From: Kevin Day Date: Thu, 15 Aug 2019 02:36:50 +0000 (-0500) Subject: Update: add UTF-8 BOM detection X-Git-Tag: 0.5.0~483 X-Git-Url: https://git.kevux.org/?a=commitdiff_plain;h=4c73bcae576c427905cb39b8f0c60b60b7e1c793;p=fll Update: add UTF-8 BOM detection Add support for detecting UTF-8 BOM in the f_utf_is_space() function. Add new function f_utf_is_bom(). --- diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index b07f574..85a803f 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -4,6 +4,32 @@ extern "C" { #endif +#ifndef _di_f_utf_is_bom_ + f_return_status f_utf_is_bom(const f_string character, const f_u_short maxWidth) { + #ifndef _di_level_0_parameter_checking_ + if (maxWidth < 1) return f_error_set_error(f_invalid_parameter); + #endif // _di_level_0_parameter_checking_ + + f_u_short width = f_macro_utf_byte_width(*character); + + if (width == 1) { + return f_false; + } + + if (width > maxWidth) { + return f_error_set_error(f_maybe); + } + + if (width == 3) { + if (!memcmp(character, f_utf_bom, width)) { + return f_true; + } + } + + return f_false; + } +#endif // _di_f_utf_is_bom_ + #ifndef _di_f_utf_is_space_ f_return_status f_utf_is_space(const f_string character, const f_u_short maxWidth) { #ifndef _di_level_0_parameter_checking_ @@ -129,6 +155,10 @@ extern "C" { return f_true; } + if (!memcmp(character, f_utf_bom, width)) { + return f_true; + } + return f_false; } diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h index 2965bc0..aee00cc 100644 --- a/level_0/f_utf/c/utf.h +++ b/level_0/f_utf/c/utf.h @@ -183,8 +183,30 @@ extern "C" { #endif // _di_f_utf_substitute_ /** + * Check to see if the entire byte block of the character is a UTF-8 BOM. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by maxWidth. + * @param maxWidth + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * f_true if a UTF-8 whitespace or substitute. + * f_false if not a UTF-8 whitespace or substitute. + * f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough. + * f_invalid_parameter (with error bit) if a parameter is invalid. + */ +#ifndef _di_f_utf_is_bom_ + extern f_return_status f_utf_is_bom(const f_string character, const f_u_short maxWidth); +#endif // _di_f_utf_is_bom_ + +/** * Check to see if the entire byte block of the character is a UTF-8 whitespace or substitute character. * + * This will also return TRUE for the UTF-8 BOM. + * * This does not check non-UTF-8 whitespace. * * @param character @@ -195,7 +217,7 @@ extern "C" { * Can be anything greater than 0. * * @return - * f_true if a UTF-8 whitespace or substitute. + * f_true if a UTF-8 whitespace, substitute, or UTF-8 BOM. * f_false if not a UTF-8 whitespace or substitute. * f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough. * f_invalid_parameter (with error bit) if a parameter is invalid.