From b3e951c53c4fddeeb4358a771f1947dc22037526 Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Thu, 23 Apr 2020 21:43:01 -0500 Subject: [PATCH] Bugfix: fix UTF-8 whitespace detection and provide zero-width detection function The whitespace detection codes for UTF-8 were incorrect. Non-printing characters, called zero-width, are not whitespace. Move them out of the whitespace detection and provide a new function for detecting zero-width. Handle additional UTF-8 whitespace character codes that I had previously missed. --- level_0/f_utf/c/utf.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++----- level_0/f_utf/c/utf.h | 20 +++++++++++++++ 2 files changed, 85 insertions(+), 6 deletions(-) diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index 5a0faa2..e6d7838 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -2492,20 +2492,79 @@ extern "C" { return f_status_is_error(f_invalid_utf); } - // Latin-1 Supplement: U+00A0, U+00AD. - if (character == 0xc2a00000 || character == 0xc2ad0000) { - return f_true; + // reduce the number of checks by grouping checks by first byte. + uint8_t byte_first = f_macro_utf_character_to_char_1(character); + + if (byte_first == 0xc2) { + // Latin-1 Supplement: U+00A0, U+0085. + if (character == 0xc2a00000 || 0xc2850000) { + return f_true; + } } + else if (byte_first == 0xe2) { + // General Punctuation: U+2000, U+2001, U+2002, U+2003. + if (character == 0xe2808000 || character == 0xe2808100 || character == 0xe2808200 || character == 0xe2808300) { + return f_true; + } - // Tags: U+E0020. - if (character == 0xf3a08080) { - return f_true; + // General Punctuation: U+2004, U+2005, U+2006, U+2007. + if (character == 0xe2808400 || character == 0xe2808500 || character == 0xe2808600 || character == 0xe2808700) { + return f_true; + } + + // General Punctuation: U+2008, U+2009, U+200A, U+2028. + if (character == 0xe2808800 || character == 0xe2808900 || character == 0xe2808a00 || character == 0xe280a800) { + return f_true; + } + + // General Punctuation: U+2029, U+202F, U+205F. + if (character == 0xe280a900 || character == 0xe2819f00 || character == 0xe280af00) { + return f_true; + } + } + else if (byte_first == 0xe3) { + // CJK Symbols and Punctuation: U+3000. + if (character == 0xe3808000) { + return f_true; + } } return f_false; } #endif // _di_f_utf_character_is_whitespace_ +#ifndef _di_f_utf_character_is_zero_width_ + f_return_status f_utf_character_is_zero_width(const f_utf_character character) { + if (f_macro_utf_character_width_is(character) == 1) { + return f_status_is_error(f_invalid_utf); + } + + // reduce the number of checks by grouping checks by first byte. + uint8_t byte_first = f_macro_utf_character_to_char_1(character); + + if (byte_first == 0xe1) { + // Mongolian: U+180E. + if (character == 0xe1a08e00) { + return f_true; + } + } + else if (byte_first == 0xe2) { + // General Punctuation: U+200B, U+200C, U+200D, U+2060. + if (character == 0xe2808b00 || character == 0xe2808c00 || character == 0xe2808d00 || character == 0xe281a000) { + return f_true; + } + } + else if (byte_first == 0xef) { + // Arabic Presentation Forms-B: U+FEFF. + if (character == 0xefbbbf00) { + return f_true; + } + } + + return f_false; + } +#endif // _di_f_utf_character_is_zero_width_ + #ifndef _di_f_utf_character_to_char_ f_return_status f_utf_character_to_char(const f_utf_character utf_character, f_string *character, uint8_t *max_width) { #ifndef _di_level_0_parameter_checking_ diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h index 6cf3a73..f81e936 100644 --- a/level_0/f_utf/c/utf.h +++ b/level_0/f_utf/c/utf.h @@ -620,6 +620,9 @@ extern "C" { /** * Check to see if the entire byte block of the character is an ASCII or UTF-8 general space character. * + * Non-printing or zero-width characters are not considered whitespace. + * This does include line separators like '\n'. + * * @param character * The character to validate. * @@ -633,6 +636,23 @@ extern "C" { #endif // _di_f_utf_character_is_whitespace_ /** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 general non-printing character. + * + * Only characters that do not print, which are generally called zero-width. + * + * @param character + * The character to validate. + * + * @return + * f_true if a UTF-8 non-printing or zero-width character. + * f_false if not a UTF-8 non-printing or zero-width character. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + */ +#ifndef _di_f_utf_character_is_zero_width_ + extern f_return_status f_utf_character_is_zero_width(const f_utf_character character); +#endif // _di_f_utf_character_is_zero_width_ + +/** * Convert a specialized f_utf_character type to a int8_t, stored as a string (character buffer). * * This will also convert ASCII characters stored in the utf_character array. -- 1.8.3.1