From: Kevin Day Date: Mon, 13 Jun 2022 23:58:12 +0000 (-0500) Subject: Bugfix: Problems exposed by unit tests for f_utf. X-Git-Tag: 0.5.10~43 X-Git-Url: https://git.kevux.org/?a=commitdiff_plain;h=2894bf88597b44d5f8ffd1ab36a53e6455facf4b;p=fll Bugfix: Problems exposed by unit tests for f_utf. Correct comments and add missing characters. Add the missing f_utf_character_is_surrogate() function. Change the is valid algorithm to one I developed for the unit tests. These are cleaner and simpler due to bitwise operations. --- diff --git a/level_0/f_utf/c/private-utf_digit.c b/level_0/f_utf/c/private-utf_digit.c index ef7996c..f794ac2 100644 --- a/level_0/f_utf/c/private-utf_digit.c +++ b/level_0/f_utf/c/private-utf_digit.c @@ -254,7 +254,7 @@ extern "C" { return F_true; } - // Tirhuta: U+9199 to U+114D9. + // Tirhuta: U+114D0 to U+114D9. if (character >= 0xf0919390 && character <= 0xf0919399) { return F_true; } diff --git a/level_0/f_utf/c/private-utf_emoji.c b/level_0/f_utf/c/private-utf_emoji.c index 1f54beb..0511052 100644 --- a/level_0/f_utf/c/private-utf_emoji.c +++ b/level_0/f_utf/c/private-utf_emoji.c @@ -502,6 +502,11 @@ extern "C" { if (character >= 0xf09fab90 && character <= 0xf09fab96) { return F_true; } + + // U+1FAF6. + if (character == 0xf09fabb6) { + return F_true; + } } } diff --git a/level_0/f_utf/c/private-utf_valid.c b/level_0/f_utf/c/private-utf_valid.c index 2b7adfa..6331429 100644 --- a/level_0/f_utf/c/private-utf_valid.c +++ b/level_0/f_utf/c/private-utf_valid.c @@ -9,76 +9,46 @@ extern "C" { #if !defined(_di_f_utf_character_is_valid_) || !defined(_di_f_utf_is_valid_) f_status_t private_f_utf_character_is_valid(const f_utf_char_t character) { - if (macro_f_utf_char_t_width_is(character) < 2) { - if (macro_f_utf_char_t_to_char_1(character) >= 0x00 && macro_f_utf_char_t_to_char_1(character) <= 0x7f) { + // Invalid: 11111xxx xxxxxxxx xxxxxxxx xxxxxxxx. + if ((macro_f_utf_char_t_to_char_1(character) & 0b11111000) == 0b11111000) { + return F_false; + } + + // Valid: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx. + if ((macro_f_utf_char_t_to_char_1(character) & 0b11111000) == 0b11110000) { + if ((macro_f_utf_char_t_to_char_2(character) & 0b11000000) == 0b10000000 && (macro_f_utf_char_t_to_char_3(character) & 0b11000000) == 0b10000000 && (macro_f_utf_char_t_to_char_4(character) & 0b11000000) == 0b10000000) { return F_true; } - return false; + return F_false; } - if (macro_f_utf_char_t_width_is(character) == 2) { - if (macro_f_utf_char_t_to_char_1(character) >= 0xc2 && macro_f_utf_char_t_to_char_1(character) <= 0xdf) { - if (macro_f_utf_char_t_to_char_2(character) >= 0x80 && macro_f_utf_char_t_to_char_2(character) <= 0xbf) { - return F_true; - } + // Valid: 1110xxxx 10xxxxxx 10xxxxxx ????????. + else if ((macro_f_utf_char_t_to_char_1(character) & 0b11110000) == 0b11100000) { + if ((macro_f_utf_char_t_to_char_2(character) & 0b11000000) == 0b10000000 && (macro_f_utf_char_t_to_char_3(character) & 0b11000000) == 0b10000000) { + return F_true; } - return false; + return F_false; } - if (macro_f_utf_char_t_width_is(character) == 3) { - if (macro_f_utf_char_t_to_char_1(character) == 0xe0) { - if (macro_f_utf_char_t_to_char_2(character) >= 0xa0 && macro_f_utf_char_t_to_char_2(character) <= 0xbf) { - if (macro_f_utf_char_t_to_char_3(character) >= 0x80 && macro_f_utf_char_t_to_char_3(character) <= 0xbf) { - return F_true; - } - } - } - else if ((macro_f_utf_char_t_to_char_1(character) >= 0xe1 && macro_f_utf_char_t_to_char_1(character) <= 0xec) || (macro_f_utf_char_t_to_char_1(character) >= 0xee && macro_f_utf_char_t_to_char_1(character) <= 0xef)) { - if (macro_f_utf_char_t_to_char_2(character) >= 0x80 && macro_f_utf_char_t_to_char_2(character) <= 0xbf) { - if (macro_f_utf_char_t_to_char_3(character) >= 0x80 && macro_f_utf_char_t_to_char_3(character) <= 0xbf) { - return F_true; - } - } - } - else if (macro_f_utf_char_t_to_char_1(character) == 0xed) { - if (macro_f_utf_char_t_to_char_2(character) >= 0x80 && macro_f_utf_char_t_to_char_2(character) <= 0x9f) { - if (macro_f_utf_char_t_to_char_3(character) >= 0x80 && macro_f_utf_char_t_to_char_3(character) <= 0xbf) { - return F_true; - } - } + // Valid: 110xxxxx 10xxxxxx ???????? ????????. + else if ((macro_f_utf_char_t_to_char_1(character) & 0b11100000) == 0b11000000) { + if ((macro_f_utf_char_t_to_char_2(character) & 0b11000000) == 0b10000000) { + return F_true; } - return false; + return F_false; } - if (macro_f_utf_char_t_to_char_1(character) == 0xf0) { - if (macro_f_utf_char_t_to_char_2(character) >= 0x90 && macro_f_utf_char_t_to_char_2(character) <= 0xbf) { - if (macro_f_utf_char_t_to_char_3(character) >= 0x80 && macro_f_utf_char_t_to_char_3(character) <= 0xbf) { - if (macro_f_utf_char_t_to_char_4(character) >= 0x80 && macro_f_utf_char_t_to_char_4(character) <= 0xbf) { - return F_true; - } - } - } - } - else if (macro_f_utf_char_t_to_char_1(character) >= 0xf1 && macro_f_utf_char_t_to_char_1(character) <= 0xf3) { - if (macro_f_utf_char_t_to_char_2(character) >= 0x80 && macro_f_utf_char_t_to_char_2(character) <= 0xbf) { - if (macro_f_utf_char_t_to_char_3(character) >= 0x80 && macro_f_utf_char_t_to_char_3(character) <= 0xbf) { - if (macro_f_utf_char_t_to_char_4(character) >= 0x80 && macro_f_utf_char_t_to_char_4(character) <= 0xbf) { - return F_true; - } - } - } + // Invalid (UTF Fragment): 10xxxxxx ???????? ???????? ????????. + else if ((macro_f_utf_char_t_to_char_1(character) & 0b11000000) == 0b10000000) { + return F_status_set_error(F_utf_fragment); } - else if (macro_f_utf_char_t_to_char_1(character) == 0xf4) { - if (macro_f_utf_char_t_to_char_2(character) >= 0x80 && macro_f_utf_char_t_to_char_2(character) <= 0x8f) { - if (macro_f_utf_char_t_to_char_3(character) >= 0x80 && macro_f_utf_char_t_to_char_3(character) <= 0xbf) { - if (macro_f_utf_char_t_to_char_4(character) >= 0x80 && macro_f_utf_char_t_to_char_4(character) <= 0xbf) { - return F_true; - } - } - } + + // Valid: 0xxxxxxx ???????? ???????? ????????. + else if (!(macro_f_utf_char_t_to_char_1(character) & 0b10000000)) { + return F_true; } return F_false; diff --git a/level_0/f_utf/c/utf/is_character.c b/level_0/f_utf/c/utf/is_character.c index ba722d3..f0a0330 100644 --- a/level_0/f_utf/c/utf/is_character.c +++ b/level_0/f_utf/c/utf/is_character.c @@ -391,6 +391,21 @@ extern "C" { } #endif // _di_f_utf_character_is_superscript_ +#ifndef _di_f_utf_character_is_surrogate_ + f_status_t f_utf_character_is_surrogate(const f_utf_char_t character) { + + if (macro_f_utf_char_t_width_is(character)) { + if (macro_f_utf_char_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_surrogate(character); + } + + return F_false; + } +#endif // _di_f_utf_character_is_surrogate_ + #ifndef _di_f_utf_character_is_symbol_ f_status_t f_utf_character_is_symbol(const f_utf_char_t character) { diff --git a/level_0/f_utf/c/utf/is_character.h b/level_0/f_utf/c/utf/is_character.h index fcfd289..dbc7263 100644 --- a/level_0/f_utf/c/utf/is_character.h +++ b/level_0/f_utf/c/utf/is_character.h @@ -418,6 +418,23 @@ extern "C" { #endif // _di_f_utf_character_is_superscript_ /** + * Check to see if the entire byte block of the character is a UTF-8 surrogate character. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 superscript character. + * F_false if not a UTF-8 superscript character. + * + * F_utf_fragment (with error bit) if character is a UTF-8 fragment. + * F_utf_not (with error bit) if unicode is an invalid Unicode character. + */ +#ifndef _di_f_utf_character_is_surrogate_ + extern f_status_t f_utf_character_is_surrogate(const f_utf_char_t character); +#endif // _di_f_utf_character_is_surrogate_ + +/** * Check to see if the entire byte block of the character is an ASCII or UTF-8 symbol character. * * @param character