From 57ca3454c3133bb49fd667a32ba781f79b4ca5e1 Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Fri, 24 Jun 2022 23:09:26 -0500 Subject: [PATCH] Bugfix: Problems in f_utf exposed by unit tests. The is alphabetic needs to perform the is valid check because its default catch-all is returning F_true. Ideally at some point (probably distant point) in the future, the literal codes for alphabetic will be matched rather than calling all of the other functions. In this situation the is valid check can be removed. Several of the is digit test value assignments are not checking if the value (the pointer) is NULL. Some of the is word sequences are incorrect. Add missing f_utf_character_is_alphabetic_numeric(). Fix function name for f_utf_character_is_control_format(). Several is word checks for f_utf_char_t are improperly comparing the entire sequence to an ASCII value when only the first byte should be compared. --- level_0/f_utf/c/private-utf_alphabetic.c | 25 +++++++ level_0/f_utf/c/private-utf_digit.c | 112 +++++++++++++++++++++++-------- level_0/f_utf/c/private-utf_word.c | 2 +- level_0/f_utf/c/utf/is.c | 1 - level_0/f_utf/c/utf/is.h | 2 +- level_0/f_utf/c/utf/is_character.c | 31 +++++++-- level_0/f_utf/c/utf/is_character.h | 6 +- 7 files changed, 139 insertions(+), 40 deletions(-) diff --git a/level_0/f_utf/c/private-utf_alphabetic.c b/level_0/f_utf/c/private-utf_alphabetic.c index 4464dc4..5278f3e 100644 --- a/level_0/f_utf/c/private-utf_alphabetic.c +++ b/level_0/f_utf/c/private-utf_alphabetic.c @@ -8,6 +8,7 @@ #include "private-utf_phonetic.h" #include "private-utf_punctuation.h" #include "private-utf_symbol.h" +#include "private-utf_valid.h" #include "private-utf_whitespace.h" #include "private-utf_zero_width.h" @@ -18,10 +19,18 @@ extern "C" { #if !defined(_di_f_utf_character_is_alphabetic_) || !defined(_di_f_utf_is_alphabetic_) f_status_t private_f_utf_character_is_alphabetic(const f_utf_char_t sequence) { + if (!private_f_utf_character_is_valid(sequence)) { + return F_false; + } + if (private_f_utf_character_is_zero_width(sequence)) { return F_false; } + if (private_f_utf_character_is_combining(sequence)) { + return F_false; + } + // The is_control() handles both is_control_code() and is_control_format(). if (private_f_utf_character_is_control(sequence)) { return F_false; @@ -66,6 +75,10 @@ extern "C" { #if !defined(_di_f_utf_character_is_alphabetic_digit_) || !defined(_di_f_utf_is_alphabetic_digit_) f_status_t private_f_utf_character_is_alphabetic_digit(const f_utf_char_t sequence, uint64_t * const value) { + if (!private_f_utf_character_is_valid(sequence)) { + return F_false; + } + if (private_f_utf_character_is_digit(sequence, value)) { return F_true; } @@ -74,6 +87,10 @@ extern "C" { return F_false; } + if (private_f_utf_character_is_combining(sequence)) { + return F_false; + } + // The is_control() handles both is_control_code() and is_control_format(). if (private_f_utf_character_is_control(sequence)) { return F_false; @@ -114,6 +131,10 @@ extern "C" { #if !defined(_di_f_utf_character_is_alphabetic_numeric_) || !defined(_di_f_utf_is_alphabetic_numeric_) f_status_t private_f_utf_character_is_alphabetic_numeric(const f_utf_char_t sequence) { + if (!private_f_utf_character_is_valid(sequence)) { + return F_false; + } + if (private_f_utf_character_is_numeric(sequence)) { return F_true; } @@ -122,6 +143,10 @@ extern "C" { return F_false; } + if (private_f_utf_character_is_combining(sequence)) { + return F_false; + } + // The is_control() handles both is_control_code() and is_control_format(). if (private_f_utf_character_is_control(sequence)) { return F_false; diff --git a/level_0/f_utf/c/private-utf_digit.c b/level_0/f_utf/c/private-utf_digit.c index 70a759c..a8249c0 100644 --- a/level_0/f_utf/c/private-utf_digit.c +++ b/level_0/f_utf/c/private-utf_digit.c @@ -268,20 +268,26 @@ static inline f_status_t private_inline_f_utf_character_handle_digit_from_four(c // Tamil: U+0BF0. if (sequence == 0xe0afb000) { - *value = 10; + if (value) { + *value = 10; + } return F_true; } // Tamil: U+0BF1. if (sequence == 0xe0afb100) { - *value = 100; + if (value) { + *value = 100; + } return F_true; } // Tamil: U+0BF2. - *value = 1000; + if (value) { + *value = 1000; + } return F_true; } @@ -296,20 +302,26 @@ static inline f_status_t private_inline_f_utf_character_handle_digit_from_four(c // Telugu: U+0C70. if (sequence == 0xe0afb000) { - *value = 10; + if (value) { + *value = 10; + } return F_true; } // Telugu: U+0C71. if (sequence == 0xe0afb100) { - *value = 100; + if (value) { + *value = 100; + } return F_true; } // Telugu: U+0C72. - *value = 1000; + if (value) { + *value = 1000; + } return F_true; } @@ -366,76 +378,98 @@ static inline f_status_t private_inline_f_utf_character_handle_digit_from_four(c // Ethiopic: U+1372. if (sequence == 0xe18db200) { - *value = 10; + if (value) { + *value = 10; + } return F_true; } // Ethiopic: U+1373. if (sequence == 0xe18db300) { - *value = 20; + if (value) { + *value = 20; + } return F_true; } // Ethiopic: U+1374. if (sequence == 0xe18db400) { - *value = 30; + if (value) { + *value = 30; + } return F_true; } // Ethiopic: U+1375. if (sequence == 0xe18db500) { - *value = 40; + if (value) { + *value = 40; + } return F_true; } // Ethiopic: U+1376. if (sequence == 0xe18db600) { - *value = 50; + if (value) { + *value = 50; + } return F_true; } // Ethiopic: U+1377. if (sequence == 0xe18db700) { - *value = 60; + if (value) { + *value = 60; + } return F_true; } // Ethiopic: U+1378. if (sequence == 0xe18db800) { - *value = 70; + if (value) { + *value = 70; + } return F_true; } // Ethiopic: U+1379. if (sequence == 0xe18db900) { - *value = 80; + if (value) { + *value = 80; + } return F_true; } // Ethiopic: U+137A. if (sequence == 0xe18dba00) { - *value = 90; + if (value) { + *value = 90; + } return F_true; } // Ethiopic: U+137B. if (sequence == 0xe18dbb00) { - *value = 100; + if (value) { + *value = 100; + } return F_true; } // Ethiopic: U+137C. - *value = 1000; + if (value) { + *value = 1000; + } return F_true; } @@ -445,20 +479,26 @@ static inline f_status_t private_inline_f_utf_character_handle_digit_from_four(c // Runic: U+16EE. if (sequence == 0xe19bae00) { - *value = 17; + if (value) { + *value = 17; + } return F_true; } // Runic: U+16EF. if (sequence == 0xe19baf00) { - *value = 18; + if (value) { + *value = 18; + } return F_true; } // Runic: U+16F0. - *value = 19; + if (value) { + *value = 19; + } return F_true; } @@ -490,7 +530,9 @@ static inline f_status_t private_inline_f_utf_character_handle_digit_from_four(c // New Tai Lue: U+19DA. if (sequence == 0xe1a79a00) { - *value = 1; + if (value) { + *value = 1; + } return F_true; } @@ -532,49 +574,63 @@ static inline f_status_t private_inline_f_utf_character_handle_digit_from_four(c // Superscripts and Subscripts: U+2070. if (sequence == 0xe281b000) { - *value = 0; + if (value) { + *value = 0; + } return F_true; } // Superscripts and Subscripts: U+2074. if (sequence == 0xe281b400) { - *value = 4; + if (value) { + *value = 4; + } return F_true; } // Superscripts and Subscripts: U+2075. if (sequence == 0xe281b500) { - *value = 5; + if (value) { + *value = 5; + } return F_true; } // Superscripts and Subscripts: U+2076. if (sequence == 0xe281b600) { - *value = 6; + if (value) { + *value = 6; + } return F_true; } // Superscripts and Subscripts: U+2077. if (sequence == 0xe281b700) { - *value = 7; + if (value) { + *value = 7; + } return F_true; } // Superscripts and Subscripts: U+2078. if (sequence == 0xe281b800) { - *value = 8; + if (value) { + *value = 8; + } return F_true; } // Superscripts and Subscripts: U+2079. if (sequence == 0xe281b900) { - *value = 9; + if (value) { + *value = 9; + } return F_true; } diff --git a/level_0/f_utf/c/private-utf_word.c b/level_0/f_utf/c/private-utf_word.c index 58cdb62..1b556cd 100644 --- a/level_0/f_utf/c/private-utf_word.c +++ b/level_0/f_utf/c/private-utf_word.c @@ -37,7 +37,7 @@ extern "C" { if (strict) { // Halfwidth and Fullwidth Forms: U+FE33 (︳), U+FE34 (︴). - if (sequence == 0xefbcbf00 || sequence == 0xefbcbf00) { + if (sequence == 0xefb8b300 || sequence == 0xefb8b400) { return F_true; } } diff --git a/level_0/f_utf/c/utf/is.c b/level_0/f_utf/c/utf/is.c index d5d238a..f25f81a 100644 --- a/level_0/f_utf/c/utf/is.c +++ b/level_0/f_utf/c/utf/is.c @@ -931,7 +931,6 @@ extern "C" { const f_status_t status = private_f_utf_char_to_character(sequence, width_max, &utf); if (F_status_is_error(status)) return status; } - return private_f_utf_character_is_word_dash_plus(utf, strict); } diff --git a/level_0/f_utf/c/utf/is.h b/level_0/f_utf/c/utf/is.h index 0f6394c..fa5339e 100644 --- a/level_0/f_utf/c/utf/is.h +++ b/level_0/f_utf/c/utf/is.h @@ -551,7 +551,7 @@ extern "C" { * F_false if not an unassigned UTF-8 character. * * F_complete_not_utf (with error bit set) if character is an incomplete UTF-8 sequence. - * F_parameter (with error bit) if a parameter is inunassigned. + * F_parameter (with error bit) if a parameter is unassigned. * F_utf_fragment (with error bit) if character is a UTF-8 fragment. * F_utf_not (with error bit) if Unicode is an invalid Unicode character. */ diff --git a/level_0/f_utf/c/utf/is_character.c b/level_0/f_utf/c/utf/is_character.c index ae1dfc7..2a58183 100644 --- a/level_0/f_utf/c/utf/is_character.c +++ b/level_0/f_utf/c/utf/is_character.c @@ -74,6 +74,25 @@ extern "C" { } #endif // _di_f_utf_character_is_alphabetic_digit_ +#ifndef _di_f_utf_character_is_alphabetic_numeric_ + f_status_t f_utf_character_is_alphabetic_numeric(const f_utf_char_t sequence) { + + if (macro_f_utf_char_t_width_is(sequence)) { + if (macro_f_utf_char_t_width_is(sequence) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_alphabetic_numeric(sequence); + } + + if (isalnum(macro_f_utf_char_t_to_char_1(sequence))) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_alphabetic_numeric_ + #ifndef _di_f_utf_character_is_ascii_ f_status_t f_utf_character_is_ascii(const f_utf_char_t sequence) { @@ -139,8 +158,8 @@ extern "C" { } #endif // _di_f_utf_character_is_control_code_ -#ifndef _di_f_utf_character_is_control_picture_ - f_status_t character_is_control_format(const f_utf_char_t sequence) { +#ifndef _di_f_utf_character_is_control_format_ + f_status_t f_utf_character_is_control_format(const f_utf_char_t sequence) { if (macro_f_utf_char_t_width_is(sequence)) { if (macro_f_utf_char_t_width_is(sequence) == 1) { @@ -150,7 +169,7 @@ extern "C" { return private_f_utf_character_is_control_format(sequence); } - // There are no control format characters in ASCII. + // There are no ASCII control formats. return F_false; } #endif // _di_f_utf_character_is_control_format_ @@ -496,7 +515,7 @@ extern "C" { return private_f_utf_character_is_word(sequence, strict); } - if (isalnum(macro_f_utf_char_t_to_char_1(sequence)) || sequence == f_string_ascii_underscore_s.string[0]) { + if (isalnum(macro_f_utf_char_t_to_char_1(sequence)) || macro_f_utf_char_t_to_char_1(sequence) == f_string_ascii_underscore_s.string[0]) { return F_true; } @@ -515,7 +534,7 @@ extern "C" { return private_f_utf_character_is_word_dash(sequence, strict); } - if (isalnum(macro_f_utf_char_t_to_char_1(sequence)) || sequence == f_string_ascii_underscore_s.string[0] || sequence == f_string_ascii_minus_s.string[0]) { + if (isalnum(macro_f_utf_char_t_to_char_1(sequence)) || macro_f_utf_char_t_to_char_1(sequence) == f_string_ascii_underscore_s.string[0] || macro_f_utf_char_t_to_char_1(sequence) == f_string_ascii_minus_s.string[0]) { return F_true; } @@ -534,7 +553,7 @@ extern "C" { return private_f_utf_character_is_word_dash_plus(sequence, strict); } - if (isalnum(macro_f_utf_char_t_to_char_1(sequence)) || sequence == f_string_ascii_underscore_s.string[0] || sequence == f_string_ascii_minus_s.string[0] || sequence == f_string_ascii_plus_s.string[0]) { + if (isalnum(macro_f_utf_char_t_to_char_1(sequence)) || macro_f_utf_char_t_to_char_1(sequence) == f_string_ascii_underscore_s.string[0] || macro_f_utf_char_t_to_char_1(sequence) == f_string_ascii_minus_s.string[0] || macro_f_utf_char_t_to_char_1(sequence) == f_string_ascii_plus_s.string[0]) { return F_true; } diff --git a/level_0/f_utf/c/utf/is_character.h b/level_0/f_utf/c/utf/is_character.h index c631c83..56aa23c 100644 --- a/level_0/f_utf/c/utf/is_character.h +++ b/level_0/f_utf/c/utf/is_character.h @@ -27,6 +27,7 @@ extern "C" { * @return * F_true if a UTF-8 character. * F_false if not a UTF-8 character. + * * F_utf_fragment if this is a UTF-8 character fragment. * * @see f_utf_character_is_valid() @@ -586,10 +587,9 @@ extern "C" { * The (UTF-8) character. * * @return - * F_none on success. + * F_true if a UTF-8 wide character. + * F_false if not a UTF-8 wide character. * - * F_failure (with error bit) if width is not long enough to convert. - * F_parameter (with error bit) if a parameter is invalid. * F_utf_fragment (with error bit) if character is a UTF-8 fragment. * F_utf_not (with error bit) if unicode is an invalid Unicode character. */ -- 1.8.3.1