]> Kevux Git Server - fll/commitdiff
Bugfix: Problems in f_utf exposed by unit tests.
authorKevin Day <thekevinday@gmail.com>
Sat, 25 Jun 2022 04:09:26 +0000 (23:09 -0500)
committerKevin Day <thekevinday@gmail.com>
Sat, 25 Jun 2022 04:09:26 +0000 (23:09 -0500)
The is alphabetic needs to perform the is valid check because its default catch-all is returning F_true.
Ideally at some point (probably distant point) in the future, the literal codes for alphabetic will be matched rather than calling all of the other functions.
In this situation the is valid check can be removed.

Several of the is digit test value assignments are not checking if the value (the pointer) is NULL.

Some of the is word sequences are incorrect.

Add missing f_utf_character_is_alphabetic_numeric().

Fix function name for f_utf_character_is_control_format().

Several is word checks for f_utf_char_t are improperly comparing the entire sequence to an ASCII value when only the first byte should be compared.

level_0/f_utf/c/private-utf_alphabetic.c
level_0/f_utf/c/private-utf_digit.c
level_0/f_utf/c/private-utf_word.c
level_0/f_utf/c/utf/is.c
level_0/f_utf/c/utf/is.h
level_0/f_utf/c/utf/is_character.c
level_0/f_utf/c/utf/is_character.h

index 4464dc46409336796f7394e517cd39ccddd66472..5278f3e7c922e8c392544d0639451430874d2568 100644 (file)
@@ -8,6 +8,7 @@
 #include "private-utf_phonetic.h"
 #include "private-utf_punctuation.h"
 #include "private-utf_symbol.h"
+#include "private-utf_valid.h"
 #include "private-utf_whitespace.h"
 #include "private-utf_zero_width.h"
 
@@ -18,10 +19,18 @@ extern "C" {
 #if !defined(_di_f_utf_character_is_alphabetic_) || !defined(_di_f_utf_is_alphabetic_)
   f_status_t private_f_utf_character_is_alphabetic(const f_utf_char_t sequence) {
 
+    if (!private_f_utf_character_is_valid(sequence)) {
+      return F_false;
+    }
+
     if (private_f_utf_character_is_zero_width(sequence)) {
       return F_false;
     }
 
+    if (private_f_utf_character_is_combining(sequence)) {
+      return F_false;
+    }
+
     // The is_control() handles both is_control_code() and is_control_format().
     if (private_f_utf_character_is_control(sequence)) {
       return F_false;
@@ -66,6 +75,10 @@ extern "C" {
 #if !defined(_di_f_utf_character_is_alphabetic_digit_) || !defined(_di_f_utf_is_alphabetic_digit_)
   f_status_t private_f_utf_character_is_alphabetic_digit(const f_utf_char_t sequence, uint64_t * const value) {
 
+    if (!private_f_utf_character_is_valid(sequence)) {
+      return F_false;
+    }
+
     if (private_f_utf_character_is_digit(sequence, value)) {
       return F_true;
     }
@@ -74,6 +87,10 @@ extern "C" {
       return F_false;
     }
 
+    if (private_f_utf_character_is_combining(sequence)) {
+      return F_false;
+    }
+
     // The is_control() handles both is_control_code() and is_control_format().
     if (private_f_utf_character_is_control(sequence)) {
       return F_false;
@@ -114,6 +131,10 @@ extern "C" {
 #if !defined(_di_f_utf_character_is_alphabetic_numeric_) || !defined(_di_f_utf_is_alphabetic_numeric_)
   f_status_t private_f_utf_character_is_alphabetic_numeric(const f_utf_char_t sequence) {
 
+    if (!private_f_utf_character_is_valid(sequence)) {
+      return F_false;
+    }
+
     if (private_f_utf_character_is_numeric(sequence)) {
       return F_true;
     }
@@ -122,6 +143,10 @@ extern "C" {
       return F_false;
     }
 
+    if (private_f_utf_character_is_combining(sequence)) {
+      return F_false;
+    }
+
     // The is_control() handles both is_control_code() and is_control_format().
     if (private_f_utf_character_is_control(sequence)) {
       return F_false;
index 70a759cbe54493ca46b0263511f1dd0cde1319e2..a8249c0a532b3ecc86ec997fa862953eafba0169 100644 (file)
@@ -268,20 +268,26 @@ static inline f_status_t private_inline_f_utf_character_handle_digit_from_four(c
 
           // Tamil: U+0BF0.
           if (sequence == 0xe0afb000) {
-            *value = 10;
+            if (value) {
+              *value = 10;
+            }
 
             return F_true;
           }
 
           // Tamil: U+0BF1.
           if (sequence == 0xe0afb100) {
-            *value = 100;
+            if (value) {
+              *value = 100;
+            }
 
             return F_true;
           }
 
           // Tamil: U+0BF2.
-          *value = 1000;
+          if (value) {
+            *value = 1000;
+          }
 
           return F_true;
         }
@@ -296,20 +302,26 @@ static inline f_status_t private_inline_f_utf_character_handle_digit_from_four(c
 
           // Telugu: U+0C70.
           if (sequence == 0xe0afb000) {
-            *value = 10;
+            if (value) {
+              *value = 10;
+            }
 
             return F_true;
           }
 
           // Telugu: U+0C71.
           if (sequence == 0xe0afb100) {
-            *value = 100;
+            if (value) {
+              *value = 100;
+            }
 
             return F_true;
           }
 
           // Telugu: U+0C72.
-          *value = 1000;
+          if (value) {
+            *value = 1000;
+          }
 
           return F_true;
         }
@@ -366,76 +378,98 @@ static inline f_status_t private_inline_f_utf_character_handle_digit_from_four(c
 
           // Ethiopic: U+1372.
           if (sequence == 0xe18db200) {
-            *value = 10;
+            if (value) {
+              *value = 10;
+            }
 
             return F_true;
           }
 
           // Ethiopic: U+1373.
           if (sequence == 0xe18db300) {
-            *value = 20;
+            if (value) {
+              *value = 20;
+            }
 
             return F_true;
           }
 
           // Ethiopic: U+1374.
           if (sequence == 0xe18db400) {
-            *value = 30;
+            if (value) {
+              *value = 30;
+            }
 
             return F_true;
           }
 
           // Ethiopic: U+1375.
           if (sequence == 0xe18db500) {
-            *value = 40;
+            if (value) {
+              *value = 40;
+            }
 
             return F_true;
           }
 
           // Ethiopic: U+1376.
           if (sequence == 0xe18db600) {
-            *value = 50;
+            if (value) {
+              *value = 50;
+            }
 
             return F_true;
           }
 
           // Ethiopic: U+1377.
           if (sequence == 0xe18db700) {
-            *value = 60;
+            if (value) {
+              *value = 60;
+            }
 
             return F_true;
           }
 
           // Ethiopic: U+1378.
           if (sequence == 0xe18db800) {
-            *value = 70;
+            if (value) {
+              *value = 70;
+            }
 
             return F_true;
           }
 
           // Ethiopic: U+1379.
           if (sequence == 0xe18db900) {
-            *value = 80;
+            if (value) {
+              *value = 80;
+            }
 
             return F_true;
           }
 
           // Ethiopic: U+137A.
           if (sequence == 0xe18dba00) {
-            *value = 90;
+            if (value) {
+              *value = 90;
+            }
 
             return F_true;
           }
 
           // Ethiopic: U+137B.
           if (sequence == 0xe18dbb00) {
-            *value = 100;
+            if (value) {
+              *value = 100;
+            }
 
             return F_true;
           }
 
           // Ethiopic: U+137C.
-          *value = 1000;
+          if (value) {
+            *value = 1000;
+          }
 
           return F_true;
         }
@@ -445,20 +479,26 @@ static inline f_status_t private_inline_f_utf_character_handle_digit_from_four(c
 
           // Runic: U+16EE.
           if (sequence == 0xe19bae00) {
-            *value = 17;
+            if (value) {
+              *value = 17;
+            }
 
             return F_true;
           }
 
           // Runic: U+16EF.
           if (sequence == 0xe19baf00) {
-            *value = 18;
+            if (value) {
+              *value = 18;
+            }
 
             return F_true;
           }
 
           // Runic: U+16F0.
-          *value = 19;
+          if (value) {
+            *value = 19;
+          }
 
           return F_true;
         }
@@ -490,7 +530,9 @@ static inline f_status_t private_inline_f_utf_character_handle_digit_from_four(c
 
         // New Tai Lue: U+19DA.
         if (sequence == 0xe1a79a00) {
-          *value = 1;
+          if (value) {
+            *value = 1;
+          }
 
           return F_true;
         }
@@ -532,49 +574,63 @@ static inline f_status_t private_inline_f_utf_character_handle_digit_from_four(c
 
           // Superscripts and Subscripts: U+2070.
           if (sequence == 0xe281b000) {
-            *value = 0;
+            if (value) {
+              *value = 0;
+            }
 
             return F_true;
           }
 
           // Superscripts and Subscripts: U+2074.
           if (sequence == 0xe281b400) {
-            *value = 4;
+            if (value) {
+              *value = 4;
+            }
 
             return F_true;
           }
 
           // Superscripts and Subscripts: U+2075.
           if (sequence == 0xe281b500) {
-            *value = 5;
+            if (value) {
+              *value = 5;
+            }
 
             return F_true;
           }
 
           // Superscripts and Subscripts: U+2076.
           if (sequence == 0xe281b600) {
-            *value = 6;
+            if (value) {
+              *value = 6;
+            }
 
             return F_true;
           }
 
           // Superscripts and Subscripts: U+2077.
           if (sequence == 0xe281b700) {
-            *value = 7;
+            if (value) {
+              *value = 7;
+            }
 
             return F_true;
           }
 
           // Superscripts and Subscripts: U+2078.
           if (sequence == 0xe281b800) {
-            *value = 8;
+            if (value) {
+              *value = 8;
+            }
 
             return F_true;
           }
 
           // Superscripts and Subscripts: U+2079.
           if (sequence == 0xe281b900) {
-            *value = 9;
+            if (value) {
+              *value = 9;
+            }
 
             return F_true;
           }
index 58cdb62c27d2d9bf3daf2f045e5c7bdda172fdac..1b556cdad1fa718b1d3431b5a82c0c41e6737234 100644 (file)
@@ -37,7 +37,7 @@ extern "C" {
         if (strict) {
 
           // Halfwidth and Fullwidth Forms: U+FE33 (︳), U+FE34 (︴).
-          if (sequence == 0xefbcbf00 || sequence == 0xefbcbf00) {
+          if (sequence == 0xefb8b300 || sequence == 0xefb8b400) {
             return F_true;
           }
         }
index d5d238a97cb780c8125bb1b706f16bce20395c12..f25f81a1dbfd335b7de9d3d6bafbae3ff22f999d 100644 (file)
@@ -931,7 +931,6 @@ extern "C" {
         const f_status_t status = private_f_utf_char_to_character(sequence, width_max, &utf);
         if (F_status_is_error(status)) return status;
       }
-
       return private_f_utf_character_is_word_dash_plus(utf, strict);
     }
 
index 0f6394c88cbbcdfce97ba9053b6051ea760cdfef..fa5339efcb10dc74186c8f4ed71c60476fb20074 100644 (file)
@@ -551,7 +551,7 @@ extern "C" {
  *   F_false if not an unassigned UTF-8 character.
  *
  *   F_complete_not_utf (with error bit set) if character is an incomplete UTF-8 sequence.
- *   F_parameter (with error bit) if a parameter is inunassigned.
+ *   F_parameter (with error bit) if a parameter is unassigned.
  *   F_utf_fragment (with error bit) if character is a UTF-8 fragment.
  *   F_utf_not (with error bit) if Unicode is an invalid Unicode character.
  */
index ae1dfc785644706ee4280bb005240547b6c875a3..2a58183d04239ecf58a74698758b09e08fcfc195 100644 (file)
@@ -74,6 +74,25 @@ extern "C" {
   }
 #endif // _di_f_utf_character_is_alphabetic_digit_
 
+#ifndef _di_f_utf_character_is_alphabetic_numeric_
+  f_status_t f_utf_character_is_alphabetic_numeric(const f_utf_char_t sequence) {
+
+    if (macro_f_utf_char_t_width_is(sequence)) {
+      if (macro_f_utf_char_t_width_is(sequence) == 1) {
+        return F_status_set_error(F_utf_fragment);
+      }
+
+      return private_f_utf_character_is_alphabetic_numeric(sequence);
+    }
+
+    if (isalnum(macro_f_utf_char_t_to_char_1(sequence))) {
+      return F_true;
+    }
+
+    return F_false;
+  }
+#endif // _di_f_utf_character_is_alphabetic_numeric_
+
 #ifndef _di_f_utf_character_is_ascii_
   f_status_t f_utf_character_is_ascii(const f_utf_char_t sequence) {
 
@@ -139,8 +158,8 @@ extern "C" {
   }
 #endif // _di_f_utf_character_is_control_code_
 
-#ifndef _di_f_utf_character_is_control_picture_
-  f_status_t character_is_control_format(const f_utf_char_t sequence) {
+#ifndef _di_f_utf_character_is_control_format_
+  f_status_t f_utf_character_is_control_format(const f_utf_char_t sequence) {
 
     if (macro_f_utf_char_t_width_is(sequence)) {
       if (macro_f_utf_char_t_width_is(sequence) == 1) {
@@ -150,7 +169,7 @@ extern "C" {
       return private_f_utf_character_is_control_format(sequence);
     }
 
-    // There are no control format characters in ASCII.
+    // There are no ASCII control formats.
     return F_false;
   }
 #endif // _di_f_utf_character_is_control_format_
@@ -496,7 +515,7 @@ extern "C" {
       return private_f_utf_character_is_word(sequence, strict);
     }
 
-    if (isalnum(macro_f_utf_char_t_to_char_1(sequence)) || sequence == f_string_ascii_underscore_s.string[0]) {
+    if (isalnum(macro_f_utf_char_t_to_char_1(sequence)) || macro_f_utf_char_t_to_char_1(sequence) == f_string_ascii_underscore_s.string[0]) {
       return F_true;
     }
 
@@ -515,7 +534,7 @@ extern "C" {
       return private_f_utf_character_is_word_dash(sequence, strict);
     }
 
-    if (isalnum(macro_f_utf_char_t_to_char_1(sequence)) || sequence == f_string_ascii_underscore_s.string[0] || sequence == f_string_ascii_minus_s.string[0]) {
+    if (isalnum(macro_f_utf_char_t_to_char_1(sequence)) || macro_f_utf_char_t_to_char_1(sequence) == f_string_ascii_underscore_s.string[0] || macro_f_utf_char_t_to_char_1(sequence) == f_string_ascii_minus_s.string[0]) {
       return F_true;
     }
 
@@ -534,7 +553,7 @@ extern "C" {
       return private_f_utf_character_is_word_dash_plus(sequence, strict);
     }
 
-    if (isalnum(macro_f_utf_char_t_to_char_1(sequence)) || sequence == f_string_ascii_underscore_s.string[0] || sequence == f_string_ascii_minus_s.string[0] || sequence == f_string_ascii_plus_s.string[0]) {
+    if (isalnum(macro_f_utf_char_t_to_char_1(sequence)) || macro_f_utf_char_t_to_char_1(sequence) == f_string_ascii_underscore_s.string[0] || macro_f_utf_char_t_to_char_1(sequence) == f_string_ascii_minus_s.string[0] || macro_f_utf_char_t_to_char_1(sequence) == f_string_ascii_plus_s.string[0]) {
       return F_true;
     }
 
index c631c833f5c5328125fc31bc74f01b168c6e8f71..56aa23cddf0a84cac1cb94ea166013c47f7fc5a9 100644 (file)
@@ -27,6 +27,7 @@ extern "C" {
  * @return
  *   F_true if a UTF-8 character.
  *   F_false if not a UTF-8 character.
+ *
  *   F_utf_fragment if this is a UTF-8 character fragment.
  *
  * @see f_utf_character_is_valid()
@@ -586,10 +587,9 @@ extern "C" {
  *   The (UTF-8) character.
  *
  * @return
- *   F_none on success.
+ *   F_true if a UTF-8 wide character.
+ *   F_false if not a UTF-8 wide character.
  *
- *   F_failure (with error bit) if width is not long enough to convert.
- *   F_parameter (with error bit) if a parameter is invalid.
  *   F_utf_fragment (with error bit) if character is a UTF-8 fragment.
  *   F_utf_not (with error bit) if unicode is an invalid Unicode character.
  */