]> Kevux Git Server - fll/commitdiff
Bugfix: Problems exposed by unit tests for f_utf.
authorKevin Day <thekevinday@gmail.com>
Mon, 13 Jun 2022 23:58:12 +0000 (18:58 -0500)
committerKevin Day <thekevinday@gmail.com>
Mon, 13 Jun 2022 23:58:12 +0000 (18:58 -0500)
Correct comments and add missing characters.

Add the missing f_utf_character_is_surrogate() function.

Change the is valid algorithm to one I developed for the unit tests.
These are cleaner and simpler due to bitwise operations.

level_0/f_utf/c/private-utf_digit.c
level_0/f_utf/c/private-utf_emoji.c
level_0/f_utf/c/private-utf_valid.c
level_0/f_utf/c/utf/is_character.c
level_0/f_utf/c/utf/is_character.h

index ef7996ce0d94d79ae02bfa12a5ca016a4f06a68d..f794ac294b3e8fa19b14b2e1db4aba6168440f1f 100644 (file)
@@ -254,7 +254,7 @@ extern "C" {
           return F_true;
         }
 
-        // Tirhuta: U+9199 to U+114D9.
+        // Tirhuta: U+114D0 to U+114D9.
         if (character >= 0xf0919390 && character <= 0xf0919399) {
           return F_true;
         }
index 1f54bebd82ca7ae7451cfd98ea83464118910bad..0511052e52ee8e1b1cf3ceb6477f561283b43253 100644 (file)
@@ -502,6 +502,11 @@ extern "C" {
         if (character >= 0xf09fab90 && character <= 0xf09fab96) {
           return F_true;
         }
+
+        // U+1FAF6.
+        if (character == 0xf09fabb6) {
+          return F_true;
+        }
       }
     }
 
index 2b7adfa554ad947fe3e2e8f9ee56bae721b52cd0..63314293ea2fe1ce0e4ee56c28349f7e7f447228 100644 (file)
@@ -9,76 +9,46 @@ extern "C" {
 #if !defined(_di_f_utf_character_is_valid_) || !defined(_di_f_utf_is_valid_)
   f_status_t private_f_utf_character_is_valid(const f_utf_char_t character) {
 
-    if (macro_f_utf_char_t_width_is(character) < 2) {
-      if (macro_f_utf_char_t_to_char_1(character) >= 0x00 && macro_f_utf_char_t_to_char_1(character) <= 0x7f) {
+    // Invalid: 11111xxx xxxxxxxx xxxxxxxx xxxxxxxx.
+    if ((macro_f_utf_char_t_to_char_1(character) & 0b11111000) == 0b11111000) {
+      return F_false;
+    }
+
+    // Valid: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
+    if ((macro_f_utf_char_t_to_char_1(character) & 0b11111000) == 0b11110000) {
+      if ((macro_f_utf_char_t_to_char_2(character) & 0b11000000) == 0b10000000 && (macro_f_utf_char_t_to_char_3(character) & 0b11000000) == 0b10000000 && (macro_f_utf_char_t_to_char_4(character) & 0b11000000) == 0b10000000) {
         return F_true;
       }
 
-      return false;
+      return F_false;
     }
 
-    if (macro_f_utf_char_t_width_is(character) == 2) {
-      if (macro_f_utf_char_t_to_char_1(character) >= 0xc2 && macro_f_utf_char_t_to_char_1(character) <= 0xdf) {
-        if (macro_f_utf_char_t_to_char_2(character) >= 0x80 && macro_f_utf_char_t_to_char_2(character) <= 0xbf) {
-          return F_true;
-        }
+    // Valid: 1110xxxx 10xxxxxx 10xxxxxx ????????.
+    else if ((macro_f_utf_char_t_to_char_1(character) & 0b11110000) == 0b11100000) {
+      if ((macro_f_utf_char_t_to_char_2(character) & 0b11000000) == 0b10000000 && (macro_f_utf_char_t_to_char_3(character) & 0b11000000) == 0b10000000) {
+        return F_true;
       }
 
-      return false;
+      return F_false;
     }
 
-    if (macro_f_utf_char_t_width_is(character) == 3) {
-      if (macro_f_utf_char_t_to_char_1(character) == 0xe0) {
-        if (macro_f_utf_char_t_to_char_2(character) >= 0xa0 && macro_f_utf_char_t_to_char_2(character) <= 0xbf) {
-          if (macro_f_utf_char_t_to_char_3(character) >= 0x80 && macro_f_utf_char_t_to_char_3(character) <= 0xbf) {
-            return F_true;
-          }
-        }
-      }
-      else if ((macro_f_utf_char_t_to_char_1(character) >= 0xe1 && macro_f_utf_char_t_to_char_1(character) <= 0xec) || (macro_f_utf_char_t_to_char_1(character) >= 0xee && macro_f_utf_char_t_to_char_1(character) <= 0xef)) {
-        if (macro_f_utf_char_t_to_char_2(character) >= 0x80 && macro_f_utf_char_t_to_char_2(character) <= 0xbf) {
-          if (macro_f_utf_char_t_to_char_3(character) >= 0x80 && macro_f_utf_char_t_to_char_3(character) <= 0xbf) {
-            return F_true;
-          }
-        }
-      }
-      else if (macro_f_utf_char_t_to_char_1(character) == 0xed) {
-        if (macro_f_utf_char_t_to_char_2(character) >= 0x80 && macro_f_utf_char_t_to_char_2(character) <= 0x9f) {
-          if (macro_f_utf_char_t_to_char_3(character) >= 0x80 && macro_f_utf_char_t_to_char_3(character) <= 0xbf) {
-            return F_true;
-          }
-        }
+    // Valid: 110xxxxx 10xxxxxx ???????? ????????.
+    else if ((macro_f_utf_char_t_to_char_1(character) & 0b11100000) == 0b11000000) {
+      if ((macro_f_utf_char_t_to_char_2(character) & 0b11000000) == 0b10000000) {
+        return F_true;
       }
 
-      return false;
+      return F_false;
     }
 
-    if (macro_f_utf_char_t_to_char_1(character) == 0xf0) {
-      if (macro_f_utf_char_t_to_char_2(character) >= 0x90 && macro_f_utf_char_t_to_char_2(character) <= 0xbf) {
-        if (macro_f_utf_char_t_to_char_3(character) >= 0x80 && macro_f_utf_char_t_to_char_3(character) <= 0xbf) {
-          if (macro_f_utf_char_t_to_char_4(character) >= 0x80 && macro_f_utf_char_t_to_char_4(character) <= 0xbf) {
-            return F_true;
-          }
-        }
-      }
-    }
-    else if (macro_f_utf_char_t_to_char_1(character) >= 0xf1 && macro_f_utf_char_t_to_char_1(character) <= 0xf3) {
-      if (macro_f_utf_char_t_to_char_2(character) >= 0x80 && macro_f_utf_char_t_to_char_2(character) <= 0xbf) {
-        if (macro_f_utf_char_t_to_char_3(character) >= 0x80 && macro_f_utf_char_t_to_char_3(character) <= 0xbf) {
-          if (macro_f_utf_char_t_to_char_4(character) >= 0x80 && macro_f_utf_char_t_to_char_4(character) <= 0xbf) {
-            return F_true;
-          }
-        }
-      }
+    // Invalid (UTF Fragment): 10xxxxxx ???????? ???????? ????????.
+    else if ((macro_f_utf_char_t_to_char_1(character) & 0b11000000) == 0b10000000) {
+      return F_status_set_error(F_utf_fragment);
     }
-    else if (macro_f_utf_char_t_to_char_1(character) == 0xf4) {
-      if (macro_f_utf_char_t_to_char_2(character) >= 0x80 && macro_f_utf_char_t_to_char_2(character) <= 0x8f) {
-        if (macro_f_utf_char_t_to_char_3(character) >= 0x80 && macro_f_utf_char_t_to_char_3(character) <= 0xbf) {
-          if (macro_f_utf_char_t_to_char_4(character) >= 0x80 && macro_f_utf_char_t_to_char_4(character) <= 0xbf) {
-            return F_true;
-          }
-        }
-      }
+
+    // Valid: 0xxxxxxx ???????? ???????? ????????.
+    else if (!(macro_f_utf_char_t_to_char_1(character) & 0b10000000)) {
+      return F_true;
     }
 
     return F_false;
index ba722d39d8b71ac021c11f233dc604fa8b3131ce..f0a03307c3b9399a6d0c2d2e39d52a5350b37ca9 100644 (file)
@@ -391,6 +391,21 @@ extern "C" {
   }
 #endif // _di_f_utf_character_is_superscript_
 
+#ifndef _di_f_utf_character_is_surrogate_
+  f_status_t f_utf_character_is_surrogate(const f_utf_char_t character) {
+
+    if (macro_f_utf_char_t_width_is(character)) {
+      if (macro_f_utf_char_t_width_is(character) == 1) {
+        return F_status_set_error(F_utf_fragment);
+      }
+
+      return private_f_utf_character_is_surrogate(character);
+    }
+
+    return F_false;
+  }
+#endif // _di_f_utf_character_is_surrogate_
+
 #ifndef _di_f_utf_character_is_symbol_
   f_status_t f_utf_character_is_symbol(const f_utf_char_t character) {
 
index fcfd2899a26edfefd154746961577fe864ad9176..dbc72632d43eb7b6af11bff0249944a0ac320816 100644 (file)
@@ -418,6 +418,23 @@ extern "C" {
 #endif // _di_f_utf_character_is_superscript_
 
 /**
+ * Check to see if the entire byte block of the character is a UTF-8 surrogate character.
+ *
+ * @param character
+ *   The character to validate.
+ *
+ * @return
+ *   F_true if a UTF-8 superscript character.
+ *   F_false if not a UTF-8 superscript character.
+ *
+ *   F_utf_fragment (with error bit) if character is a UTF-8 fragment.
+ *   F_utf_not (with error bit) if unicode is an invalid Unicode character.
+ */
+#ifndef _di_f_utf_character_is_surrogate_
+  extern f_status_t f_utf_character_is_surrogate(const f_utf_char_t character);
+#endif // _di_f_utf_character_is_surrogate_
+
+/**
  * Check to see if the entire byte block of the character is an ASCII or UTF-8 symbol character.
  *
  * @param character