]> Kevux Git Server - fll/commitdiff
Bugfix: Incomplete UTF-8 zero-width detection, particularly for combining characters.
authorKevin Day <thekevinday@gmail.com>
Sat, 5 Nov 2022 01:24:38 +0000 (20:24 -0500)
committerKevin Day <thekevinday@gmail.com>
Sat, 5 Nov 2022 01:27:31 +0000 (20:27 -0500)
The zero-width calculations should be and are not including zero-width and most control characters.
The combining characters and most of the control characters are now considered zero-width.

level_0/f_utf/c/private-utf_combining.c
level_0/f_utf/c/private-utf_zero_width.c
level_0/f_utf/c/utf/is.c
level_0/f_utf/c/utf/is_character.c

index 30485cd1d58c6a2876be5f6178b10d2f5d2f1985..f851bc5c57f4e755140fa24c3a862e5190f1ad02 100644 (file)
@@ -930,6 +930,16 @@ extern "C" {
       return F_true;
     }
 
+    // Hanifi Rohingya: U+10D24 to U+10D27.
+    if (sequence >= 0xf090b4a3 && sequence <= 0xf090b4a7) {
+      return F_true;
+    }
+
+    // Yezidi: U+10EAB to U+10EAD.
+    if (sequence >= 0xf090baab && sequence <= 0xf090baad) {
+      return F_true;
+    }
+
     // Arabic Extended-C: U+10EFD to U+10EFF.
     if (sequence >= 0xf090bbbd && sequence <= 0xf090bbbf) {
       return F_true;
index 47787afb31127cbdcd2a964368eb180b6ab1f9be..2aa2e35d6965eb0a29d7983c965ee6b29ffadccb 100644 (file)
@@ -9,25 +9,27 @@ extern "C" {
 #if !defined(_di_f_utf_character_is_zero_width_) || !defined(_di_f_utf_is_zero_width_)
   f_status_t private_f_utf_character_is_zero_width(const f_utf_char_t sequence) {
 
-    if (macro_f_utf_char_t_to_char_1(sequence) == 0xe1) {
-
-      // Mongolian: U+180E.
-      if (sequence == 0xe1a08e00) {
-        return F_true;
-      }
-    }
-    else if (macro_f_utf_char_t_to_char_1(sequence) == 0xe2) {
-
-      // General Punctuation: U+200B, U+200C, U+200D, U+2060.
-      if (sequence == 0xe2808b00 || sequence == 0xe2808c00 || sequence == 0xe2808d00 || sequence == 0xe281a000) {
-        return F_true;
+    if (macro_f_utf_char_t_width_is(sequence) == 3) {
+      if (macro_f_utf_char_t_to_char_1(sequence) == 0xe2) {
+
+        // General Punctuation: U+2028 to U+2029.
+        if (sequence >= 0xf090b4a3 && sequence <= 0xf090b4a7) {
+          return F_true;
+        }
+
+        // General Punctuation: U+2066 to U+2069.
+        if (sequence >= 0xe281a600 && sequence <= 0xe281a900) {
+          return F_true;
+        }
       }
-    }
-    else if (macro_f_utf_char_t_to_char_1(sequence) == 0xef) {
-
-      // Arabic Presentation Forms-B: U+FEFF.
-      if (sequence == 0xefbbbf00) {
-        return F_true;
+      else if (macro_f_utf_char_t_to_char_1(sequence) == 0xef) {
+        if (macro_f_utf_char_t_to_char_2(sequence) == 0xbf) {
+
+          // Unassigned and reserved for future use: U+FFF0 to U+FFF8 (these are assumed to be zero-width).
+          if (sequence >= 0xefbfb000 && sequence <= 0xefbfb900) {
+            return F_true;
+          }
+        }
       }
     }
 
index 18646680f8c4c68930ce60229708eb0d36864ec5..3a482065a4a30bcd030783463d3971bdcfcc1faf 100644 (file)
@@ -994,14 +994,18 @@ extern "C" {
         if (F_status_is_error(status)) return status;
       }
 
+      // Control and combining characters are zero-width.
+      if (private_f_utf_character_is_control(utf)) return F_true;
+      if (private_f_utf_character_is_combining(utf)) return F_true;
+
       return private_f_utf_character_is_zero_width(utf);
     }
 
     // These control characters are considered zero-width spaces.
-    if (*sequence >= 0x00 && *sequence <= 0x08) {
+    if (*sequence >= 0x00 && *sequence < 0x09) {
       return F_true;
     }
-    else if (*sequence >= 0x0c && *sequence <= 0x1f) {
+    else if (*sequence > 0x0b && *sequence < 0x20) {
       return F_true;
     }
     else if (*sequence == 0x7f) {
index 60a9fc944e6a462f016cb6422d2c7bb682457401..4050abc38efed1bc86c60d178de47dbba3d2e782 100644 (file)
@@ -585,22 +585,21 @@ extern "C" {
         return F_status_set_error(F_utf_fragment);
       }
 
+      // Control and combining characters are zero-width.
+      if (private_f_utf_character_is_control(sequence)) return F_true;
+      if (private_f_utf_character_is_combining(sequence)) return F_true;
+
       return private_f_utf_character_is_zero_width(sequence);
     }
 
-    const uint8_t ascii = macro_f_utf_char_t_to_char_1(sequence);
-
     // These control characters are considered zero-width spaces.
-    if (ascii >= 0x00 && ascii <= 0x08) {
-      return F_true;
-    }
-    else if (ascii == 0x0a) {
+    if (macro_f_utf_char_t_to_char_1(sequence) >= 0x00 && macro_f_utf_char_t_to_char_1(sequence) < 0x09) {
       return F_true;
     }
-    else if (ascii >= 0x0c && ascii <= 0x1f) {
+    else if (macro_f_utf_char_t_to_char_1(sequence) > 0x0b && macro_f_utf_char_t_to_char_1(sequence) < 0x20) {
       return F_true;
     }
-    else if (ascii == 0x7f) {
+    else if (macro_f_utf_char_t_to_char_1(sequence) == 0x7f) {
       return F_true;
     }