Bugfix: fix UTF-8 whitespace detection and provide zero-width detection function

author Kevin Day <thekevinday@gmail.com>

Fri, 24 Apr 2020 02:43:01 +0000 (21:43 -0500)

committer Kevin Day <thekevinday@gmail.com>

Fri, 24 Apr 2020 02:43:01 +0000 (21:43 -0500)
author Kevin Day <thekevinday@gmail.com>
Fri, 24 Apr 2020 02:43:01 +0000 (21:43 -0500)
committer Kevin Day <thekevinday@gmail.com>
Fri, 24 Apr 2020 02:43:01 +0000 (21:43 -0500)
diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c

index 5a0faa2e008e819c155f64ca4e4449ab35e6905e..e6d783869b51bc08663aab42f8dbcc0c85bf3daa 100644 (file)
--- a/level_0/f_utf/c/utf.c
+++ b/level_0/f_utf/c/utf.c
@@ -2492,20 +2492,79 @@ extern "C" {
        return f_status_is_error(f_invalid_utf);
      }
  
-    // Latin-1 Supplement: U+00A0, U+00AD.
-    if (character == 0xc2a00000 || character == 0xc2ad0000) {
-      return f_true;
+    // reduce the number of checks by grouping checks by first byte.
+    uint8_t byte_first = f_macro_utf_character_to_char_1(character);
+
+    if (byte_first == 0xc2) {
+      // Latin-1 Supplement: U+00A0, U+0085.
+      if (character == 0xc2a00000 || 0xc2850000) {
+        return f_true;
+      }
      }
+    else if (byte_first == 0xe2) {
+      // General Punctuation: U+2000, U+2001, U+2002, U+2003.
+      if (character == 0xe2808000 || character == 0xe2808100 || character == 0xe2808200 || character == 0xe2808300) {
+        return f_true;
+      }
  
-    // Tags: U+E0020.
-    if (character == 0xf3a08080) {
-      return f_true;
+      // General Punctuation: U+2004, U+2005, U+2006, U+2007.
+      if (character == 0xe2808400 || character == 0xe2808500 || character == 0xe2808600 || character == 0xe2808700) {
+        return f_true;
+      }
+
+      // General Punctuation: U+2008, U+2009, U+200A, U+2028.
+      if (character == 0xe2808800 || character == 0xe2808900 || character == 0xe2808a00 || character == 0xe280a800) {
+        return f_true;
+      }
+
+      // General Punctuation: U+2029, U+202F, U+205F.
+      if (character == 0xe280a900 || character == 0xe2819f00 || character == 0xe280af00) {
+        return f_true;
+      }
+    }
+    else if (byte_first == 0xe3) {
+      // CJK Symbols and Punctuation: U+3000.
+      if (character == 0xe3808000) {
+        return f_true;
+      }
      }
  
      return f_false;
    }
  #endif // _di_f_utf_character_is_whitespace_
  
+#ifndef _di_f_utf_character_is_zero_width_
+  f_return_status f_utf_character_is_zero_width(const f_utf_character character) {
+    if (f_macro_utf_character_width_is(character) == 1) {
+      return f_status_is_error(f_invalid_utf);
+    }
+
+    // reduce the number of checks by grouping checks by first byte.
+    uint8_t byte_first = f_macro_utf_character_to_char_1(character);
+
+    if (byte_first == 0xe1) {
+      // Mongolian: U+180E.
+      if (character == 0xe1a08e00) {
+        return f_true;
+      }
+    }
+    else if (byte_first == 0xe2) {
+      // General Punctuation: U+200B, U+200C, U+200D, U+2060.
+      if (character == 0xe2808b00 || character == 0xe2808c00 || character == 0xe2808d00 || character == 0xe281a000) {
+        return f_true;
+      }
+    }
+    else if (byte_first == 0xef) {
+      // Arabic Presentation Forms-B: U+FEFF.
+      if (character == 0xefbbbf00) {
+        return f_true;
+      }
+    }
+
+    return f_false;
+  }
+#endif // _di_f_utf_character_is_zero_width_
+
  #ifndef _di_f_utf_character_to_char_
    f_return_status f_utf_character_to_char(const f_utf_character utf_character, f_string *character, uint8_t *max_width) {
      #ifndef _di_level_0_parameter_checking_
diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h

index 6cf3a73fd06fbdb3de77979bf0c0d53059693d3f..f81e93632df719615898d33f2dad645602196e36 100644 (file)
--- a/level_0/f_utf/c/utf.h
+++ b/level_0/f_utf/c/utf.h
@@ -620,6 +620,9 @@ extern "C" {
  /**
   * Check to see if the entire byte block of the character is an ASCII or UTF-8 general space character.
   *
+ * Non-printing or zero-width characters are not considered whitespace.
+ * This does include line separators like '\n'.
+ *
   * @param character
   *   The character to validate.
   *
@@ -633,6 +636,23 @@ extern "C" {
  #endif // _di_f_utf_character_is_whitespace_
  
  /**
+ * Check to see if the entire byte block of the character is an ASCII or UTF-8 general non-printing character.
+ *
+ * Only characters that do not print, which are generally called zero-width.
+ *
+ * @param character
+ *   The character to validate.
+ *
+ * @return
+ *   f_true if a UTF-8 non-printing or zero-width character.
+ *   f_false if not a UTF-8 non-printing or zero-width character.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ */
+#ifndef _di_f_utf_character_is_zero_width_
+  extern f_return_status f_utf_character_is_zero_width(const f_utf_character character);
+#endif // _di_f_utf_character_is_zero_width_
+
+/**
   * Convert a specialized f_utf_character type to a int8_t, stored as a string (character buffer).
   *
   * This will also convert ASCII characters stored in the utf_character array.
author	Kevin Day <thekevinday@gmail.com>
	Fri, 24 Apr 2020 02:43:01 +0000 (21:43 -0500)
committer	Kevin Day <thekevinday@gmail.com>
	Fri, 24 Apr 2020 02:43:01 +0000 (21:43 -0500)
level_0/f_utf/c/utf.c		patch \| blob \| history
level_0/f_utf/c/utf.h		patch \| blob \| history