From b3e951c53c4fddeeb4358a771f1947dc22037526 Mon Sep 17 00:00:00 2001
From: Kevin Day <thekevinday@gmail.com>
Date: Thu, 23 Apr 2020 21:43:01 -0500
Subject: [PATCH] Bugfix: fix UTF-8 whitespace detection and provide zero-width
 detection function

The whitespace detection codes for UTF-8 were incorrect.

Non-printing characters, called zero-width, are not whitespace.
Move them out of the whitespace detection and provide a new function for detecting zero-width.

Handle additional UTF-8 whitespace character codes that I had previously missed.
---
 level_0/f_utf/c/utf.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++-----
 level_0/f_utf/c/utf.h | 20 +++++++++++++++
 2 files changed, 85 insertions(+), 6 deletions(-)

diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c
index 5a0faa2..e6d7838 100644
--- a/level_0/f_utf/c/utf.c
+++ b/level_0/f_utf/c/utf.c
@@ -2492,20 +2492,79 @@ extern "C" {
       return f_status_is_error(f_invalid_utf);
     }
 
-    // Latin-1 Supplement: U+00A0, U+00AD.
-    if (character == 0xc2a00000 || character == 0xc2ad0000) {
-      return f_true;
+    // reduce the number of checks by grouping checks by first byte.
+    uint8_t byte_first = f_macro_utf_character_to_char_1(character);
+
+    if (byte_first == 0xc2) {
+      // Latin-1 Supplement: U+00A0, U+0085.
+      if (character == 0xc2a00000 || 0xc2850000) {
+        return f_true;
+      }
     }
+    else if (byte_first == 0xe2) {
+      // General Punctuation: U+2000, U+2001, U+2002, U+2003.
+      if (character == 0xe2808000 || character == 0xe2808100 || character == 0xe2808200 || character == 0xe2808300) {
+        return f_true;
+      }
 
-    // Tags: U+E0020.
-    if (character == 0xf3a08080) {
-      return f_true;
+      // General Punctuation: U+2004, U+2005, U+2006, U+2007.
+      if (character == 0xe2808400 || character == 0xe2808500 || character == 0xe2808600 || character == 0xe2808700) {
+        return f_true;
+      }
+
+      // General Punctuation: U+2008, U+2009, U+200A, U+2028.
+      if (character == 0xe2808800 || character == 0xe2808900 || character == 0xe2808a00 || character == 0xe280a800) {
+        return f_true;
+      }
+
+      // General Punctuation: U+2029, U+202F, U+205F.
+      if (character == 0xe280a900 || character == 0xe2819f00 || character == 0xe280af00) {
+        return f_true;
+      }
+    }
+    else if (byte_first == 0xe3) {
+      // CJK Symbols and Punctuation: U+3000.
+      if (character == 0xe3808000) {
+        return f_true;
+      }
     }
 
     return f_false;
   }
 #endif // _di_f_utf_character_is_whitespace_
 
+#ifndef _di_f_utf_character_is_zero_width_
+  f_return_status f_utf_character_is_zero_width(const f_utf_character character) {
+    if (f_macro_utf_character_width_is(character) == 1) {
+      return f_status_is_error(f_invalid_utf);
+    }
+
+    // reduce the number of checks by grouping checks by first byte.
+    uint8_t byte_first = f_macro_utf_character_to_char_1(character);
+
+    if (byte_first == 0xe1) {
+      // Mongolian: U+180E.
+      if (character == 0xe1a08e00) {
+        return f_true;
+      }
+    }
+    else if (byte_first == 0xe2) {
+      // General Punctuation: U+200B, U+200C, U+200D, U+2060.
+      if (character == 0xe2808b00 || character == 0xe2808c00 || character == 0xe2808d00 || character == 0xe281a000) {
+        return f_true;
+      }
+    }
+    else if (byte_first == 0xef) {
+      // Arabic Presentation Forms-B: U+FEFF.
+      if (character == 0xefbbbf00) {
+        return f_true;
+      }
+    }
+
+    return f_false;
+  }
+#endif // _di_f_utf_character_is_zero_width_
+
 #ifndef _di_f_utf_character_to_char_
   f_return_status f_utf_character_to_char(const f_utf_character utf_character, f_string *character, uint8_t *max_width) {
     #ifndef _di_level_0_parameter_checking_
diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h
index 6cf3a73..f81e936 100644
--- a/level_0/f_utf/c/utf.h
+++ b/level_0/f_utf/c/utf.h
@@ -620,6 +620,9 @@ extern "C" {
 /**
  * Check to see if the entire byte block of the character is an ASCII or UTF-8 general space character.
  *
+ * Non-printing or zero-width characters are not considered whitespace.
+ * This does include line separators like '\n'.
+ *
  * @param character
  *   The character to validate.
  *
@@ -633,6 +636,23 @@ extern "C" {
 #endif // _di_f_utf_character_is_whitespace_
 
 /**
+ * Check to see if the entire byte block of the character is an ASCII or UTF-8 general non-printing character.
+ *
+ * Only characters that do not print, which are generally called zero-width.
+ *
+ * @param character
+ *   The character to validate.
+ *
+ * @return
+ *   f_true if a UTF-8 non-printing or zero-width character.
+ *   f_false if not a UTF-8 non-printing or zero-width character.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ */
+#ifndef _di_f_utf_character_is_zero_width_
+  extern f_return_status f_utf_character_is_zero_width(const f_utf_character character);
+#endif // _di_f_utf_character_is_zero_width_
+
+/**
  * Convert a specialized f_utf_character type to a int8_t, stored as a string (character buffer).
  *
  * This will also convert ASCII characters stored in the utf_character array.
-- 
1.8.3.1