Progress: UTF-8

author Kevin Day <thekevinday@gmail.com>

Thu, 2 Jul 2020 03:36:33 +0000 (22:36 -0500)

committer Kevin Day <thekevinday@gmail.com>

Thu, 2 Jul 2020 03:36:33 +0000 (22:36 -0500)
author Kevin Day <thekevinday@gmail.com>
Thu, 2 Jul 2020 03:36:33 +0000 (22:36 -0500)
committer Kevin Day <thekevinday@gmail.com>
Thu, 2 Jul 2020 03:36:33 +0000 (22:36 -0500)
diff --git a/level_0/f_fss/c/fss.h b/level_0/f_fss/c/fss.h

index 412f215c1a5dbdcf7cfe41441d1e467a56d3e557..eeabe782dd1973fd73a0ea2d616ef4089ca38088 100644 (file)
--- a/level_0/f_fss/c/fss.h
+++ b/level_0/f_fss/c/fss.h
@@ -9,6 +9,8 @@
   *
   * The purpose of compression is not to compression the entire file's contents but only and individual objects content, so the file is still partially readable.
   * NOTE: all start/stop locations must be defined as a (start < stop) and not (start <= stop), therefore if (start == stop) then stop.
+ *
+ * @todo identify all special UTF-8 characters that would violate the concepts, such as "Ogham space mark ( )" is not valid as whitespace in FSS because it is a visible non-whitespace character.
   */
  #ifndef _F_fss_h
  #define _F_fss_h
diff --git a/level_0/f_utf/c/private-utf.c b/level_0/f_utf/c/private-utf.c

index 9c0adb609efec4338fa562fe1bc61131a14140d1..c1961bfbe219be03e07ba9d858c15cb406153388 100644 (file)
--- a/level_0/f_utf/c/private-utf.c
+++ b/level_0/f_utf/c/private-utf.c
@@ -52,6 +52,53 @@ extern "C" {
    }
  #endif // !defined(_di_f_utf_character_is_alpha_) || !defined(_di_f_utf_is_alpha_)
  
+#if !defined(_di_f_utf_character_is_alpha_digit_) || !defined(_di_f_utf_is_alpha_digit_)
+  f_return_status private_f_utf_character_is_alpha_digit(const f_utf_character character, const uint8_t width) {
+
+    if (private_f_utf_character_is_digit(character, width)) {
+      return F_true;
+    }
+
+    if (private_f_utf_character_is_zero_width(character, width)) {
+      return F_false;
+    }
+
+    if (private_f_utf_character_is_control(character, width)) {
+      return F_false;
+    }
+
+    if (private_f_utf_character_is_control_picture(character, width)) {
+      return F_false;
+    }
+
+    if (private_f_utf_character_is_whitespace(character, width)) {
+      return F_false;
+    }
+
+    if (private_f_utf_character_is_whitespace_modifier(character, width)) {
+      return F_false;
+    }
+
+    if (private_f_utf_character_is_numeric(character, width)) {
+      return F_false;
+    }
+
+    if (private_f_utf_character_is_punctuation(character, width)) {
+      return F_false;
+    }
+
+    if (private_f_utf_character_is_symbol(character, width)) {
+      return F_false;
+    }
+
+    if (private_f_utf_character_is_phonetic(character, width)) {
+      return F_false;
+    }
+
+    return F_false;
+  }
+#endif // !defined(_di_f_utf_character_is_alpha_digit_) || !defined(_di_f_utf_is_alpha_digit_)
+
  #if !defined(_di_f_utf_character_is_alpha_numeric_) || !defined(_di_f_utf_is_alpha_numeric_)
    f_return_status private_f_utf_character_is_alpha_numeric(const f_utf_character character, const uint8_t width) {
  
@@ -107,8 +154,7 @@ extern "C" {
  
        return F_false;
      }
-
-    if (width == 3) {
+    else if (width == 3) {
  
        // Diacritical Marks Extended: U+1AB0 to U+1AC0.
        if (character >= 0xe1aab000 && character <= 0xe1ab8000) {
@@ -134,8 +180,6 @@ extern "C" {
        if (character >= 0xefb8a000 && character <= 0xefb8af00) {
          return F_true;
        }
-
-      return F_false;
      }
  
      return F_false;
@@ -151,11 +195,8 @@ extern "C" {
        if (character >= 0xc2800000 && character <= 0xc29f0000) {
          return F_true;
        }
-
-      return F_false;
      }
-
-    if (width == 3) {
+    else if (width == 3) {
  
        // General Punctuation: U+200E and U+200F.
        if (character == 0xe2808e00 || character == 0xe2808f00) {
@@ -171,11 +212,8 @@ extern "C" {
        if (character >= 0xefbfb900 && character <= 0xefbfbb00) {
          return F_true;
        }
-
-      return F_false;
      }
-
-    if (width == 4) {
+    else if (width == 4) {
  
        // Tags: U+E0001 and U+E007F.
        if (character == 0xf3a08081 || character == 0xf3a081bf) {
@@ -206,6 +244,24 @@ extern "C" {
    }
  #endif // !defined(_di_f_utf_character_is_control_picture_) || !defined(_di_f_utf_is_control_picture_)
  
+#if !defined(_di_f_utf_character_is_digit_) || !defined(_di_f_utf_is_digit_)
+  f_return_status private_f_utf_character_is_digit(const f_utf_character character, const uint8_t width) {
+
+    // @todo: add decimal digit UTF-8 numbers.
+    // @todo: consider adding letter UTF-8 numbers (update all code comment documentation accordingly).
+
+    if (width == 3) {
+
+      // Number Forms: U+2150 to U+218B.
+      if (character >= 0xe2859000 && character <= 0xe2868b00) {
+        return F_true;
+      }
+    }
+
+    return F_false;
+  }
+#endif // !defined(_di_f_utf_character_is_digit_) || !defined(_di_f_utf_is_digit_)
+
  #if !defined(_di_f_utf_character_is_emoji_) || !defined(_di_f_utf_is_emoji_)
    f_return_status private_f_utf_character_is_emoji(const f_utf_character character, const uint8_t width) {
  
@@ -697,17 +753,18 @@ extern "C" {
  #if !defined(_di_f_utf_character_is_numeric_) || !defined(_di_f_utf_is_numeric_)
    f_return_status private_f_utf_character_is_numeric(const f_utf_character character, const uint8_t width) {
  
+    // @todo: add decimal digit UTF-8 numbers.
+    // @todo: add letter UTF-8 numbers.
+    // @todo: add other UTF-8 numbers.
+
      if (width == 3) {
  
        // Number Forms: U+2150 to U+218B.
        if (character >= 0xe2859000 && character <= 0xe2868b00) {
          return F_true;
        }
-
-      return F_false;
      }
-
-    if (width == 4) {
+    else if (width == 4) {
  
        // Coptic Epact Numbers: U+102E1 to U+102FB.
        if (character >= 0xf0908ba1 && character <= 0xf0908bbb) {
@@ -831,11 +888,8 @@ extern "C" {
            return F_true;
          }
        }
-
-      return F_false;
      }
-
-    if (width == 3) {
+    else if (width == 3) {
  
        if (byte_first == 0xe0) {
  
@@ -1013,6 +1067,16 @@ extern "C" {
            return F_true;
          }
  
+        // Coptic: U+2CF9 to U+2CFF.
+        if (character >= 0xe2b3b900 && character <= 0xe2b3bf00) {
+          return F_true;
+        }
+
+        // Tifinagh: U+2D70.
+        if (character == 0xe2b5b000) {
+          return F_true;
+        }
+
          // Supplemental Punctuation: U+2E00 to U+2E52.
          if (character == 0xe2b88000 || character == 0xe2b99200) {
            return F_true;
@@ -1020,12 +1084,380 @@ extern "C" {
        }
        else if (byte_first == 0xe3) {
  
-        // CJK Symbols and Punctuation: U+3001, U+3002, U+3003, or U+3004.
-        if (character == 0xe3808100 || character == 0xe3808200 || character == 0xe3808300) {
+        // CJK Symbols and Punctuation: U+3001 to U+3003.
+        if (character >= 0xe3808100 && character <= 0xe3808300) {
+          return F_true;
+        }
+
+        // CJK Symbols and Punctuation: U+303D, U+30FB.
+        if (character == 0xe380bd00 || character == 0xe383bb00) {
+          return F_true;
+        }
+      }
+      else if (byte_first == 0xea) {
+
+        // Lisu: U+A4FE, U+A4FF.
+        if (character == 0xea93be00 || character == 0xea93bf00) {
+          return F_true;
+        }
+
+        // Vai: U+A60D to U+A60F.
+        if (character >= 0xea988d00 && character <= 0xea988f00) {
+          return F_true;
+        }
+
+        // Cyrillic Extended-B: U+A673, U+A67E.
+        if (character == 0xea99b300 || character == 0xea99be00) {
+          return F_true;
+        }
+
+        // Bamum: U+A6F2 to U+A6F7.
+        if (character >= 0xea9bb200 && character <= 0xea9bb700) {
+          return F_true;
+        }
+
+        // Phags-pa: U+A874 to U+A877.
+        if (character >= 0xeaa1b400 && character <= 0xeaa1b700) {
+          return F_true;
+        }
+
+        // Saurashtra: U+A673, U+A8CF.
+        if (character == 0xea99b300 || character == 0xeaa38f00) {
+          return F_true;
+        }
+
+        // Devanagari Extended: U+A8F8 to U+A8FC.
+        if (character >= 0xeaa3b800 && character <= 0xeaa3bc00) {
+          return F_true;
+        }
+
+        // Kayah Li: U+A92E, U+A92F.
+        if (character == 0xeaa4ae00 || character == 0xeaa4af00) {
+          return F_true;
+        }
+
+        // Rejang: U+A92E, U+A92F.
+        if (character == 0xeaa4ae00 || character == 0xeaa59f00) {
+          return F_true;
+        }
+
+        // Javanese: U+A9C1 to U+A9DF.
+        if (character >= 0xeaa78100 && character <= 0xeaa79f00) {
+          return F_true;
+        }
+
+        // Cham: U+AA5C to U+AA5F.
+        if (character >= 0xeaa99c00 && character <= 0xeaa99f00) {
+          return F_true;
+        }
+
+        // Tai Viet: U+AADE, U+AADF.
+        if (character == 0xeaab9e00 || character == 0xeaab9f00) {
+          return F_true;
+        }
+
+        // Meetei Mayek Extensions: U+AAF0, U+AAF1.
+        if (character == 0xeaabb000 || character == 0xeaabb100) {
+          return F_true;
+        }
+
+        // Meetei Mayek: U+ABEB.
+        if (character == 0xeaafab00) {
+          return F_true;
+        }
+      }
+      else if (byte_first == 0xef) {
+
+        // Vertical Forms: U+FE10 to U+FE19.
+        if (character >= 0xefb89000 && character <= 0xefb89900) {
+          return F_true;
+        }
+
+        // CJK Compatibility Forms: U+FE30, U+FE45, U+FE46, U+FE49.
+        if (character == 0xefb8b000 || character == 0xefb98500 || character == 0xefb98600 || character == 0xefb98900) {
+          return F_true;
+        }
+
+        // CJK Compatibility Forms: U+FE4A to U+FE4C.
+        if (character >= 0xefb98a00 && character <= 0xefb98c00) {
+          return F_true;
+        }
+
+        // Small Form Variants: U+FE50 to U+FE57.
+        if (character >= 0xefb99000 && character <= 0xefb99700) {
+          return F_true;
+        }
+
+        // Small Form Variants: U+FE5F to U+FE61.
+        if (character >= 0xefb99f00 && character <= 0xefb9a100) {
+          return F_true;
+        }
+
+        // Small Form Variants: U+FE68, U+FE6A, U+FE6B.
+        if (character == 0xefb9a800 || character == 0xefb9aa00 || character == 0xefb9ab00) {
+          return F_true;
+        }
+
+        // Halfwidth and Fullwidth Forms: U+FF01 to U+FF07.
+        if (character >= 0xefbc8100 && character <= 0xefbc8100) {
+          return F_true;
+        }
+
+        // Halfwidth and Fullwidth Forms: U+FF0A, U+FF0C.
+        if (character == 0xefbc8a00 || character == 0xefbc8c00) {
+          return F_true;
+        }
+
+        // Halfwidth and Fullwidth Forms: U+FF0E to U+FF1B.
+        if (character >= 0xefbc8e00 && character <= 0xefbc9b00) {
+          return F_true;
+        }
+
+        // Halfwidth and Fullwidth Forms: U+FF1F, U+FF20, U+FF3C, U+FF61.
+        if (character == 0xefbc9f00 || character == 0xefbca000 || character == 0xefbcbc00 || character == 0xefbda100) {
+          return F_true;
+        }
+
+        // Halfwidth and Fullwidth Forms: U+FF64, U+FF65.
+        if (character == 0xefbda400 || character == 0xefbda500) {
            return F_true;
          }
        }
      }
+    else if (width == 4) {
+      uint8_t byte_second = f_macro_utf_character_to_char_2(character);
+
+      if (byte_first == 0xf0) {
+        if (byte_second == 0x90) {
+
+          // Aegean Numbers: U+10100 to U+10102.
+          if (character >= 0xf0908480 && character <= 0xf0908482) {
+            return F_true;
+          }
+
+          // Ugaritic: U+1039F, U+103D0.
+          if (character == 0xf0908e9f || character == 0xf0908f90) {
+            return F_true;
+          }
+
+          // Caucasian Albanian: U+1056F.
+          if (character == 0xf09095af) {
+            return F_true;
+          }
+
+          // Imperial Aramaic: U+10857.
+          if (character == 0xf090a197) {
+            return F_true;
+          }
+
+          // Phoenician: U+1091F.
+          if (character == 0xf090a49f) {
+            return F_true;
+          }
+
+          // Lydian: U+1093F.
+          if (character == 0xf090a4bf) {
+            return F_true;
+          }
+
+          // Kharoshthi: U+10A50 to U+10A58.
+          if (character >= 0xf090a990 && character <= 0xd802de58) {
+            return F_true;
+          }
+
+          // Old South Arabian: U+10A7F.
+          if (character == 0xf090a9bf) {
+            return F_true;
+          }
+
+          // Manichaean: U+10AF0 to U+10AF6.
+          if (character >= 0xf090abb0 && character <= 0xf090abb6) {
+            return F_true;
+          }
+
+          // Avestan: U+10B39.
+          if (character == 0xf090a9bf) {
+            return F_true;
+          }
+
+          // Avestan: U+10B3A to U+10B3F.
+          if (character >= 0xf090acba && character <= 0xf090acbf) {
+            return F_true;
+          }
+
+          // Psalter Pahlavi: U+10B99 to U+10B9C.
+          if (character >= 0xf090ae99 && character <= 0xf090ae9c) {
+            return F_true;
+          }
+
+          // Sogdian: U+10F55 to U+10F59.
+          if (character >= 0xf090bd95 && character <= 0xf090bd99) {
+            return F_true;
+          }
+        }
+        else if (byte_second == 0x91) {
+
+          // Brahmi: U+11047 to U+1104D.
+          if (character >= 0xf0918187 && character <= 0xf091818d) {
+            return F_true;
+          }
+
+          // Kaithi: U+110BB to U+110C1.
+          if (character >= 0xf09182bb && character <= 0xf0918381) {
+            return F_true;
+          }
+
+          // Chakma: U+11140 to U+11143.
+          if (character >= 0xf0918580 && character <= 0xf0918583) {
+            return F_true;
+          }
+
+          // Mahajani: U+11174 to U+11175.
+          if (character == 0xf09185b4 || character == 0xf09185b5) {
+            return F_true;
+          }
+
+          // Sharada: U+111C5 to U+111DF.
+          if (character >= 0xf0918785 && character <= 0xf091879f) {
+            return F_true;
+          }
+
+          // Khojki: U+11238 to U+1123D.
+          if (character >= 0xf09188b8 && character <= 0xf09188bd) {
+            return F_true;
+          }
+
+          // Multani: U+112A9.
+          if (character == 0xf0918aa9) {
+            return F_true;
+          }
+
+          // Newa: U+1144B to U+1145D.
+          if (character >= 0xf091918b && character <= 0xf091919d) {
+            return F_true;
+          }
+
+          // Tirhuta: U+114C6.
+          if (character == 0xf0919386) {
+            return F_true;
+          }
+
+          // Siddham: U+115C1 to U+115D7.
+          if (character >= 0xf0919781 && character <= 0xf0919797) {
+            return F_true;
+          }
+
+          // Modi: U+11641 to U+11643.
+          if (character >= 0xf0919981 && character <= 0xf0919983) {
+            return F_true;
+          }
+
+          // Mongolian Supplement: U+11660 to U+1166C.
+          if (character >= 0xf09199a0 && character <= 0xf09199ac) {
+            return F_true;
+          }
+
+          // Ahom: U+1173C to U+1173E.
+          if (character >= 0xf0919cbc && character <= 0xf0919cbe) {
+            return F_true;
+          }
+
+          // Dogra: U+1183B.
+          if (character == 0xf091a0bb) {
+            return F_true;
+          }
+
+          // Dives Akuru: U+11944 to U+11946.
+          if (character >= 0xf091a584 && character <= 0xf091a586) {
+            return F_true;
+          }
+
+          // Nandinagari: U+119E2.
+          if (character == 0xf091a7a2) {
+            return F_true;
+          }
+
+          // Zanabazar Square: U+11A3F to U+11A46.
+          if (character >= 0xd806de3f && character <= 0xf091a986) {
+            return F_true;
+          }
+
+          // Soyombo: U+11A9A to U+11AA2.
+          if (character >= 0xd806de9a && character <= 0xd806dea2) {
+            return F_true;
+          }
+
+          // Bhaiksuki: U+11C41 to U+11C45.
+          if (character >= 0xf091b181 && character <= 0xf091b185) {
+            return F_true;
+          }
+
+          // Marchen: U+11C70, U+11C71.
+          if (character == 0xf091b1b0 || character == 0xf091b1b1) {
+            return F_true;
+          }
+
+          // Makasar: U+11EF7, U+11EF8.
+          if (character == 0xf091bbb7 || character == 0xf091bbb8) {
+            return F_true;
+          }
+
+          // Tamil Supplement: U+11FFF.
+          if (character == 0xf091bfbf) {
+            return F_true;
+          }
+        }
+        else if (byte_second == 0x92) {
+
+          // Cuneiform Numbers and Punctuation: U+12470 to U+12474.
+          if (character >= 0xf09291b0 && character <= 0xf09291b4) {
+            return F_true;
+          }
+        }
+        else if (byte_second == 0x96) {
+
+          // Mro: U+16A6E, U+16A6F.
+          if (character == 0xf096a9ae || character == 0xf096a9af) {
+            return F_true;
+          }
+
+          // Bassa Vah: U+16AF5.
+          if (character == 0xf096abb5) {
+            return F_true;
+          }
+
+          // Pahawh Hmong: U+16B37 to U+16B44.
+          if (character >= 0xf096acb7 && character <= 0xf096ad84) {
+            return F_true;
+          }
+
+          // Medefaidrin: U+16E97 to U+16E9A.
+          if (character >= 0xf096ba97 && character <= 0xf096ba9a) {
+            return F_true;
+          }
+
+          // Ideographic Symbols and Punctuation: U+16FE2.
+          if (character == 0xf096bfa2) {
+            return F_true;
+          }
+
+          // Duployan: U+1BC9F.
+          if (character == 0xf09bb29f) {
+            return F_true;
+          }
+
+          // Sutton SignWriting: U+1DA87 to U+1DA8B.
+          if (character >= 0xf09daa87 && character <= 0xf09daa8b) {
+            return F_true;
+          }
+
+          // Adlam: U+1E95E, U+1E95F.
+          if (character == 0xf09ea59e || character == 0xf09ea59f) {
+            return F_true;
+          }
+        }
+      }
+    }
  
      return F_false;
    }
@@ -3411,8 +3843,8 @@ extern "C" {
  
      if (width == 2) {
  
-      // Latin-1 Supplement: U+00A0, U+0085.
-      if (character == 0xc2a00000 || character == 0xc2850000) {
+      // Latin-1 Supplement: U+00A0.
+      if (character == 0xc2a00000) {
          return F_true;
        }
      }
@@ -3422,23 +3854,13 @@ extern "C" {
  
        if (byte_first == 0xe2) {
  
-        // General Punctuation: U+2000, U+2001, U+2002, U+2003.
-        if (character == 0xe2808000 || character == 0xe2808100 || character == 0xe2808200 || character == 0xe2808300) {
+        // General Punctuation: U+2000 to U+200A.
+        if (character >= 0xe2808000 && character <= 0xe2808a00) {
            return F_true;
          }
  
-        // General Punctuation: U+2004, U+2005, U+2006, U+2007.
-        if (character == 0xe2808400 || character == 0xe2808500 || character == 0xe2808600 || character == 0xe2808700) {
-          return F_true;
-        }
-
-        // General Punctuation: U+2008, U+2009, U+200A, U+2028.
-        if (character == 0xe2808800 || character == 0xe2808900 || character == 0xe2808a00 || character == 0xe280a800) {
-          return F_true;
-        }
-
-        // General Punctuation: U+2029, U+202F, U+205F.
-        if (character == 0xe280a900 || character == 0xe2819f00 || character == 0xe280af00) {
+        // General Punctuation: U+2028, U+2029, U+202F, U+205F.
+        if (character == 0xe280a800 || character == 0xe280a900 || character == 0xe2819f00 || character == 0xe280af00) {
            return F_true;
          }
        }
@@ -3470,10 +3892,22 @@ extern "C" {
    }
  #endif // !defined(_di_f_utf_character_is_whitespace_modifier_) || !defined(_di_f_utf_is_whitespace_modifier_)
  
+#if !defined(_di_f_utf_character_is_whitespace_other_) || !defined(_di_f_utf_is_whitespace_other_)
+  f_return_status private_f_utf_character_is_whitespace_other(const f_utf_character character, const uint8_t width) {
+
+    // Ogham: U+1680 (isn't whitespace but is technically considered one: ( )).
+    if (character == 0xe19a8000) {
+      return F_true;
+    }
+
+    return F_false;
+  }
+#endif // !defined(_di_f_utf_character_is_whitespace_other_) || !defined(_di_f_utf_is_whitespace_other_)
+
  #if !defined(_di_f_utf_character_is_word_) || !defined(_di_f_utf_is_word_)
    f_return_status private_f_utf_character_is_word(const f_utf_character character, const uint8_t width) {
  
-    if (private_f_utf_character_is_alpha_numeric(character, width)) {
+    if (private_f_utf_character_is_alpha_digit(character, width)) {
        return F_true;
      }
  
diff --git a/level_0/f_utf/c/private-utf.h b/level_0/f_utf/c/private-utf.h

index b4ff964f31476d5e7a507397dfd5ec3e6a39fdf9..367c27487c8b99e787cb666fe01980faaec3c5f4 100644 (file)
--- a/level_0/f_utf/c/private-utf.h
+++ b/level_0/f_utf/c/private-utf.h
@@ -40,6 +40,28 @@ extern "C" {
  #endif // !defined(_di_f_utf_character_is_alpha_) || !defined(_di_f_utf_is_alpha_)
  
  /**
+ * Private implementation of f_utf_character_is_alpha_digit().
+ *
+ * Intended to be shared to each of the different implementation variations.
+ *
+ * @param character
+ *   The character to validate.
+ * @param width
+ *   The number of bytes repesenting the character width.
+ *
+ * @return
+ *   F_true if a UTF-8 control character.
+ *   F_false if not a UTF-8 control character.
+ *   F_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see f_utf_character_is_alpha_digit()
+ * @see f_utf_is_alpha_digit()
+ */
+#if !defined(_di_f_utf_character_is_alpha_digit_) || !defined(_di_f_utf_is_alpha_digit_)
+  extern f_return_status private_f_utf_character_is_alpha_digit(const f_utf_character character, const uint8_t width) f_gcc_attribute_visibility_internal;
+#endif // !defined(_di_f_utf_character_is_alpha_digit_) || !defined(_di_f_utf_is_alpha_digit_)
+
+/**
   * Private implementation of f_utf_character_is_alpha_numeric().
   *
   * Intended to be shared to each of the different implementation variations.
@@ -54,8 +76,8 @@ extern "C" {
   *   F_false if not a UTF-8 control character.
   *   F_utf (with error bit) if character is an invalid UTF-8 character.
   *
- * @see f_utf_character_is_control()
- * @see f_utf_is_control()
+ * @see f_utf_character_is_alpha_numeric()
+ * @see f_utf_is_alpha_numeric()
   */
  #if !defined(_di_f_utf_character_is_alpha_numeric_) || !defined(_di_f_utf_is_alpha_numeric_)
    extern f_return_status private_f_utf_character_is_alpha_numeric(const f_utf_character character, const uint8_t width) f_gcc_attribute_visibility_internal;
@@ -128,6 +150,28 @@ extern "C" {
  #endif // !defined(_di_f_utf_character_is_control_picture_) || !defined(_di_f_utf_is_control_picture_)
  
  /**
+ * Private implementation of f_utf_character_is_digit().
+ *
+ * Intended to be shared to each of the different implementation variations.
+ *
+ * @param character
+ *   The character to validate.
+ * @param width
+ *   The number of bytes repesenting the character width.
+ *
+ * @return
+ *   F_true if a UTF-8 control character.
+ *   F_false if not a UTF-8 control character.
+ *   F_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see f_utf_character_is_digit()
+ * @see f_utf_is_digit()
+ */
+#if !defined(_di_f_utf_character_is_digit_) || !defined(_di_f_utf_is_digit_)
+  extern f_return_status private_f_utf_character_is_digit(const f_utf_character character, const uint8_t width) f_gcc_attribute_visibility_internal;
+#endif // !defined(_di_f_utf_character_is_digit_) || !defined(_di_f_utf_is_digit_)
+
+/**
   * Private implementation of f_utf_character_is_emoji().
   *
   * Intended to be shared to each of the different implementation variations.
@@ -304,6 +348,28 @@ extern "C" {
  #endif // !defined(_di_f_utf_character_is_whitespace_modifier_) || !defined(_di_f_utf_is_whitespace_modifier_)
  
  /**
+ * Private implementation of f_utf_character_is_whitespace_other().
+ *
+ * Intended to be shared to each of the different implementation variations.
+ *
+ * @param character
+ *   The character to validate.
+ * @param width
+ *   The number of bytes repesenting the character width.
+ *
+ * @return
+ *   F_true if a UTF-8 whitespace.
+ *   F_false if not a UTF-8 whitespace.
+ *   F_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see f_utf_character_is_whitespace_other()
+ * @see f_utf_is_whitespace_other()
+ */
+#if !defined(_di_f_utf_character_is_whitespace_other_) || !defined(_di_f_utf_is_whitespace_other_)
+  extern f_return_status private_f_utf_character_is_whitespace_other(const f_utf_character character, const uint8_t width) f_gcc_attribute_visibility_internal;
+#endif // !defined(_di_f_utf_character_is_whitespace_other_) || !defined(_di_f_utf_is_whitespace_other_)
+
+/**
   * Private implementation of f_utf_character_is_word().
   *
   * Intended to be shared to each of the different implementation variations.
diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c

index 9c5126da47a7a961028aad0ac808c75594fd78b3..a3d955c2a901434d7bc6fe0e1e52f6136c6e84eb 100644 (file)
--- a/level_0/f_utf/c/utf.c
+++ b/level_0/f_utf/c/utf.c
@@ -114,6 +114,26 @@ extern "C" {
    }
  #endif // _di_f_utf_character_is_alpha_
  
+#ifndef _di_f_utf_character_is_alpha_digit_
+  f_return_status f_utf_character_is_alpha_digit(const f_utf_character character) {
+    unsigned short width = f_macro_utf_character_width_is(character);
+
+    if (width == 0) {
+      if (isalnum(f_macro_utf_character_to_char_1(character))) {
+        return F_true;
+      }
+
+      return F_false;
+    }
+
+    if (width == 1) {
+      return F_status_is_error(F_utf);
+    }
+
+    return private_f_utf_character_is_alpha_digit(character, width);
+  }
+#endif // _di_f_utf_character_is_alpha_digit_
+
  #ifndef _di_f_utf_character_is_alpha_numeric_
    f_return_status f_utf_character_is_alpha_numeric(const f_utf_character character) {
      unsigned short width = f_macro_utf_character_width_is(character);
@@ -196,6 +216,26 @@ extern "C" {
    }
  #endif // _di_f_utf_character_is_control_picture_
  
+#ifndef _di_f_utf_character_is_digit_
+  f_return_status f_utf_character_is_digit(const f_utf_character character) {
+    unsigned short width = f_macro_utf_character_width_is(character);
+
+    if (width == 0) {
+      if (isdigit(f_macro_utf_character_to_char_1(character))) {
+        return F_true;
+      }
+
+      return F_false;
+    }
+
+    if (width == 1) {
+      return F_status_is_error(F_utf);
+    }
+
+    return private_f_utf_character_is_digit(character, width);
+  }
+#endif // _di_f_utf_character_is_digit_
+
  #ifndef _di_f_utf_character_is_emoji_
    f_return_status f_utf_character_is_emoji(const f_utf_character character) {
      unsigned short width = f_macro_utf_character_width_is(character);
@@ -421,6 +461,23 @@ extern "C" {
    }
  #endif // _di_f_utf_character_is_whitespace_modifier_
  
+#ifndef _di_f_utf_character_is_whitespace_other_
+  f_return_status f_utf_character_is_whitespace_other(const f_utf_character character) {
+    unsigned short width = f_macro_utf_character_width_is(character);
+
+    if (width == 0) {
+      // There are no ASCII whitespace other.
+      return F_false;
+    }
+
+    if (width == 1) {
+      return F_status_is_error(F_utf);
+    }
+
+    return private_f_utf_character_is_whitespace_other(character, width);
+  }
+#endif // _di_f_utf_character_is_whitespace_other_
+
  #ifndef _di_f_utf_character_is_word_
    f_return_status f_utf_character_is_word(const f_utf_character character) {
      unsigned short width = f_macro_utf_character_width_is(character);
@@ -636,6 +693,40 @@ extern "C" {
    }
  #endif // _di_f_utf_is_alpha_
  
+#ifndef _di_f_utf_is_alpha_digit_
+  f_return_status f_utf_is_alpha_digit(const f_string character, const f_string_length width_max) {
+    #ifndef _di_level_0_parameter_checking_
+      if (width_max < 1) return F_status_set_error(F_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    uint8_t width = f_macro_utf_byte_width_is(*character);
+
+    if (width == 0) {
+      if (isalnum(*character)) {
+        return F_true;
+      }
+
+      return F_false;
+    }
+
+    if (width == 1) {
+      return F_status_is_error(F_incomplete_utf);
+    }
+
+    f_utf_character character_utf = 0;
+
+    {
+      f_status status = 0;
+
+      status = f_utf_char_to_character(character, width_max, &character_utf);
+
+      if (status != F_none) return status;
+    }
+
+    return private_f_utf_character_is_alpha_digit(character_utf, width);
+  }
+#endif // _di_f_utf_is_alpha_digit_
+
  #ifndef _di_f_utf_is_alpha_numeric_
    f_return_status f_utf_is_alpha_numeric(const f_string character, const f_string_length width_max) {
      #ifndef _di_level_0_parameter_checking_
@@ -766,6 +857,40 @@ extern "C" {
    }
  #endif // _di_f_utf_is_control_picture_
  
+#ifndef _di_f_utf_is_digit_
+  f_return_status f_utf_is_digit(const f_string character, const f_string_length width_max) {
+    #ifndef _di_level_0_parameter_checking_
+      if (width_max < 1) return F_status_set_error(F_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    uint8_t width = f_macro_utf_byte_width_is(*character);
+
+    if (width == 0) {
+      if (isdigit(*character)) {
+        return F_true;
+      }
+
+      return F_false;
+    }
+
+    if (width == 1) {
+      return F_status_is_error(F_incomplete_utf);
+    }
+
+    f_utf_character character_utf = 0;
+
+    {
+      f_status status = 0;
+
+      status = f_utf_char_to_character(character, width_max, &character_utf);
+
+      if (status != F_none) return status;
+    }
+
+    return private_f_utf_character_is_digit(character_utf, width);
+  }
+#endif // _di_f_utf_is_digit_
+
  #ifndef _di_f_utf_is_emoji_
    f_return_status f_utf_is_emoji(const f_string character, const f_string_length width_max) {
      #ifndef _di_level_0_parameter_checking_
@@ -1124,6 +1249,37 @@ extern "C" {
    }
  #endif // _di_f_utf_is_whitespace_modifier_
  
+#ifndef _di_f_utf_is_whitespace_other_
+  f_return_status f_utf_is_whitespace_other(const f_string character, const f_string_length width_max) {
+    #ifndef _di_level_0_parameter_checking_
+      if (width_max < 1) return F_status_set_error(F_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    uint8_t width = f_macro_utf_byte_width_is(*character);
+
+    if (width == 0) {
+      // There are no ASCII whitespace other.
+      return F_false;
+    }
+
+    if (width == 1) {
+      return F_status_is_error(F_incomplete_utf);
+    }
+
+    f_utf_character character_utf = 0;
+
+    {
+      f_status status = 0;
+
+      status = f_utf_char_to_character(character, width_max, &character_utf);
+
+      if (status != F_none) return status;
+    }
+
+    return private_f_utf_character_is_whitespace_other(character_utf, width);
+  }
+#endif // _di_f_utf_is_whitespace_other_
+
  #ifndef _di_f_utf_is_word_
    f_return_status f_utf_is_word(const f_string character, const f_string_length width_max) {
      #ifndef _di_level_0_parameter_checking_
diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h

index dd9dbd42686ea0b0488eaff1a454567be55e65c0..8715191abbc8fd55b7aa043b26b3e6dca57dad6e 100644 (file)
--- a/level_0/f_utf/c/utf.h
+++ b/level_0/f_utf/c/utf.h
@@ -7,6 +7,8 @@
   *
   * Provides UTF-8 capabilities.
   *
+ * @todo consider is_graph() functions being their own data set (review unicode to see which of checking only for graph() vs checking for all not-graph will be the smaller set).
+ *
   * Identifiers:
   * - UTF_8-1: 1000 0000
   * - UTF_8-2: 1100 0000
@@ -149,8 +151,31 @@ extern "C" {
  #endif // _di_f_utf_character_is_alpha_
  
  /**
+ * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabetic or digit character.
+ *
+ * Digit characters are decimal digits and letter numbers.
+ *
+ * This does not include number-like, such as 1/2 (½) or superscript 2 (²).
+ *
+ * @param character
+ *   The character to validate.
+ *
+ * @return
+ *   F_true if a UTF-8 alpha-digit character.
+ *   F_false if not a UTF-8 alpha-digit character.
+ *   F_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see isalnum()
+ */
+#ifndef _di_f_utf_character_is_alpha_digit_
+  extern f_return_status f_utf_character_is_alpha_digit(const f_utf_character character);
+#endif // _di_f_utf_character_is_alpha_digit_
+
+/**
   * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabetic or numeric character.
   *
+ * Numeric characters are decimal digits, letter numbers, and number-like, such as 1/2 (½) or superscript 2 (²).
+ *
   * @param character
   *   The character to validate.
   *
@@ -215,6 +240,27 @@ extern "C" {
  #endif // _di_f_utf_character_is_control_picture_
  
  /**
+ * Check to see if the entire byte block of the character is an ASCII or UTF-8 digit character.
+ *
+ * Digit characters are decimal digits and letter numbers.
+ *
+ * This does not include number-like, such as 1/2 (½) or superscript 2 (²).
+ *
+ * @param character
+ *   The character to validate.
+ *
+ * @return
+ *   F_true if a UTF-8 digit character.
+ *   F_false if not a UTF-8 digit character.
+ *   F_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see isdigit()
+ */
+#ifndef _di_f_utf_character_is_digit_
+  extern f_return_status f_utf_character_is_digit(const f_utf_character character);
+#endif // _di_f_utf_character_is_digit_
+
+/**
   * Check to see if the entire byte block of the character is an ASCII or UTF-8 emoji character.
   *
   * @todo Incomplete, UTF-8 codes not yet checked!
@@ -285,6 +331,8 @@ extern "C" {
  /**
   * Check to see if the entire byte block of the character is an ASCII or UTF-8 numeric character.
   *
+ * Numeric characters are decimal digits, letter numbers, and number-like, such as 1/2 (½) or superscript 2 (²).
+ *
   * @param character
   *   The character to validate.
   *
@@ -379,6 +427,7 @@ extern "C" {
   * Non-printing or zero-width characters are not considered whitespace.
   * This does include line separators like '\n'.
   * This does not include phonetic spaces, like whitespace modifiers.
+ * This does not include non-true whitespace characters, such as Ogham Space Mark ( ).
   *
   * Phonetic spaces are whitespaces with additional phonetic meaning associated with them.
   * However, because they are not renderred as whitespace, they are technically not white space.
@@ -418,6 +467,25 @@ extern "C" {
  #endif // _di_f_utf_character_is_whitespace_modifier_
  
  /**
+ * Check to see if the entire byte block of the character is an other type of UTF-8 space character.
+ *
+ * This is a list of whitespace that are not actual whitespace (because they are graph characters) but are considered whitespace, such as Ogham Space Mark ( ).
+ *
+ * @param character
+ *   The character to validate.
+ *
+ * @return
+ *   F_true if a UTF-8 (other) whitespace.
+ *   F_false if not a UTF-8 (other) whitespace.
+ *   F_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see isspace()
+ */
+#ifndef _di_f_utf_character_is_whitespace_other_
+  extern f_return_status f_utf_character_is_whitespace_other(const f_utf_character character);
+#endif // _di_f_utf_character_is_whitespace_other_
+
+/**
   * Check to see if the entire byte block of the character is an ASCII or UTF-8 word character.
   *
   * A word character is alpha-numeric or an underscore '_'.
@@ -578,8 +646,35 @@ extern "C" {
  #endif // _di_f_utf_is_alpha_
  
  /**
+ * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabet or digit character.
+ *
+ * Digit characters are decimal digits and letter numbers.
+ *
+ * This does not include number-like, such as 1/2 (½) or superscript 2 (²).
+ *
+ * @param character
+ *   The character to validate.
+ *   There must be enough space allocated to compare against, as limited by width_max.
+ * @param width_max
+ *   The maximum width available for checking.
+ *   Can be anything greater than 0.
+ *
+ * @return
+ *   F_true if a UTF-8 alphabet character.
+ *   F_false if not a UTF-8 alpha-numeric character.x
+ *   F_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
+ *
+ * @see isalnum()
+ */
+#ifndef _di_f_utf_is_alpha_digit_
+  extern f_return_status f_utf_is_alpha_digit(const f_string character, const f_string_length width_max);
+#endif // _di_f_utf_is_alpha_digit_
+
+/**
   * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabet or numeric character.
   *
+ * Numeric characters are decimal digits, letter numbers, and number-like, such as 1/2 (½) or superscript 2 (²).
+ *
   * @param character
   *   The character to validate.
   *   There must be enough space allocated to compare against, as limited by width_max.
@@ -660,6 +755,27 @@ extern "C" {
  #endif // _di_f_utf_is_control_picture_
  
  /**
+ * Check to see if the entire byte block of the character is an ASCII or UTF-8 digit character.
+ *
+ * @param character
+ *   The character to validate.
+ *   There must be enough space allocated to compare against, as limited by width_max.
+ * @param width_max
+ *   The maximum width available for checking.
+ *   Can be anything greater than 0.
+ *
+ * @return
+ *   F_true if a UTF-8 digit character.
+ *   F_false if not a UTF-8 digit character.
+ *   F_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
+ *
+ * @see isdigit()
+ */
+#ifndef _di_f_utf_is_digit_
+  extern f_return_status f_utf_is_digit(const f_string character, const f_string_length width_max);
+#endif // _di_f_utf_is_digit_
+
+/**
   * Check to see if the entire byte block of the character is an ASCII or UTF-8 emoji character.
   *
   * @todo Incomplete, UTF-8 codes not yet checked!
@@ -740,6 +856,8 @@ extern "C" {
  /**
   * Check to see if the entire byte block of the character is an ASCII or UTF-8 numeric character.
   *
+ * Numeric characters are decimal digits, letter numbers, and number-like, such as 1/2 (½) or superscript 2 (²).
+ *
   * @param character
   *   The character to validate.
   *   There must be enough space allocated to compare against, as limited by width_max.
@@ -851,6 +969,7 @@ extern "C" {
   * Non-printing or zero-width characters are not considered whitespace.
   * This does include line separators like '\n'.
   * This does not include phonetic spaces, like whitespace modifiers.
+ * This does not include non-true whitespace characters, such as Ogham Space Mark ( ).
   *
   * Phonetic spaces are whitespaces with additional phonetic meaning associated with them.
   * However, because they are not renderred as whitespace, they are technically not white space.
@@ -902,9 +1021,32 @@ extern "C" {
  #endif // _di_f_utf_is_whitespace_modifier_
  
  /**
+ * Check to see if the entire byte block of the character is an other type of UTF-8 space character.
+ *
+ * This is a list of whitespace that are not actual whitespace (because they are graph characters) but are considered whitespace, such as Ogham Space Mark ( ).
+ *
+ * @param character
+ *   The character to validate.
+ *   There must be enough space allocated to compare against, as limited by width_max.
+ * @param width_max
+ *   The maximum width available for checking.
+ *   Can be anything greater than 0.
+ *
+ * @return
+ *   F_true if a UTF-8 whitespace.
+ *   F_false if not a UTF-8 whitespace.
+ *   F_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
+ *   F_maybe (with error bit) if this could be a whitespace but width is not long enough.
+ *   F_parameter (with error bit) if a parameter is invalid.
+ */
+#ifndef _di_f_utf_is_whitespace_other_
+  extern f_return_status f_utf_is_whitespace_other(const f_string character, const f_string_length width_max);
+#endif // _di_f_utf_is_whitespace_other_
+
+/**
   * Check to see if the entire byte block of the character is an ASCII or UTF-8 word character.
   *
- * A word character is alpha-numeric or an underscore '_'.
+ * A word character is alpha-digit or an underscore '_'.
   *
   * @param character
   *   The character to validate.
@@ -927,7 +1069,7 @@ extern "C" {
  /**
   * Check to see if the entire byte block of the character is an ASCII or UTF-8 word or dash character.
   *
- * A word dash character is alpha-numeric, an underscore '_' or a dash '-'.
+ * A word dash character is alpha-digit, an underscore '_' or a dash '-'.
   *
   * @param character
   *   The character to validate.
@@ -950,7 +1092,7 @@ extern "C" {
  /**
   * Check to see if the entire byte block of the character is an ASCII or UTF-8 word, dash, or plus character.
   *
- * A word dash character is alpha-numeric, an underscore '_', a dash '-', or a plus '+'.
+ * A word dash character is alpha-digit, an underscore '_', a dash '-', or a plus '+'.
   *
   * This does not include "invisible plus".
   *
author	Kevin Day <thekevinday@gmail.com>
	Thu, 2 Jul 2020 03:36:33 +0000 (22:36 -0500)
committer	Kevin Day <thekevinday@gmail.com>
	Thu, 2 Jul 2020 03:36:33 +0000 (22:36 -0500)
level_0/f_fss/c/fss.h		patch \| blob \| history
level_0/f_utf/c/private-utf.c		patch \| blob \| history
level_0/f_utf/c/private-utf.h		patch \| blob \| history
level_0/f_utf/c/utf.c		patch \| blob \| history
level_0/f_utf/c/utf.h		patch \| blob \| history