]> Kevux Git Server - fll/commitdiff
Update: Improve UTF-8 Control detecting, expanding to distinguish Control Code and...
authorKevin Day <thekevinday@gmail.com>
Sun, 14 Nov 2021 04:39:06 +0000 (22:39 -0600)
committerKevin Day <thekevinday@gmail.com>
Sun, 14 Nov 2021 04:41:50 +0000 (22:41 -0600)
There seem to be "Control Format".
Create functions for "Control Code" and "Control Format" (is_control_code and is_control_format functions).
The is_control functions now check for both.

level_0/f_utf/c/private-utf.c
level_0/f_utf/c/private-utf.h
level_0/f_utf/c/utf.c
level_0/f_utf/c/utf.h

index 50b09085e3b874d9398b5f7c23a0d27ab5a39185..beed62caf130c456289b8820c4af41866b558e77 100644 (file)
@@ -54,6 +54,7 @@ extern "C" {
       return F_false;
     }
 
+    // is_control() handles both is_control_code() and is_control_format().
     if (private_f_utf_character_is_control(character, width)) {
       return F_false;
     }
@@ -105,6 +106,7 @@ extern "C" {
       return F_false;
     }
 
+    // is_control() handles both is_control_code() and is_control_format().
     if (private_f_utf_character_is_control(character, width)) {
       return F_false;
     }
@@ -152,6 +154,7 @@ extern "C" {
       return F_false;
     }
 
+    // is_control() handles both is_control_code() and is_control_format().
     if (private_f_utf_character_is_control(character, width)) {
       return F_false;
     }
@@ -248,32 +251,110 @@ extern "C" {
 
     if (width == 2) {
 
+      // Control Codes.
+
       // Latin-1 Supplement: U+0080 to U+009F.
       if (character >= 0xc2800000 && character <= 0xc29f0000) {
         return F_true;
       }
+
+      // Control Formats.
+
+      // Latin-1 Supplement: U+00AD.
+      if (character == 0xc2ad0000) {
+        return F_true;
+      }
+
+      // Arabic: U+0600 to U+0605.
+      if (character >= 0xd8800000 && character <= 0xd8850000) {
+        return F_true;
+      }
+
+      // Arabic: U+061C, U+06DD.
+      if (character == 0xd89c0000 || character == 0xdb9d0000) {
+        return F_true;
+      }
+
+      // Syriac: U+070F.
+      if (character == 0xdc8f0000) {
+        return F_true;
+      }
     }
     else if (width == 3) {
 
-      // General Punctuation: U+200E and U+200F.
-      if (character == 0xe2808e00 || character == 0xe2808f00) {
+      // Control Formats.
+
+      // Arabic Extended-A: U+08E2.
+      if (character == 0xe0a3a200) {
         return F_true;
       }
 
-      // General Punctuation: U+2066 to U+2069.
-      if (character >= 0xe281a600 && character <= 0xe281a900) {
+      // Mongolian: U+180E.
+      if (character == 0xe1a08e00) {
+        return F_true;
+      }
+
+      // General Punctuation: U+200B to U+200F.
+      if (character >= 0xe2808b00 && character <= 0xe2808f00) {
+        return F_true;
+      }
+
+      // General Punctuation: U+202A to U+202E.
+      if (character >= 0xe280aa00 && character <= 0xe280ae00) {
+        return F_true;
+      }
+
+      // General Punctuation: U+2060 to U+2064.
+      if (character >= 0xe281a000 && character <= 0xe281a400) {
+        return F_true;
+      }
+
+      // General Punctuation: U+2066 to U+206F.
+      if (character >= 0xe281a600 && character <= 0xe281af00) {
+        return F_true;
+      }
+
+      // Arabic Presentation Forms-B: U+FEFF.
+      if (character == 0xefbbbf00) {
         return F_true;
       }
 
-      // Special: U+FFF9 to U+FFFB.
+      // Specials: U+FFF9 to U+FFFB.
       if (character >= 0xefbfb900 && character <= 0xefbfbb00) {
         return F_true;
       }
     }
     else if (width == 4) {
 
-      // Tags: U+E0001 and U+E007F.
-      if (character == 0xf3a08081 || character == 0xf3a081bf) {
+      // Control Formats.
+
+      // Kaithi: U+110BD, U+110CD.
+      if (character == 0xf09182bd || character == 0xf091838d) {
+        return F_true;
+      }
+
+      // Egyptian Hieroglyphics: U+13430 to U+13438.
+      if (character >= 0xf09390b0 && character <= 0xf09390b8) {
+        return F_true;
+      }
+
+      // Shothand Format Controls: U+1BCA0 to U+1BCA3.
+      if (character >= 0xf09bb2a0 && character <= 0xf09bb2a3) {
+        return F_true;
+      }
+
+      // Music Symbols: U+1D173 to U+1D17A.
+      if (character >= 0xf09d85b3 && character <= 0xf09d85ba) {
+        return F_true;
+      }
+
+      // Tags: U+E0001.
+      if (character == 0xf3a08081) {
+        return F_true;
+      }
+
+      // Tags: U+E0020 to U+E007F.
+      if (character >= 0xf3a080a0 && character <= 0xf3a081bf) {
         return F_true;
       }
     }
@@ -282,6 +363,125 @@ extern "C" {
   }
 #endif // !defined(_di_f_utf_character_is_control_) || !defined(_di_f_utf_is_control_)
 
+#if !defined(_di_f_utf_character_is_control_code_) || !defined(_di_f_utf_is_control_code_)
+  f_status_t private_f_utf_character_is_control_code(const f_utf_character_t character, const uint8_t width) {
+
+    if (width == 2) {
+
+      // Latin-1 Supplement: U+0080 to U+009F.
+      if (character >= 0xc2800000 && character <= 0xc29f0000) {
+        return F_true;
+      }
+    }
+
+    return F_false;
+  }
+#endif // !defined(_di_f_utf_character_is_control_code_) || !defined(_di_f_utf_is_contro_codel_)
+
+#if !defined(_di_f_utf_character_is_control_format_) || !defined(_di_f_utf_is_control_format_)
+  f_status_t private_f_utf_character_is_control_format(const f_utf_character_t character, const uint8_t width) {
+
+    if (width == 2) {
+
+      // Latin-1 Supplement: U+00AD.
+      if (character == 0xc2ad0000) {
+        return F_true;
+      }
+
+      // Arabic: U+0600 to U+0605.
+      if (character >= 0xd8800000 && character <= 0xd8850000) {
+        return F_true;
+      }
+
+      // Arabic: U+061C, U+06DD.
+      if (character == 0xd89c0000 || character == 0xdb9d0000) {
+        return F_true;
+      }
+
+      // Syriac: U+070F.
+      if (character == 0xdc8f0000) {
+        return F_true;
+      }
+    }
+    else if (width == 3) {
+
+      // Arabic Extended-A: U+08E2.
+      if (character == 0xe0a3a200) {
+        return F_true;
+      }
+
+      // Mongolian: U+180E.
+      if (character == 0xe1a08e00) {
+        return F_true;
+      }
+
+      // General Punctuation: U+200B to U+200F.
+      if (character >= 0xe2808b00 && character <= 0xe2808f00) {
+        return F_true;
+      }
+
+      // General Punctuation: U+202A to U+202E.
+      if (character >= 0xe280aa00 && character <= 0xe280ae00) {
+        return F_true;
+      }
+
+      // General Punctuation: U+2060 to U+2064.
+      if (character >= 0xe281a000 && character <= 0xe281a400) {
+        return F_true;
+      }
+
+      // General Punctuation: U+2066 to U+206F.
+      if (character >= 0xe281a600 && character <= 0xe281af00) {
+        return F_true;
+      }
+
+      // Arabic Presentation Forms-B: U+FEFF.
+      if (character == 0xefbbbf00) {
+        return F_true;
+      }
+
+      // Specials: U+FFF9 to U+FFFB.
+      if (character >= 0xefbfb900 && character <= 0xefbfbb00) {
+        return F_true;
+      }
+    }
+    else if (width == 4) {
+
+      // Kaithi: U+110BD, U+110CD.
+      if (character == 0xf09182bd || character == 0xf091838d) {
+        return F_true;
+      }
+
+      // Egyptian Hieroglyphics: U+13430 to U+13438.
+      if (character >= 0xf09390b0 && character <= 0xf09390b8) {
+        return F_true;
+      }
+
+      // Shothand Format Controls: U+1BCA0 to U+1BCA3.
+      if (character >= 0xf09bb2a0 && character <= 0xf09bb2a3) {
+        return F_true;
+      }
+
+      // Music Symbols: U+1D173 to U+1D17A.
+      if (character >= 0xf09d85b3 && character <= 0xf09d85ba) {
+        return F_true;
+      }
+
+      // Tags: U+E0001.
+      if (character == 0xf3a08081) {
+        return F_true;
+      }
+
+      // Tags: U+E0020 to U+E007F.
+      if (character >= 0xf3a080a0 && character <= 0xf3a081bf) {
+        return F_true;
+      }
+    }
+
+    return F_false;
+  }
+#endif // !defined(_di_f_utf_character_is_control_format_) || !defined(_di_f_utf_is_control_format_)
+
 #if !defined(_di_f_utf_character_is_control_picture_) || !defined(_di_f_utf_is_control_picture_)
   f_status_t private_f_utf_character_is_control_picture(const f_utf_character_t character, const uint8_t width) {
 
index f2e0c3e1ba3844043d677383a75bfd4a8ed60e01..ff453a8b63fe184aa8c815ce9c49398a89f0bb88 100644 (file)
@@ -175,8 +175,8 @@ extern "C" {
  *   The number of bytes repesenting the character width.
  *
  * @return
- *   F_true if a UTF-8 control picture character.
- *   F_false if not a UTF-8 control picture character.
+ *   F_true if a UTF-8 combining character.
+ *   F_false if not a UTF-8 combining character.
  *
  *   F_utf (with error bit) if character is an invalid UTF-8 character.
  *
@@ -211,6 +211,52 @@ extern "C" {
 #endif // !defined(_di_f_utf_character_is_control_) || !defined(_di_f_utf_is_control_)
 
 /**
+ * Private implementation of f_utf_character_is_control_code().
+ *
+ * Intended to be shared to each of the different implementation variations.
+ *
+ * @param character
+ *   The character to validate.
+ * @param width
+ *   The number of bytes repesenting the character width.
+ *
+ * @return
+ *   F_true if a UTF-8 control character.
+ *   F_false if not a UTF-8 control character.
+ *
+ *   F_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see f_utf_character_is_control_code()
+ * @see f_utf_is_control_code()
+ */
+#if !defined(_di_f_utf_character_is_control_code_) || !defined(_di_f_utf_is_control_code_)
+  extern f_status_t private_f_utf_character_is_control_code(const f_utf_character_t character, const uint8_t width) F_attribute_visibility_internal_d;
+#endif // !defined(_di_f_utf_character_is_control_code_) || !defined(_di_f_utf_is_control_code_)
+
+/**
+ * Private implementation of f_utf_character_is_control_format().
+ *
+ * Intended to be shared to each of the different implementation variations.
+ *
+ * @param character
+ *   The character to validate.
+ * @param width
+ *   The number of bytes repesenting the character width.
+ *
+ * @return
+ *   F_true if a UTF-8 control character.
+ *   F_false if not a UTF-8 control character.
+ *
+ *   F_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see f_utf_character_is_control_format()
+ * @see f_utf_is_control_format()
+ */
+#if !defined(_di_f_utf_character_is_control_format_) || !defined(_di_f_utf_is_control_format_)
+  extern f_status_t private_f_utf_character_is_control_format(const f_utf_character_t character, const uint8_t width) F_attribute_visibility_internal_d;
+#endif // !defined(_di_f_utf_character_is_control_format_) || !defined(_di_f_utf_is_control_format_)
+
+/**
  * Private implementation of f_utf_character_is_control_picture().
  *
  * Intended to be shared to each of the different implementation variations.
index 6081392610079619b1fcccaa23d5ecf674c4ccff..9c3281343b3d24e9b94e1c1e4298b33cdd8fdc78 100644 (file)
@@ -229,6 +229,46 @@ extern "C" {
   }
 #endif // _di_f_utf_character_is_control_
 
+#ifndef _di_f_utf_character_is_control_code_
+  f_status_t f_utf_character_is_control_code(const f_utf_character_t character) {
+
+    const uint8_t width = macro_f_utf_character_t_width_is(character);
+
+    if (!width) {
+      if (iscntrl(macro_f_utf_character_t_to_char_1(character))) {
+        return F_true;
+      }
+
+      return F_false;
+    }
+
+    if (width == 1) {
+      return F_status_is_error(F_utf);
+    }
+
+    return private_f_utf_character_is_control_code(character, width);
+  }
+#endif // _di_f_utf_character_is_control_code_
+
+#ifndef _di_f_utf_character_is_control_picture_
+  f_status_t character_is_control_format(const f_utf_character_t character) {
+
+    const uint8_t width = macro_f_utf_character_t_width_is(character);
+
+    if (!width) {
+
+      // There are no control format characters in ASCII.
+      return F_false;
+    }
+
+    if (width == 1) {
+      return F_status_is_error(F_utf);
+    }
+
+    return private_f_utf_character_is_control_format(character, width);
+  }
+#endif // _di_f_utf_character_is_control_format_
+
 #ifndef _di_f_utf_character_is_control_picture_
   f_status_t f_utf_character_is_control_picture(const f_utf_character_t character) {
 
@@ -1008,6 +1048,63 @@ extern "C" {
   }
 #endif // _di_f_utf_is_control_
 
+#ifndef _di_f_utf_is_control_code
+  f_status_t f_utf_is_control_code(const f_string_t character, const f_array_length_t width_max) {
+    #ifndef _di_level_0_parameter_checking_
+      if (width_max < 1) return F_status_set_error(F_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    const uint8_t width = macro_f_utf_byte_width_is(*character);
+
+    if (!width) {
+      if (iscntrl(*character)) {
+        return F_true;
+      }
+
+      return F_false;
+    }
+
+    if (width == 1) {
+      return F_status_is_error(F_complete_not_utf);
+    }
+
+    f_utf_character_t character_utf = 0;
+
+    f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
+    if (F_status_is_error(status)) return status;
+
+    return private_f_utf_character_is_control_code(character_utf, width);
+  }
+#endif // _di_f_utf_is_control_code_
+
+#ifndef _di_f_utf_is_control_format_
+  f_status_t f_utf_is_control_format(const f_string_t character, const f_array_length_t width_max) {
+    #ifndef _di_level_0_parameter_checking_
+      if (width_max < 1) return F_status_set_error(F_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    const uint8_t width = macro_f_utf_byte_width_is(*character);
+
+    // There are no ASCII control formats.
+    if (!width) {
+      return F_false;
+    }
+
+    if (width == 1) {
+      return F_status_is_error(F_complete_not_utf);
+    }
+
+    f_utf_character_t character_utf = 0;
+
+    {
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
+      if (F_status_is_error(status)) return status;
+    }
+
+    return private_f_utf_character_is_control_format(character_utf, width);
+  }
+#endif // _di_f_utf_is_control_format_
+
 #ifndef _di_f_utf_is_control_picture_
   f_status_t f_utf_is_control_picture(const f_string_t character, const f_array_length_t width_max) {
     #ifndef _di_level_0_parameter_checking_
@@ -1025,6 +1122,10 @@ extern "C" {
       return F_status_is_error(F_complete_not_utf);
     }
 
+    if (width != 3) {
+      return F_false;
+    }
+
     f_utf_character_t character_utf = 0;
 
     {
index 5d957749f8d300e37c1c64a66d9678441ac0656e..1a0b48097b4f16f5abe10fa9ee3dc383c9b87b5b 100644 (file)
@@ -224,8 +224,8 @@ extern "C" {
  *   The character to validate.
  *
  * @return
- *   F_true if a UTF-8 control picture character.
- *   F_false if not a UTF-8 control picture character.
+ *   F_true if a UTF-8 combining character.
+ *   F_false if not a UTF-8 combining character.
  *
  *   F_utf (with error bit) if character is an invalid UTF-8 character.
  */
@@ -236,6 +236,8 @@ extern "C" {
 /**
  * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character.
  *
+ * This includes control code and control format characters.
+ *
  * @param character
  *   The character to validate.
  *
@@ -252,6 +254,45 @@ extern "C" {
 #endif // _di_f_utf_character_is_control_
 
 /**
+ * Check to see if the entire byte block of the character is an ASCII or UTF-8 control code character.
+ *
+ * Control Code characters are the traditional control characters, such as "\n" as well as some newer Unicode ones.
+ *
+ * @param character
+ *   The character to validate.
+ *
+ * @return
+ *   F_true if a UTF-8 control code character.
+ *   F_false if not a UTF-8 control code character.
+ *
+ *   F_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see iscntrl()
+ */
+#ifndef _di_f_utf_character_is_control_code_
+  extern f_status_t f_utf_character_is_control_code(const f_utf_character_t character);
+#endif // _di_f_utf_character_is_control_code_
+
+/**
+ * Check to see if the entire byte block of the character is a UTF-8 control format character.
+ *
+ * Control Format characters are special characters used for formatting.
+ * These are considered control characters.
+ *
+ * @param character
+ *   The character to validate.
+ *
+ * @return
+ *   F_true if a UTF-8 control format character.
+ *   F_false if not a UTF-8 control format character.
+ *
+ *   F_utf (with error bit) if character is an invalid UTF-8 character.
+ */
+#ifndef _di_f_utf_character_is_control_format_
+  extern f_status_t f_utf_character_is_control_format(const f_utf_character_t character);
+#endif // _di_f_utf_character_is_control_format_
+
+/**
  * Check to see if the entire byte block of the character is a UTF-8 control picture character.
  *
  * Control Picture characters are placeholders for special ASCII characters and therefore there are no ASCII Control Picture characters.
@@ -876,8 +917,8 @@ extern "C" {
  *   Can be anything greater than 0.
  *
  * @return
- *   F_true if a UTF-8 control picture character.
- *   F_false if not a UTF-8 control picture character.
+ *   F_true if a UTF-8 combining character.
+ *   F_false if not a UTF-8 combining character.
  *
  *   F_complete_not_utf (with error bit) if character is an incomplete UTF-8 fragment.
  */
@@ -888,6 +929,8 @@ extern "C" {
 /**
  * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character.
  *
+ * This includes control code and control format characters.
+ *
  * @param character
  *   The character to validate.
  *   There must be enough space allocated to compare against, as limited by width_max.
@@ -908,6 +951,51 @@ extern "C" {
 #endif // _di_f_utf_is_control_
 
 /**
+ * Check to see if the entire byte block of the character is a UTF-8 control code character.
+ *
+ * Control Code characters are the traditional control characters, such as "\n" as well as some newer Unicode ones.
+ *
+ * @param character
+ *   The character to validate.
+ *   There must be enough space allocated to compare against, as limited by width_max.
+ * @param width_max
+ *   The maximum width available for checking.
+ *   Can be anything greater than 0.
+ *
+ * @return
+ *   F_true if a UTF-8 control code character.
+ *   F_false if not a UTF-8 control code character.
+ *
+ *   F_complete_not_utf (with error bit) if character is an incomplete UTF-8 fragment.
+ */
+#ifndef _di_f_utf_is_control_code_
+  extern f_status_t f_utf_is_control_code(const f_string_t character, const f_array_length_t width_max);
+#endif // _di_f_utf_is_control_code_
+
+/**
+ * Check to see if the entire byte block of the character is a UTF-8 control format character.
+ *
+ * Control Format characters are special characters used for formatting.
+ * These are considered control characters.
+ *
+ * @param character
+ *   The character to validate.
+ *   There must be enough space allocated to compare against, as limited by width_max.
+ * @param width_max
+ *   The maximum width available for checking.
+ *   Can be anything greater than 0.
+ *
+ * @return
+ *   F_true if a UTF-8 control format character.
+ *   F_false if not a UTF-8 control format character.
+ *
+ *   F_complete_not_utf (with error bit) if character is an incomplete UTF-8 fragment.
+ */
+#ifndef _di_f_utf_is_control_format_
+  extern f_status_t f_utf_is_control_format(const f_string_t character, const f_array_length_t width_max);
+#endif // _di_f_utf_is_control_format_
+
+/**
  * Check to see if the entire byte block of the character is a UTF-8 control picture character.
  *
  * Control Picture characters are placeholders for special ASCII characters and therefore there are no ASCII Control Picture characters.