Update: Add missing function in f_utf needed for completeness and reduce repeated...

author Kevin Day <thekevinday@gmail.com>

Tue, 7 Dec 2021 03:51:02 +0000 (21:51 -0600)

committer Kevin Day <thekevinday@gmail.com>

Tue, 7 Dec 2021 03:51:02 +0000 (21:51 -0600)
author Kevin Day <thekevinday@gmail.com>
Tue, 7 Dec 2021 03:51:02 +0000 (21:51 -0600)
committer Kevin Day <thekevinday@gmail.com>
Tue, 7 Dec 2021 03:51:02 +0000 (21:51 -0600)
diff --git a/level_0/f_utf/c/private-utf.c b/level_0/f_utf/c/private-utf.c

index 1af2b538713f327af1d82adfc9420bf4e28dacc9..ff15202d7ae5461dc8fba65ad734a91ff0414dec 100644 (file)
--- a/level_0/f_utf/c/private-utf.c
+++ b/level_0/f_utf/c/private-utf.c
@@ -3596,6 +3596,47 @@ extern "C" {
    }
  #endif // !defined(_di_f_utf_character_is_zero_width_) || !defined(_di_f_utf_is_zero_width_)
  
+#if !defined(_di_f_utf_unicode_to_) || !defined(_di_f_utf_character_unicode_to_)
+  f_status_t private_f_utf_character_unicode_to(const f_utf_character_t character, uint32_t *unicode) {
+
+    if (macro_f_utf_character_t_width_is(character) == 1) {
+      return F_status_set_error(F_utf_fragment);
+    }
+
+    if (private_f_utf_character_is_valid(character) == F_false) {
+      return F_status_set_error(F_utf);
+    }
+
+    // U+0000 -> U+007F (ASCII).
+    if (macro_f_utf_character_t_width(character) == 1) {
+      *unicode = macro_f_utf_character_t_to_char_1(character) & 0x7f;
+    }
+
+    // U+0080 -> U+07FF.
+    else if (macro_f_utf_character_t_width(character) == 2) {
+      *unicode = (macro_f_utf_character_t_to_char_1(character) & 0x1f) << 6;
+      *unicode |= macro_f_utf_character_t_to_char_2(character) & 0x3f;
+    }
+
+    // U+0800 -> U+FFFF.
+    else if (macro_f_utf_character_t_width(character) == 3) {
+      *unicode = (macro_f_utf_character_t_to_char_1(character) & 0xf) << 12;
+      *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 6;
+      *unicode |= macro_f_utf_character_t_to_char_3(character) & 0x3f;
+    }
+
+    // U+10000 -> U+10FFFF.
+    else if (macro_f_utf_character_t_width(character) == 4) {
+      *unicode = (macro_f_utf_character_t_to_char_1(character) & 0x7) << 18;
+      *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 12;
+      *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 6;
+      *unicode |= macro_f_utf_character_t_to_char_4(character) & 0x3f;
+    }
+
+    return F_none;
+  }
+#endif // !defined(_di_f_utf_unicode_to_) || !defined(_di_f_utf_character_unicode_to_)
+
  #ifdef __cplusplus
  } // extern "C"
  #endif
diff --git a/level_0/f_utf/c/private-utf.h b/level_0/f_utf/c/private-utf.h

index 5b3336cf827e44aa1efdf473ef850234e93e7643..a2dbf7d84bf6d92a1d496a105027f25f29cc0f04 100644 (file)
--- a/level_0/f_utf/c/private-utf.h
+++ b/level_0/f_utf/c/private-utf.h
@@ -633,6 +633,33 @@ extern "C" {
    extern f_status_t private_f_utf_character_is_zero_width(const f_utf_character_t character) F_attribute_visibility_internal_d;
  #endif // !defined(_di_f_utf_character_is_zero_width_) || !defined(_di_f_utf_is_zero_width_)
  
+/**
+ * Private implementation of f_utf_character_is_zero_width().
+ *
+ * Intended to be shared to each of the different implementation variations.
+ *
+ * @param character
+ *   The (UTF-8) character to convert to the Unicode representation.
+ *   The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged.
+ * @param unicode
+ *   A 32-bit integer representing the Unicode (such as U+0001).
+ *   Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF).
+ *
+ * @return
+ *   F_none on success.
+ *
+ *   F_failure (with error bit) if width is not long enough to convert.
+ *   F_parameter (with error bit) if a parameter is invalid.
+ *   F_utf (with error bit) if unicode is an invalid Unicode character.
+ *   F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment.
+ *
+ * @see f_utf_character_unicode_to()
+ * @see f_utf_unicode_to()
+ */
+#if !defined(_di_f_utf_character_unicode_to_) || !defined(_di_f_utf_unicode_to_)
+  extern f_status_t private_f_utf_character_unicode_to(const f_utf_character_t character, uint32_t *unicode) F_attribute_visibility_internal_d;
+#endif // !defined(_di_f_utf_character_unicode_to_) || !defined(_di_f_utf_unicode_to_)
+
  #ifdef __cplusplus
  } // extern "C"
  #endif
diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c

index 9122fa71678e95486e78c71a095305228b51a150..80a9e8a192076cf4043dc1aa9cc134416f68cec4 100644 (file)
--- a/level_0/f_utf/c/utf.c
+++ b/level_0/f_utf/c/utf.c
@@ -720,41 +720,7 @@ extern "C" {
        if (!unicode) return F_status_set_error(F_parameter);
      #endif // _di_level_0_parameter_checking_
  
-    // ASCII.
-    if (!macro_f_utf_character_t_width_is(character)) {
-      *unicode = macro_f_utf_character_t_to_char_1(character) & 0x7f;
-    }
-
-    if (macro_f_utf_character_t_width_is(character) == 1) {
-      return F_status_set_error(F_utf_fragment);
-    }
-
-    if (private_f_utf_character_is_valid(character) == F_false) {
-      return F_status_set_error(F_utf);
-    }
-
-    // U+0080 -> U+07FF.
-    if (macro_f_utf_character_t_width_is(character) == 2) {
-      *unicode = (macro_f_utf_character_t_to_char_1(character) & 0x1f) << 6;
-      *unicode |= macro_f_utf_character_t_to_char_2(character) & 0x3f;
-    }
-
-    // U+0800 -> U+FFFF.
-    else if (macro_f_utf_character_t_width_is(character) == 3) {
-      *unicode = (macro_f_utf_character_t_to_char_1(character) & 0xf) << 12;
-      *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 6;
-      *unicode |= macro_f_utf_character_t_to_char_3(character) & 0x3f;
-    }
-
-    // U+10000 -> U+10FFFF.
-    else if (macro_f_utf_character_t_width_is(character) == 4) {
-      *unicode = (macro_f_utf_character_t_to_char_1(character) & 0x7) << 18;
-      *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 12;
-      *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 6;
-      *unicode |= macro_f_utf_character_t_to_char_4(character) & 0x3f;
-    }
-
-    return F_none;
+    return private_f_utf_character_unicode_to(character, unicode);
    }
  #endif // _di_f_utf_character_unicode_to_
  
@@ -801,6 +767,81 @@ extern "C" {
    }
  #endif // _di_f_utf_character_unicode_from_
  
+#ifndef _di_f_utf_character_unicode_string_to_
+  f_status_t f_utf_character_unicode_string_to(const f_utf_string_t string, const f_array_length_t length, uint32_t *unicode) {
+    #ifndef _di_level_0_parameter_checking_
+      if (!string) return F_status_set_error(F_parameter);
+      if (!unicode) return F_status_set_error(F_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    f_array_length_t i = 0;
+
+    while (i < length && !string[i]) {
+      ++i;
+    } // while
+
+    if (i < length) {
+      if (macro_f_utf_character_t_width_is(string[i])) {
+        i = length;
+      }
+      else {
+        if (macro_f_utf_character_t_to_char_1(string[i]) == f_string_ascii_u_s[0] || macro_f_utf_character_t_to_char_1(string[i]) == f_string_ascii_U_s[0]) {
+          do {
+            ++i;
+          } while (i < length && !string[i]);
+
+          if (i < length && !macro_f_utf_character_t_width_is(string[i]) && macro_f_utf_character_t_to_char_1(string[i]) == f_string_ascii_plus_s[0]) {
+            ++i;
+          }
+          else {
+            i = length;
+          }
+        }
+        else {
+          i = length;
+        }
+      }
+    }
+
+    if (i == length) {
+      return F_status_set_error(F_valid_not);
+    }
+
+    uint32_t value = 0;
+    uint8_t character = 0;
+
+    for (; i < length; ++i) {
+
+      if (!string[i]) continue;
+
+      // Only ASCII character numbers are allowed to represent
+      if (macro_f_utf_character_t_width_is(string[i])) {
+        return F_status_set_error(F_valid_not);
+      }
+
+      value *= 16;
+      character = macro_f_utf_character_t_to_char_1(string[i]);
+
+      if (character > 0x2f && character < 0x3a) {
+        value += character - 0x30;
+      }
+      else if (character > 0x40 && character < 0x47) {
+        value += (character - 0x41) + 10;
+      }
+      else if (character > 0x60 && character < 0x67) {
+        value += (character - 0x61) + 10;
+      }
+      else {
+        return F_status_set_error(F_valid_not);
+      }
+    } // for
+
+    *unicode = value;
+
+    return F_none;
+  }
+#endif // _di_f_utf_character_unicode_string_to_
+
  #ifndef _di_f_utf_is_
    f_status_t f_utf_is(const f_string_t character) {
  
@@ -1806,48 +1847,14 @@ extern "C" {
        if (!unicode) return F_status_set_error(F_parameter);
      #endif // _di_level_0_parameter_checking_
  
-    if (macro_f_utf_byte_width_is(*character) == 1) {
-      return F_status_set_error(F_utf_fragment);
-    }
+    f_utf_character_t character_utf = 0;
  
      {
-      f_utf_character_t character_utf = 0;
-
        const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
-
-      if (private_f_utf_character_is_valid(character_utf) == F_false) {
-        return F_status_set_error(F_utf);
-      }
-    }
-
-    // U+0000 -> U+007F.
-    if (macro_f_utf_byte_width(*character) == 1) {
-      *unicode = ((uint8_t) character[0]) & 0x7f;
-    }
-
-    // U+0080 -> U+07FF.
-    else if (macro_f_utf_byte_width(*character) == 2) {
-      *unicode = (((uint8_t) character[0]) & 0x1f) << 6;
-      *unicode |= ((uint8_t) character[1]) & 0x3f;
      }
  
-    // U+0800 -> U+FFFF.
-    else if (macro_f_utf_byte_width(*character) == 3) {
-      *unicode = (((uint8_t) character[0]) & 0xf) << 12;
-      *unicode |= (((uint8_t) character[1]) & 0x3f) << 6;
-      *unicode |= ((uint8_t) character[2]) & 0x3f;
-    }
-
-    // U+10000 -> U+10FFFF.
-    else if (macro_f_utf_byte_width(*character) == 4) {
-      *unicode = (((uint8_t) character[0]) & 0x7) << 18;
-      *unicode |= (((uint8_t) character[1]) & 0x3f) << 12;
-      *unicode |= (((uint8_t) character[2]) & 0x3f) << 6;
-      *unicode |= ((uint8_t) character[3]) & 0x3f;
-    }
-
-    return F_none;
+    return private_f_utf_character_unicode_to(character_utf, unicode);
    }
  #endif // _di_f_utf_unicode_to_
  
diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h

index 477537c23cfa272b351d087c374bff43259138ed..389bead9e53663fad7475e482443b8f1788a9ea6 100644 (file)
--- a/level_0/f_utf/c/utf.h
+++ b/level_0/f_utf/c/utf.h
@@ -836,6 +836,33 @@ extern "C" {
  #endif // _di_f_utf_character_unicode_from_
  
  /**
+ * Convert a string of the format "U+FFFF" into the codepoint value.
+ *
+ * This ignores NULL characters.
+ * The string may only contain "U+" followed by a hexidecimal digit, upper or lower case.
+ * The "U+" prefix is optional.
+ * Only ASCII characters are allowed to represent the Unicode sequence string.
+ *
+ * @param string
+ *   The string representing a Unicode sequence.
+ * @param length
+ *   The maximum number of characters.
+ * @param unicode
+ *   A 32-bit integer representing the Unicode (such as U+0001).
+ *   Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF).
+ *
+ * @return
+ *   F_none on success.
+ *
+ *   F_failure (with error bit) if width_max is not long enough to convert.
+ *   F_parameter (with error bit) if a parameter is invalid.
+ *   F_valid_not (with error bit) if string is not a valid Unicode string.
+ */
+#ifndef _di_f_utf_character_unicode_string_to_
+  extern f_status_t f_utf_character_unicode_string_to(const f_utf_string_t string, const f_array_length_t length, uint32_t *unicode);
+#endif // _di_f_utf_character_unicode_string_to_
+
+/**
   * Check to see if the entire byte block of the character is a non-ASCII UTF-8 character.
   *
   * This does not check the validity of the character, for that instead use f_utf_is_valid().
@@ -1672,7 +1699,7 @@ extern "C" {
   *   F_parameter (with error bit) if a parameter is invalid.
   *   F_valid_not (with error bit) if string is not a valid Unicode string.
   */
-#ifndef _di_f_utf_unicode_string_to_f_
+#ifndef _di_f_utf_unicode_string_to_
    extern f_status_t f_utf_unicode_string_to(const f_string_t string, const f_array_length_t length, uint32_t *unicode);
  #endif // _di_f_utf_unicode_string_to_
author	Kevin Day <thekevinday@gmail.com>
	Tue, 7 Dec 2021 03:51:02 +0000 (21:51 -0600)
committer	Kevin Day <thekevinday@gmail.com>
	Tue, 7 Dec 2021 03:51:02 +0000 (21:51 -0600)
level_0/f_utf/c/private-utf.c		patch \| blob \| history
level_0/f_utf/c/private-utf.h		patch \| blob \| history
level_0/f_utf/c/utf.c		patch \| blob \| history
level_0/f_utf/c/utf.h		patch \| blob \| history