Feature: Add Unicode to/from UTF-8 translations and fix usage of function that should...

author Kevin Day <thekevinday@gmail.com>

Tue, 25 May 2021 23:27:30 +0000 (18:27 -0500)

committer Kevin Day <thekevinday@gmail.com>

Tue, 25 May 2021 23:27:30 +0000 (18:27 -0500)
author Kevin Day <thekevinday@gmail.com>
Tue, 25 May 2021 23:27:30 +0000 (18:27 -0500)
committer Kevin Day <thekevinday@gmail.com>
Tue, 25 May 2021 23:27:30 +0000 (18:27 -0500)
diff --git a/level_0/f_utf/c/private-utf.c b/level_0/f_utf/c/private-utf.c

index 031ad859581923b4846ecca04eb6c2dae7c7290e..52d8f643015c5186b30efe1db3f1c8389bacf26f 100644 (file)
--- a/level_0/f_utf/c/private-utf.c
+++ b/level_0/f_utf/c/private-utf.c
@@ -5,6 +5,48 @@
  extern "C" {
  #endif
  
+#if !defined(_di_f_utf_char_to_character_) || !defined(_di_f_utf_is_alpha_) || !defined(_di_f_utf_is_alpha_digit_) || !defined(_di_f_utf_is_alpha_numeric_) || !defined(_di_f_utf_is_ascii_) || !defined(_di_f_utf_is_combining_) || !defined(_di_f_utf_is_control_) || !defined(_di_f_utf_is_control_picture_) || !defined(_di_f_utf_is_digit_) || !defined(_di_f_utf_is_emoji_) || !defined(_di_f_utf_is_graph_) || !defined(_di_f_utf_is_numeric_) || !defined(_di_f_utf_is_phonetic_) || !defined(_di_f_utf_is_private_) || !defined(_di_f_utf_is_punctuation_) || !defined(_di_f_utf_is_symbol_) || !defined(_di_f_utf_is_unassigned_) || !defined(_di_f_utf_is_valid_) || !defined(_di_f_utf_is_whitespace_) || !defined(_di_f_utf_is_whitespace_modifier_) || !defined(_di_f_utf_is_whitespace_other_) || !defined(_di_f_utf_is_word_) || !defined(_di_f_utf_is_word_dash_) || !defined(_di_f_utf_is_word_dash_plus_) || !defined(_di_f_utf_is_zero_width_) || !defined(f_utf_unicode_to)
+  f_status_t private_f_utf_char_to_character(const f_string_t character, const f_array_length_t width_max, f_utf_character_t *character_utf) {
+
+    const uint8_t width = macro_f_utf_byte_width_is(*character);
+
+    if (!width) {
+      *character_utf = macro_f_utf_character_t_from_char_1(character[0]);
+
+      return F_none;
+    }
+    else if (width == 1) {
+      return F_status_is_error(F_utf);
+    }
+
+    if (width > width_max) {
+      return F_status_set_error(F_failure);
+    }
+
+    *character_utf = macro_f_utf_character_t_from_char_1(character[0]);
+
+    if (width < 2) {
+      return F_none;
+    }
+
+    *character_utf |= macro_f_utf_character_t_from_char_2(character[1]);
+
+    if (width == 2) {
+      return F_none;
+    }
+
+    *character_utf |= macro_f_utf_character_t_from_char_3(character[2]);
+
+    if (width == 3) {
+      return F_none;
+    }
+
+    *character_utf |= macro_f_utf_character_t_from_char_4(character[3]);
+
+    return F_none;
+  }
+#endif // !defined(_di_f_utf_char_to_character_) || !defined(_di_f_utf_is_alpha_) || !defined(_di_f_utf_is_alpha_digit_) || !defined(_di_f_utf_is_alpha_numeric_) || !defined(_di_f_utf_is_ascii_) || !defined(_di_f_utf_is_combining_) || !defined(_di_f_utf_is_control_) || !defined(_di_f_utf_is_control_picture_) || !defined(_di_f_utf_is_digit_) || !defined(_di_f_utf_is_emoji_) || !defined(_di_f_utf_is_graph_) || !defined(_di_f_utf_is_numeric_) || !defined(_di_f_utf_is_phonetic_) || !defined(_di_f_utf_is_private_) || !defined(_di_f_utf_is_punctuation_) || !defined(_di_f_utf_is_symbol_) || !defined(_di_f_utf_is_unassigned_) || !defined(_di_f_utf_is_valid_) || !defined(_di_f_utf_is_whitespace_) || !defined(_di_f_utf_is_whitespace_modifier_) || !defined(_di_f_utf_is_whitespace_other_) || !defined(_di_f_utf_is_word_) || !defined(_di_f_utf_is_word_dash_) || !defined(_di_f_utf_is_word_dash_plus_) || !defined(_di_f_utf_is_zero_width_) || !defined(f_utf_unicode_to)
+
  #if !defined(_di_f_utf_character_is_alpha_) || !defined(_di_f_utf_is_alpha_)
    f_status_t private_f_utf_character_is_alpha(const f_utf_character_t character, const uint8_t width) {
  
diff --git a/level_0/f_utf/c/private-utf.h b/level_0/f_utf/c/private-utf.h

index 91ace79b6f5d52c8f677ded0b689a8e905123897..1b3a1df0ab1cc32c3d29ec14525e5ab744067091 100644 (file)
--- a/level_0/f_utf/c/private-utf.h
+++ b/level_0/f_utf/c/private-utf.h
@@ -18,6 +18,60 @@ extern "C" {
  #endif
  
  /**
+ * Private implementation of f_utf_char_to_character().
+ *
+ * Intended to be shared to each of the different implementation variations.
+ *
+ * @param character
+ *   The character string to be converted to the f_utf_character_t type.
+ *   There must be enough space allocated to convert against, as limited by width_max.
+ * @param width_max
+ *   The maximum width available for converting.
+ *   Can be anything greater than 0.
+ * @param character_utf
+ *   The generated character of type f_utf_character_t.
+ *   This value may be cleared, even on error.
+ *
+ * @return
+ *   F_none if conversion was successful.
+ *
+ *   F_failure (with error bit) if width is not long enough to convert.
+ *   F_parameter (with error bit) if a parameter is invalid.
+ *   F_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see f_utf_char_to_character()
+ * @see f_utf_character_is_valid()
+ * @see f_utf_is_valid()
+ * @see f_utf_is_alpha()
+ * @see f_utf_is_alpha_digit()
+ * @see f_utf_is_alpha_numeric()
+ * @see f_utf_is_ascii()
+ * @see f_utf_is_combining_  _di_f_utf_is_control()
+ * @see f_utf_is_control_picture()
+ * @see f_utf_is_digit()
+ * @see f_utf_is_emoji()
+ * @see f_utf_is_graph()
+ * @see f_utf_is_numeric()
+ * @see f_utf_is_phonetic()
+ * @see f_utf_is_private()
+ * @see f_utf_is_punctuation()
+ * @see f_utf_is_symbol()
+ * @see f_utf_is_unassigned()
+ * @see f_utf_is_valid()
+ * @see f_utf_is_whitespace()
+ * @see f_utf_is_whitespace_modifier()
+ * @see f_utf_is_whitespace_other()
+ * @see f_utf_is_word()
+ * @see f_utf_is_word_dash()
+ * @see f_utf_is_word_dash_plus()
+ * @see f_utf_is_zero_width()
+ * @see f_utf_unicode_to()
+ */
+#if !defined(_di_f_utf_char_to_character_) || !defined(_di_f_utf_is_alpha_) || !defined(_di_f_utf_is_alpha_digit_) || !defined(_di_f_utf_is_alpha_numeric_) || !defined(_di_f_utf_is_ascii_) || !defined(_di_f_utf_is_combining_) || !defined(_di_f_utf_is_control_) || !defined(_di_f_utf_is_control_picture_) || !defined(_di_f_utf_is_digit_) || !defined(_di_f_utf_is_emoji_) || !defined(_di_f_utf_is_graph_) || !defined(_di_f_utf_is_numeric_) || !defined(_di_f_utf_is_phonetic_) || !defined(_di_f_utf_is_private_) || !defined(_di_f_utf_is_punctuation_) || !defined(_di_f_utf_is_symbol_) || !defined(_di_f_utf_is_unassigned_) || !defined(_di_f_utf_is_valid_) || !defined(_di_f_utf_is_whitespace_) || !defined(_di_f_utf_is_whitespace_modifier_) || !defined(_di_f_utf_is_whitespace_other_) || !defined(_di_f_utf_is_word_) || !defined(_di_f_utf_is_word_dash_) || !defined(_di_f_utf_is_word_dash_plus_) || !defined(_di_f_utf_is_zero_width_) || !defined(f_utf_unicode_to)
+  extern f_status_t private_f_utf_char_to_character(const f_string_t character, const f_array_length_t width_max, f_utf_character_t *character_utf) f_attribute_visibility_internal;
+#endif // !defined(_di_f_utf_char_to_character_) || !defined(_di_f_utf_is_alpha_) || !defined(_di_f_utf_is_alpha_digit_) || !defined(_di_f_utf_is_alpha_numeric_) || !defined(_di_f_utf_is_ascii_) || !defined(_di_f_utf_is_combining_) || !defined(_di_f_utf_is_control_) || !defined(_di_f_utf_is_control_picture_) || !defined(_di_f_utf_is_digit_) || !defined(_di_f_utf_is_emoji_) || !defined(_di_f_utf_is_graph_) || !defined(_di_f_utf_is_numeric_) || !defined(_di_f_utf_is_phonetic_) || !defined(_di_f_utf_is_private_) || !defined(_di_f_utf_is_punctuation_) || !defined(_di_f_utf_is_symbol_) || !defined(_di_f_utf_is_unassigned_) || !defined(_di_f_utf_is_valid_) || !defined(_di_f_utf_is_whitespace_) || !defined(_di_f_utf_is_whitespace_modifier_) || !defined(_di_f_utf_is_whitespace_other_) || !defined(_di_f_utf_is_word_) || !defined(_di_f_utf_is_word_dash_) || !defined(_di_f_utf_is_word_dash_plus_) || !defined(_di_f_utf_is_zero_width_) || !defined(f_utf_unicode_to)
+
+/**
   * Private implementation of f_utf_character_is_alpha().
   *
   * Intended to be shared to each of the different implementation variations.
diff --git a/level_0/f_utf/c/utf-common.h b/level_0/f_utf/c/utf-common.h

index 9a3139c2d42d61957f5d5b046b52147b8d1f2738..b2110bf250c5dd4fccb73ed230bfea185e65580c 100644 (file)
--- a/level_0/f_utf/c/utf-common.h
+++ b/level_0/f_utf/c/utf-common.h
@@ -7,6 +7,8 @@
   *
   * Defines common data to be used for/by project utf.
   *
+ * @fixme this code probably only works on little-endian only as-is, this needs to be checked for and possibly redesign to support both big or little.
+ *
   * This is auto-included by utf.h and should not need to be explicitly included.
   */
  #ifndef _F_utf_common_h
@@ -31,7 +33,7 @@ extern "C" {
   * The macro_f_utf_byte_is_* macros are used to determine a width of the character (either 1, 2, 3, or 4, respectively).
   *
   * The macro_f_utf_byte_width macro determines a width of the character.
- * The macro_f_utf_byte_width_is is identical to macro_f_utf_byte_width, except it returns 0 when character is not UTF-8.
+ * The macro_f_utf_byte_width_is is identical to macro_f_utf_byte_width, except it returns 0 when character is ASCII.
   */
  #ifndef _di_f_utf_byte_
    #define f_utf_byte_1 0x80 // 1000 0000
@@ -166,6 +168,9 @@ extern "C" {
   * The macro_f_utf_character_t_width is used to determine the width of the UTF-8 character based on macro_f_utf_byte_width.
   * The macro_f_utf_character_t_width_is is used to determine the width of the UTF-8 character based on macro_f_utf_byte_width_is.
   *
+ * The macro_f_utf_character_t_width macro determines a width of the UTF-8 character based on macro_f_utf_byte_width.
+ * The macro_f_utf_character_t_width_is is identical to macro_f_utf_character_t_width, except it returns 0 when character is ASCII.
+ *
   * @see f_utf_is_big_endian()
   */
  #ifndef _di_f_utf_character_t_
diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c

index a6e2d86de1d8b8c048951d71aac71e381b9df49a..893fc6f51568a20b8685b84b48ab2ed52c4cf54b 100644 (file)
--- a/level_0/f_utf/c/utf.c
+++ b/level_0/f_utf/c/utf.c
@@ -89,42 +89,7 @@ extern "C" {
        if (!character_utf) return F_status_set_error(F_parameter);
      #endif // _di_level_0_parameter_checking_
  
-    const uint8_t width = macro_f_utf_byte_width_is(*character);
-
-    if (!width) {
-      *character_utf = macro_f_utf_character_t_from_char_1(character[0]);
-
-      return F_none;
-    }
-    else if (width == 1) {
-      return F_status_is_error(F_utf);
-    }
-
-    if (width > width_max) {
-      return F_status_set_error(F_failure);
-    }
-
-    *character_utf = macro_f_utf_character_t_from_char_1(character[0]);
-
-    if (width < 2) {
-      return F_none;
-    }
-
-    *character_utf |= macro_f_utf_character_t_from_char_2(character[1]);
-
-    if (width == 2) {
-      return F_none;
-    }
-
-    *character_utf |= macro_f_utf_character_t_from_char_3(character[2]);
-
-    if (width == 3) {
-      return F_none;
-    }
-
-    *character_utf |= macro_f_utf_character_t_from_char_4(character[3]);
-
-    return F_none;
+    return private_f_utf_char_to_character(character, width_max, character_utf);
    }
  #endif // _di_f_utf_char_to_character_
  
@@ -749,6 +714,93 @@ extern "C" {
    }
  #endif // _di_f_utf_character_to_char_
  
+#ifndef _di_f_utf_character_unicode_to_
+  f_status_t f_utf_character_unicode_to(const f_utf_character_t character, uint32_t *unicode) {
+    #ifndef _di_level_0_parameter_checking_
+      if (!unicode) return F_status_set_error(F_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    const uint8_t width = macro_f_utf_character_t_width(character);
+
+    if (private_f_utf_character_is_valid(character, width) == F_false) {
+      return F_status_set_error(F_utf);
+    }
+
+    if (width < 2) {
+
+      // U+0000 -> U+007F
+      *unicode = macro_f_utf_character_t_to_char_1(character) & 0x7f;
+    }
+    else if (width == 2) {
+
+      // U+0080 -> U+07FF
+      *unicode = (macro_f_utf_character_t_to_char_1(character) & 0x1f) << 6;
+      *unicode |= macro_f_utf_character_t_to_char_2(character) & 0x3f;
+    }
+    else if (width == 3) {
+
+      // U+0800 -> U+FFFF
+      *unicode = (macro_f_utf_character_t_to_char_1(character) & 0xf) << 12;
+      *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 6;
+      *unicode |= macro_f_utf_character_t_to_char_3(character) & 0x3f;
+    }
+    else if (width == 4) {
+
+      // U+10000 -> U+10FFFF
+      *unicode = (macro_f_utf_character_t_to_char_1(character) & 0x7) << 18;
+      *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 12;
+      *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 6;
+      *unicode |= macro_f_utf_character_t_to_char_4(character) & 0x3f;
+    }
+
+    return F_none;
+  }
+#endif // _di_f_utf_character_unicode_to_
+
+#ifndef _di_f_utf_character_unicode_from_
+  f_status_t f_utf_character_unicode_from(const uint32_t unicode, f_utf_character_t *character) {
+    #ifndef _di_level_0_parameter_checking_
+      if (!character) return F_status_set_error(F_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    if (unicode > 0x10ffff) {
+      return F_status_set_error(F_utf);
+    }
+
+    if (unicode < 0x80) {
+
+      // U+0000 -> U+007F
+      *character = unicode;
+    }
+    else if (unicode < 0x800) {
+
+      // U+0080 -> U+07FF
+      *character = (unicode & 0x7c0) << 2;
+      *character |= unicode & 0x3f;
+      *character |= 0xc080;
+    }
+    else if (unicode < 0x10000) {
+
+      // U+0800 -> U+FFFF
+      *character = (unicode & 0xf000) << 4;
+      *character |= (unicode & 0xfc0) << 2;
+      *character |= unicode & 0x3f;
+      *character |= 0xe08080;
+    }
+    else {
+
+      // U+10000 -> U+
+      *character = (unicode & 0x1c0000) << 6;
+      *character |= (unicode & 0x3f000) << 4;
+      *character |= (unicode & 0xfc0) << 2;
+      *character |= unicode & 0x3f;
+      *character |= 0xe0808080;
+    }
+
+    return F_none;
+  }
+#endif // _di_f_utf_character_unicode_from_
+
  #ifndef _di_f_utf_is_big_endian_
    f_status_t f_utf_is_big_endian() {
      uint16_t test_int = (0x01 << 8) | 0x02;
@@ -805,7 +857,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -836,7 +888,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -867,7 +919,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -890,7 +942,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -918,7 +970,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -949,7 +1001,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -977,7 +1029,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1008,7 +1060,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1039,7 +1091,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1086,7 +1138,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1130,7 +1182,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1159,7 +1211,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1188,7 +1240,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1246,7 +1298,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1289,7 +1341,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1312,7 +1364,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1335,7 +1387,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1366,7 +1418,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1395,7 +1447,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1424,7 +1476,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1455,7 +1507,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1486,7 +1538,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1517,7 +1569,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1556,7 +1608,7 @@ extern "C" {
      f_utf_character_t character_utf = 0;
  
      {
-      const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
        if (F_status_is_error(status)) return status;
      }
  
@@ -1930,6 +1982,134 @@ extern "C" {
    }
  #endif // _di_f_utf_string_seek_to_
  
+#ifndef _di_f_utf_unicode_to_
+  f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, uint32_t *unicode) {
+    #ifndef _di_level_0_parameter_checking_
+      if (width_max < 1) return F_status_set_error(F_parameter);
+      if (!unicode) return F_status_set_error(F_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    const uint8_t width = macro_f_utf_byte_width(*character);
+
+    {
+      f_utf_character_t character_utf = 0;
+
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
+      if (F_status_is_error(status)) return status;
+
+      if (private_f_utf_character_is_valid(character_utf, width) == F_false) {
+        return F_status_set_error(F_utf);
+      }
+    }
+
+    // @fixme the code here needs to be reviewed for endianess accuracy for both big and little endian.
+    if (width == 1) {
+
+      // U+0000 -> U+007F
+      *unicode = character[0] & 0x7f;
+    }
+    else if (width == 2) {
+
+      // U+0080 -> U+07FF
+      *unicode = (character[0] & 0x1f) << 6;
+      *unicode |= character[1] & 0x3f;
+    }
+    else if (width == 3) {
+
+      // U+0800 -> U+FFFF
+      *unicode = (character[0] & 0xf) << 12;
+      *unicode |= (character[1] & 0x3f) << 6;
+      *unicode |= character[2] & 0x3f;
+    }
+    else if (width == 4) {
+
+      // U+10000 -> U+10FFFF
+      *unicode = (character[0] & 0x7) << 18;
+      *unicode |= (character[1] & 0x3f) << 12;
+      *unicode |= (character[2] & 0x3f) << 6;
+      *unicode |= character[3] & 0x3f;
+    }
+
+    return F_none;
+  }
+#endif // _di_f_utf_unicode_to_
+
+#ifndef _di_f_utf_unicode_from_
+  f_status_t f_utf_unicode_from(const uint32_t unicode, const f_array_length_t width_max, f_string_t *character) {
+    #ifndef _di_level_0_parameter_checking_
+      if (width_max < 1) return F_status_set_error(F_parameter);
+      if (!unicode) return F_status_set_error(F_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    // @fixme the code here needs to be reviewed for endianess accuracy for both big and little endian.
+    if (unicode > 0x10ffff) {
+      return F_status_set_error(F_utf);
+    }
+
+    if (unicode < 0x80) {
+
+      // U+0000 -> U+007F
+      (*character)[0] = (char) unicode;
+
+      if (width_max > 1) {
+        (*character)[1] = 0;
+
+        if (width_max > 2) {
+          (*character)[2] = 0;
+
+          if (width_max > 3) {
+            (*character)[3] = 0;
+          }
+        }
+      }
+    }
+    else if (unicode < 0x800) {
+      if (width_max < 2) {
+        return F_status_set_error(F_utf);
+      }
+
+      // U+0080 -> U+07FF
+      (*character)[0] = f_utf_byte_2 | ((char) ((unicode & 0x7c0) >> 6));
+      (*character)[1] = f_utf_byte_1 | ((char) (unicode & 0x3f));
+
+      if (width_max > 2) {
+        (*character)[2] = 0;
+
+        if (width_max > 2) {
+          (*character)[2] = 0;
+        }
+      }
+    }
+    else if (unicode < 0x10000) {
+      if (width_max < 3) {
+        return F_status_set_error(F_utf);
+      }
+
+      // U+0800 -> U+FFFF
+      (*character)[0] = f_utf_byte_3 | ((char) ((unicode & 0xf000) >> 12));
+      (*character)[1] = f_utf_byte_1 | ((char) ((unicode & 0xfc0) >> 6));
+      (*character)[2] = f_utf_byte_1 | ((char) (unicode & 0x3f));
+
+      if (width_max > 3) {
+        character[3] = 0;
+      }
+    }
+    else {
+      if (width_max < 4) {
+        return F_status_set_error(F_utf);
+      }
+
+      // U+10000 -> U+10FFFF
+      (*character)[0] = f_utf_byte_4 | ((char) ((unicode & 0x1c0000) >> 18));
+      (*character)[1] = f_utf_byte_1 | ((char) ((unicode & 0x3f000) >> 12));
+      (*character)[2] = f_utf_byte_1 | ((char) ((unicode & 0xfc0) >> 6));
+      (*character)[3] = f_utf_byte_1 | ((char) (unicode & 0x3f));
+    }
+
+    return F_none;
+  }
+#endif // _di_f_utf_unicode_from_
+
  #ifdef __cplusplus
  } // extern "C"
  #endif
diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h

index 1a90547f879387050d9bc1f2c64c08c3da27c795..f5f35f024873bc659f71c88c526329a8a202c2f2 100644 (file)
--- a/level_0/f_utf/c/utf.h
+++ b/level_0/f_utf/c/utf.h
@@ -701,6 +701,52 @@ extern "C" {
  #endif // _di_f_utf_character_to_char_
  
  /**
+ * Convert a given (UTF-8) character into Unicode.
+ *
+ * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged.
+ * The Unicode is a 32-bit integer representing the Unicode (such as U+0001).
+ * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF).
+ *
+ * @param character
+ *   The (UTF-8) character.
+ * @param unicode
+ *   The Unicode number.
+ *
+ * @return
+ *   F_none on success.
+ *
+ *   F_parameter (with error bit) if a parameter is invalid.
+ *   F_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see f_utf_character_is_valid()
+ */
+#ifndef _di_f_utf_character_unicode_to_
+  extern f_status_t f_utf_character_unicode_to(const f_utf_character_t character, uint32_t *unicode);
+#endif // _di_f_utf_character_unicode_to_
+
+/**
+ * Convert a given Unicode into (UTF-8) character.
+ *
+ * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged.
+ * The Unicode is a 32-bit integer representing the Unicode (such as U+0001).
+ * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF).
+ *
+ * @param unicode
+ *   The Unicode number.
+ * @param character
+ *   The (UTF-8) character.
+ *
+ * @return
+ *   F_none on success.
+ *
+ *   F_parameter (with error bit) if a parameter is invalid.
+ *   F_utf (with error bit) if unicode is an invalid Unicode character.
+ */
+#ifndef _di_f_utf_character_unicode_from_
+  extern f_status_t f_utf_character_unicode_from(const uint32_t unicode, f_utf_character_t *character);
+#endif // _di_f_utf_character_unicode_from_
+
+/**
   * Helper function for UTF-8 processing code to determine endianess of the system.
   *
   * @todo relocate this outside of f_utf into a more general path, perhaps f_memory (f_memory_is_big_endian).
@@ -1778,6 +1824,62 @@ extern "C" {
    extern f_status_t f_utf_string_seek_to(const f_utf_string_t string, const uint8_t seek_to, f_utf_string_range_t *range);
  #endif // _di_f_utf_string_seek_to_
  
+/**
+ * Convert a given string block representing a single character into Unicode.
+ *
+ * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged.
+ * The Unicode is a 32-bit integer representing the Unicode (such as U+0001).
+ * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF).
+ *
+ * @param character
+ *   The (UTF-8) character to convert to the Unicode representation.
+ * @param width_max
+ *   The max width available for representing the UTF-8 character.
+ *   There must be enough space in the character buffer to handle the Unicode width.
+ *   It is recommended to always have 4 characters (4 uint8_t) of space available in character.
+ * @param unicode
+ *   The Unicode number.
+ *
+ * @return
+ *   F_none on success.
+ *
+ *   F_failure (with error bit) if width is not long enough to convert.
+ *   F_parameter (with error bit) if a parameter is invalid.
+ *   F_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see f_utf_character_is_valid()
+ */
+#ifndef _di_f_utf_unicode_to_
+  extern f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, uint32_t *unicode);
+#endif // _di_f_utf_unicode_to_
+
+/**
+ * Convert a given Unicode into a string block representing a single character.
+ *
+ * The f_string is a 32-bit integer containing UTF-8 sequences, unchanged.
+ * The Unicode is a 32-bit integer representing the Unicode (such as U+0001).
+ * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF).
+ *
+ * @param character
+ *   The (UTF-8) character.
+ * @param width_max
+ *   The max width available for representing the UTF-8 character.
+ *   There must be enough space in the character buffer to handle the Unicode width.
+ *   It is recommended to always have 4 characters (4 uint8_t) of space available in character.
+ * @param unicode
+ *   The Unicode number.
+ *
+ * @return
+ *   F_none on success.
+ *
+ *   F_failure (with error bit) if width is not long enough to convert.
+ *   F_parameter (with error bit) if a parameter is invalid.
+ *   F_utf (with error bit) if unicode is an invalid Unicode character.
+ */
+#ifndef _di_f_utf_unicode_from_
+  extern f_status_t f_utf_unicode_from(const uint32_t unicode, const f_array_length_t width_max, f_string_t *character);
+#endif // _di_f_utf_unicode_from_
+
  #ifdef __cplusplus
  } // extern "C"
  #endif
author	Kevin Day <thekevinday@gmail.com>
	Tue, 25 May 2021 23:27:30 +0000 (18:27 -0500)
committer	Kevin Day <thekevinday@gmail.com>
	Tue, 25 May 2021 23:27:30 +0000 (18:27 -0500)
level_0/f_utf/c/private-utf.c		patch \| blob \| history
level_0/f_utf/c/private-utf.h		patch \| blob \| history
level_0/f_utf/c/utf-common.h		patch \| blob \| history
level_0/f_utf/c/utf.c		patch \| blob \| history
level_0/f_utf/c/utf.h		patch \| blob \| history