Update: Complete incomplete unicode processing code.

author Kevin Day <thekevinday@gmail.com>

Sat, 4 Dec 2021 00:29:50 +0000 (18:29 -0600)

committer Kevin Day <thekevinday@gmail.com>

Sat, 4 Dec 2021 00:38:01 +0000 (18:38 -0600)
author Kevin Day <thekevinday@gmail.com>
Sat, 4 Dec 2021 00:29:50 +0000 (18:29 -0600)
committer Kevin Day <thekevinday@gmail.com>
Sat, 4 Dec 2021 00:38:01 +0000 (18:38 -0600)
diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c

index d0cbef5f5ff3cbe45835954e9637df6fdd76b617..c6b162a9b1c4fed7bd936b095e5f23ac0e73327d 100644 (file)
--- a/level_0/f_utf/c/utf.c
+++ b/level_0/f_utf/c/utf.c
@@ -1798,8 +1798,60 @@ extern "C" {
    }
  #endif // _di_f_utf_unicode_from_
  
-#ifndef _di_f_utf_unicode_string_from_f_
-  f_status_t f_utf_unicode_string_from(const f_string_t string, const f_array_length_t length, uint32_t *unicode) {
+#ifndef _di_f_utf_unicode_to_
+  f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, uint32_t *unicode) {
+    #ifndef _di_level_0_parameter_checking_
+      if (width_max < 1) return F_status_set_error(F_parameter);
+      if (!unicode) return F_status_set_error(F_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    if (macro_f_utf_byte_width_is(*character) == 1) {
+      return F_status_set_error(F_utf_fragment);
+    }
+
+    {
+      f_utf_character_t character_utf = 0;
+
+      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
+      if (F_status_is_error(status)) return status;
+
+      if (private_f_utf_character_is_valid(character_utf) == F_false) {
+        return F_status_set_error(F_utf);
+      }
+    }
+
+    // U+0000 -> U+007F.
+    if (macro_f_utf_byte_width(*character) == 1) {
+      *unicode = ((uint8_t) character[0]) & 0x7f;
+    }
+
+    // U+0080 -> U+07FF.
+    else if (macro_f_utf_byte_width(*character) == 2) {
+      *unicode = (((uint8_t) character[0]) & 0x1f) << 6;
+      *unicode |= ((uint8_t) character[1]) & 0x3f;
+    }
+
+    // U+0800 -> U+FFFF.
+    else if (macro_f_utf_byte_width(*character) == 3) {
+      *unicode = (((uint8_t) character[0]) & 0xf) << 12;
+      *unicode |= (((uint8_t) character[1]) & 0x3f) << 6;
+      *unicode |= ((uint8_t) character[2]) & 0x3f;
+    }
+
+    // U+10000 -> U+10FFFF.
+    else if (macro_f_utf_byte_width(*character) == 4) {
+      *unicode = (((uint8_t) character[0]) & 0x7) << 18;
+      *unicode |= (((uint8_t) character[1]) & 0x3f) << 12;
+      *unicode |= (((uint8_t) character[2]) & 0x3f) << 6;
+      *unicode |= ((uint8_t) character[3]) & 0x3f;
+    }
+
+    return F_none;
+  }
+#endif // _di_f_utf_unicode_to_
+
+#ifndef _di_f_utf_unicode_string_to_f_
+  f_status_t f_utf_unicode_string_to(const f_string_t string, const f_array_length_t length, uint32_t *unicode) {
      #ifndef _di_level_0_parameter_checking_
        if (!unicode) return F_status_set_error(F_parameter);
      #endif // _di_level_0_parameter_checking_
@@ -1858,59 +1910,7 @@ extern "C" {
  
      return F_none;
    }
-#endif // _di_f_utf_unicode_string_from_
-
-#ifndef _di_f_utf_unicode_to_
-  f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, uint32_t *unicode) {
-    #ifndef _di_level_0_parameter_checking_
-      if (width_max < 1) return F_status_set_error(F_parameter);
-      if (!unicode) return F_status_set_error(F_parameter);
-    #endif // _di_level_0_parameter_checking_
-
-    if (macro_f_utf_byte_width_is(*character) == 1) {
-      return F_status_set_error(F_utf_fragment);
-    }
-
-    {
-      f_utf_character_t character_utf = 0;
-
-      const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
-      if (F_status_is_error(status)) return status;
-
-      if (private_f_utf_character_is_valid(character_utf) == F_false) {
-        return F_status_set_error(F_utf);
-      }
-    }
-
-    // U+0000 -> U+007F.
-    if (macro_f_utf_byte_width(*character) == 1) {
-      *unicode = ((uint8_t) character[0]) & 0x7f;
-    }
-
-    // U+0080 -> U+07FF.
-    else if (macro_f_utf_byte_width(*character) == 2) {
-      *unicode = (((uint8_t) character[0]) & 0x1f) << 6;
-      *unicode |= ((uint8_t) character[1]) & 0x3f;
-    }
-
-    // U+0800 -> U+FFFF.
-    else if (macro_f_utf_byte_width(*character) == 3) {
-      *unicode = (((uint8_t) character[0]) & 0xf) << 12;
-      *unicode |= (((uint8_t) character[1]) & 0x3f) << 6;
-      *unicode |= ((uint8_t) character[2]) & 0x3f;
-    }
-
-    // U+10000 -> U+10FFFF.
-    else if (macro_f_utf_byte_width(*character) == 4) {
-      *unicode = (((uint8_t) character[0]) & 0x7) << 18;
-      *unicode |= (((uint8_t) character[1]) & 0x3f) << 12;
-      *unicode |= (((uint8_t) character[2]) & 0x3f) << 6;
-      *unicode |= ((uint8_t) character[3]) & 0x3f;
-    }
-
-    return F_none;
-  }
-#endif // _di_f_utf_unicode_to_
+#endif // _di_f_utf_unicode_string_to_
  
  #ifdef __cplusplus
  } // extern "C"
diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h

index 085da41c15ad0af2bfcc234e9fd2e801ac2b8e74..5c66ccce4730c535caeac48138b5bab705ac0ede 100644 (file)
--- a/level_0/f_utf/c/utf.h
+++ b/level_0/f_utf/c/utf.h
@@ -1622,33 +1622,6 @@ extern "C" {
  #endif // _di_f_utf_unicode_from_
  
  /**
- * Convert a string of the format "U+FFFF" into the codepoint value.
- *
- * This ignores NULL characters.
- * The string may only contain "U+" followed by a hexidecimal digit, upper or lower case.
- * The "U+" prefix is optional.
- * Only ASCII characters are allowed to represent the Unicode sequence string.
- *
- * @param string
- *   The string representing a Unicode sequence.
- * @param length
- *   The maximum number of characters.
- * @param unicode
- *   A 32-bit integer representing the Unicode (such as U+0001).
- *   Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF).
- *
- * @return
- *   F_none on success.
- *
- *   F_failure (with error bit) if width_max is not long enough to convert.
- *   F_parameter (with error bit) if a parameter is invalid.
- *   F_valid_not (with error bit) if string is not a valid Unicode string.
- */
-#ifndef _di_f_utf_unicode_string_from_f_
-  extern f_status_t f_utf_unicode_string_from(const f_string_t string, const f_array_length_t length, uint32_t *unicode);
-#endif // _di_f_utf_unicode_string_from_
-
-/**
   * Convert a given string block representing a single character into Unicode.
   *
   * @param character
@@ -1676,6 +1649,33 @@ extern "C" {
    extern f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, uint32_t *unicode);
  #endif // _di_f_utf_unicode_to_
  
+/**
+ * Convert a string of the format "U+FFFF" into the codepoint value.
+ *
+ * This ignores NULL characters.
+ * The string may only contain "U+" followed by a hexidecimal digit, upper or lower case.
+ * The "U+" prefix is optional.
+ * Only ASCII characters are allowed to represent the Unicode sequence string.
+ *
+ * @param string
+ *   The string representing a Unicode sequence.
+ * @param length
+ *   The maximum number of characters.
+ * @param unicode
+ *   A 32-bit integer representing the Unicode (such as U+0001).
+ *   Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF).
+ *
+ * @return
+ *   F_none on success.
+ *
+ *   F_failure (with error bit) if width_max is not long enough to convert.
+ *   F_parameter (with error bit) if a parameter is invalid.
+ *   F_valid_not (with error bit) if string is not a valid Unicode string.
+ */
+#ifndef _di_f_utf_unicode_string_to_f_
+  extern f_status_t f_utf_unicode_string_to(const f_string_t string, const f_array_length_t length, uint32_t *unicode);
+#endif // _di_f_utf_unicode_string_to_
+
  #ifdef __cplusplus
  } // extern "C"
  #endif
diff --git a/level_3/utf8/c/private-print.c b/level_3/utf8/c/private-print.c

index 68cc1aa7205d9c0aefe833cc74fdbc00e26875ad..617c66c0fe52973763f74f9abc60548f61a9ed8d 100644 (file)
--- a/level_3/utf8/c/private-print.c
+++ b/level_3/utf8/c/private-print.c
@@ -14,9 +14,7 @@ extern "C" {
      }
  
      if (data->mode & utf8_mode_to_binary_d) {
-      fl_print_format("%s%[", data->file.stream, data->prepend, set);
-      f_print_dynamic_raw(character, data->file.stream);
-      fl_print_format("%s%]", data->file.stream, data->append, set);
+      fl_print_format("%s%[%r%]%s", data->file.stream, data->prepend, set, character, set, data->append);
      }
      else {
        fl_print_format("%s%[0x", data->file.stream, data->prepend, set);
@@ -25,7 +23,7 @@ extern "C" {
          fl_print_format("%04_uii", data->file.stream, (uint8_t) character.string[i]);
        } // for
  
-      fl_print_format("%s%]", data->file.stream, data->append, set);
+      fl_print_format("%]%s", data->file.stream, set, data->append);
      }
    }
  #endif // _di_utf8_print_character_
@@ -35,10 +33,10 @@ extern "C" {
  
      if (data->main->error.verbosity == f_console_verbosity_quiet) return;
  
-    fl_print_format("%c%[%SFailed to decode character '%]", data->main->error.to.stream, f_string_eol_s[0], data->main->context.set.error, data->main->context.set.error);
+    fl_print_format("%c%[%SFailed to decode character '%]", data->main->error.to.stream, f_string_eol_s[0], data->main->context.set.error, data->main->error.prefix, data->main->context.set.error);
      fl_print_format("%[%r%]", data->main->error.to.stream, data->main->context.set.notable, character, data->main->context.set.notable);
      fl_print_format("%[', error status code%] ", data->main->error.to.stream, data->main->context.set.error, data->main->context.set.error, f_string_eol_s[0]);
-    fl_print_format("%[%S%]", data->main->error.to.stream, data->main->context.set.notable, F_status_set_fine(status), data->main->context.set.notable);
+    fl_print_format("%[%ui%]", data->main->error.to.stream, data->main->context.set.notable, F_status_set_fine(status), data->main->context.set.notable);
      fl_print_format("%[.%]%c", data->main->error.to.stream, data->main->context.set.error, data->main->context.set.error, f_string_eol_s[0]);
    }
  #endif // _di_utf8_print_error_decode_
diff --git a/level_3/utf8/c/private-utf8_binary.c b/level_3/utf8/c/private-utf8_binary.c

index accba66776b542b8ba988196e3796ddada02027d..718c2f1bcc8d8d56064d5cdf95f2915a096d9abc 100644 (file)
--- a/level_3/utf8/c/private-utf8_binary.c
+++ b/level_3/utf8/c/private-utf8_binary.c
@@ -35,9 +35,7 @@ extern "C" {
      }
      else if (data->main->parameters[utf8_parameter_verify].result == f_console_result_none) {
        if (data->mode & utf8_mode_to_binary_d) {
-        f_print_terminated(data->prepend, data->file.stream);
-        f_print_dynamic_raw(character, data->file.stream);
-        f_print_terminated(data->append, data->file.stream);
+        fl_print_format("%s%r%s", data->file.stream, data->prepend, character, data->append);
        }
        else {
          fl_print_format(codepoint < 0xffff ? "%sU+%04_U%s" : "%sU+%6_U%s", data->file.stream, data->prepend, codepoint, data->append);
diff --git a/level_3/utf8/c/private-utf8_codepoint.c b/level_3/utf8/c/private-utf8_codepoint.c

index b373decbf96ef18586986ee319e52b0e1677ffe2..849c937f2d976e343ba37218324cfed671170634 100644 (file)
--- a/level_3/utf8/c/private-utf8_codepoint.c
+++ b/level_3/utf8/c/private-utf8_codepoint.c
@@ -32,7 +32,7 @@ extern "C" {
      if (*mode == utf8_codepoint_mode_end) {
        uint32_t codepoint = 0;
  
-      status = f_utf_unicode_string_from(data->text.string, data->text.used, &codepoint);
+      status = f_utf_unicode_string_to(data->text.string, data->text.used, &codepoint);
  
        if (F_status_is_error(status)) {
          if (F_status_set_fine(status) == F_failure || F_status_set_fine(status) == F_utf) {
@@ -50,23 +50,16 @@ extern "C" {
            text.used = macro_f_utf_byte_width(codepoint);
            text.size = 5;
  
-          byte[0] = macro_f_utf_character_t_to_char_1(codepoint);
+          status = f_utf_unicode_from(codepoint, 4, &text.string);
  
-          if (text.used > 1) {
-            byte[1] = macro_f_utf_character_t_to_char_2(codepoint);
-
-            if (text.used > 2) {
-              byte[2] = macro_f_utf_character_t_to_char_3(codepoint);
-
-              if (text.used > 3) {
-                byte[3] = macro_f_utf_character_t_to_char_4(codepoint);
-              }
-            }
+          if (F_status_is_error(status)) {
+            utf8_print_error_decode(data, status, character);
            }
+          else {
+            status = F_none;
  
-          f_print_terminated(data->prepend, data->file.stream);
-          f_print_dynamic_raw(text, data->file.stream);
-          f_print_terminated(data->append, data->file.stream);
+            fl_print_format("%s%r%s", data->file.stream, data->prepend, text, data->append);
+          }
          }
          else {
            fl_print_format(codepoint < 0xffff ? "%sU+%04_U%s" : "%sU+%6_U%s", data->file.stream, data->prepend, codepoint, data->append);
author	Kevin Day <thekevinday@gmail.com>
	Sat, 4 Dec 2021 00:29:50 +0000 (18:29 -0600)
committer	Kevin Day <thekevinday@gmail.com>
	Sat, 4 Dec 2021 00:38:01 +0000 (18:38 -0600)
level_0/f_utf/c/utf.c		patch \| blob \| history
level_0/f_utf/c/utf.h		patch \| blob \| history
level_3/utf8/c/private-print.c		patch \| blob \| history
level_3/utf8/c/private-utf8_binary.c		patch \| blob \| history
level_3/utf8/c/private-utf8_codepoint.c		patch \| blob \| history