From 6761261276ef712883e642740a6fa329fdaaf21f Mon Sep 17 00:00:00 2001
From: Kevin Day <thekevinday@gmail.com>
Date: Sun, 22 May 2022 20:45:31 -0500
Subject: [PATCH] Feature: Add missing functionality allowing the utf8 program
 to convert back to binary data with invalid codepoints.

Even when there are invalid codepoints produced, it should be possible to convert the entire output back to the original data.
This is possible because the codepoint output by default still prints the invalid data as a hex-digit representing up to 4 bytes of data.
The combining and width parameters are also supported.
---
 level_3/utf8/c/common.c                 |   2 +
 level_3/utf8/c/common.h                 |   6 +
 level_3/utf8/c/private-common.h         |  25 +++--
 level_3/utf8/c/private-print.c          | 107 ++++++++++++++++++
 level_3/utf8/c/private-print.h          |  40 +++++++
 level_3/utf8/c/private-utf8.c           |  12 +-
 level_3/utf8/c/private-utf8_bytecode.c  |   2 +-
 level_3/utf8/c/private-utf8_codepoint.c | 141 ++++++++++++++++++++++--
 level_3/utf8/c/private-utf8_codepoint.h |  32 +++++-
 9 files changed, 344 insertions(+), 23 deletions(-)

diff --git a/level_3/utf8/c/common.c b/level_3/utf8/c/common.c
index 20e8b88fe..4f81c0f93 100644
--- a/level_3/utf8/c/common.c
+++ b/level_3/utf8/c/common.c
@@ -35,6 +35,8 @@ extern "C" {
   const f_string_static_t utf8_string_width_0_s = macro_f_string_static_t_initialize(UTF8_string_width_0_s, 0, UTF8_string_width_0_s_length);
   const f_string_static_t utf8_string_width_1_s = macro_f_string_static_t_initialize(UTF8_string_width_1_s, 0, UTF8_string_width_1_s_length);
   const f_string_static_t utf8_string_width_2_s = macro_f_string_static_t_initialize(UTF8_string_width_2_s, 0, UTF8_string_width_2_s_length);
+  const f_string_static_t utf8_string_width_3_s = macro_f_string_static_t_initialize(UTF8_string_width_3_s, 0, UTF8_string_width_3_s_length);
+  const f_string_static_t utf8_string_width_4_s = macro_f_string_static_t_initialize(UTF8_string_width_4_s, 0, UTF8_string_width_4_s_length);
 #endif // _di_utf8_defines_
 
 #ifndef _di_utf8_parameters_
diff --git a/level_3/utf8/c/common.h b/level_3/utf8/c/common.h
index ab64ec506..1a477d002 100644
--- a/level_3/utf8/c/common.h
+++ b/level_3/utf8/c/common.h
@@ -99,6 +99,8 @@ extern "C" {
   #define UTF8_string_width_0_s "0"
   #define UTF8_string_width_1_s "1"
   #define UTF8_string_width_2_s "2"
+  #define UTF8_string_width_3_s "3"
+  #define UTF8_string_width_4_s "4"
 
   #define UTF8_string_combining_is_s_length  1
   #define UTF8_string_combining_not_s_length 1
@@ -117,6 +119,8 @@ extern "C" {
   #define UTF8_string_width_0_s_length 1
   #define UTF8_string_width_1_s_length 1
   #define UTF8_string_width_2_s_length 1
+  #define UTF8_string_width_3_s_length 1
+  #define UTF8_string_width_4_s_length 1
 
   extern const f_string_static_t utf8_string_combining_is_s;
   extern const f_string_static_t utf8_string_combining_not_s;
@@ -133,6 +137,8 @@ extern "C" {
   extern const f_string_static_t utf8_string_width_0_s;
   extern const f_string_static_t utf8_string_width_1_s;
   extern const f_string_static_t utf8_string_width_2_s;
+  extern const f_string_static_t utf8_string_width_3_s;
+  extern const f_string_static_t utf8_string_width_4_s;
 
   extern const f_string_static_t utf8_string_valid_not_s;
 #endif // _di_utf8_defines_
diff --git a/level_3/utf8/c/private-common.h b/level_3/utf8/c/private-common.h
index c3f5d4967..1226c1bb7 100644
--- a/level_3/utf8/c/private-common.h
+++ b/level_3/utf8/c/private-common.h
@@ -13,14 +13,22 @@ extern "C" {
 #endif
 
 /**
+ * Codepoint modes for converting to/from binary and codepoint values.
+ *
+ * The special "raw" format is used only for reading from codepoint format where that format represents binary character that is not a valid Unicode character.
+ * This is intended to be used to save and restore the original binary data even if that data is invalid.
+ *
  * private_utf8_codepoint_mode_*:
- *   - ready:     The codepoint has yet to be processed, skip leading spaces until first 'U' is matched.
- *   - begin:     The first 'U' is matched, look for the '+'.
- *   - number:    The '+' is matched, process numbers.
- *   - end:       The last number is reached (at either white space or EOS/EOF).
- *   - bad:       This is not a valid codepoint.
- *   - bad_begin: This is the beginning of an invalid codepoint.
- *   - bad_end:   The end of bad is detected, which happens on white space or end of buffer.
+ *   - ready:      The codepoint has yet to be processed, skip leading spaces until first 'U' is matched.
+ *   - begin:      The first 'U' is matched, look for the '+'.
+ *   - number:     The '+' is matched, process numbers.
+ *   - end:        The last number is reached (at either white space or EOS/EOF).
+ *   - bad:        This is not a valid codepoint.
+ *   - bad_begin:  This is the beginning of an invalid codepoint.
+ *   - bad_end:    The end of bad is detected, which happens on white space or end of buffer.
+ *   - raw_begin:  This is the beginning of a potential raw data (matched '0').
+ *   - raw_number: This is the confirmed beginning of raw data (matched 'X'), process numbers.
+ *   - raw_end:    The end of raw data is detected, which happens on white space or end of buffer.
  */
 #ifndef _di_utf8_codepoint_modes_
   enum {
@@ -31,6 +39,9 @@ extern "C" {
     utf8_codepoint_mode_bad_e,
     utf8_codepoint_mode_bad_begin_e,
     utf8_codepoint_mode_bad_end_e,
+    utf8_codepoint_mode_raw_begin_e,
+    utf8_codepoint_mode_raw_number_e,
+    utf8_codepoint_mode_raw_end_e,
   };
 #endif // _di__utf8_codepoint_modes_
 
diff --git a/level_3/utf8/c/private-print.c b/level_3/utf8/c/private-print.c
index cec0a5832..7a6fcb1f5 100644
--- a/level_3/utf8/c/private-print.c
+++ b/level_3/utf8/c/private-print.c
@@ -209,6 +209,113 @@ extern "C" {
   }
 #endif // _di_utf8_print_error_parameter_file_to_too_many_
 
+#ifndef _di_utf8_print_raw_bytecode_
+  void utf8_print_raw_bytecode(utf8_data_t * const data, const f_utf_char_t raw, const uint8_t width) {
+
+    if (data->main->parameters.array[utf8_parameter_strip_invalid_e].result == f_console_result_found_e) return;
+    if (data->main->parameters.array[utf8_parameter_verify_e].result == f_console_result_found_e) return;
+
+    f_string_static_t character = macro_f_string_static_t_initialize(0, 0, width);
+
+    uint8_t byte[character.used];
+    character.string = byte;
+
+    if (raw) {
+      if (width == 1) {
+        byte[0] = (uint8_t) (raw & 0xff);
+      }
+      else if (width == 2) {
+        #ifdef _is_F_endian_big
+          byte[0] = (uint8_t) (raw & 0xff);
+          byte[1] = (uint8_t) ((raw & 0xff00) << 8);
+        #else
+          byte[0] = (uint8_t) ((raw & 0xff00) >> 8);
+          byte[1] = (uint8_t) (raw & 0xff);
+        #endif // _is_F_endian_big
+      }
+      else if (width == 3) {
+        #ifdef _is_F_endian_big
+          byte[0] = (uint8_t) (raw & 0xff);
+          byte[1] = (uint8_t) ((raw & 0xff00) << 8);
+          byte[2] = (uint8_t) ((raw & 0xff0000) << 16);
+        #else
+          byte[0] = (uint8_t) ((raw & 0xff0000) >> 16);
+          byte[1] = (uint8_t) ((raw & 0xff00) >> 8);
+          byte[2] = (uint8_t) (raw & 0xff);
+        #endif // _is_F_endian_big
+      }
+      else {
+        #ifdef _is_F_endian_big
+          byte[0] = (uint8_t) (raw & 0xff);
+          byte[1] = (uint8_t) ((raw & 0xff00) << 8);
+          byte[2] = (uint8_t) ((raw & 0xff0000) << 16);
+          byte[3] = (uint8_t) ((raw & 0xff000000) << 24);
+        #else
+          byte[0] = (uint8_t) ((raw & 0xff000000) >> 24);
+          byte[1] = (uint8_t) ((raw & 0xff0000) >> 16);
+          byte[2] = (uint8_t) ((raw & 0xff00) >> 8);
+          byte[3] = (uint8_t) (raw & 0xff);
+        #endif // _is_F_endian_big
+      }
+    }
+    else {
+      memset(byte, 0, sizeof(uint8_t) * width);
+    }
+
+    fl_print_format("%r%[%r%]%r", data->file.stream, data->prepend, data->valid_not, character, data->valid_not, data->append);
+  }
+#endif // _di_utf8_print_raw_bytecode_
+
+#ifndef _di_utf8_print_raw_codepoint_
+  void utf8_print_raw_codepoint(utf8_data_t * const data, const f_string_static_t raw) {
+
+    if (data->main->parameters.array[utf8_parameter_strip_invalid_e].result == f_console_result_found_e) return;
+    if (data->main->parameters.array[utf8_parameter_verify_e].result == f_console_result_found_e) return;
+
+    fl_print_format("%r%[%r%]%r", data->file.stream, data->prepend, data->valid_not, raw, data->valid_not, data->append);
+  }
+#endif // _di_utf8_print_raw_codepoint_
+
+#ifndef _di_utf8_print_raw_combining_or_width_
+  void utf8_print_raw_combining_or_width(utf8_data_t * const data, const uint8_t width) {
+
+    if (data->main->parameters.array[utf8_parameter_strip_invalid_e].result == f_console_result_found_e) return;
+    if (data->main->parameters.array[utf8_parameter_verify_e].result == f_console_result_found_e) return;
+
+    f_status_t status = F_none;
+
+    if (data->mode & utf8_mode_to_combining_d) {
+      fl_print_format("%r%[%r%]%r", data->file.stream, data->prepend, data->valid_not, utf8_string_unknown_s, data->valid_not, data->append);
+    }
+    else if (data->mode & utf8_mode_to_width_d) {
+      const f_string_static_t *character = 0;
+
+      switch (width) {
+        case 1:
+          character = &utf8_string_width_1_s;
+          break;
+
+        case 2:
+          character = &utf8_string_width_2_s;
+          break;
+
+        case 3:
+          character = &utf8_string_width_3_s;
+          break;
+
+        case 4:
+          character = &utf8_string_width_4_s;
+          break;
+
+        default:
+          character = &utf8_string_width_0_s;
+      }
+
+      fl_print_format("%r%[%r%]%r", data->file.stream, data->prepend, data->valid_not, *character, data->valid_not, data->append);
+    }
+  }
+#endif // _di_utf8_print_raw_combining_or_width_
+
 #ifndef _di_utf8_print_section_header_file_
   void utf8_print_section_header_file(utf8_data_t * const data, const f_string_static_t name) {
 
diff --git a/level_3/utf8/c/private-print.h b/level_3/utf8/c/private-print.h
index df35aa07e..c6f794d81 100644
--- a/level_3/utf8/c/private-print.h
+++ b/level_3/utf8/c/private-print.h
@@ -151,6 +151,46 @@ extern "C" {
   extern void utf8_print_error_parameter_file_to_too_many(utf8_data_t * const data) F_attribute_visibility_internal_d;
 #endif // _di_utf8_print_error_parameter_file_to_too_many_
 
+/**
+ * Print the raw character data (binary / bytecode).
+ *
+ * @param data
+ *   The program data.
+ * @param raw
+ *   The raw string in integer format.
+ * @param width
+ *   The width the raw character represents (a value inclusively from 1 to 4).
+ */
+#ifndef _di_utf8_print_raw_bytecode_
+  extern void utf8_print_raw_bytecode(utf8_data_t * const data, const f_utf_char_t raw, const uint8_t width) F_attribute_visibility_internal_d;
+#endif // _di_utf8_print_raw_bytecode_
+
+/**
+ * Print the raw character data (codepoint).
+ *
+ * @param data
+ *   The program data.
+ * @param raw
+ *   The raw string already in codepoint format.
+ */
+#ifndef _di_utf8_print_raw_codepoint_
+  extern void utf8_print_raw_codepoint(utf8_data_t * const data, const f_string_static_t raw) F_attribute_visibility_internal_d;
+#endif // _di_utf8_print_raw_codepoint_
+
+/**
+ * Print the width or combining state of the for a raw character.
+ *
+ * @param data
+ *   The program data.
+ * @param width
+ *   The pre-calculated width.
+ *
+ * @see utf8_print_width()
+ */
+#ifndef _di_utf8_print_raw_combining_or_width_
+  extern void utf8_print_raw_combining_or_width(utf8_data_t * const data, const uint8_t width) F_attribute_visibility_internal_d;
+#endif // _di_utf8_print_raw_combining_or_width_
+
 /**
  * Print the input file section header.
  *
diff --git a/level_3/utf8/c/private-utf8.c b/level_3/utf8/c/private-utf8.c
index 3535e21f9..e159450a8 100644
--- a/level_3/utf8/c/private-utf8.c
+++ b/level_3/utf8/c/private-utf8.c
@@ -66,10 +66,13 @@ extern "C" {
     } // for
 
     if (F_status_is_error_not(status) && !(data->mode & utf8_mode_from_bytecode_d)) {
-      if (mode_codepoint != utf8_codepoint_mode_ready_e && mode_codepoint != utf8_codepoint_mode_end_e && mode_codepoint != utf8_codepoint_mode_bad_end_e) {
+      if (mode_codepoint != utf8_codepoint_mode_ready_e && mode_codepoint != utf8_codepoint_mode_end_e && mode_codepoint != utf8_codepoint_mode_bad_end_e && mode_codepoint != utf8_codepoint_mode_raw_end_e) {
         if (mode_codepoint == utf8_codepoint_mode_number_e) {
           mode_codepoint = utf8_codepoint_mode_end_e;
         }
+        else if (mode_codepoint == utf8_codepoint_mode_raw_number_e) {
+          mode_codepoint = utf8_codepoint_mode_raw_end_e;
+        }
         else {
           mode_codepoint = utf8_codepoint_mode_bad_end_e;
           valid = F_false;
@@ -77,7 +80,12 @@ extern "C" {
 
         text.used = 0;
 
-        status = utf8_convert_codepoint(data, text, &mode_codepoint);
+        if (mode_codepoint == utf8_codepoint_mode_raw_number_e) {
+          status = utf8_convert_raw(data, text, &mode_codepoint);
+        }
+        else {
+          status = utf8_convert_codepoint(data, text, &mode_codepoint);
+        }
       }
     }
 
diff --git a/level_3/utf8/c/private-utf8_bytecode.c b/level_3/utf8/c/private-utf8_bytecode.c
index 419aef0d0..6628d39c0 100644
--- a/level_3/utf8/c/private-utf8_bytecode.c
+++ b/level_3/utf8/c/private-utf8_bytecode.c
@@ -73,7 +73,7 @@ extern "C" {
     f_array_length_t j = 0;
 
     f_char_t block_character[4] = { 0, 0, 0, 0 };
-    f_string_static_t character = macro_f_string_static_t_initialize2(block_character, 4);
+    f_string_static_t character = macro_f_string_static_t_initialize(block_character, 0, 4);
 
     do {
       status = f_file_read_block(file, &data->buffer);
diff --git a/level_3/utf8/c/private-utf8_codepoint.c b/level_3/utf8/c/private-utf8_codepoint.c
index 689b71803..d0fdd6cf7 100644
--- a/level_3/utf8/c/private-utf8_codepoint.c
+++ b/level_3/utf8/c/private-utf8_codepoint.c
@@ -26,10 +26,6 @@ extern "C" {
       } // for
     }
 
-    if (!(*mode == utf8_codepoint_mode_end_e || *mode == utf8_codepoint_mode_bad_end_e)) {
-      return F_none;
-    }
-
     if (*mode == utf8_codepoint_mode_end_e) {
       uint32_t codepoint = 0;
 
@@ -53,8 +49,8 @@ extern "C" {
       }
       else if (data->main->parameters.array[utf8_parameter_verify_e].result == f_console_result_none_e) {
         if (data->mode & utf8_mode_to_bytecode_d) {
-          f_char_t byte[5] = { 0, 0, 0, 0, 0 };
-          f_string_static_t character = macro_f_string_static_t_initialize2(byte, 5);
+          f_char_t byte[4] = { 0, 0, 0, 0 };
+          f_string_static_t character = macro_f_string_static_t_initialize(byte, 0, 4);
 
           status = f_utf_unicode_from(codepoint, 4, &character.string);
 
@@ -76,11 +72,14 @@ extern "C" {
         }
       }
     }
-    else {
+    else if (*mode == utf8_codepoint_mode_bad_end_e) {
       status = F_none;
 
       utf8_print_character_invalid(data, character);
     }
+    else {
+      return F_none;
+    }
 
     *mode = utf8_codepoint_mode_ready_e;
     data->text.used = 0;
@@ -93,6 +92,90 @@ extern "C" {
   }
 #endif // _di_utf8_convert_codepoint_
 
+#ifndef _di_utf8_convert_raw_
+  f_status_t utf8_convert_raw(utf8_data_t * const data, const f_string_static_t character, uint8_t *mode) {
+
+    f_status_t status = F_none;
+    bool valid_not = F_false;
+
+    if (*mode != utf8_codepoint_mode_raw_end_e) {
+      if (data->text.used + character.used >= data->text.size) {
+        status = f_string_dynamic_increase_by(utf8_default_allocation_step_d, &data->text);
+        if (F_status_is_error(status)) return status;
+      }
+
+      for (f_array_length_t i = 0; i < character.used; ++i) {
+        data->text.string[data->text.used++] = character.string[i];
+      } // for
+    }
+
+    if (*mode == utf8_codepoint_mode_raw_end_e) {
+      f_utf_char_t raw = 0;
+
+      {
+        f_number_unsigned_t number = 0;
+
+        status = fl_conversion_dynamic_to_number_unsigned(data->text, &number);
+
+        raw = (f_utf_char_t) number;
+      }
+
+      if (F_status_is_error(status)) {
+        status = F_status_set_fine(status);
+
+        if (status == F_number || status == F_utf_not || status == F_complete_not_utf || status == F_utf_fragment || status == F_number_decimal || status == F_number_negative || status == F_number_positive || status == F_number_overflow) {
+          valid_not = F_true;
+
+          utf8_print_character_invalid(data, character);
+        }
+        else {
+          status = F_status_set_error(status);
+
+          utf8_print_error_decode(data, status, character);
+
+          return status;
+        }
+      }
+      else if (data->main->parameters.array[utf8_parameter_verify_e].result == f_console_result_none_e) {
+
+        // The width actually includes the leading '0x', which is not part of the width of the digit in binary form.
+        uint8_t width = data->text.used > 1 ? (data->text.used - 2) / 2 : 0;
+
+        if ((data->text.used - 2) % 2) {
+          ++width;
+        }
+
+        if (data->mode & utf8_mode_to_bytecode_d) {
+          utf8_print_raw_bytecode(data, raw, width);
+        }
+        else if (data->mode & utf8_mode_to_codepoint_d) {
+          utf8_print_raw_codepoint(data, data->text);
+        }
+        else {
+          utf8_print_raw_combining_or_width(data, width);
+        }
+      }
+    }
+    else if (*mode == utf8_codepoint_mode_bad_end_e) {
+      status = F_none;
+
+      utf8_print_character_invalid(data, character);
+    }
+    else {
+      return F_none;
+    }
+
+    *mode = utf8_codepoint_mode_ready_e;
+    data->text.used = 0;
+
+    if (valid_not || F_status_is_error(status)) {
+      return F_valid_not;
+    }
+
+    return status;
+  }
+#endif // _di_utf8_convert_raw_
+
 #ifndef _di_utf8_detect_codepoint_
   f_status_t utf8_detect_codepoint(utf8_data_t * const data, const f_string_static_t character, uint8_t *mode) {
 
@@ -106,6 +189,9 @@ extern "C" {
     if (character.string[0] == f_string_ascii_u_s.string[0] || character.string[0] == f_string_ascii_U_s.string[0] || character.string[0] == f_string_ascii_plus_s.string[0]) {
       // Do nothing.
     }
+    else if (character.string[0] == f_string_ascii_0_s.string[0] || character.string[0] == f_string_ascii_x_s.string[0] || character.string[0] == f_string_ascii_X_s.string[0]) {
+      // Do nothing.
+    }
     else if (character.string[0] == f_string_ascii_space_s.string[0]) {
       status = F_space;
     }
@@ -171,6 +257,10 @@ extern "C" {
           *mode = utf8_codepoint_mode_begin_e;
           data->text.used = 0;
         }
+        else if (character.string[0] == f_string_ascii_0_s.string[0]) {
+          *mode = utf8_codepoint_mode_raw_begin_e;
+          data->text.used = 0;
+        }
         else {
           *mode = utf8_codepoint_mode_bad_e;
         }
@@ -183,6 +273,19 @@ extern "C" {
           *mode = utf8_codepoint_mode_bad_e;
         }
       }
+      else if (*mode == utf8_codepoint_mode_raw_begin_e) {
+        if (character.string[0] == f_string_ascii_x_s.string[0] || character.string[0] == f_string_ascii_X_s.string[0]) {
+          *mode = utf8_codepoint_mode_raw_number_e;
+        }
+        else {
+          *mode = utf8_codepoint_mode_bad_e;
+        }
+      }
+      else if (*mode == utf8_codepoint_mode_raw_number_e) {
+        if (status == F_space) {
+          *mode = utf8_codepoint_mode_raw_end_e;
+        }
+      }
       else if (*mode == utf8_codepoint_mode_number_e) {
         if (status == F_space) {
           *mode = utf8_codepoint_mode_end_e;
@@ -205,8 +308,8 @@ extern "C" {
     f_array_length_t i = 0;
     f_array_length_t j = 0;
 
-    f_char_t block[4] = { 0, 0, 0, 0 };
-    f_string_static_t character = macro_f_string_static_t_initialize2(block, 4);
+    f_char_t block[5] = { 0, 0, 0, 0, 0 };
+    f_string_static_t character = macro_f_string_static_t_initialize(block, 0, 4);
 
     do {
       status = f_file_read_block(file, &data->buffer);
@@ -247,7 +350,15 @@ extern "C" {
             status = utf8_detect_codepoint(data, character, &mode_codepoint);
 
             if (F_status_is_fine(status) && status != F_next) {
-              status = utf8_convert_codepoint(data, character, &mode_codepoint);
+              if (mode_codepoint == utf8_codepoint_mode_raw_begin_e || mode_codepoint == utf8_codepoint_mode_raw_number_e || mode_codepoint == utf8_codepoint_mode_raw_end_e) {
+                status = utf8_convert_raw(data, character, &mode_codepoint);
+
+                // Raw mode represents an invalid Unicode sequence.
+                valid = F_false;
+              }
+              else {
+                status = utf8_convert_codepoint(data, character, &mode_codepoint);
+              }
             }
           }
 
@@ -276,7 +387,15 @@ extern "C" {
         status = utf8_detect_codepoint(data, character, &mode_codepoint);
 
         if (F_status_is_fine(status) && status != F_next) {
-          status = utf8_convert_codepoint(data, character, &mode_codepoint);
+          if (mode_codepoint == utf8_codepoint_mode_raw_begin_e || mode_codepoint == utf8_codepoint_mode_raw_number_e || mode_codepoint == utf8_codepoint_mode_raw_end_e) {
+            status = utf8_convert_raw(data, character, &mode_codepoint);
+
+            // Raw mode represents an invalid Unicode sequence.
+            valid = F_false;
+          }
+          else {
+            status = utf8_convert_codepoint(data, character, &mode_codepoint);
+          }
         }
       }
 
diff --git a/level_3/utf8/c/private-utf8_codepoint.h b/level_3/utf8/c/private-utf8_codepoint.h
index 184b9fb8c..199895e22 100644
--- a/level_3/utf8/c/private-utf8_codepoint.h
+++ b/level_3/utf8/c/private-utf8_codepoint.h
@@ -15,7 +15,7 @@ extern "C" {
 /**
  * Convert a codepoint character representation to another format.
  *
- * This automatically determines the output format and is also handles the verify process.
+ * This automatically determines the output format and also handles the verify process.
  *
  * @param data
  *   The program data.
@@ -28,14 +28,42 @@ extern "C" {
  *   F_none on success.
  *   F_utf_not on invalid UTF-8 (which is still "success" when verifying).
  *
- *   F_utf_not (with error bit) if not verifying and
+ *   F_utf_not (with error bit) if not verifying and the Unicode value is invalid.
  *
  *   Errors (with error bit) from: f_utf_unicode_to()
+ *
+ * @see f_utf_unicode_to()
  */
 #ifndef _di_utf8_convert_codepoint_
   extern f_status_t utf8_convert_codepoint(utf8_data_t * const data, const f_string_static_t character, uint8_t *mode) F_attribute_visibility_internal_d;
 #endif // _di_utf8_convert_codepoint_
 
+/**
+ * Convert a raw character representation (hex-digit) to another format.
+ *
+ * This automatically determines the output format and also handles the verify process.
+ *
+ * @param data
+ *   The program data.
+ * @param character
+ *   The a single character currently being processed.
+ * @param mode
+ *   The codepoint mode the text is currently in.
+ *
+ * @return
+ *   F_none on success.
+ *   F_valid_not on invalid raw (which is still "success" when verifying).
+ *
+ *   F_valid_not (with error bit) if not verifying and the raw value is invalid.
+ *
+ *   Errors (with error bit) from: f_utf_unicode_to()
+ *
+ * @see f_utf_unicode_to()
+ */
+#ifndef _di_utf8_convert_raw_
+  extern f_status_t utf8_convert_raw(utf8_data_t * const data, const f_string_static_t character, uint8_t *mode) F_attribute_visibility_internal_d;
+#endif // _di_utf8_convert_raw_
+
 /**
  * Detect a codepoint character.
  *
-- 
2.52.0