From 95a51fdfd022435ba0fd21dbf5fbabaa8daf1927 Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Fri, 3 Dec 2021 18:29:50 -0600 Subject: [PATCH] Update: Complete incomplete unicode processing code. I had originally accidentally committed the utf8 program before it was ready. I followed up with a cleanup after I noticed this. It seems that there is still more work to finish. Looking at what I need to do to finish this it has become clear to me that I was originally working on this and realized I should move functionality into the level_0 f_utf project. When I did this, I probably noticed a Unicode bug and stopped what I was doing to fix it. I then forgot to come back and fix this code, leaving it in this incomplete and broken state. I also noticed that the f_utf_unicode_string_from() function is mis-named. The is a "to" function rather than a "from" function because it is creating to a Unicode codepoint. The "raw" print mode is now supported so use the fl_print_format() to print. Move the printing of "append" to after the closing color context. This makes more sense, but I have not bothered to check to see if the design logic is intended to be used this way. --- level_0/f_utf/c/utf.c | 110 ++++++++++++++++---------------- level_0/f_utf/c/utf.h | 54 ++++++++-------- level_3/utf8/c/private-print.c | 10 ++- level_3/utf8/c/private-utf8_binary.c | 4 +- level_3/utf8/c/private-utf8_codepoint.c | 23 +++---- 5 files changed, 95 insertions(+), 106 deletions(-) diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index d0cbef5..c6b162a 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -1798,8 +1798,60 @@ extern "C" { } #endif // _di_f_utf_unicode_from_ -#ifndef _di_f_utf_unicode_string_from_f_ - f_status_t f_utf_unicode_string_from(const f_string_t string, const f_array_length_t length, uint32_t *unicode) { +#ifndef _di_f_utf_unicode_to_ + f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, uint32_t *unicode) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + if (!unicode) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + { + f_utf_character_t character_utf = 0; + + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + + if (private_f_utf_character_is_valid(character_utf) == F_false) { + return F_status_set_error(F_utf); + } + } + + // U+0000 -> U+007F. + if (macro_f_utf_byte_width(*character) == 1) { + *unicode = ((uint8_t) character[0]) & 0x7f; + } + + // U+0080 -> U+07FF. + else if (macro_f_utf_byte_width(*character) == 2) { + *unicode = (((uint8_t) character[0]) & 0x1f) << 6; + *unicode |= ((uint8_t) character[1]) & 0x3f; + } + + // U+0800 -> U+FFFF. + else if (macro_f_utf_byte_width(*character) == 3) { + *unicode = (((uint8_t) character[0]) & 0xf) << 12; + *unicode |= (((uint8_t) character[1]) & 0x3f) << 6; + *unicode |= ((uint8_t) character[2]) & 0x3f; + } + + // U+10000 -> U+10FFFF. + else if (macro_f_utf_byte_width(*character) == 4) { + *unicode = (((uint8_t) character[0]) & 0x7) << 18; + *unicode |= (((uint8_t) character[1]) & 0x3f) << 12; + *unicode |= (((uint8_t) character[2]) & 0x3f) << 6; + *unicode |= ((uint8_t) character[3]) & 0x3f; + } + + return F_none; + } +#endif // _di_f_utf_unicode_to_ + +#ifndef _di_f_utf_unicode_string_to_f_ + f_status_t f_utf_unicode_string_to(const f_string_t string, const f_array_length_t length, uint32_t *unicode) { #ifndef _di_level_0_parameter_checking_ if (!unicode) return F_status_set_error(F_parameter); #endif // _di_level_0_parameter_checking_ @@ -1858,59 +1910,7 @@ extern "C" { return F_none; } -#endif // _di_f_utf_unicode_string_from_ - -#ifndef _di_f_utf_unicode_to_ - f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, uint32_t *unicode) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - if (!unicode) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - { - f_utf_character_t character_utf = 0; - - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - - if (private_f_utf_character_is_valid(character_utf) == F_false) { - return F_status_set_error(F_utf); - } - } - - // U+0000 -> U+007F. - if (macro_f_utf_byte_width(*character) == 1) { - *unicode = ((uint8_t) character[0]) & 0x7f; - } - - // U+0080 -> U+07FF. - else if (macro_f_utf_byte_width(*character) == 2) { - *unicode = (((uint8_t) character[0]) & 0x1f) << 6; - *unicode |= ((uint8_t) character[1]) & 0x3f; - } - - // U+0800 -> U+FFFF. - else if (macro_f_utf_byte_width(*character) == 3) { - *unicode = (((uint8_t) character[0]) & 0xf) << 12; - *unicode |= (((uint8_t) character[1]) & 0x3f) << 6; - *unicode |= ((uint8_t) character[2]) & 0x3f; - } - - // U+10000 -> U+10FFFF. - else if (macro_f_utf_byte_width(*character) == 4) { - *unicode = (((uint8_t) character[0]) & 0x7) << 18; - *unicode |= (((uint8_t) character[1]) & 0x3f) << 12; - *unicode |= (((uint8_t) character[2]) & 0x3f) << 6; - *unicode |= ((uint8_t) character[3]) & 0x3f; - } - - return F_none; - } -#endif // _di_f_utf_unicode_to_ +#endif // _di_f_utf_unicode_string_to_ #ifdef __cplusplus } // extern "C" diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h index 085da41..5c66ccc 100644 --- a/level_0/f_utf/c/utf.h +++ b/level_0/f_utf/c/utf.h @@ -1622,33 +1622,6 @@ extern "C" { #endif // _di_f_utf_unicode_from_ /** - * Convert a string of the format "U+FFFF" into the codepoint value. - * - * This ignores NULL characters. - * The string may only contain "U+" followed by a hexidecimal digit, upper or lower case. - * The "U+" prefix is optional. - * Only ASCII characters are allowed to represent the Unicode sequence string. - * - * @param string - * The string representing a Unicode sequence. - * @param length - * The maximum number of characters. - * @param unicode - * A 32-bit integer representing the Unicode (such as U+0001). - * Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF). - * - * @return - * F_none on success. - * - * F_failure (with error bit) if width_max is not long enough to convert. - * F_parameter (with error bit) if a parameter is invalid. - * F_valid_not (with error bit) if string is not a valid Unicode string. - */ -#ifndef _di_f_utf_unicode_string_from_f_ - extern f_status_t f_utf_unicode_string_from(const f_string_t string, const f_array_length_t length, uint32_t *unicode); -#endif // _di_f_utf_unicode_string_from_ - -/** * Convert a given string block representing a single character into Unicode. * * @param character @@ -1676,6 +1649,33 @@ extern "C" { extern f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, uint32_t *unicode); #endif // _di_f_utf_unicode_to_ +/** + * Convert a string of the format "U+FFFF" into the codepoint value. + * + * This ignores NULL characters. + * The string may only contain "U+" followed by a hexidecimal digit, upper or lower case. + * The "U+" prefix is optional. + * Only ASCII characters are allowed to represent the Unicode sequence string. + * + * @param string + * The string representing a Unicode sequence. + * @param length + * The maximum number of characters. + * @param unicode + * A 32-bit integer representing the Unicode (such as U+0001). + * Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF). + * + * @return + * F_none on success. + * + * F_failure (with error bit) if width_max is not long enough to convert. + * F_parameter (with error bit) if a parameter is invalid. + * F_valid_not (with error bit) if string is not a valid Unicode string. + */ +#ifndef _di_f_utf_unicode_string_to_f_ + extern f_status_t f_utf_unicode_string_to(const f_string_t string, const f_array_length_t length, uint32_t *unicode); +#endif // _di_f_utf_unicode_string_to_ + #ifdef __cplusplus } // extern "C" #endif diff --git a/level_3/utf8/c/private-print.c b/level_3/utf8/c/private-print.c index 68cc1aa..617c66c 100644 --- a/level_3/utf8/c/private-print.c +++ b/level_3/utf8/c/private-print.c @@ -14,9 +14,7 @@ extern "C" { } if (data->mode & utf8_mode_to_binary_d) { - fl_print_format("%s%[", data->file.stream, data->prepend, set); - f_print_dynamic_raw(character, data->file.stream); - fl_print_format("%s%]", data->file.stream, data->append, set); + fl_print_format("%s%[%r%]%s", data->file.stream, data->prepend, set, character, set, data->append); } else { fl_print_format("%s%[0x", data->file.stream, data->prepend, set); @@ -25,7 +23,7 @@ extern "C" { fl_print_format("%04_uii", data->file.stream, (uint8_t) character.string[i]); } // for - fl_print_format("%s%]", data->file.stream, data->append, set); + fl_print_format("%]%s", data->file.stream, set, data->append); } } #endif // _di_utf8_print_character_ @@ -35,10 +33,10 @@ extern "C" { if (data->main->error.verbosity == f_console_verbosity_quiet) return; - fl_print_format("%c%[%SFailed to decode character '%]", data->main->error.to.stream, f_string_eol_s[0], data->main->context.set.error, data->main->context.set.error); + fl_print_format("%c%[%SFailed to decode character '%]", data->main->error.to.stream, f_string_eol_s[0], data->main->context.set.error, data->main->error.prefix, data->main->context.set.error); fl_print_format("%[%r%]", data->main->error.to.stream, data->main->context.set.notable, character, data->main->context.set.notable); fl_print_format("%[', error status code%] ", data->main->error.to.stream, data->main->context.set.error, data->main->context.set.error, f_string_eol_s[0]); - fl_print_format("%[%S%]", data->main->error.to.stream, data->main->context.set.notable, F_status_set_fine(status), data->main->context.set.notable); + fl_print_format("%[%ui%]", data->main->error.to.stream, data->main->context.set.notable, F_status_set_fine(status), data->main->context.set.notable); fl_print_format("%[.%]%c", data->main->error.to.stream, data->main->context.set.error, data->main->context.set.error, f_string_eol_s[0]); } #endif // _di_utf8_print_error_decode_ diff --git a/level_3/utf8/c/private-utf8_binary.c b/level_3/utf8/c/private-utf8_binary.c index accba66..718c2f1 100644 --- a/level_3/utf8/c/private-utf8_binary.c +++ b/level_3/utf8/c/private-utf8_binary.c @@ -35,9 +35,7 @@ extern "C" { } else if (data->main->parameters[utf8_parameter_verify].result == f_console_result_none) { if (data->mode & utf8_mode_to_binary_d) { - f_print_terminated(data->prepend, data->file.stream); - f_print_dynamic_raw(character, data->file.stream); - f_print_terminated(data->append, data->file.stream); + fl_print_format("%s%r%s", data->file.stream, data->prepend, character, data->append); } else { fl_print_format(codepoint < 0xffff ? "%sU+%04_U%s" : "%sU+%6_U%s", data->file.stream, data->prepend, codepoint, data->append); diff --git a/level_3/utf8/c/private-utf8_codepoint.c b/level_3/utf8/c/private-utf8_codepoint.c index b373dec..849c937 100644 --- a/level_3/utf8/c/private-utf8_codepoint.c +++ b/level_3/utf8/c/private-utf8_codepoint.c @@ -32,7 +32,7 @@ extern "C" { if (*mode == utf8_codepoint_mode_end) { uint32_t codepoint = 0; - status = f_utf_unicode_string_from(data->text.string, data->text.used, &codepoint); + status = f_utf_unicode_string_to(data->text.string, data->text.used, &codepoint); if (F_status_is_error(status)) { if (F_status_set_fine(status) == F_failure || F_status_set_fine(status) == F_utf) { @@ -50,23 +50,16 @@ extern "C" { text.used = macro_f_utf_byte_width(codepoint); text.size = 5; - byte[0] = macro_f_utf_character_t_to_char_1(codepoint); + status = f_utf_unicode_from(codepoint, 4, &text.string); - if (text.used > 1) { - byte[1] = macro_f_utf_character_t_to_char_2(codepoint); - - if (text.used > 2) { - byte[2] = macro_f_utf_character_t_to_char_3(codepoint); - - if (text.used > 3) { - byte[3] = macro_f_utf_character_t_to_char_4(codepoint); - } - } + if (F_status_is_error(status)) { + utf8_print_error_decode(data, status, character); } + else { + status = F_none; - f_print_terminated(data->prepend, data->file.stream); - f_print_dynamic_raw(text, data->file.stream); - f_print_terminated(data->append, data->file.stream); + fl_print_format("%s%r%s", data->file.stream, data->prepend, text, data->append); + } } else { fl_print_format(codepoint < 0xffff ? "%sU+%04_U%s" : "%sU+%6_U%s", data->file.stream, data->prepend, codepoint, data->append); -- 1.8.3.1