From 6761261276ef712883e642740a6fa329fdaaf21f Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Sun, 22 May 2022 20:45:31 -0500 Subject: [PATCH] Feature: Add missing functionality allowing the utf8 program to convert back to binary data with invalid codepoints. Even when there are invalid codepoints produced, it should be possible to convert the entire output back to the original data. This is possible because the codepoint output by default still prints the invalid data as a hex-digit representing up to 4 bytes of data. The combining and width parameters are also supported. --- level_3/utf8/c/common.c | 2 + level_3/utf8/c/common.h | 6 ++ level_3/utf8/c/private-common.h | 25 ++++-- level_3/utf8/c/private-print.c | 107 ++++++++++++++++++++++++ level_3/utf8/c/private-print.h | 40 +++++++++ level_3/utf8/c/private-utf8.c | 12 ++- level_3/utf8/c/private-utf8_bytecode.c | 2 +- level_3/utf8/c/private-utf8_codepoint.c | 141 +++++++++++++++++++++++++++++--- level_3/utf8/c/private-utf8_codepoint.h | 32 +++++++- 9 files changed, 344 insertions(+), 23 deletions(-) diff --git a/level_3/utf8/c/common.c b/level_3/utf8/c/common.c index 20e8b88..4f81c0f 100644 --- a/level_3/utf8/c/common.c +++ b/level_3/utf8/c/common.c @@ -35,6 +35,8 @@ extern "C" { const f_string_static_t utf8_string_width_0_s = macro_f_string_static_t_initialize(UTF8_string_width_0_s, 0, UTF8_string_width_0_s_length); const f_string_static_t utf8_string_width_1_s = macro_f_string_static_t_initialize(UTF8_string_width_1_s, 0, UTF8_string_width_1_s_length); const f_string_static_t utf8_string_width_2_s = macro_f_string_static_t_initialize(UTF8_string_width_2_s, 0, UTF8_string_width_2_s_length); + const f_string_static_t utf8_string_width_3_s = macro_f_string_static_t_initialize(UTF8_string_width_3_s, 0, UTF8_string_width_3_s_length); + const f_string_static_t utf8_string_width_4_s = macro_f_string_static_t_initialize(UTF8_string_width_4_s, 0, UTF8_string_width_4_s_length); #endif // _di_utf8_defines_ #ifndef _di_utf8_parameters_ diff --git a/level_3/utf8/c/common.h b/level_3/utf8/c/common.h index ab64ec5..1a477d0 100644 --- a/level_3/utf8/c/common.h +++ b/level_3/utf8/c/common.h @@ -99,6 +99,8 @@ extern "C" { #define UTF8_string_width_0_s "0" #define UTF8_string_width_1_s "1" #define UTF8_string_width_2_s "2" + #define UTF8_string_width_3_s "3" + #define UTF8_string_width_4_s "4" #define UTF8_string_combining_is_s_length 1 #define UTF8_string_combining_not_s_length 1 @@ -117,6 +119,8 @@ extern "C" { #define UTF8_string_width_0_s_length 1 #define UTF8_string_width_1_s_length 1 #define UTF8_string_width_2_s_length 1 + #define UTF8_string_width_3_s_length 1 + #define UTF8_string_width_4_s_length 1 extern const f_string_static_t utf8_string_combining_is_s; extern const f_string_static_t utf8_string_combining_not_s; @@ -133,6 +137,8 @@ extern "C" { extern const f_string_static_t utf8_string_width_0_s; extern const f_string_static_t utf8_string_width_1_s; extern const f_string_static_t utf8_string_width_2_s; + extern const f_string_static_t utf8_string_width_3_s; + extern const f_string_static_t utf8_string_width_4_s; extern const f_string_static_t utf8_string_valid_not_s; #endif // _di_utf8_defines_ diff --git a/level_3/utf8/c/private-common.h b/level_3/utf8/c/private-common.h index c3f5d49..1226c1b 100644 --- a/level_3/utf8/c/private-common.h +++ b/level_3/utf8/c/private-common.h @@ -13,14 +13,22 @@ extern "C" { #endif /** + * Codepoint modes for converting to/from binary and codepoint values. + * + * The special "raw" format is used only for reading from codepoint format where that format represents binary character that is not a valid Unicode character. + * This is intended to be used to save and restore the original binary data even if that data is invalid. + * * private_utf8_codepoint_mode_*: - * - ready: The codepoint has yet to be processed, skip leading spaces until first 'U' is matched. - * - begin: The first 'U' is matched, look for the '+'. - * - number: The '+' is matched, process numbers. - * - end: The last number is reached (at either white space or EOS/EOF). - * - bad: This is not a valid codepoint. - * - bad_begin: This is the beginning of an invalid codepoint. - * - bad_end: The end of bad is detected, which happens on white space or end of buffer. + * - ready: The codepoint has yet to be processed, skip leading spaces until first 'U' is matched. + * - begin: The first 'U' is matched, look for the '+'. + * - number: The '+' is matched, process numbers. + * - end: The last number is reached (at either white space or EOS/EOF). + * - bad: This is not a valid codepoint. + * - bad_begin: This is the beginning of an invalid codepoint. + * - bad_end: The end of bad is detected, which happens on white space or end of buffer. + * - raw_begin: This is the beginning of a potential raw data (matched '0'). + * - raw_number: This is the confirmed beginning of raw data (matched 'X'), process numbers. + * - raw_end: The end of raw data is detected, which happens on white space or end of buffer. */ #ifndef _di_utf8_codepoint_modes_ enum { @@ -31,6 +39,9 @@ extern "C" { utf8_codepoint_mode_bad_e, utf8_codepoint_mode_bad_begin_e, utf8_codepoint_mode_bad_end_e, + utf8_codepoint_mode_raw_begin_e, + utf8_codepoint_mode_raw_number_e, + utf8_codepoint_mode_raw_end_e, }; #endif // _di__utf8_codepoint_modes_ diff --git a/level_3/utf8/c/private-print.c b/level_3/utf8/c/private-print.c index cec0a58..7a6fcb1 100644 --- a/level_3/utf8/c/private-print.c +++ b/level_3/utf8/c/private-print.c @@ -209,6 +209,113 @@ extern "C" { } #endif // _di_utf8_print_error_parameter_file_to_too_many_ +#ifndef _di_utf8_print_raw_bytecode_ + void utf8_print_raw_bytecode(utf8_data_t * const data, const f_utf_char_t raw, const uint8_t width) { + + if (data->main->parameters.array[utf8_parameter_strip_invalid_e].result == f_console_result_found_e) return; + if (data->main->parameters.array[utf8_parameter_verify_e].result == f_console_result_found_e) return; + + f_string_static_t character = macro_f_string_static_t_initialize(0, 0, width); + + uint8_t byte[character.used]; + character.string = byte; + + if (raw) { + if (width == 1) { + byte[0] = (uint8_t) (raw & 0xff); + } + else if (width == 2) { + #ifdef _is_F_endian_big + byte[0] = (uint8_t) (raw & 0xff); + byte[1] = (uint8_t) ((raw & 0xff00) << 8); + #else + byte[0] = (uint8_t) ((raw & 0xff00) >> 8); + byte[1] = (uint8_t) (raw & 0xff); + #endif // _is_F_endian_big + } + else if (width == 3) { + #ifdef _is_F_endian_big + byte[0] = (uint8_t) (raw & 0xff); + byte[1] = (uint8_t) ((raw & 0xff00) << 8); + byte[2] = (uint8_t) ((raw & 0xff0000) << 16); + #else + byte[0] = (uint8_t) ((raw & 0xff0000) >> 16); + byte[1] = (uint8_t) ((raw & 0xff00) >> 8); + byte[2] = (uint8_t) (raw & 0xff); + #endif // _is_F_endian_big + } + else { + #ifdef _is_F_endian_big + byte[0] = (uint8_t) (raw & 0xff); + byte[1] = (uint8_t) ((raw & 0xff00) << 8); + byte[2] = (uint8_t) ((raw & 0xff0000) << 16); + byte[3] = (uint8_t) ((raw & 0xff000000) << 24); + #else + byte[0] = (uint8_t) ((raw & 0xff000000) >> 24); + byte[1] = (uint8_t) ((raw & 0xff0000) >> 16); + byte[2] = (uint8_t) ((raw & 0xff00) >> 8); + byte[3] = (uint8_t) (raw & 0xff); + #endif // _is_F_endian_big + } + } + else { + memset(byte, 0, sizeof(uint8_t) * width); + } + + fl_print_format("%r%[%r%]%r", data->file.stream, data->prepend, data->valid_not, character, data->valid_not, data->append); + } +#endif // _di_utf8_print_raw_bytecode_ + +#ifndef _di_utf8_print_raw_codepoint_ + void utf8_print_raw_codepoint(utf8_data_t * const data, const f_string_static_t raw) { + + if (data->main->parameters.array[utf8_parameter_strip_invalid_e].result == f_console_result_found_e) return; + if (data->main->parameters.array[utf8_parameter_verify_e].result == f_console_result_found_e) return; + + fl_print_format("%r%[%r%]%r", data->file.stream, data->prepend, data->valid_not, raw, data->valid_not, data->append); + } +#endif // _di_utf8_print_raw_codepoint_ + +#ifndef _di_utf8_print_raw_combining_or_width_ + void utf8_print_raw_combining_or_width(utf8_data_t * const data, const uint8_t width) { + + if (data->main->parameters.array[utf8_parameter_strip_invalid_e].result == f_console_result_found_e) return; + if (data->main->parameters.array[utf8_parameter_verify_e].result == f_console_result_found_e) return; + + f_status_t status = F_none; + + if (data->mode & utf8_mode_to_combining_d) { + fl_print_format("%r%[%r%]%r", data->file.stream, data->prepend, data->valid_not, utf8_string_unknown_s, data->valid_not, data->append); + } + else if (data->mode & utf8_mode_to_width_d) { + const f_string_static_t *character = 0; + + switch (width) { + case 1: + character = &utf8_string_width_1_s; + break; + + case 2: + character = &utf8_string_width_2_s; + break; + + case 3: + character = &utf8_string_width_3_s; + break; + + case 4: + character = &utf8_string_width_4_s; + break; + + default: + character = &utf8_string_width_0_s; + } + + fl_print_format("%r%[%r%]%r", data->file.stream, data->prepend, data->valid_not, *character, data->valid_not, data->append); + } + } +#endif // _di_utf8_print_raw_combining_or_width_ + #ifndef _di_utf8_print_section_header_file_ void utf8_print_section_header_file(utf8_data_t * const data, const f_string_static_t name) { diff --git a/level_3/utf8/c/private-print.h b/level_3/utf8/c/private-print.h index df35aa0..c6f794d 100644 --- a/level_3/utf8/c/private-print.h +++ b/level_3/utf8/c/private-print.h @@ -152,6 +152,46 @@ extern "C" { #endif // _di_utf8_print_error_parameter_file_to_too_many_ /** + * Print the raw character data (binary / bytecode). + * + * @param data + * The program data. + * @param raw + * The raw string in integer format. + * @param width + * The width the raw character represents (a value inclusively from 1 to 4). + */ +#ifndef _di_utf8_print_raw_bytecode_ + extern void utf8_print_raw_bytecode(utf8_data_t * const data, const f_utf_char_t raw, const uint8_t width) F_attribute_visibility_internal_d; +#endif // _di_utf8_print_raw_bytecode_ + +/** + * Print the raw character data (codepoint). + * + * @param data + * The program data. + * @param raw + * The raw string already in codepoint format. + */ +#ifndef _di_utf8_print_raw_codepoint_ + extern void utf8_print_raw_codepoint(utf8_data_t * const data, const f_string_static_t raw) F_attribute_visibility_internal_d; +#endif // _di_utf8_print_raw_codepoint_ + +/** + * Print the width or combining state of the for a raw character. + * + * @param data + * The program data. + * @param width + * The pre-calculated width. + * + * @see utf8_print_width() + */ +#ifndef _di_utf8_print_raw_combining_or_width_ + extern void utf8_print_raw_combining_or_width(utf8_data_t * const data, const uint8_t width) F_attribute_visibility_internal_d; +#endif // _di_utf8_print_raw_combining_or_width_ + +/** * Print the input file section header. * * @param data diff --git a/level_3/utf8/c/private-utf8.c b/level_3/utf8/c/private-utf8.c index 3535e21..e159450 100644 --- a/level_3/utf8/c/private-utf8.c +++ b/level_3/utf8/c/private-utf8.c @@ -66,10 +66,13 @@ extern "C" { } // for if (F_status_is_error_not(status) && !(data->mode & utf8_mode_from_bytecode_d)) { - if (mode_codepoint != utf8_codepoint_mode_ready_e && mode_codepoint != utf8_codepoint_mode_end_e && mode_codepoint != utf8_codepoint_mode_bad_end_e) { + if (mode_codepoint != utf8_codepoint_mode_ready_e && mode_codepoint != utf8_codepoint_mode_end_e && mode_codepoint != utf8_codepoint_mode_bad_end_e && mode_codepoint != utf8_codepoint_mode_raw_end_e) { if (mode_codepoint == utf8_codepoint_mode_number_e) { mode_codepoint = utf8_codepoint_mode_end_e; } + else if (mode_codepoint == utf8_codepoint_mode_raw_number_e) { + mode_codepoint = utf8_codepoint_mode_raw_end_e; + } else { mode_codepoint = utf8_codepoint_mode_bad_end_e; valid = F_false; @@ -77,7 +80,12 @@ extern "C" { text.used = 0; - status = utf8_convert_codepoint(data, text, &mode_codepoint); + if (mode_codepoint == utf8_codepoint_mode_raw_number_e) { + status = utf8_convert_raw(data, text, &mode_codepoint); + } + else { + status = utf8_convert_codepoint(data, text, &mode_codepoint); + } } } diff --git a/level_3/utf8/c/private-utf8_bytecode.c b/level_3/utf8/c/private-utf8_bytecode.c index 419aef0..6628d39 100644 --- a/level_3/utf8/c/private-utf8_bytecode.c +++ b/level_3/utf8/c/private-utf8_bytecode.c @@ -73,7 +73,7 @@ extern "C" { f_array_length_t j = 0; f_char_t block_character[4] = { 0, 0, 0, 0 }; - f_string_static_t character = macro_f_string_static_t_initialize2(block_character, 4); + f_string_static_t character = macro_f_string_static_t_initialize(block_character, 0, 4); do { status = f_file_read_block(file, &data->buffer); diff --git a/level_3/utf8/c/private-utf8_codepoint.c b/level_3/utf8/c/private-utf8_codepoint.c index 689b718..d0fdd6c 100644 --- a/level_3/utf8/c/private-utf8_codepoint.c +++ b/level_3/utf8/c/private-utf8_codepoint.c @@ -26,10 +26,6 @@ extern "C" { } // for } - if (!(*mode == utf8_codepoint_mode_end_e || *mode == utf8_codepoint_mode_bad_end_e)) { - return F_none; - } - if (*mode == utf8_codepoint_mode_end_e) { uint32_t codepoint = 0; @@ -53,8 +49,8 @@ extern "C" { } else if (data->main->parameters.array[utf8_parameter_verify_e].result == f_console_result_none_e) { if (data->mode & utf8_mode_to_bytecode_d) { - f_char_t byte[5] = { 0, 0, 0, 0, 0 }; - f_string_static_t character = macro_f_string_static_t_initialize2(byte, 5); + f_char_t byte[4] = { 0, 0, 0, 0 }; + f_string_static_t character = macro_f_string_static_t_initialize(byte, 0, 4); status = f_utf_unicode_from(codepoint, 4, &character.string); @@ -76,11 +72,14 @@ extern "C" { } } } - else { + else if (*mode == utf8_codepoint_mode_bad_end_e) { status = F_none; utf8_print_character_invalid(data, character); } + else { + return F_none; + } *mode = utf8_codepoint_mode_ready_e; data->text.used = 0; @@ -93,6 +92,90 @@ extern "C" { } #endif // _di_utf8_convert_codepoint_ +#ifndef _di_utf8_convert_raw_ + f_status_t utf8_convert_raw(utf8_data_t * const data, const f_string_static_t character, uint8_t *mode) { + + f_status_t status = F_none; + bool valid_not = F_false; + + if (*mode != utf8_codepoint_mode_raw_end_e) { + if (data->text.used + character.used >= data->text.size) { + status = f_string_dynamic_increase_by(utf8_default_allocation_step_d, &data->text); + if (F_status_is_error(status)) return status; + } + + for (f_array_length_t i = 0; i < character.used; ++i) { + data->text.string[data->text.used++] = character.string[i]; + } // for + } + + if (*mode == utf8_codepoint_mode_raw_end_e) { + f_utf_char_t raw = 0; + + { + f_number_unsigned_t number = 0; + + status = fl_conversion_dynamic_to_number_unsigned(data->text, &number); + + raw = (f_utf_char_t) number; + } + + if (F_status_is_error(status)) { + status = F_status_set_fine(status); + + if (status == F_number || status == F_utf_not || status == F_complete_not_utf || status == F_utf_fragment || status == F_number_decimal || status == F_number_negative || status == F_number_positive || status == F_number_overflow) { + valid_not = F_true; + + utf8_print_character_invalid(data, character); + } + else { + status = F_status_set_error(status); + + utf8_print_error_decode(data, status, character); + + return status; + } + } + else if (data->main->parameters.array[utf8_parameter_verify_e].result == f_console_result_none_e) { + + // The width actually includes the leading '0x', which is not part of the width of the digit in binary form. + uint8_t width = data->text.used > 1 ? (data->text.used - 2) / 2 : 0; + + if ((data->text.used - 2) % 2) { + ++width; + } + + if (data->mode & utf8_mode_to_bytecode_d) { + utf8_print_raw_bytecode(data, raw, width); + } + else if (data->mode & utf8_mode_to_codepoint_d) { + utf8_print_raw_codepoint(data, data->text); + } + else { + utf8_print_raw_combining_or_width(data, width); + } + } + } + else if (*mode == utf8_codepoint_mode_bad_end_e) { + status = F_none; + + utf8_print_character_invalid(data, character); + } + else { + return F_none; + } + + *mode = utf8_codepoint_mode_ready_e; + data->text.used = 0; + + if (valid_not || F_status_is_error(status)) { + return F_valid_not; + } + + return status; + } +#endif // _di_utf8_convert_raw_ + #ifndef _di_utf8_detect_codepoint_ f_status_t utf8_detect_codepoint(utf8_data_t * const data, const f_string_static_t character, uint8_t *mode) { @@ -106,6 +189,9 @@ extern "C" { if (character.string[0] == f_string_ascii_u_s.string[0] || character.string[0] == f_string_ascii_U_s.string[0] || character.string[0] == f_string_ascii_plus_s.string[0]) { // Do nothing. } + else if (character.string[0] == f_string_ascii_0_s.string[0] || character.string[0] == f_string_ascii_x_s.string[0] || character.string[0] == f_string_ascii_X_s.string[0]) { + // Do nothing. + } else if (character.string[0] == f_string_ascii_space_s.string[0]) { status = F_space; } @@ -171,6 +257,10 @@ extern "C" { *mode = utf8_codepoint_mode_begin_e; data->text.used = 0; } + else if (character.string[0] == f_string_ascii_0_s.string[0]) { + *mode = utf8_codepoint_mode_raw_begin_e; + data->text.used = 0; + } else { *mode = utf8_codepoint_mode_bad_e; } @@ -183,6 +273,19 @@ extern "C" { *mode = utf8_codepoint_mode_bad_e; } } + else if (*mode == utf8_codepoint_mode_raw_begin_e) { + if (character.string[0] == f_string_ascii_x_s.string[0] || character.string[0] == f_string_ascii_X_s.string[0]) { + *mode = utf8_codepoint_mode_raw_number_e; + } + else { + *mode = utf8_codepoint_mode_bad_e; + } + } + else if (*mode == utf8_codepoint_mode_raw_number_e) { + if (status == F_space) { + *mode = utf8_codepoint_mode_raw_end_e; + } + } else if (*mode == utf8_codepoint_mode_number_e) { if (status == F_space) { *mode = utf8_codepoint_mode_end_e; @@ -205,8 +308,8 @@ extern "C" { f_array_length_t i = 0; f_array_length_t j = 0; - f_char_t block[4] = { 0, 0, 0, 0 }; - f_string_static_t character = macro_f_string_static_t_initialize2(block, 4); + f_char_t block[5] = { 0, 0, 0, 0, 0 }; + f_string_static_t character = macro_f_string_static_t_initialize(block, 0, 4); do { status = f_file_read_block(file, &data->buffer); @@ -247,7 +350,15 @@ extern "C" { status = utf8_detect_codepoint(data, character, &mode_codepoint); if (F_status_is_fine(status) && status != F_next) { - status = utf8_convert_codepoint(data, character, &mode_codepoint); + if (mode_codepoint == utf8_codepoint_mode_raw_begin_e || mode_codepoint == utf8_codepoint_mode_raw_number_e || mode_codepoint == utf8_codepoint_mode_raw_end_e) { + status = utf8_convert_raw(data, character, &mode_codepoint); + + // Raw mode represents an invalid Unicode sequence. + valid = F_false; + } + else { + status = utf8_convert_codepoint(data, character, &mode_codepoint); + } } } @@ -276,7 +387,15 @@ extern "C" { status = utf8_detect_codepoint(data, character, &mode_codepoint); if (F_status_is_fine(status) && status != F_next) { - status = utf8_convert_codepoint(data, character, &mode_codepoint); + if (mode_codepoint == utf8_codepoint_mode_raw_begin_e || mode_codepoint == utf8_codepoint_mode_raw_number_e || mode_codepoint == utf8_codepoint_mode_raw_end_e) { + status = utf8_convert_raw(data, character, &mode_codepoint); + + // Raw mode represents an invalid Unicode sequence. + valid = F_false; + } + else { + status = utf8_convert_codepoint(data, character, &mode_codepoint); + } } } diff --git a/level_3/utf8/c/private-utf8_codepoint.h b/level_3/utf8/c/private-utf8_codepoint.h index 184b9fb..199895e 100644 --- a/level_3/utf8/c/private-utf8_codepoint.h +++ b/level_3/utf8/c/private-utf8_codepoint.h @@ -15,7 +15,7 @@ extern "C" { /** * Convert a codepoint character representation to another format. * - * This automatically determines the output format and is also handles the verify process. + * This automatically determines the output format and also handles the verify process. * * @param data * The program data. @@ -28,15 +28,43 @@ extern "C" { * F_none on success. * F_utf_not on invalid UTF-8 (which is still "success" when verifying). * - * F_utf_not (with error bit) if not verifying and + * F_utf_not (with error bit) if not verifying and the Unicode value is invalid. * * Errors (with error bit) from: f_utf_unicode_to() + * + * @see f_utf_unicode_to() */ #ifndef _di_utf8_convert_codepoint_ extern f_status_t utf8_convert_codepoint(utf8_data_t * const data, const f_string_static_t character, uint8_t *mode) F_attribute_visibility_internal_d; #endif // _di_utf8_convert_codepoint_ /** + * Convert a raw character representation (hex-digit) to another format. + * + * This automatically determines the output format and also handles the verify process. + * + * @param data + * The program data. + * @param character + * The a single character currently being processed. + * @param mode + * The codepoint mode the text is currently in. + * + * @return + * F_none on success. + * F_valid_not on invalid raw (which is still "success" when verifying). + * + * F_valid_not (with error bit) if not verifying and the raw value is invalid. + * + * Errors (with error bit) from: f_utf_unicode_to() + * + * @see f_utf_unicode_to() + */ +#ifndef _di_utf8_convert_raw_ + extern f_status_t utf8_convert_raw(utf8_data_t * const data, const f_string_static_t character, uint8_t *mode) F_attribute_visibility_internal_d; +#endif // _di_utf8_convert_raw_ + +/** * Detect a codepoint character. * * @param data -- 1.8.3.1