From 2ee43d6a407f5a968f49f536beb5313251c67451 Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Sat, 11 Dec 2021 16:06:37 -0600 Subject: [PATCH] Feature: Support outputting width or combining state of characters. The width is reported as one of: '0', '1', or '2'. The following is used for unknown or invalid '?'. The private use area is consider valid but unknown. The combining state is reported as either 'C' or 'N'. The 'N' can be considered either 'Not' or 'No' as the meaning is synonymous in this case. The to_combining and to_width may be used together. Now that I know how this is to be implemented, remove unneeded functions. Remove some extra newlines printed in the help. --- level_3/utf8/c/private-print.c | 114 ++++++++++++++++++++++---------- level_3/utf8/c/private-print.h | 56 ++++++++-------- level_3/utf8/c/private-utf8_binary.c | 5 +- level_3/utf8/c/private-utf8_codepoint.c | 5 +- level_3/utf8/c/utf8.c | 68 ++++++++++++++++--- level_3/utf8/c/utf8.h | 38 +++++++++-- 6 files changed, 209 insertions(+), 77 deletions(-) diff --git a/level_3/utf8/c/private-print.c b/level_3/utf8/c/private-print.c index f362bf6..e697d8c 100644 --- a/level_3/utf8/c/private-print.c +++ b/level_3/utf8/c/private-print.c @@ -13,37 +13,33 @@ extern "C" { } #endif // _di_utf8_print_binary_ -#ifndef _di_utf8_print_character_ - void utf8_print_character(utf8_data_t * const data, const f_string_static_t character, const f_color_set_t set) { +#ifndef _di_utf8_print_character_invalid_ + void utf8_print_character_invalid(utf8_data_t * const data, const f_string_static_t character) { + + if (data->main->parameters[utf8_parameter_strip_invalid].result == f_console_result_found) return; + if (data->main->parameters[utf8_parameter_verify].result == f_console_result_found) return; if (!character.used) return; - if (data->mode & utf8_mode_to_binary_d) { - fl_print_format("%s%[%r%]%s", data->file.stream, data->prepend, set, character, set, data->append); + if ((data->mode & utf8_mode_to_combining_d) || (data->mode & utf8_mode_to_width_d)) { + utf8_print_combining_or_width(data, character); + } + else if (data->mode & utf8_mode_to_binary_d) { + fl_print_format("%s%[%r%]%s", data->file.stream, data->prepend, data->valid_not, character, data->valid_not, data->append); } else if (data->mode & utf8_mode_from_codepoint_d) { - fl_print_format("%s%[%Q%]%s", data->file.stream, data->prepend, set, character, set, data->append); + fl_print_format("%s%[%Q%]%s", data->file.stream, data->prepend, data->valid_not, character, data->valid_not, data->append); } else { - fl_print_format("%s%[0x", data->file.stream, data->prepend, set); + fl_print_format("%s%[0x", data->file.stream, data->prepend, data->valid_not); for (uint8_t i = 0; i < character.used; ++i) { fl_print_format("%02_uii", data->file.stream, (uint8_t) character.string[i]); } // for - fl_print_format("%]%s", data->file.stream, set, data->append); + fl_print_format("%]%s", data->file.stream, data->valid_not, data->append); } } -#endif // _di_utf8_print_character_ - -#ifndef _di_utf8_print_character_invalid_ - void utf8_print_character_invalid(utf8_data_t * const data, const f_string_static_t character) { - - if (data->main->parameters[utf8_parameter_strip_invalid].result == f_console_result_found) return; - if (data->main->parameters[utf8_parameter_verify].result == f_console_result_found) return; - - utf8_print_character(data, character, data->valid_not); - } #endif // _di_utf8_print_character_invalid_ #ifndef _di_utf8_print_codepoint_ @@ -61,6 +57,40 @@ extern "C" { } #endif // _di_utf8_print_codepoint_ +#ifndef _di_utf8_print_combining_or_width_ + void utf8_print_combining_or_width(utf8_data_t * const data, const f_string_static_t character) { + + f_status_t status = F_none; + + if (data->mode & utf8_mode_to_combining_d) { + status = f_utf_is_combining(character.string, character.used); + + if (status == F_true) { + fl_print_format("%s%s%s", data->file.stream, data->prepend, utf8_string_combining_is_s, data->append); + } + else if (status == F_false) { + status = f_utf_is_private(character.string, character.used); + + if (status == F_true) { + fl_print_format("%s%s%s", data->file.stream, data->prepend, utf8_string_unknown_s, data->append); + } + else if (data->mode & utf8_mode_to_width_d) { + utf8_print_width(data, character); + } + else { + fl_print_format("%s%s%s", data->file.stream, data->prepend, utf8_string_combining_not_s, data->append); + } + } + else if (data->main->parameters[utf8_parameter_strip_invalid].result == f_console_result_none && data->main->parameters[utf8_parameter_verify].result == f_console_result_none) { + fl_print_format("%s%[%s%]%s", data->file.stream, data->prepend, data->valid_not, utf8_string_unknown_s, data->valid_not, data->append); + } + } + else if (data->mode & utf8_mode_to_width_d) { + utf8_print_width(data, character); + } + } +#endif // _di_utf8_print_combining_or_width_ + #ifndef _di_utf8_print_error_decode_ void utf8_print_error_decode(utf8_data_t * const data, const f_status_t status, const f_string_static_t character) { @@ -137,23 +167,6 @@ extern "C" { } #endif // _di_utf8_print_error_no_value_ -#ifndef _di_utf8_print_error_parameter_conflict_ - void utf8_print_error_parameter_conflict(utf8_data_t * const data, const f_string_t first, const f_string_t second) { - - if (data->main->error.verbosity == f_console_verbosity_quiet) return; - - flockfile(data->main->output.to.stream); - - fl_print_format("%c%[%sThe parameter '%]", data->main->error.to.stream, f_string_eol_s[0], data->main->error.context, data->main->error.prefix, data->main->error.context); - fl_print_format("%[%s%S%]", data->main->error.to.stream, data->main->error.notable, f_console_symbol_long_enable_s, first, data->main->error.notable); - fl_print_format("%[' cannot be used with the parameter '%]", data->main->error.to.stream, data->main->error.context, data->main->error.context); - fl_print_format("%[%s%S%]", data->main->error.to.stream, data->main->error.notable, f_console_symbol_long_enable_s, second, data->main->error.notable); - fl_print_format("%['.%]%c", data->main->error.to.stream, data->main->error.context, data->main->error.context, f_string_eol_s[0]); - - funlockfile(data->main->output.to.stream); - } -#endif // _di_utf8_print_error_parameter_conflict_ - #ifndef _di_utf8_print_error_parameter_file_name_empty_ void utf8_print_error_parameter_file_name_empty(utf8_data_t * const data, const f_array_length_t index) { @@ -257,6 +270,39 @@ extern "C" { } #endif // _di_utf8_print_signal_received_ +#ifndef _di_utf8_print_width_ + void utf8_print_width(utf8_data_t * const data, const f_string_static_t character) { + + f_status_t status = f_utf_is_wide(character.string, character.used); + + if (status == F_true) { + fl_print_format("%s%s%s", data->file.stream, data->prepend, utf8_string_width_2_s, data->append); + + return; + } + + if (status == F_false) { + status = f_utf_is_graph(character.string, character.used); + + if (status == F_true) { + fl_print_format("%s%s%s", data->file.stream, data->prepend, utf8_string_width_1_s, data->append); + + return; + } + + if (status == F_false) { + fl_print_format("%s%s%s", data->file.stream, data->prepend, utf8_string_width_0_s, data->append); + + return; + } + } + + if (data->main->parameters[utf8_parameter_strip_invalid].result == f_console_result_none && data->main->parameters[utf8_parameter_verify].result == f_console_result_none) { + fl_print_format("%s%[%s%]%s", data->file.stream, data->prepend, data->valid_not, utf8_string_unknown_s, data->valid_not, data->append); + } + } +#endif // _di_utf8_print_width_ + #ifdef __cplusplus } // extern "C" #endif diff --git a/level_3/utf8/c/private-print.h b/level_3/utf8/c/private-print.h index 18f462d..38170e1 100644 --- a/level_3/utf8/c/private-print.h +++ b/level_3/utf8/c/private-print.h @@ -26,20 +26,6 @@ extern "C" { #endif // _di_utf8_print_binary_ /** - * Print the character either as a Unicode codeblock or as a binary. - * - * @param data - * The program data. - * @param character - * The character block to print. - * @param set - * The output context set. - */ -#ifndef _di_utf8_print_character_ - extern void utf8_print_character(utf8_data_t * const data, const f_string_static_t character, const f_color_set_t set) F_attribute_visibility_internal_d; -#endif // _di_utf8_print_character_ - -/** * Print an invalid character either as a Unicode codeblock or as a binary. * * This handles whether or not the invalid character should be printed or not based on program parameters. @@ -48,6 +34,8 @@ extern "C" { * The program data. * @param character * The character block to print. + * + * @see utf8_print_combining_or_width() */ #ifndef _di_utf8_print_character_invalid_ extern void utf8_print_character_invalid(utf8_data_t * const data, const f_string_static_t character) F_attribute_visibility_internal_d; @@ -67,6 +55,20 @@ extern "C" { #endif // _di_utf8_print_codepoint_ /** + * Print the width or combining state of the given character. + * + * @param data + * The program data. + * @param character + * The character block whose width or combining state is to be printed. + * + * @see utf8_print_width() + */ +#ifndef _di_utf8_print_combining_or_width_ + extern void utf8_print_combining_or_width(utf8_data_t * const data, const f_string_static_t character) F_attribute_visibility_internal_d; +#endif // _di_utf8_print_combining_or_width_ + +/** * Print error message when attempt to decode the character failed. * * @param data @@ -113,20 +115,6 @@ extern "C" { #endif // _di_utf8_print_error_no_value_ /** - * Print error message for two parameters not being allowed to be used together. - * - * @param data - * The program data. - * @param first - * The long parameter name for the first parameter. - * @param second - * The long parameter name for the second parameter. - */ -#ifndef _di_utf8_print_error_parameter_conflict_ - extern void utf8_print_error_parameter_conflict(utf8_data_t * const data, const f_string_t first, const f_string_t second) F_attribute_visibility_internal_d; -#endif // _di_utf8_print_error_parameter_conflict_ - -/** * Print error message for when the file parameter is an empty string. * * @param data @@ -209,6 +197,18 @@ extern "C" { extern void utf8_print_signal_received(utf8_data_t * const data, const f_status_t signal) F_attribute_visibility_internal_d; #endif // _di_utf8_print_signal_received_ +/** + * Print the width of the given character. + * + * @param data + * The program data. + * @param character + * The character block whose width is to be printed. + */ +#ifndef _di_utf8_print_width_ + extern void utf8_print_width(utf8_data_t * const data, const f_string_static_t character) F_attribute_visibility_internal_d; +#endif // _di_utf8_print_width_ + #ifdef __cplusplus } // extern "C" #endif diff --git a/level_3/utf8/c/private-utf8_binary.c b/level_3/utf8/c/private-utf8_binary.c index 26e5edb..8ef75fd 100644 --- a/level_3/utf8/c/private-utf8_binary.c +++ b/level_3/utf8/c/private-utf8_binary.c @@ -44,9 +44,12 @@ extern "C" { if (data->mode & utf8_mode_to_binary_d) { utf8_print_binary(data, character); } - else { + else if (data->mode & utf8_mode_to_codepoint_d) { utf8_print_codepoint(data, codepoint); } + else { + utf8_print_combining_or_width(data, character); + } } if (valid_not || F_status_is_error(status)) { diff --git a/level_3/utf8/c/private-utf8_codepoint.c b/level_3/utf8/c/private-utf8_codepoint.c index 601bc57..83b714f 100644 --- a/level_3/utf8/c/private-utf8_codepoint.c +++ b/level_3/utf8/c/private-utf8_codepoint.c @@ -68,9 +68,12 @@ extern "C" { utf8_print_binary(data, character); } } - else { + else if (data->mode & utf8_mode_to_codepoint_d) { utf8_print_codepoint(data, codepoint); } + else { + utf8_print_combining_or_width(data, character); + } } } else { diff --git a/level_3/utf8/c/utf8.c b/level_3/utf8/c/utf8.c index 60b118c..d56607a 100644 --- a/level_3/utf8/c/utf8.c +++ b/level_3/utf8/c/utf8.c @@ -36,7 +36,9 @@ extern "C" { fll_program_print_help_option(file, context, utf8_short_to_binary_s, utf8_long_to_binary_s, f_console_symbol_short_enable_s, f_console_symbol_long_enable_s, " The output format is binary (character data)."); fll_program_print_help_option(file, context, utf8_short_to_codepoint_s, utf8_long_to_codepoint_s, f_console_symbol_short_enable_s, f_console_symbol_long_enable_s, "The output format is codepoint (such as U+0000)."); + fll_program_print_help_option(file, context, utf8_short_to_combining_s, utf8_long_to_combining_s, f_console_symbol_short_enable_s, f_console_symbol_long_enable_s, "The output format is to print whether or not character is combining or not."); fll_program_print_help_option(file, context, utf8_short_to_file_s, utf8_long_to_file_s, f_console_symbol_short_enable_s, f_console_symbol_long_enable_s, " Use the given file as the output destination."); + fll_program_print_help_option(file, context, utf8_short_to_width_s, utf8_long_to_width_s, f_console_symbol_short_enable_s, f_console_symbol_long_enable_s, " The output format is to print the width of a character (either 0, 1, or 2)."); f_print_character(f_string_eol_s[0], file.stream); @@ -45,16 +47,18 @@ extern "C" { fll_program_print_help_option(file, context, utf8_short_strip_invalid_s, utf8_long_strip_invalid_s, f_console_symbol_short_enable_s, f_console_symbol_long_enable_s, "Strip invalid Unicode characters (do not print invalid sequences)."); fll_program_print_help_option(file, context, utf8_short_verify_s, utf8_long_verify_s, f_console_symbol_short_enable_s, f_console_symbol_long_enable_s, " Only perform verification of valid sequences."); - f_print_character(f_string_eol_s[0], file.stream); - f_print_character(f_string_eol_s[0], file.stream); - fll_program_print_help_usage(file, context, utf8_program_name_s, "filename(s)"); fl_print_format(" The default behavior is to assume the expected input is binary from the command line to be output to the screen as codepoints.%c%c", file.stream, f_string_eol_s[0], f_string_eol_s[0]); fl_print_format(" Multiple input sources are allowed but only a single output destination is allowed.%c%c", file.stream, f_string_eol_s[0], f_string_eol_s[0]); - fl_print_format(" When using the parameter '%[%s%s%]', no data is printed and 0 is returned if valid or 1 is returned if invalid.%c", file.stream, context.set.notable, f_console_symbol_long_enable_s, utf8_long_verify_s, context.set.notable, f_string_eol_s[0]); + fl_print_format(" When using the parameter '%[%s%s%]', no data is printed and 0 is returned if valid or 1 is returned if invalid.%c%c", file.stream, context.set.notable, f_console_symbol_long_enable_s, utf8_long_verify_s, context.set.notable, f_string_eol_s[0], f_string_eol_s[0]); + + fl_print_format(" When using the parameter '%[%s%s%]' with the parameter ", file.stream, context.set.notable, f_console_symbol_long_enable_s, utf8_long_to_combining_s, context.set.notable); + fl_print_format("'%[%s%s%]', the ", file.stream, context.set.notable, f_console_symbol_long_enable_s, utf8_long_to_width_s, context.set.notable); + fl_print_format("'%[%s%]' character is printed to represent the combining and the digits are used to represent widths.%c", file.stream, context.set.notable, utf8_string_combining_is_s, context.set.notable, f_string_eol_s[0]); + fl_print_format(" The combining characters should be considered 1-width by themselves or 0-width when combined.%c%c", file.stream, f_string_eol_s[0], f_string_eol_s[0]); funlockfile(file.stream); @@ -151,7 +155,7 @@ extern "C" { // Identify and prioritize from mode parameters. { - f_console_parameter_id_t ids[4] = { utf8_parameter_from_binary, utf8_parameter_from_codepoint }; + f_console_parameter_id_t ids[2] = { utf8_parameter_from_binary, utf8_parameter_from_codepoint }; f_console_parameter_id_t choice = 0; const f_console_parameter_ids_t choices = macro_f_console_parameter_ids_t_initialize(ids, 2); @@ -184,9 +188,9 @@ extern "C" { // Identify and prioritize to mode parameters. { - f_console_parameter_id_t ids[4] = { utf8_parameter_to_binary, utf8_parameter_to_codepoint }; + f_console_parameter_id_t ids[4] = { utf8_parameter_to_binary, utf8_parameter_to_codepoint, utf8_parameter_to_combining, utf8_parameter_to_width }; f_console_parameter_id_t choice = 0; - const f_console_parameter_ids_t choices = macro_f_console_parameter_ids_t_initialize(ids, 2); + const f_console_parameter_ids_t choices = macro_f_console_parameter_ids_t_initialize(ids, 4); status = f_console_parameter_prioritize_right(parameters, choices, &choice); @@ -204,6 +208,14 @@ extern "C" { data.mode -= utf8_mode_to_codepoint_d; } + if (data.mode & utf8_mode_to_combining_d) { + data.mode -= utf8_mode_to_combining_d; + } + + if (data.mode & utf8_mode_to_width_d) { + data.mode -= utf8_mode_to_width_d; + } + data.mode |= utf8_mode_to_binary_d; } else if (choice == utf8_parameter_to_codepoint) { @@ -211,8 +223,48 @@ extern "C" { data.mode -= utf8_mode_to_binary_d; } + if (data.mode & utf8_mode_to_combining_d) { + data.mode -= utf8_mode_to_combining_d; + } + + if (data.mode & utf8_mode_to_width_d) { + data.mode -= utf8_mode_to_width_d; + } + data.mode |= utf8_mode_to_codepoint_d; } + else if (choice == utf8_parameter_to_combining) { + if (data.mode & utf8_mode_to_binary_d) { + data.mode -= utf8_mode_to_binary_d; + } + + if (data.mode & utf8_mode_to_codepoint_d) { + data.mode -= utf8_mode_to_codepoint_d; + } + + // --to_width may be specified with --to_combining. + if (main->parameters[utf8_parameter_to_width].result == f_console_result_found) { + data.mode |= utf8_mode_to_width_d; + } + + data.mode |= utf8_mode_to_combining_d; + } + else if (choice == utf8_parameter_to_width) { + if (data.mode & utf8_mode_to_binary_d) { + data.mode -= utf8_mode_to_binary_d; + } + + if (data.mode & utf8_mode_to_codepoint_d) { + data.mode -= utf8_mode_to_codepoint_d; + } + + // --to_width may be specified with --to_combining. + if (main->parameters[utf8_parameter_to_combining].result == f_console_result_found) { + data.mode |= utf8_mode_to_combining_d; + } + + data.mode |= utf8_mode_to_width_d; + } } status = F_none; @@ -314,7 +366,7 @@ extern "C" { status = F_status_set_error(F_parameter); } - if (data.mode & utf8_mode_to_codepoint_d) { + if (!(data.mode & utf8_mode_to_binary_d)) { if (main->parameters[utf8_parameter_separate].result == f_console_result_found || main->parameters[utf8_parameter_headers].result == f_console_result_found) { data.prepend = " "; data.append = f_string_eol_s; diff --git a/level_3/utf8/c/utf8.h b/level_3/utf8/c/utf8.h index 9829038..24cc0d1 100644 --- a/level_3/utf8/c/utf8.h +++ b/level_3/utf8/c/utf8.h @@ -98,6 +98,14 @@ extern "C" { #define utf8_string_verified_valid_s "Verified Valid" #define utf8_string_verified_valid_not_s "Verified Invalid" + #define utf8_string_combining_is_s "C" + #define utf8_string_combining_not_s "N" + #define utf8_string_unknown_s "?" + + #define utf8_string_width_0_s "0" + #define utf8_string_width_1_s "1" + #define utf8_string_width_2_s "2" + #define utf8_string_from_s_length 4 #define utf8_string_to_s_length 2 @@ -107,6 +115,14 @@ extern "C" { #define utf8_string_verified_valid_s_length 14 #define utf8_string_verified_valid_not_s_length 16 + #define utf8_string_combining_is_s_length 1 + #define utf8_string_combining_not_s_length 1 + #define utf8_string_unknown_s_length 1 + + #define utf8_string_width_0_s_length 1 + #define utf8_string_width_1_s_length 1 + #define utf8_string_width_2_s_length 1 + #define utf8_character_valid_not_s "�" #define utf8_short_from_binary_s "b" @@ -120,7 +136,9 @@ extern "C" { #define utf8_short_to_binary_s "B" #define utf8_short_to_codepoint_s "C" + #define utf8_short_to_combining_s "O" #define utf8_short_to_file_s "F" + #define utf8_short_to_width_s "W" #define utf8_long_from_binary_s "from_binary" #define utf8_long_from_codepoint_s "from_codepoint" @@ -133,7 +151,9 @@ extern "C" { #define utf8_long_to_binary_s "to_binary" #define utf8_long_to_codepoint_s "to_codepoint" + #define utf8_long_to_combining_s "to_combining" #define utf8_long_to_file_s "to_file" + #define utf8_long_to_width_s "to_width" enum { utf8_parameter_help, @@ -156,7 +176,9 @@ extern "C" { utf8_parameter_to_binary, utf8_parameter_to_codepoint, + utf8_parameter_to_combining, utf8_parameter_to_file, + utf8_parameter_to_width, utf8_parameter_verify, }; @@ -180,29 +202,35 @@ extern "C" { f_console_parameter_t_initialize(utf8_short_strip_invalid_s, utf8_long_strip_invalid_s, 0, 0, f_console_type_normal), \ f_console_parameter_t_initialize(utf8_short_to_binary_s, utf8_long_to_binary_s, 0, 0, f_console_type_normal), \ f_console_parameter_t_initialize(utf8_short_to_codepoint_s, utf8_long_to_codepoint_s, 0, 0, f_console_type_normal), \ + f_console_parameter_t_initialize(utf8_short_to_combining_s, utf8_long_to_combining_s, 0, 0, f_console_type_normal), \ f_console_parameter_t_initialize(utf8_short_to_file_s, utf8_long_to_file_s, 0, 1, f_console_type_normal), \ + f_console_parameter_t_initialize(utf8_short_to_width_s, utf8_long_to_width_s, 0, 0, f_console_type_normal), \ f_console_parameter_t_initialize(utf8_short_verify_s, utf8_long_verify_s, 0, 0, f_console_type_normal), \ } - #define utf8_total_parameters_d 19 + #define utf8_total_parameters_d 21 #endif // _di_utf8_defines_ /** * Modes used to designate how to the input and output are to be processed. * * utf8_mode_from_*: - * - binary: The input source is binary. - * - codepoint: The input source is codepoint (U+XXXX or U+XXXXXX). + * - binary: The input format is binary. + * - codepoint: The input format is codepoint (U+XXXX or U+XXXXXX). * * utf8_mode_to_*: - * - binary: The outout destination is binary. - * - codepoint: The outout destination is codepoint (U+XXXX or U+XXXXXX). + * - binary: The outout format is binary. + * - codepoint: The outout format is codepoint (U+XXXX or U+XXXXXX). + * - combining: The outout format is whether or not character is combining (may be used with "width"). + * - width: The outout format is how wide the character is (may be used with "combining"). */ #ifndef _di_utf8_modes_ #define utf8_mode_from_binary_d 0x1 #define utf8_mode_from_codepoint_d 0x2 #define utf8_mode_to_binary_d 0x4 #define utf8_mode_to_codepoint_d 0x8 + #define utf8_mode_to_combining_d 0x10 + #define utf8_mode_to_width_d 0x20 #endif // _di_utf8_modes_ /** -- 1.8.3.1