From: Kevin Day Date: Wed, 18 Sep 2019 00:09:44 +0000 (-0500) Subject: Update: finish implementing f_utf_character_is_valid() and related UTF-8 changes X-Git-Tag: 0.5.0~407 X-Git-Url: https://git.kevux.org/?a=commitdiff_plain;h=0b913142dff7ac3a3a968b1b138acc7af9f1e5d6;p=fll Update: finish implementing f_utf_character_is_valid() and related UTF-8 changes UTF-8 BOM is actually not a thing but only a suggestion, see RFC 3629. I consider it a very bad practice now that I have learned that it is also the zero width space. Get rid of the UTF-8 BOM support, it is a bad idea and is not to be supported by this project. The referenced rfc also provides an easier way to view the valid ranges that my previous resources (such as wikipedia). This helped me finish this function. Updated byte_dump to better utilize this and to remove no longer necessary code. Fix an accidental incorrect "invalid detection" check use before calling f_utf_character_is_valid() in byte_dump. Explicitly print a "." or " " for UTF-8 control characters (ASCII control characters are already handled before this point so it is safe to call f_utf_character_is_control()). --- diff --git a/level_0/f_fss/c/fss.h b/level_0/f_fss/c/fss.h index 96d75cf..696386d 100644 --- a/level_0/f_fss/c/fss.h +++ b/level_0/f_fss/c/fss.h @@ -84,10 +84,14 @@ extern "C" { * Max size of a FSS header. * * The standard FSS character header is: "# fss-0000\n\0", which is 10 characters + newline + EOS = 12. - * This includes the possibility of the first character being a UTF-8 BOM (which is 3-bytes long, which results in a max size of 15 bytes). + * + * The UTF-8 BOM is not supported because it is not an actual thing (only a suggestion according to rfc3629). + * The UTF-8 BOM sequence is actually a different character called "zero-width non breaking space". + * Because it already has use, this project considers the existence of UTF-8 BOM bad practice in all cases. + * After all, if your file begins with a "zero-width non breaking space", you may want to actually use a space and not a "BOM". */ #ifndef _di_f_fss_max_header_length_ - #define f_fss_max_header_length 15 + #define f_fss_max_header_length 12 #endif // _di_f_fss_max_header_length_ /** diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index 2ed0389..045f6c4 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -20,22 +20,6 @@ extern "C" { } #endif // _di_f_utf_character_is_ -#ifndef _di_f_utf_character_is_bom_ - f_return_status f_utf_character_is_bom(const f_utf_character character) { - if (character == f_utf_character_mask_bom) { - return f_true; - } - - unsigned short width = f_macro_utf_character_width_is(character); - - if (width == 1) { - return f_status_is_error(f_invalid_utf); - } - - return f_false; - } -#endif // _di_f_utf_character_is_bom_ - #ifndef _di_f_utf_character_is_control_ f_return_status f_utf_character_is_control(const f_utf_character character) { unsigned short width = f_macro_utf_character_width_is(character); @@ -171,6 +155,11 @@ extern "C" { if (width == 2) { uint8_t byte = f_macro_utf_character_to_char_2(character); + if (byte_first < 0xc2 || byte_first > 0xdf) { + // Valid UTF-8-2 range = %xC2-DF UTF8-tail. + return f_false; + } + if (byte_first == 0xcd) { // Greek and Coptic: U+0378, U+0379. if (byte == 0xb8 || byte == 0xb9) { @@ -261,7 +250,21 @@ extern "C" { else if (width == 3) { uint16_t bytes = (uint16_t) ((character & 0x00ffff00) >> 4); + if (byte_first < 0xe0 || byte_first > 0xef) { + // Valid UTF-8-3 ranges = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + return f_false; + } + if (byte_first == 0xe0) { + { + uint8_t byte_second = f_macro_utf_character_to_char_2(character); + + // Valid UTF-8-3 ranges = %xE0 %xA0-BF UTF8-tail + if (byte_second < 0xa0 || byte_second > 0xbf) { + return f_false; + } + } + // Arabic Extended-A: U+08B5, U+08BE to U+08D3. if (bytes == 0xa2b5 || bytes >= 0xa2be && bytes <= 0xa393) { return f_false; @@ -1299,6 +1302,15 @@ extern "C" { } } else if (byte_first == 0xed) { + { + uint8_t byte_second = f_macro_utf_character_to_char_2(character); + + // Valid UTF-8-3 ranges = %xED %x80-9F UTF8-tail + if (byte_second < 0x80 || byte_second > 0x9f) { + return f_false; + } + } + // Hangul Jamo Extended-B: U+D7C7 to U+D7CA. if (bytes >= 0x9f87 && bytes <= 0x9f8a) { return f_false; @@ -2583,36 +2595,6 @@ extern "C" { } #endif // _di_f_utf_is_ -#ifndef _di_f_utf_is_bom_ - f_return_status f_utf_is_bom(const f_string character, const unsigned short max_width) { - #ifndef _di_level_0_parameter_checking_ - if (max_width < 1) return f_status_set_error(f_invalid_parameter); - #endif // _di_level_0_parameter_checking_ - - unsigned short width = f_macro_utf_byte_width_is(*character); - - if (width == 0) { - return f_false; - } - - if (width == 1) { - return f_status_is_error(f_incomplete_utf); - } - - if (width > max_width) { - return f_status_set_error(f_maybe); - } - - if (width == 3) { - if (!memcmp(character, f_utf_bom, width)) { - return f_true; - } - } - - return f_false; - } -#endif // _di_f_utf_is_bom_ - #ifndef _di_f_utf_is_control_ f_return_status f_utf_is_control(const f_string character, const unsigned short max_width) { #ifndef _di_level_0_parameter_checking_ diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h index 54a2bd8..ba1b765 100644 --- a/level_0/f_utf/c/utf.h +++ b/level_0/f_utf/c/utf.h @@ -48,20 +48,6 @@ extern "C" { #endif /** - * Define the UTF-8 BOM. - * - * The BOM designates that a string is in UTF-8. - * The BOM must be checked for when processing strings. - * - * In many cases, this should be removed such that only one exists in some string block. - */ -#ifndef _di_f_utf_bom_ - #define f_utf_bom_length 3 - - const static int8_t f_utf_bom[f_utf_bom_length] = { 0xef, 0xbb, 0xbf }; // 1110 1111, 1011 1011, 1011 1111 -#endif // _di_f_utf_bom_ - -/** * Define the UTF-8 bytes. * * The bytes are for checking a single 8-bit character value (specifically, checking the first bits). @@ -126,8 +112,6 @@ extern "C" { #ifndef _di_f_utf_character_ typedef uint32_t f_utf_character; - #define f_utf_character_mask_bom 0xefbbbf00 // 1110 1111, 1011 1011, 1011 1111, 0000 0000 - #define f_utf_character_mask_byte_1 0xff000000 // 1111 1111, 0000 0000, 0000 0000, 0000 0000 #define f_utf_character_mask_byte_2 0xffff0000 // 1111 1111, 1111 1111, 0000 0000, 0000 0000 #define f_utf_character_mask_byte_3 0xffffff00 // 1111 1111, 1111 1111, 1111 1111, 0000 0000 @@ -531,25 +515,8 @@ extern "C" { #endif // _di_f_utf_character_is_ /** - * Check to see if the entire byte block of the character is a UTF-8 BOM. - * - * @param character - * The character to validate. - * - * @return - * f_true if a UTF-8 BOM. - * f_false if not a UTF-8 BOM. - * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. - */ -#ifndef _di_f_utf_character_is_bom_ - extern f_return_status f_utf_character_is_bom(const f_utf_character character); -#endif // _di_f_utf_character_is_bom_ - -/** * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character. * - * The UTF-8 BOM is considered a control character. - * * @param character * The character to validate. * @@ -727,31 +694,8 @@ extern "C" { #endif // _di_f_utf_is_ /** - * Check to see if the entire byte block of the character is a UTF-8 BOM. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by max_width. - * @param max_width - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * f_true if a UTF-8 whitespace or substitute. - * f_false if not a UTF-8 whitespace or substitute. - * f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough. - * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. - * f_invalid_parameter (with error bit) if a parameter is invalid. - */ -#ifndef _di_f_utf_is_bom_ - extern f_return_status f_utf_is_bom(const f_string character, const unsigned short max_width); -#endif // _di_f_utf_is_bom_ - -/** * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character. * - * The UTF-8 BOM is considered a control character. - * * @param character * The character to validate. * There must be enough space allocated to compare against, as limited by max_width. @@ -799,6 +743,17 @@ extern "C" { * * For normal validation functions, try using f_utf_character_is() or f_utf_character_is_valid(). * + * According to rfc3629, the valid octect sequences for UTF-8 are: + * UTF8-octets = *( UTF8-char ) + * UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 + * UTF8-1 = %x00-7F + * UTF8-2 = %xC2-DF UTF8-tail + * UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + * %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + * UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + * %xF4 %x80-8F 2( UTF8-tail ) + * UTF8-tail = %x80-BF + * * @param character * The character to validate. * There must be enough space allocated to compare against, as limited by max_width. diff --git a/level_1/fl_console/c/console.h b/level_1/fl_console/c/console.h index eb2334a..93834ce 100644 --- a/level_1/fl_console/c/console.h +++ b/level_1/fl_console/c/console.h @@ -46,8 +46,6 @@ extern "C" { * - May not be grouped and must be separated from any subsequent parameter, such as: "tar extract create file". * - Additional parameters must immediately follow the parameter, such as "tar extract file file.tar.gz create". * - * The UTF-8 BOM is not allowed in the parameters. - * * @param arguments * The parameters passed to the process. * @param parameters diff --git a/level_1/fl_fss/c/fss.c b/level_1/fl_fss/c/fss.c index 0388567..0a00d63 100644 --- a/level_1/fl_fss/c/fss.c +++ b/level_1/fl_fss/c/fss.c @@ -47,27 +47,8 @@ extern "C" { register f_string_length i = 0; - // A single UTF-8 BOM is allowed to exist before the valid FSS identifier. - if (buffer.used > 3) { - f_status status = f_utf_is_bom(buffer.string, 4); - - if (f_status_is_error(status)) { - return f_status_set_error(fl_fss_no_header); - } - - if (status == f_true) { - i = f_utf_bom_length; - - if (buffer.used < 10 + f_utf_bom_length) { - return fl_fss_no_header; - } - } - else if (buffer.used < 10) { - // "# fss-0000" without UTF-8 BOM is always 10 characters. - return fl_fss_no_header; - } - } - else { + if (buffer.used < 10) { + // "# fss-0000" is always 10 characters. return fl_fss_no_header; } diff --git a/level_1/fl_fss/c/fss.h b/level_1/fl_fss/c/fss.h index ccf0bc5..24b07e1 100644 --- a/level_1/fl_fss/c/fss.h +++ b/level_1/fl_fss/c/fss.h @@ -60,8 +60,6 @@ extern "C" { /** * Identify FSS type from a buffered string. * - * The UTF-8 BOM is allowed to exist as the first character of the FSS header, but not anywhere else. - * * @param buffer * The string to process. * @param header diff --git a/level_1/fl_string/c/string.h b/level_1/fl_string/c/string.h index b62862c..dc97956 100644 --- a/level_1/fl_string/c/string.h +++ b/level_1/fl_string/c/string.h @@ -49,8 +49,6 @@ extern "C" { /** * Increment buffer location until a graph character (including UTF-8) or an EOL is matched. * - * This will ignore the UTF-8 BOM. - * * @param buffer * The buffer to traverse. * @param location @@ -76,8 +74,6 @@ extern "C" { /** * Increment buffer location until a non-graph character (including UTF-8) or an EOL is matched. * - * This will ignore the UTF-8 BOM. - * * @param buffer * The buffer to traverse. * @param location diff --git a/level_1/fl_utf/c/utf.h b/level_1/fl_utf/c/utf.h index 854a3df..41b7d80 100644 --- a/level_1/fl_utf/c/utf.h +++ b/level_1/fl_utf/c/utf.h @@ -28,8 +28,6 @@ extern "C" { /** * Increment buffer location until a graph character or an EOL is matched. * - * This will ignore the UTF-8 BOM. - * * @param buffer * The buffer to traverse. * @param location @@ -51,8 +49,6 @@ extern "C" { /** * Increment buffer location until a non-graph character or an EOL is matched. * - * This will ignore the UTF-8 BOM. - * * @param buffer * The buffer to traverse. * @param location diff --git a/level_3/byte_dump/c/byte_dump.h b/level_3/byte_dump/c/byte_dump.h index 523d8a4..c44ac40 100644 --- a/level_3/byte_dump/c/byte_dump.h +++ b/level_3/byte_dump/c/byte_dump.h @@ -109,7 +109,6 @@ extern "C" { #define byte_dump_sequence_tab "␉" #define byte_dump_sequence_tab_vertical "␋" #define byte_dump_sequence_unit_separator "␟" - #define byte_dump_sequence_utf_bom "␂" #define byte_dump_character_wall "|" #define byte_dump_character_placeholder "␣" // other likely choices: (substitute form 1: '␚', substitute form 2: '␦'). diff --git a/level_3/byte_dump/c/private-byte_dump.c b/level_3/byte_dump/c/private-byte_dump.c index 354b70d..a47716d 100644 --- a/level_3/byte_dump/c/private-byte_dump.c +++ b/level_3/byte_dump/c/private-byte_dump.c @@ -71,21 +71,6 @@ found_invalid_utf = f_true; invalid[character_current] = 1; } - // UTF-8 characters with width of 4 cannot have any characters of 0x8f as the first byte. - else if (width_utf == 4 && byte == 0x8f) { - found_invalid_utf = f_true; - invalid[character_current] = width_utf; - } - // These are not defined in Unicode, and so are considered invalid in UTF-8, regardless of their width_utf. - else if (byte >= 0xf5) { - found_invalid_utf = f_true; - invalid[character_current] = width_utf; - } - // Sequences that start with 0xc1 are invalid because UTF-8 does not support overlong ASCII. - else if (byte == 0xc1) { - found_invalid_utf = f_true; - invalid[character_current] = width_utf; - } // Process the UTF-8 character. else if (width_utf > 1) { position++; @@ -129,21 +114,14 @@ } // At this point: an ASCII character is collected, the entire UTF-8 character sequence is collected, or an invalid UTF-8 was processed. - if (!found_invalid_utf && width_utf > 1) { + if (!invalid[character_current] && width_utf > 1) { if (f_utf_character_is_valid(characters.string[character_current]) == f_false) { found_invalid_utf = f_true; invalid[character_current] = width_utf; } - // @todo: remove this check once implemented in f_utf_character_is_valid(). - // Handle special case invalid situations, 0xc0 and 0xc1 are used for two-byte encoding of a 7-bit ASCII but are considered invalid by UTF-8. - // Does not include 0xc0 0x80 because this is considered a overlong NULL in UTF-8, which is a valid NULL. - else if (width_utf == 2 && characters.string[character_current] > 0xc0800000 && characters.string[character_current] <= 0xc0ff0000) { - found_invalid_utf = f_true; - invalid[character_current] = width_utf; - } } - if (byte_dump_print_character_fragment(data, characters, invalid, width_utf, 1, &previous_bytes, &previous_invalid, &column, &row)) { + if (byte_dump_print_character_fragment(data, characters, invalid, width_utf, 1, &previous_bytes, &previous_invalid, &column, &row) == f_true) { character_reset = f_true; } @@ -153,12 +131,12 @@ } if (width_utf > 2) { - if (byte_dump_print_character_fragment(data, characters, invalid, width_utf, 3, &previous_bytes, &previous_invalid, &column, &row)) { + if (byte_dump_print_character_fragment(data, characters, invalid, width_utf, 3, &previous_bytes, &previous_invalid, &column, &row) == f_true) { character_reset = f_true; } if (width_utf > 3) { - if (byte_dump_print_character_fragment(data, characters, invalid, width_utf, 4, &previous_bytes, &previous_invalid, &column, &row)) { + if (byte_dump_print_character_fragment(data, characters, invalid, width_utf, 4, &previous_bytes, &previous_invalid, &column, &row) == f_true) { character_reset = f_true; } } @@ -525,19 +503,19 @@ else if (f_utf_character_is_whitespace(characters.string[i]) == f_true) { printf("%s", byte_dump_sequence_space); } - else if (width_utf == 2 && characters.string[i] == 0xc0800000) { - // This is an "Overlong Null" and is a valid NULL character. - printf("%s", byte_dump_sequence_null); + else if (f_utf_character_is_control(characters.string[i]) == f_true) { + // print a space (or '.') for control characters. + if (data.presentation == byte_dump_presentation_classic) { + printf("."); + } + else { + printf(" "); + } } else if (width_utf == 2 && characters.string[i] == 0xd89d0000) { // U+061C printf(" "); } - else if (width_utf == 2 && characters.string[i] >= 0xc2800000 && characters.string[i] <= 0xc29f0000) { - // Use space to represent unprintable Latin-1 supplement control codes. - // 0xc2a00000 happens to be the non-breaking space character and is explicitly handled above. - printf(" "); - } else if (width_utf == 3 && characters.string[i] >= 0xefbfb000 && characters.string[i] <= 0xefbfbc00) { // Use space to represent Specials codes. // 0xefbfbd00 is excluded because it is printable (and is the "Replacement Character" code). @@ -563,9 +541,6 @@ // Use space to represent Supplemental Private Use Area-B codes. printf(" "); } - else if (characters.string[i] == f_utf_character_mask_bom) { - fl_color_print(f_standard_output, data.context.warning, data.context.reset, "%s", byte_dump_sequence_utf_bom); - } else if (width_utf == 1) { // print invalid placeholder for invalid UTF-8 widths. if (invalid[i]) { @@ -593,6 +568,7 @@ } } + // @todo: implement a function in f_utf, such as f_utf_is_combining(), for detecting these combining characters. // print a space for combining characters to combine into, thereby allowing it to be safely and readably displayed. if (width_utf == 2 && characters.string[i] >= 0xdea60000 && characters.string[i] <= 0xdeb00000) { // Thana combining codes: U+07A6 to U+07B0. diff --git a/level_3/byte_dump/c/private-byte_dump.h b/level_3/byte_dump/c/private-byte_dump.h index e0c8f06..0be072c 100644 --- a/level_3/byte_dump/c/private-byte_dump.h +++ b/level_3/byte_dump/c/private-byte_dump.h @@ -60,6 +60,10 @@ extern "C" { * The current row that the character is being printed on. * When the max width is reached byte_dump_print_text() is called and this value is incremented. * + * @return + * f_true is returned to designate that a reset is needed. + * f_false is returned to designate that a reset is not needed. + * * @see byte_dump_print_text() */ #ifndef _di_byte_dump_print_character_fragment_