From: Kevin Day Date: Sat, 31 Aug 2019 20:59:55 +0000 (-0500) Subject: Update: handle invalid UTF-8 fragments X-Git-Tag: 0.5.0~465 X-Git-Url: https://git.kevux.org/?a=commitdiff_plain;h=397823e1b7ff07f3891763f2f414c6259684a4d5;p=fll Update: handle invalid UTF-8 fragments A 1-width UTF-8 character (that is not a valid ASCII character) is used to designate part of a complete UTF-8 character block (aka: 1-width UTF-8 characters are fragments). Because this fragment cannot exist in isolation, it must be handled as either an invalid or an incomplete UTF-8 fragment. Provide new status codes for handling incomplete UTF-8 fragments. Update appropriate functions to detect and handle these invalid or incomplete fragments. --- diff --git a/level_0/f_status/c/status.h b/level_0/f_status/c/status.h index 0b85466..9120699 100644 --- a/level_0/f_status/c/status.h +++ b/level_0/f_status/c/status.h @@ -221,6 +221,9 @@ enum { f_unterminated_group_on_eol, f_unterminated_group_on_eos, f_unterminated_group_on_stop, + f_incomplete_utf, + f_incomplete_utf_on_eof, + f_incomplete_utf_on_eol, f_incomplete_utf_on_eos, f_incomplete_utf_on_stop, #endif // _di_f_status_buffers_ diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index a814581..e4e268e 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -10,11 +10,14 @@ extern "C" { if (max_width < 1) return f_status_set_error(f_invalid_parameter); #endif // _di_level_0_parameter_checking_ - f_u_short width = f_macro_utf_byte_width(*character); + f_u_short width = f_macro_utf_byte_width_is(*character); - if (width == 1) { + if (width == 0) { return f_false; } + else if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } if (width > max_width) { return f_status_set_error(f_maybe); @@ -41,6 +44,14 @@ extern "C" { if (width == 0) { return f_false; } + else if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } + + // Do not operate on UTF-8 fragments that are not the first byte of the character. + if (width == 1) { + return f_status_set_error(f_incomplete_utf); + } if (width > max_width) { return f_status_set_error(f_maybe); @@ -66,11 +77,14 @@ extern "C" { if (max_width < 1) return f_status_set_error(f_invalid_parameter); #endif // _di_level_0_parameter_checking_ - f_u_short width = f_macro_utf_byte_width(*character); + f_u_short width = f_macro_utf_byte_width_is(*character); - if (width == 1) { + if (width == 0) { return f_false; } + else if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } if (width > max_width) { return f_status_set_error(f_maybe); @@ -198,11 +212,14 @@ extern "C" { if (max_width < 1) return f_status_set_error(f_invalid_parameter); #endif // _di_level_0_parameter_checking_ - f_u_short width = f_macro_utf_byte_width(*character); + f_u_short width = f_macro_utf_byte_width_is(*character); - if (width == 1) { + if (width == 0) { return f_false; } + else if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } if (width > max_width) { return f_status_set_error(f_maybe); @@ -246,11 +263,14 @@ extern "C" { if (max_width < 1) return f_status_set_error(f_invalid_parameter); #endif // _di_level_0_parameter_checking_ - f_u_short width = f_macro_utf_byte_width(*character); + f_u_short width = f_macro_utf_byte_width_is(*character); - if (width == 1) { + if (width == 0) { return f_false; } + else if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } if (width > max_width) { return f_status_set_error(f_maybe); @@ -369,6 +389,9 @@ extern "C" { if (width == 0) { return f_false; } + else if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } // for now, just assume that any non-whitespace, non-substitute utf-8 character is a graph. @@ -386,11 +409,14 @@ extern "C" { #ifndef _di_f_utf_is_space_character_ f_return_status f_utf_is_space_character(const f_utf_character character) { - f_u_short width = f_macro_utf_character_width(character); + f_u_short width = f_macro_utf_character_width_is(character); - if (width == 1) { + if (width == 0) { return f_false; } + else if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } if (width == 2) { char utf[2] = { f_macro_utf_character_to_char_1(character), f_macro_utf_character_to_char_2(character) }; @@ -514,11 +540,14 @@ extern "C" { #ifndef _di_f_utf_is_substitute_character_ f_return_status f_utf_is_substitute_character(const f_utf_character character) { - f_u_short width = f_macro_utf_character_width(character); + f_u_short width = f_macro_utf_character_width_is(character); - if (width == 1) { + if (width == 0) { return f_false; } + else if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } if (width == 2) { char utf[2] = { f_macro_utf_character_to_char_1(character), f_macro_utf_character_to_char_2(character) }; @@ -558,11 +587,19 @@ extern "C" { #ifndef _di_f_utf_is_whitespace_character_ f_return_status f_utf_is_whitespace_character(const f_utf_character character) { - f_u_short width = f_macro_utf_character_width(character); + f_u_short width = f_macro_utf_character_width_is(character); - if (width == 1) { + if (width == 0) { return f_false; } + else if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } + + // Do not operate on UTF-8 fragments that are not the first byte of the character. + if (width == 1) { + return f_status_set_error(f_incomplete_utf); + } if (width == 2) { char utf[2] = { f_macro_utf_character_to_char_1(character), f_macro_utf_character_to_char_2(character) }; @@ -671,12 +708,15 @@ extern "C" { if (utf_character == 0) return f_status_set_error(f_invalid_parameter); #endif // _di_level_0_parameter_checking_ - f_u_short width = f_macro_utf_byte_width(*character); + f_u_short width = f_macro_utf_byte_width_is(*character); - if (width == 1) { + if (width == 0) { *utf_character = f_macro_utf_character_from_char_1(character[0]); return f_none; } + else if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } if (width > max_width) { return f_status_set_error(f_failure); diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h index 7017951..d5eb5f2 100644 --- a/level_0/f_utf/c/utf.h +++ b/level_0/f_utf/c/utf.h @@ -69,6 +69,9 @@ extern "C" { * * The f_utf_byte_is method will return non-zero if the character is a UTF-8 character of any width. * + * The f_utf_byte_1 is specifically used only on UTF-8 fragments. + * For example, with the 2-byte-wide UTF-8 character '1100x xxxx 10yy yyyy', the 8-byte block '10yy yyyy' would be a fragment. + * * The f_macro_utf_byte_is_* macros are used to determine a width of the character (either 1, 2, 3, or 4, respectively). * * The f_macro_utf_byte_width macro determines a width of the character. @@ -245,6 +248,7 @@ extern "C" { * f_true if a UTF-8 whitespace or substitute. * f_false if not a UTF-8 whitespace or substitute. * f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough. + * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_invalid_parameter (with error bit) if a parameter is invalid. */ #ifndef _di_f_utf_is_bom_ @@ -267,6 +271,7 @@ extern "C" { * f_true if a UTF-8 graph. * f_false if not a UTF-8 graph. * f_maybe (with error bit) if this could be a graph but width is not long enough. + * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_invalid_parameter (with error bit) if a parameter is invalid. */ #ifndef _di_f_utf_is_graph_ @@ -289,6 +294,7 @@ extern "C" { * f_true if a UTF-8 whitespace or substitute. * f_false if not a UTF-8 whitespace or substitute. * f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough. + * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_invalid_parameter (with error bit) if a parameter is invalid. */ #ifndef _di_f_utf_is_space_ @@ -311,6 +317,7 @@ extern "C" { * f_true if a UTF-8 substitute. * f_false if not a UTF-8 substitute. * f_maybe (with error bit) if this could be a substitute but width is not long enough. + * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_invalid_parameter (with error bit) if a parameter is invalid. */ #ifndef _di_f_utf_is_substitute_ @@ -333,6 +340,7 @@ extern "C" { * f_true if a UTF-8 whitespace. * f_false if not a UTF-8 whitespace. * f_maybe (with error bit) if this could be a whitespace but width is not long enough. + * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_invalid_parameter (with error bit) if a parameter is invalid. */ #ifndef _di_f_utf_is_whitespace_ @@ -349,6 +357,7 @@ extern "C" { * f_true if a UTF-8 whitespace or substitute. * f_false if not a UTF-8 whitespace or substitute. * f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. * f_invalid_parameter (with error bit) if a parameter is invalid. */ #ifndef _di_f_utf_is_bom_character_ @@ -366,6 +375,7 @@ extern "C" { * @return * f_true if a UTF-8 graph. * f_false if not a UTF-8 graph. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. * f_invalid_parameter (with error bit) if a parameter is invalid. */ #ifndef _di_f_utf_is_graph_character_ @@ -383,6 +393,7 @@ extern "C" { * @return * f_true if a UTF-8 whitespace or substitute. * f_false if not a UTF-8 whitespace or substitute. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. * f_invalid_parameter (with error bit) if a parameter is invalid. */ #ifndef _di_f_utf_is_space_character_ @@ -400,6 +411,7 @@ extern "C" { * @return * f_true if a UTF-8 substitute. * f_false if not a UTF-8 substitute. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. * f_invalid_parameter (with error bit) if a parameter is invalid. */ #ifndef _di_f_utf_is_substitute_character_ @@ -417,6 +429,7 @@ extern "C" { * @return * f_true if a UTF-8 whitespace. * f_false if not a UTF-8 whitespace. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. * f_invalid_parameter (with error bit) if a parameter is invalid. */ #ifndef _di_f_utf_is_whitespace_character_ @@ -441,6 +454,7 @@ extern "C" { * @return * f_none if conversion was successful. * f_failure (with error bit) if width is not long enough to convert. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. * f_invalid_parameter (with error bit) if a parameter is invalid. */ #ifndef _di_f_utf_char_to_character_ diff --git a/level_1/fl_status/c/status.c b/level_1/fl_status/c/status.c index 6bee91f..3d71563 100644 --- a/level_1/fl_status/c/status.c +++ b/level_1/fl_status/c/status.c @@ -486,6 +486,15 @@ extern "C" { case f_unterminated_group_on_stop: *string = fl_status_string_unterminated_group_on_stop; break; + case f_incomplete_utf: + *string = fl_status_string_incomplete_utf; + break; + case f_incomplete_utf_on_eof: + *string = fl_status_string_incomplete_utf_on_eof; + break; + case f_incomplete_utf_on_eol: + *string = fl_status_string_incomplete_utf_on_eol; + break; case f_incomplete_utf_on_eos: *string = fl_status_string_incomplete_utf_on_eos; break; diff --git a/level_1/fl_status/c/status.h b/level_1/fl_status/c/status.h index 904cf81..65894e1 100644 --- a/level_1/fl_status/c/status.h +++ b/level_1/fl_status/c/status.h @@ -489,6 +489,15 @@ extern "C" { #define fl_status_string_unterminated_group_on_stop "f_unterminated_group_on_stop" #define fl_status_string_unterminated_group_on_stop_length 29 + #define fl_status_string_incomplete_utf "f_incomplete_utf" + #define fl_status_string_incomplete_utf_length 17 + + #define fl_status_string_incomplete_utf_on_eof "f_incomplete_utf_on_eof" + #define fl_status_string_incomplete_utf_on_eof_length 24 + + #define fl_status_string_incomplete_utf_on_eol "f_incomplete_utf_on_eol" + #define fl_status_string_incomplete_utf_on_eol_length 24 + #define fl_status_string_incomplete_utf_on_eos "f_incomplete_utf_on_eos" #define fl_status_string_incomplete_utf_on_eos_length 24 diff --git a/level_1/fl_strings/c/strings.c b/level_1/fl_strings/c/strings.c index 1a39b82..58a6fa0 100644 --- a/level_1/fl_strings/c/strings.c +++ b/level_1/fl_strings/c/strings.c @@ -62,9 +62,16 @@ extern "C" { while (buffer.string[location->start] == placeholder || (!isgraph(buffer.string[location->start]) && (status = f_utf_is_graph(buffer.string + location->start, max_width)) == f_false)) { if (buffer.string[location->start] == f_eol) return f_none_on_eol; - width = f_macro_utf_byte_width(buffer.string[location->start]); + width = f_macro_utf_byte_width_is(buffer.string[location->start]); - if (width > 1) { + if (width == 0) { + width = 1; + } + // Do not operate on UTF-8 fragments that are not the first byte of the character. + else if (width == 1) { + return f_status_set_error(f_incomplete_utf); + } + else { if (location->start + width >= buffer.used) return f_status_set_error(f_incomplete_utf_on_eos); if (location->start + width > location->stop) return f_status_set_error(f_incomplete_utf_on_stop); } @@ -111,9 +118,16 @@ extern "C" { while (buffer.string[location->start] == placeholder || (isgraph(buffer.string[location->start]) && (status = f_utf_is_space(buffer.string + location->start, max_width)) == f_false)) { if (buffer.string[location->start] == f_eol) return f_none_on_eol; - width = f_macro_utf_byte_width(buffer.string[location->start]); + width = f_macro_utf_byte_width_is(buffer.string[location->start]); - if (width > 1) { + if (width == 0) { + width = 1; + } + // Do not operate on UTF-8 fragments that are not the first byte of the character. + else if (width == 1) { + return f_status_set_error(f_incomplete_utf); + } + else { if (location->start + width >= buffer.used) return f_status_set_error(f_incomplete_utf_on_eos); if (location->start + width > location->stop) return f_status_set_error(f_incomplete_utf_on_stop); } @@ -186,15 +200,21 @@ extern "C" { max_width = buffer.used - location->start; } - width = f_macro_utf_byte_width(buffer.string[location->start]); + width = f_macro_utf_byte_width_is(buffer.string[location->start]); + + if (width == 0) { + width = 1; - if (width == 1) { if (buffer.string[location->start] == f_eol) return f_none_on_eol; if (seek_width == width) { if (buffer.string[location->start] == seek_to_this) return f_none; } } + // Do not operate on UTF-8 fragments that are not the first byte of the character. + else if (width == 1) { + return f_status_set_error(f_incomplete_utf); + } else { if (location->start + width >= buffer.used) return f_status_set_error(f_incomplete_utf_on_eos); if (location->start + width > location->stop) return f_status_set_error(f_incomplete_utf_on_stop); @@ -268,13 +288,19 @@ extern "C" { max_width = buffer.used - location->start; } - width = f_macro_utf_byte_width(buffer.string[location->start]); + width = f_macro_utf_byte_width_is(buffer.string[location->start]); + + if (width == 0) { + width = 1; - if (width == 1) { if (seek_width == width) { if (buffer.string[location->start] == seek_to_this) return f_none; } } + // Do not operate on UTF-8 fragments that are not the first byte of the character. + else if (width == 1) { + return f_status_set_error(f_incomplete_utf); + } else { if (location->start + width >= buffer.used) return f_status_set_error(f_incomplete_utf_on_eos); if (location->start + width > location->stop) return f_status_set_error(f_incomplete_utf_on_stop); diff --git a/level_1/fl_strings/c/strings.h b/level_1/fl_strings/c/strings.h index 4566b69..7e82cbd 100644 --- a/level_1/fl_strings/c/strings.h +++ b/level_1/fl_strings/c/strings.h @@ -38,7 +38,6 @@ extern "C" { * @return * f_none on success. * f_no_data if nothing to rip, no allocations or reallocations are performed. - * f_incomplete_utf_on_eos if end of sting is reached before a complete UTF-8 character can be processed. * f_invalid_parameter (with error bit) if a parameter is invalid. * f_allocation_error (with error bit) on memory allocation error. * f_reallocation_error (with error bit) on memory reallocation error. @@ -63,6 +62,7 @@ extern "C" { * f_none on success. * f_none_on_eol on success, but stopped at EOL. * f_none_on_eos on success, but stopped at end of buffer. + * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed. * f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed. * f_invalid_parameter (with error bit) if a parameter is invalid. @@ -90,6 +90,7 @@ extern "C" { * f_none_on_eol on success, but stopped at EOL. * f_none_on_eos on success, but stopped at end of buffer. * f_none_on_stop on success, but stopped stop location. + * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed. * f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed. * f_invalid_parameter (with error bit) if a parameter is invalid. @@ -139,6 +140,8 @@ extern "C" { * f_none on success. * f_none_on_eol on success, but stopped at EOL. * f_none_on_eos on success, but stopped at end of buffer. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed. * f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed. * f_invalid_parameter (with error bit) if a parameter is invalid. @@ -164,6 +167,7 @@ extern "C" { * f_none on success. * f_none_on_eos on success, but stopped at end of buffer. * f_none_on_stop on success, but stopped stop location. + * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed. * f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed. * f_invalid_parameter (with error bit) if a parameter is invalid. @@ -188,6 +192,8 @@ extern "C" { * @return * f_none on success. * f_none_on_eos on success, but stopped at end of buffer. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed. * f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed. * f_invalid_parameter (with error bit) if a parameter is invalid. diff --git a/level_2/fll_status/c/status.c b/level_2/fll_status/c/status.c index 016cc52..d62e6df 100644 --- a/level_2/fll_status/c/status.c +++ b/level_2/fll_status/c/status.c @@ -783,6 +783,21 @@ extern "C" { return f_none; } + if (fl_compare_strings(string, fl_status_string_incomplete_utf, length, fl_status_string_incomplete_utf_length) == f_equal_to) { + *code = f_incomplete_utf; + return f_none; + } + + if (fl_compare_strings(string, fl_status_string_incomplete_utf_on_eof, length, fl_status_string_incomplete_utf_on_eof_length) == f_equal_to) { + *code = f_incomplete_utf_on_eof; + return f_none; + } + + if (fl_compare_strings(string, fl_status_string_incomplete_utf_on_eol, length, fl_status_string_incomplete_utf_on_eol_length) == f_equal_to) { + *code = f_incomplete_utf_on_eol; + return f_none; + } + if (fl_compare_strings(string, fl_status_string_incomplete_utf_on_eos, length, fl_status_string_incomplete_utf_on_eos_length) == f_equal_to) { *code = f_incomplete_utf_on_eos; return f_none;