From: Kevin Day Date: Sat, 4 Jun 2022 03:40:29 +0000 (-0500) Subject: Update: Handle more cases of combining characters following white space. X-Git-Tag: 0.5.10~76 X-Git-Url: https://git.kevux.org/?a=commitdiff_plain;h=1ccb15dbbd246358408a84cbb20b39f3e7c201ff;p=fll Update: Handle more cases of combining characters following white space. The way UTF-8 combining characters works continues to give me a headache. This is a pass at resolving the case, but the logic is unfortunate. I have a strong feeling that there is a regression from this. At this point I feel unit tests are necessary. Perform the combining check after checking white space. Make the code more flexible by not returning on any invalid character. An invalid character cannot be a white space, so treat it as not a white space and continue. The resulting code should be more robust. --- diff --git a/level_1/fl_string/c/private-string.c b/level_1/fl_string/c/private-string.c index 5c4f459..14f0b95 100644 --- a/level_1/fl_string/c/private-string.c +++ b/level_1/fl_string/c/private-string.c @@ -118,6 +118,7 @@ extern "C" { uint8_t width = 0; f_array_length_t width_max = 0; + f_array_length_t previous = 0; f_status_t status = F_none; @@ -137,19 +138,36 @@ extern "C" { } width_max = (stop1 - i1) + 1; + status = f_utf_is_whitespace(string1 + i1, width_max); if (F_status_is_error(status)) { - if (F_status_set_fine(status) == F_maybe) { - return F_status_set_error(F_utf_not); - } + if (F_status_set_fine(status) == F_parameter) return status; - return status; + break; } if (status == F_false) break; width = macro_f_utf_byte_width(string1[i1]); + width_max = (stop1 - i1) + 1; + + status = f_utf_is_combining(string1 + i1, width_max); + + if (F_status_is_error(status)) { + if (F_status_set_fine(status) == F_parameter) return status; + + break; + } + + // This is a combining character, so the previous character is no longer considered a space. + if (status == F_true) { + i1 = previous; + + break; + } + + previous = i1; } // for // Skip past leading whitespace in string2. @@ -168,19 +186,36 @@ extern "C" { } width_max = (stop2 - i2) + 1; + status = f_utf_is_whitespace(string2 + i2, width_max); if (F_status_is_error(status)) { - if (F_status_set_fine(status) == F_maybe) { - return F_status_set_error(F_utf_not); - } + if (F_status_set_fine(status) == F_parameter) return status; - return status; + break; } if (status == F_false) break; width = macro_f_utf_byte_width(string2[i2]); + width_max = (stop2 - i2) + 1; + + status = f_utf_is_combining(string2 + i2, width_max); + + if (F_status_is_error(status)) { + if (F_status_set_fine(status) == F_parameter) return status; + + break; + } + + // This is a combining character, so the previous character is no longer considered a space. + if (status == F_true) { + i2 = previous; + + break; + } + + previous = i2; } // for f_array_length_t last1 = i1; @@ -211,21 +246,42 @@ extern "C" { } width_max = (stop1 - j) + 1; + status = f_utf_is_whitespace(string1 + j, width_max); if (F_status_is_error(status)) { - if (F_status_set_fine(status) == F_maybe) { - return F_status_set_error(F_utf_not); - } - - return status; + if (F_status_set_fine(status) == F_parameter) return status; } width = macro_f_utf_byte_width(string1[j]); if (status == F_false) { + status = f_utf_is_combining(string1 + j, width_max); + + if (F_status_is_error(status)) { + if (F_status_set_fine(status) == F_parameter) return status; + } + + // This is a combining character, so the previous character is no longer considered a space. + if (status == F_true) { + if (last1 != previous) { + size1 -= macro_f_utf_byte_width(string1[last1]); + last1 = previous; + } + } + else { + last1 = j; + size1 += width; + previous = j; + } + } + else if (F_status_is_error(status)) { last1 = j; size1 += width; + previous = j; + } + else { + previous = j; } } // for @@ -242,10 +298,12 @@ extern "C" { while (ej < except2.used && except2.array[ej] < j) ++ej; if (ej < except2.used && except2.array[ej] == j) { width = 1; + continue; } width_max = (stop2 - j) + 1; + status = f_utf_is_whitespace(string2 + j, width_max); if (F_status_is_error(status)) { @@ -339,55 +397,77 @@ extern "C" { f_array_length_t i1 = offset1; f_array_length_t i2 = offset2; - - uint8_t width = 0; - f_array_length_t width_max = 0; + f_array_length_t previous = 0; f_status_t status = F_none; // Skip past leading whitespace in string1. - for (; i1 < stop1; i1 += width) { + for (; i1 < stop1; i1 += macro_f_utf_byte_width(string1[i1])) { // Skip past NULL in string1. while (i1 < stop1 && !string1[i1]) ++i1; if (i1 == stop1) break; - width_max = (stop1 - i1) + 1; - status = f_utf_is_whitespace(string1 + i1, width_max); + status = f_utf_is_whitespace(string1 + i1, (stop1 - i1) + 1); if (F_status_is_error(status)) { - if (F_status_set_fine(status) == F_maybe) { - return F_status_set_error(F_utf_not); - } + if (F_status_set_fine(status) == F_parameter) return status; - return status; + break; } if (status == F_false) break; - width = macro_f_utf_byte_width(string1[i1]); + status = f_utf_is_combining(string1 + i1, (stop1 - i1) + 1); + + if (F_status_is_error(status)) { + if (F_status_set_fine(status) == F_parameter) return status; + + break; + } + + // This is a combining character, so the previous character is no longer considered a space. + if (status == F_true) { + i1 = previous; + + break; + } + + previous = i1; } // for // Skip past leading whitespace in string2. - for (; i2 < stop2; i2 += width) { + for (; i2 < stop2; i2 += macro_f_utf_byte_width(string2[i2])) { // Skip past NULL in string2. while (i2 < stop2 && !string2[i2]) ++i2; if (i2 == stop2) break; - width_max = (stop2 - i2) + 1; - status = f_utf_is_whitespace(string2 + i2, width_max); + status = f_utf_is_whitespace(string2 + i2, (stop2 - i2) + 1); if (F_status_is_error(status)) { - if (F_status_set_fine(status) == F_maybe) { - return F_status_set_error(F_utf_not); - } + if (F_status_set_fine(status) == F_parameter) return status; - return status; + break; } if (status == F_false) break; - width = macro_f_utf_byte_width(string2[i2]); + status = f_utf_is_combining(string2 + i2, (stop2 - i2) + 1); + + if (F_status_is_error(status)) { + if (F_status_set_fine(status) == F_parameter) return status; + + break; + } + + // This is a combining character, so the previous character is no longer considered a space. + if (status == F_true) { + i2 = previous; + + break; + } + + previous = i2; } // for f_array_length_t last1 = i1; @@ -398,55 +478,97 @@ extern "C" { f_array_length_t size1 = 0; f_array_length_t size2 = 0; + previous = i1; + // Determine where the last non-whitespace is in string1. - for (f_array_length_t j = i1; j < stop1; j += width) { + for (f_array_length_t j = i1; j < stop1; j += macro_f_utf_byte_width(string1[j])) { // Skip past NULL in string1. while (j < stop1 && !string1[j]) ++j; if (j == stop1) break; - width_max = (stop1 - j) + 1; - status = f_utf_is_whitespace(string1 + j, width_max); + status = f_utf_is_whitespace(string1 + j, (stop1 - j) + 1); if (F_status_is_error(status)) { - if (F_status_set_fine(status) == F_maybe) { - return F_status_set_error(F_utf_not); - } + if (F_status_set_fine(status) == F_parameter) return status; - return status; + break; } - width = macro_f_utf_byte_width(string1[j]); - if (status == F_false) { + status = f_utf_is_combining(string1 + j, (stop1 - j) + 1); + + if (F_status_is_error(status)) { + if (F_status_set_fine(status) == F_parameter) return status; + } + + // This is a combining character, so the previous character is no longer considered a space. + if (status == F_true) { + if (last1 != previous) { + size1 -= macro_f_utf_byte_width(string1[last1]); + last1 = previous; + } + } + else { + last1 = j; + size1 += macro_f_utf_byte_width(string1[last1]); + previous = j; + } + } + else if (F_status_is_error(status)) { last1 = j; - ++size1; + size1 += macro_f_utf_byte_width(string1[last1]); + previous = j; + } + else { + previous = j; } } // for + previous = i2; + // Determine where the last non-whitespace is in string2. - for (f_array_length_t j = i2; j < stop2; j += width) { + for (f_array_length_t j = i2; j < stop2; j += macro_f_utf_byte_width(string2[j])) { // Skip past NULL in string2. while (j < stop2 && !string2[j]) ++j; if (j == stop2) break; - width_max = (stop2 - j) + 1; - status = f_utf_is_whitespace(string2 + j, width_max); + status = f_utf_is_whitespace(string2 + j, (stop2 - j) + 1); if (F_status_is_error(status)) { - if (F_status_set_fine(status) == F_maybe) { - return F_status_set_error(F_utf_not); - } + if (F_status_set_fine(status) == F_parameter) return status; - return status; + break; } - width = macro_f_utf_byte_width(string2[j]); - if (status == F_false) { + status = f_utf_is_combining(string2 + j, (stop1 - j) + 1); + + if (F_status_is_error(status)) { + if (F_status_set_fine(status) == F_parameter) return status; + } + + // This is a combining character, so the previous character is no longer considered a space. + if (status == F_true) { + if (last2 != previous) { + size2 -= macro_f_utf_byte_width(string2[last2]); + last2 = previous; + } + } + else { + last2 = j; + size2 += macro_f_utf_byte_width(string2[last2]); + previous = j; + } + } + else if (F_status_is_error(status)) { last2 = j; - ++size2; + size2 += macro_f_utf_byte_width(string2[last2]); + previous = j; + } + else { + previous = j; } } // for @@ -484,51 +606,60 @@ extern "C" { #endif // !defined(_di_fl_string_compare_trim_) || !defined(_di_fl_string_dynamic_compare_trim_) || !defined(_di_fl_string_dynamic_partial_compare_trim_) #if !defined(_di_fl_string_rip_) || !defined(_di_fl_string_dynamic_partial_rip_) || !defined(_di_fl_string_rip_nulless_) || !defined(_di_fl_string_dynamic_partial_rip_nulless_) - f_status_t private_fl_string_rip_find_range(const f_string_t source, f_array_length_t * const start, f_array_length_t * const stop) { + f_status_t private_fl_string_rip_find_range(const f_string_t string, f_array_length_t * const start, f_array_length_t * const stop) { const f_array_length_t stop_original = *stop; - + f_array_length_t previous = 0; f_status_t status = F_none; - uint8_t width = 0; - // Skip past leading whitespace. - for (; *start <= *stop; *start += width) { + for (; *start <= *stop; *start += macro_f_utf_byte_width(string[*start])) { // Skip past NULL. - while (*start < *stop && !source[*start]) ++(*start); + while (*start < *stop && !string[*start]) ++(*start); if (*start > *stop) break; - status = f_utf_is_whitespace(source + *start, (*stop - *start) + 1); + status = f_utf_is_whitespace(string + *start, (*stop - *start) + 1); if (F_status_is_error(status)) { - if (F_status_set_fine(status) == F_maybe) { - return F_status_set_error(F_utf_not); - } + if (F_status_set_fine(status) == F_parameter) return status; - return status; + break; } if (status == F_false) break; - width = macro_f_utf_byte_width(source[*start]); + status = f_utf_is_combining(string + *start, (*stop - *start) + 1); + + if (F_status_is_error(status)) { + if (F_status_set_fine(status) == F_parameter) return status; + + break; + } + + // This is a combining character, so the previous character is no longer considered a space. + if (status == F_true) { + *start = previous; + + break; + } + + previous = *start; } // for - for (; *stop > *start; --(*stop)) { + // Use previous as boolean here to designate that a combining is found in a previous character. + for (previous = F_false; *stop > *start; --(*stop)) { // Skip past NULL. - while (*stop > *start && !source[*stop]) --(*stop); + while (*stop > *start && !string[*stop]) --(*stop); - if (!source[*stop]) continue; + if (!string[*stop]) continue; if (*stop == *start) break; - // Each UTF-8 character of width 1 is an incomplete part. - // Go left until either width is 0 (ascii, or > 1) to determine the character. + // Go left until either width is 0 (ASCII, or > 1) to determine the character. for (;;) { - width = macro_f_utf_byte_width_is(source[*stop]); - - if (width == 1) { + if (macro_f_utf_byte_width_is(string[*stop]) == 1) { --(*stop); if (*stop == *start) break; @@ -540,28 +671,38 @@ extern "C" { if (*stop == *start) break; - status = f_utf_is_whitespace(source + *stop, (stop_original - *stop) + 1); + status = f_utf_is_whitespace(string + *stop, (stop_original - *stop) + 1); if (F_status_is_error(status)) { - if (F_status_set_fine(status) == F_maybe) { - return F_status_set_error(F_utf_not); - } + if (F_status_set_fine(status) == F_parameter) return status; - return status; + break; } if (status == F_false) break; + if (status == F_true && previous == F_true) break; + + previous = F_false; + + status = f_utf_is_combining(string + *stop, (stop_original - *stop) + 1); + + if (F_status_is_error(status)) { + if (F_status_set_fine(status) == F_parameter) return status; + + break; + } + + // This is a combining character, so the next character is no longer considered a space. + if (status == F_true) { + previous = F_true; + } } // for if (*stop == *start) { - status = f_utf_is_whitespace(source + *stop, (stop_original - *stop) + 1); + status = f_utf_is_whitespace(string + *stop, (stop_original - *stop) + 1); if (F_status_is_error(status)) { - if (F_status_set_fine(status) == F_maybe) { - return F_status_set_error(F_utf_not); - } - - return status; + if (F_status_set_fine(status) == F_parameter) return status; } if (status == F_true) { diff --git a/level_1/fl_string/c/private-string.h b/level_1/fl_string/c/private-string.h index 9bdbacc..ee9939a 100644 --- a/level_1/fl_string/c/private-string.h +++ b/level_1/fl_string/c/private-string.h @@ -111,8 +111,10 @@ extern "C" { * * F_utf_not (with error bit) if a character is not valid UTF-8. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() * @see fl_string_compare_except_trim() * @see fl_string_dynamic_compare_except_trim() @@ -146,8 +148,10 @@ extern "C" { * * F_utf_not (with error bit) if a character is not valid UTF-8. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() * @see fl_string_compare_trim() * @see fl_string_dynamic_compare_trim() @@ -162,7 +166,7 @@ extern "C" { * * Intended to be shared to each of the different implementation variations. * - * @param source + * @param string * The string to rip from. * @param start * Inclusive start point of string to rip. @@ -177,15 +181,18 @@ extern "C" { * * F_utf_not (with error bit) if a character is not valid UTF-8. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() + * @see f_utf_is_whitespace() * @see fl_string_rip() * @see fl_string_rip_nulless() * @see fl_string_dynamic_partial_rip() * @see fl_string_dynamic_partial_rip_nulless() */ #if !defined(_di_fl_string_rip_) || !defined(_di_fl_string_dynamic_partial_rip_) || !defined(_di_fl_string_rip_nulless_) || !defined(_di_fl_string_dynamic_partial_rip_nulless_) - extern f_status_t private_fl_string_rip_find_range(const f_string_t source, f_array_length_t * const start, f_array_length_t * const stop) F_attribute_visibility_internal_d; + extern f_status_t private_fl_string_rip_find_range(const f_string_t string, f_array_length_t * const start, f_array_length_t * const stop) F_attribute_visibility_internal_d; #endif // !defined(_di_fl_string_rip_) || !defined(_di_fl_string_dynamic_partial_rip_) || !defined(_di_fl_string_rip_nulless_) || !defined(_di_fl_string_dynamic_partial_rip_nulless_) #ifdef __cplusplus diff --git a/level_1/fl_string/c/string.h b/level_1/fl_string/c/string.h index a2df68c..4637ac5 100644 --- a/level_1/fl_string/c/string.h +++ b/level_1/fl_string/c/string.h @@ -120,8 +120,10 @@ extern "C" { * * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_compare_except_trim_ @@ -150,8 +152,10 @@ extern "C" { * * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_compare_trim_ @@ -236,8 +240,10 @@ extern "C" { * * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_dynamic_compare_except_string_ @@ -269,8 +275,10 @@ extern "C" { * * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_dynamic_compare_except_trim_ @@ -306,8 +314,10 @@ extern "C" { * * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_dynamic_compare_except_trim_string_ @@ -336,8 +346,10 @@ extern "C" { * * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_dynamic_compare_string_ @@ -362,8 +374,10 @@ extern "C" { * * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_dynamic_compare_trim_ @@ -392,8 +406,10 @@ extern "C" { * * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_dynamic_compare_trim_string_ @@ -573,8 +589,10 @@ extern "C" { * * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_dynamic_partial_compare_except_trim_ @@ -608,8 +626,10 @@ extern "C" { * * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_dynamic_partial_compare_except_trim_dynamic_ @@ -647,8 +667,10 @@ extern "C" { * * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_dynamic_partial_compare_except_trim_string_ @@ -702,8 +724,10 @@ extern "C" { * * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_dynamic_partial_compare_trim_dynamic_ @@ -734,8 +758,10 @@ extern "C" { * * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_dynamic_partial_compare_trim_string_ @@ -1007,11 +1033,13 @@ extern "C" { * F_parameter (with error bit) if a parameter is invalid. * F_utf_not (with error bit) if character is an invalid UTF-8 character. * - * Errors (with error bit) from: f_utf_is_whitespace(). - * Errors (with error bit) from: f_utf_is_word(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_word(). * * @see isxdigit() * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() * @see f_utf_is_word() */ @@ -1040,8 +1068,10 @@ extern "C" { * F_memory_not (with error bit) on out of memory. * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_rip_ @@ -1071,8 +1101,10 @@ extern "C" { * F_memory_not (with error bit) on out of memory. * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_rip_nulless_ @@ -1159,8 +1191,10 @@ extern "C" { * F_memory_not (with error bit) on out of memory. * F_parameter (with error bit) if a parameter is invalid. * - * Errors (with error bit) from: f_utf_is_whitespace(). + * F_parameter (with error bit) from: f_utf_is_combining(). + * F_parameter (with error bit) from: f_utf_is_whitespace(). * + * @see f_utf_is_combining() * @see f_utf_is_whitespace() */ #ifndef _di_fl_string_seek_line_until_graph_non_