From: Kevin Day <kevin@kevux.org>
Date: Fri, 31 Mar 2023 04:10:53 +0000 (-0500)
Subject: Update: Back port utf string compare code from 0.7.x.
X-Git-Tag: 0.6.5~22
X-Git-Url: https://git.kevux.org/?a=commitdiff_plain;h=fd46d4ae77199050f261cf11573d4ca2862db4f5;p=fll

Update: Back port utf string compare code from 0.7.x.

This only back ports the functions that are present in 0.6.x.
This does not introduce any new functions and therefore has neither any API nor any ABI changes.
---

diff --git a/level_1/fl_utf/c/private-utf.c b/level_1/fl_utf/c/private-utf.c
index fa15a5e..ebd00c9 100644
--- a/level_1/fl_utf/c/private-utf.c
+++ b/level_1/fl_utf/c/private-utf.c
@@ -42,11 +42,11 @@ extern "C" {
 
     f_array_length_t i1 = offset1;
     f_array_length_t i2 = offset2;
-
+    f_array_length_t previous = 0;
     f_status_t status = F_none;
 
     // Skip past leading whitespace in string1.
-    for (; i1 < stop1; ++i1) {
+    for (; i1 < stop1; i1 += macro_f_utf_byte_width(string1[i1])) {
 
       // Skip past NULL in string1.
       while (i1 < stop1 && !string1[i1]) ++i1;
@@ -55,18 +55,33 @@ extern "C" {
       status = f_utf_character_is_whitespace(string1[i1], F_false);
 
       if (F_status_is_error(status)) {
+        if (F_status_set_fine(status) == F_parameter) return status;
 
-        // Ignore possibly invalid UTF-8 codes.
-        if (F_status_set_fine(status) != F_maybe) {
-          return status;
-        }
+        break;
       }
 
       if (status == F_false) break;
+
+      status = f_utf_character_is_combining(string1[i1]);
+
+      if (F_status_is_error(status)) {
+        if (F_status_set_fine(status) == F_parameter) return status;
+
+        break;
+      }
+
+      // This is a combining character, so the previous character is no longer considered a space.
+      if (status == F_true) {
+        i1 = previous;
+
+        break;
+      }
+
+      previous = i1;
     } // for
 
     // Skip past leading whitespace in string2.
-    for (; i2 < stop2; i2++) {
+    for (; i2 < stop2; i2 += macro_f_utf_byte_width(string2[i2])) {
 
       // Skip past NULL in string2.
       while (i2 < stop2 && !string2[i2]) ++i2;
@@ -75,27 +90,43 @@ extern "C" {
       status = f_utf_character_is_whitespace(string2[i2], F_false);
 
       if (F_status_is_error(status)) {
+        if (F_status_set_fine(status) == F_parameter) return status;
 
-        // Ignore possibly invalid UTF-8 codes.
-        if (F_status_set_fine(status) != F_maybe) {
-          return status;
-        }
+        break;
       }
 
       if (status == F_false) break;
+
+      status = f_utf_character_is_combining(string2[i2]);
+
+      if (F_status_is_error(status)) {
+        if (F_status_set_fine(status) == F_parameter) return status;
+
+        break;
+      }
+
+      // This is a combining character, so the previous character is no longer considered a space.
+      if (status == F_true) {
+        i2 = previous;
+
+        break;
+      }
+
+      previous = i2;
     } // for
 
     f_array_length_t last1 = i1;
     f_array_length_t last2 = i2;
 
     {
-      // The size1 and size2 are to represent to total number of characters after trim.
+      // Size1 and size2 are to represent to total number of characters after trim.
       f_array_length_t size1 = 0;
       f_array_length_t size2 = 0;
-      f_array_length_t j = 0;
+
+      previous = i1;
 
       // Determine where the last non-whitespace is in string1.
-      for (j = i1; j < stop1; ++j) {
+      for (f_array_length_t j = i1; j < stop1; j += macro_f_utf_byte_width(string1[j])) {
 
         // Skip past NULL in string1.
         while (j < stop1 && !string1[j]) ++j;
@@ -104,20 +135,45 @@ extern "C" {
         status = f_utf_character_is_whitespace(string1[j], F_false);
 
         if (F_status_is_error(status)) {
-          // ignore possibly invalid UTF-8 codes.
-          if (F_status_set_fine(status) != F_maybe) {
-            return status;
-          }
+          if (F_status_set_fine(status) == F_parameter) return status;
+
+          break;
         }
 
         if (status == F_false) {
+          status = f_utf_character_is_combining(string1[j]);
+
+          if (F_status_is_error(status)) {
+            if (F_status_set_fine(status) == F_parameter) return status;
+          }
+
+          // This is a combining character, so the previous character is no longer considered a space.
+          if (status == F_true) {
+            if (last1 != previous) {
+              size1 -= macro_f_utf_byte_width(string1[last1]);
+              last1 = previous;
+            }
+          }
+          else {
+            last1 = j;
+            size1 += macro_f_utf_byte_width(string1[last1]);
+            previous = j;
+          }
+        }
+        else if (F_status_is_error(status)) {
           last1 = j;
-          ++size1;
+          size1 += macro_f_utf_byte_width(string1[last1]);
+          previous = j;
+        }
+        else {
+          previous = j;
         }
       } // for
 
+      previous = i2;
+
       // Determine where the last non-whitespace is in string2.
-      for (j = i2; j < stop2; ++j) {
+      for (f_array_length_t j = i2; j < stop2; j += macro_f_utf_byte_width(string2[j])) {
 
         // Skip past NULL in string2.
         while (j < stop2 && !string2[j]) ++j;
@@ -126,45 +182,71 @@ extern "C" {
         status = f_utf_character_is_whitespace(string2[j], F_false);
 
         if (F_status_is_error(status)) {
+          if (F_status_set_fine(status) == F_parameter) return status;
 
-          // Ignore possibly invalid UTF-8 codes.
-          if (F_status_set_fine(status) != F_maybe) {
-            return status;
-          }
+          break;
         }
 
         if (status == F_false) {
+          status = f_utf_character_is_combining(string2[j]);
+
+          if (F_status_is_error(status)) {
+            if (F_status_set_fine(status) == F_parameter) return status;
+          }
+
+          // This is a combining character, so the previous character is no longer considered a space.
+          if (status == F_true) {
+            if (last2 != previous) {
+              size2 -= macro_f_utf_byte_width(string2[last2]);
+              last2 = previous;
+            }
+          }
+          else {
+            last2 = j;
+            size2 += macro_f_utf_byte_width(string2[last2]);
+            previous = j;
+          }
+        }
+        else if (F_status_is_error(status)) {
           last2 = j;
-          ++size2;
+          size2 += macro_f_utf_byte_width(string2[last2]);
+          previous = j;
+        }
+        else {
+          previous = j;
         }
       } // for
 
-      if (size1 != size2) {
-        return F_equal_to_not;
-      }
+      if (size1 != size2) return F_equal_to_not;
     }
 
-    for (; i1 < last1 && i2 < last2; ++i1, ++i2) {
+    if (last1 < stop1 && last2 < stop2) {
+      for (; i1 < last1 && i2 < last2; ++i1, ++i2) {
 
-      // Skip past NULL in string1.
-      while (i1 < last1 && !string1[i1]) ++i1;
-      if (i1 == last1) break;
+        // Skip past NULL in string1.
+        while (i1 < last1 && !string1[i1]) ++i1;
+        if (i1 == last1) break;
 
-      // Skip past NULL in string2.
-      while (i2 < last2 && !string2[i2]) ++i2;
-      if (i2 == last2) break;
+        // Skip past NULL in string2.
+        while (i2 < last2 && !string2[i2]) ++i2;
+        if (i2 == last2) break;
 
-      if (string1[i1] != string2[i2]) return F_equal_to_not;
-    } // for
+        if (string1[i1] != string2[i2]) return F_equal_to_not;
+      } // for
+    }
 
     // Only return F_equal_to if all remaining characters are NULL.
-    for (; i1 < last1; ++i1) {
-      if (string1[i1] != 0) return F_equal_to_not;
-    } // for
+    if (last1 < stop1) {
+      for (; i1 < last1; ++i1) {
+        if (string1[i1] != 0) return F_equal_to_not;
+      } // for
+    }
 
-    for (; i2 < last2; ++i2) {
-      if (string2[i2] != 0) return F_equal_to_not;
-    } // for
+    if (last2 < stop2) {
+      for (; i2 < last2; ++i2) {
+        if (string2[i2] != 0) return F_equal_to_not;
+      } // for
+    }
 
     return F_equal_to;
   }
diff --git a/level_1/fl_utf/c/utf.c b/level_1/fl_utf/c/utf.c
index 364c74d..45aad7d 100644
--- a/level_1/fl_utf/c/utf.c
+++ b/level_1/fl_utf/c/utf.c
@@ -31,10 +31,6 @@ extern "C" {
 
 #ifndef _di_fl_utf_string_dynamic_partial_compare_
   f_status_t fl_utf_string_dynamic_partial_compare(const f_utf_string_static_t string1, const f_utf_string_static_t string2, const f_string_range_t range1, const f_string_range_t range2) {
-    #ifndef _di_level_1_parameter_checking_
-      if (string1.used <= range1.stop) return F_status_set_error(F_parameter);
-      if (string2.used <= range2.stop) return F_status_set_error(F_parameter);
-    #endif // _di_level_1_parameter_checking_
 
     return private_fl_utf_string_compare(string1.string, string2.string, range1.start, range2.start, range1.stop + 1, range2.stop + 1);
   }
@@ -42,10 +38,6 @@ extern "C" {
 
 #ifndef _di_fl_utf_string_dynamic_partial_compare_trim_
   f_status_t fl_utf_string_dynamic_partial_compare_trim(const f_utf_string_static_t string1, const f_utf_string_static_t string2, const f_string_range_t range1, const f_string_range_t range2) {
-    #ifndef _di_level_1_parameter_checking_
-      if (string1.used <= range1.stop) return F_status_set_error(F_parameter);
-      if (string2.used <= range2.stop) return F_status_set_error(F_parameter);
-    #endif // _di_level_1_parameter_checking_
 
     return private_fl_utf_string_compare_trim(string1.string, string2.string, range1.start, range2.start, range1.stop + 1, range2.stop + 1);
   }