From: Kevin Day <thekevinday@gmail.com>
Date: Sat, 31 Aug 2019 20:59:55 +0000 (-0500)
Subject: Update: handle invalid UTF-8 fragments
X-Git-Tag: 0.5.0~465
X-Git-Url: https://git.kevux.org/?a=commitdiff_plain;h=397823e1b7ff07f3891763f2f414c6259684a4d5;p=fll

Update: handle invalid UTF-8 fragments

A 1-width UTF-8 character (that is not a valid ASCII character) is used to designate part of a complete UTF-8 character block (aka: 1-width UTF-8 characters are fragments).
Because this fragment cannot exist in isolation, it must be handled as either an invalid or an incomplete UTF-8 fragment.

Provide new status codes for handling incomplete UTF-8 fragments.
Update appropriate functions to detect and handle these invalid or incomplete fragments.
---

diff --git a/level_0/f_status/c/status.h b/level_0/f_status/c/status.h
index 0b85466..9120699 100644
--- a/level_0/f_status/c/status.h
+++ b/level_0/f_status/c/status.h
@@ -221,6 +221,9 @@ enum {
     f_unterminated_group_on_eol,
     f_unterminated_group_on_eos,
     f_unterminated_group_on_stop,
+    f_incomplete_utf,
+    f_incomplete_utf_on_eof,
+    f_incomplete_utf_on_eol,
     f_incomplete_utf_on_eos,
     f_incomplete_utf_on_stop,
   #endif // _di_f_status_buffers_
diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c
index a814581..e4e268e 100644
--- a/level_0/f_utf/c/utf.c
+++ b/level_0/f_utf/c/utf.c
@@ -10,11 +10,14 @@ extern "C" {
       if (max_width < 1) return f_status_set_error(f_invalid_parameter);
     #endif // _di_level_0_parameter_checking_
 
-    f_u_short width = f_macro_utf_byte_width(*character);
+    f_u_short width = f_macro_utf_byte_width_is(*character);
 
-    if (width == 1) {
+    if (width == 0) {
       return f_false;
     }
+    else if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
 
     if (width > max_width) {
       return f_status_set_error(f_maybe);
@@ -41,6 +44,14 @@ extern "C" {
     if (width == 0) {
       return f_false;
     }
+    else if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
+
+    // Do not operate on UTF-8 fragments that are not the first byte of the character.
+    if (width == 1) {
+      return f_status_set_error(f_incomplete_utf);
+    }
 
     if (width > max_width) {
       return f_status_set_error(f_maybe);
@@ -66,11 +77,14 @@ extern "C" {
       if (max_width < 1) return f_status_set_error(f_invalid_parameter);
     #endif // _di_level_0_parameter_checking_
 
-    f_u_short width = f_macro_utf_byte_width(*character);
+    f_u_short width = f_macro_utf_byte_width_is(*character);
 
-    if (width == 1) {
+    if (width == 0) {
       return f_false;
     }
+    else if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
 
     if (width > max_width) {
       return f_status_set_error(f_maybe);
@@ -198,11 +212,14 @@ extern "C" {
       if (max_width < 1) return f_status_set_error(f_invalid_parameter);
     #endif // _di_level_0_parameter_checking_
 
-    f_u_short width = f_macro_utf_byte_width(*character);
+    f_u_short width = f_macro_utf_byte_width_is(*character);
 
-    if (width == 1) {
+    if (width == 0) {
       return f_false;
     }
+    else if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
 
     if (width > max_width) {
       return f_status_set_error(f_maybe);
@@ -246,11 +263,14 @@ extern "C" {
       if (max_width < 1) return f_status_set_error(f_invalid_parameter);
     #endif // _di_level_0_parameter_checking_
 
-    f_u_short width = f_macro_utf_byte_width(*character);
+    f_u_short width = f_macro_utf_byte_width_is(*character);
 
-    if (width == 1) {
+    if (width == 0) {
       return f_false;
     }
+    else if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
 
     if (width > max_width) {
       return f_status_set_error(f_maybe);
@@ -369,6 +389,9 @@ extern "C" {
     if (width == 0) {
       return f_false;
     }
+    else if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
 
     // for now, just assume that any non-whitespace, non-substitute utf-8 character is a graph.
 
@@ -386,11 +409,14 @@ extern "C" {
 
 #ifndef _di_f_utf_is_space_character_
   f_return_status f_utf_is_space_character(const f_utf_character character) {
-    f_u_short width = f_macro_utf_character_width(character);
+    f_u_short width = f_macro_utf_character_width_is(character);
 
-    if (width == 1) {
+    if (width == 0) {
       return f_false;
     }
+    else if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
 
     if (width == 2) {
       char utf[2] = { f_macro_utf_character_to_char_1(character), f_macro_utf_character_to_char_2(character) };
@@ -514,11 +540,14 @@ extern "C" {
 
 #ifndef _di_f_utf_is_substitute_character_
   f_return_status f_utf_is_substitute_character(const f_utf_character character) {
-    f_u_short width = f_macro_utf_character_width(character);
+    f_u_short width = f_macro_utf_character_width_is(character);
 
-    if (width == 1) {
+    if (width == 0) {
       return f_false;
     }
+    else if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
 
     if (width == 2) {
       char utf[2] = { f_macro_utf_character_to_char_1(character), f_macro_utf_character_to_char_2(character) };
@@ -558,11 +587,19 @@ extern "C" {
 
 #ifndef _di_f_utf_is_whitespace_character_
   f_return_status f_utf_is_whitespace_character(const f_utf_character character) {
-    f_u_short width = f_macro_utf_character_width(character);
+    f_u_short width = f_macro_utf_character_width_is(character);
 
-    if (width == 1) {
+    if (width == 0) {
       return f_false;
     }
+    else if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
+
+    // Do not operate on UTF-8 fragments that are not the first byte of the character.
+    if (width == 1) {
+      return f_status_set_error(f_incomplete_utf);
+    }
 
     if (width == 2) {
       char utf[2] = { f_macro_utf_character_to_char_1(character), f_macro_utf_character_to_char_2(character) };
@@ -671,12 +708,15 @@ extern "C" {
       if (utf_character == 0) return f_status_set_error(f_invalid_parameter);
     #endif // _di_level_0_parameter_checking_
 
-    f_u_short width = f_macro_utf_byte_width(*character);
+    f_u_short width = f_macro_utf_byte_width_is(*character);
 
-    if (width == 1) {
+    if (width == 0) {
       *utf_character = f_macro_utf_character_from_char_1(character[0]);
       return f_none;
     }
+    else if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
 
     if (width > max_width) {
       return f_status_set_error(f_failure);
diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h
index 7017951..d5eb5f2 100644
--- a/level_0/f_utf/c/utf.h
+++ b/level_0/f_utf/c/utf.h
@@ -69,6 +69,9 @@ extern "C" {
  *
  * The f_utf_byte_is method will return non-zero if the character is a UTF-8 character of any width.
  *
+ * The f_utf_byte_1 is specifically used only on UTF-8 fragments.
+ * For example, with the 2-byte-wide UTF-8 character '1100x xxxx 10yy yyyy', the 8-byte block '10yy yyyy' would be a fragment.
+ *
  * The f_macro_utf_byte_is_* macros are used to determine a width of the character (either 1, 2, 3, or 4, respectively).
  *
  * The f_macro_utf_byte_width macro determines a width of the character.
@@ -245,6 +248,7 @@ extern "C" {
  *   f_true if a UTF-8 whitespace or substitute.
  *   f_false if not a UTF-8 whitespace or substitute.
  *   f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough.
+ *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
  */
 #ifndef _di_f_utf_is_bom_
@@ -267,6 +271,7 @@ extern "C" {
  *   f_true if a UTF-8 graph.
  *   f_false if not a UTF-8 graph.
  *   f_maybe (with error bit) if this could be a graph but width is not long enough.
+ *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
  */
 #ifndef _di_f_utf_is_graph_
@@ -289,6 +294,7 @@ extern "C" {
  *   f_true if a UTF-8 whitespace or substitute.
  *   f_false if not a UTF-8 whitespace or substitute.
  *   f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough.
+ *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
  */
 #ifndef _di_f_utf_is_space_
@@ -311,6 +317,7 @@ extern "C" {
  *   f_true if a UTF-8 substitute.
  *   f_false if not a UTF-8 substitute.
  *   f_maybe (with error bit) if this could be a substitute but width is not long enough.
+ *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
  */
 #ifndef _di_f_utf_is_substitute_
@@ -333,6 +340,7 @@ extern "C" {
  *   f_true if a UTF-8 whitespace.
  *   f_false if not a UTF-8 whitespace.
  *   f_maybe (with error bit) if this could be a whitespace but width is not long enough.
+ *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
  */
 #ifndef _di_f_utf_is_whitespace_
@@ -349,6 +357,7 @@ extern "C" {
  *   f_true if a UTF-8 whitespace or substitute.
  *   f_false if not a UTF-8 whitespace or substitute.
  *   f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
  */
 #ifndef _di_f_utf_is_bom_character_
@@ -366,6 +375,7 @@ extern "C" {
  * @return
  *   f_true if a UTF-8 graph.
  *   f_false if not a UTF-8 graph.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
  */
 #ifndef _di_f_utf_is_graph_character_
@@ -383,6 +393,7 @@ extern "C" {
  * @return
  *   f_true if a UTF-8 whitespace or substitute.
  *   f_false if not a UTF-8 whitespace or substitute.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
  */
 #ifndef _di_f_utf_is_space_character_
@@ -400,6 +411,7 @@ extern "C" {
  * @return
  *   f_true if a UTF-8 substitute.
  *   f_false if not a UTF-8 substitute.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
  */
 #ifndef _di_f_utf_is_substitute_character_
@@ -417,6 +429,7 @@ extern "C" {
  * @return
  *   f_true if a UTF-8 whitespace.
  *   f_false if not a UTF-8 whitespace.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
  */
 #ifndef _di_f_utf_is_whitespace_character_
@@ -441,6 +454,7 @@ extern "C" {
  * @return
  *   f_none if conversion was successful.
  *   f_failure (with error bit) if width is not long enough to convert.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
  */
 #ifndef _di_f_utf_char_to_character_
diff --git a/level_1/fl_status/c/status.c b/level_1/fl_status/c/status.c
index 6bee91f..3d71563 100644
--- a/level_1/fl_status/c/status.c
+++ b/level_1/fl_status/c/status.c
@@ -486,6 +486,15 @@ extern "C" {
         case f_unterminated_group_on_stop:
           *string = fl_status_string_unterminated_group_on_stop;
           break;
+        case f_incomplete_utf:
+          *string = fl_status_string_incomplete_utf;
+          break;
+        case f_incomplete_utf_on_eof:
+          *string = fl_status_string_incomplete_utf_on_eof;
+          break;
+        case f_incomplete_utf_on_eol:
+          *string = fl_status_string_incomplete_utf_on_eol;
+          break;
         case f_incomplete_utf_on_eos:
           *string = fl_status_string_incomplete_utf_on_eos;
           break;
diff --git a/level_1/fl_status/c/status.h b/level_1/fl_status/c/status.h
index 904cf81..65894e1 100644
--- a/level_1/fl_status/c/status.h
+++ b/level_1/fl_status/c/status.h
@@ -489,6 +489,15 @@ extern "C" {
     #define fl_status_string_unterminated_group_on_stop "f_unterminated_group_on_stop"
     #define fl_status_string_unterminated_group_on_stop_length 29
 
+    #define fl_status_string_incomplete_utf "f_incomplete_utf"
+    #define fl_status_string_incomplete_utf_length 17
+
+    #define fl_status_string_incomplete_utf_on_eof "f_incomplete_utf_on_eof"
+    #define fl_status_string_incomplete_utf_on_eof_length 24
+
+    #define fl_status_string_incomplete_utf_on_eol "f_incomplete_utf_on_eol"
+    #define fl_status_string_incomplete_utf_on_eol_length 24
+
     #define fl_status_string_incomplete_utf_on_eos "f_incomplete_utf_on_eos"
     #define fl_status_string_incomplete_utf_on_eos_length 24
 
diff --git a/level_1/fl_strings/c/strings.c b/level_1/fl_strings/c/strings.c
index 1a39b82..58a6fa0 100644
--- a/level_1/fl_strings/c/strings.c
+++ b/level_1/fl_strings/c/strings.c
@@ -62,9 +62,16 @@ extern "C" {
     while (buffer.string[location->start] == placeholder || (!isgraph(buffer.string[location->start]) && (status = f_utf_is_graph(buffer.string + location->start, max_width)) == f_false)) {
       if (buffer.string[location->start] == f_eol) return f_none_on_eol;
 
-      width = f_macro_utf_byte_width(buffer.string[location->start]);
+      width = f_macro_utf_byte_width_is(buffer.string[location->start]);
 
-      if (width > 1) {
+      if (width == 0) {
+        width = 1;
+      }
+      // Do not operate on UTF-8 fragments that are not the first byte of the character.
+      else if (width == 1) {
+        return f_status_set_error(f_incomplete_utf);
+      }
+      else {
         if (location->start + width >= buffer.used) return f_status_set_error(f_incomplete_utf_on_eos);
         if (location->start + width > location->stop) return f_status_set_error(f_incomplete_utf_on_stop);
       }
@@ -111,9 +118,16 @@ extern "C" {
     while (buffer.string[location->start] == placeholder || (isgraph(buffer.string[location->start]) && (status = f_utf_is_space(buffer.string + location->start, max_width)) == f_false)) {
       if (buffer.string[location->start] == f_eol) return f_none_on_eol;
 
-      width = f_macro_utf_byte_width(buffer.string[location->start]);
+      width = f_macro_utf_byte_width_is(buffer.string[location->start]);
 
-      if (width > 1) {
+      if (width == 0) {
+        width = 1;
+      }
+      // Do not operate on UTF-8 fragments that are not the first byte of the character.
+      else if (width == 1) {
+        return f_status_set_error(f_incomplete_utf);
+      }
+      else {
         if (location->start + width >= buffer.used) return f_status_set_error(f_incomplete_utf_on_eos);
         if (location->start + width > location->stop) return f_status_set_error(f_incomplete_utf_on_stop);
       }
@@ -186,15 +200,21 @@ extern "C" {
         max_width = buffer.used - location->start;
       }
 
-      width = f_macro_utf_byte_width(buffer.string[location->start]);
+      width = f_macro_utf_byte_width_is(buffer.string[location->start]);
+
+      if (width == 0) {
+        width = 1;
 
-      if (width == 1) {
         if (buffer.string[location->start] == f_eol) return f_none_on_eol;
 
         if (seek_width == width) {
           if (buffer.string[location->start] == seek_to_this) return f_none;
         }
       }
+      // Do not operate on UTF-8 fragments that are not the first byte of the character.
+      else if (width == 1) {
+        return f_status_set_error(f_incomplete_utf);
+      }
       else {
         if (location->start + width >= buffer.used) return f_status_set_error(f_incomplete_utf_on_eos);
         if (location->start + width > location->stop) return f_status_set_error(f_incomplete_utf_on_stop);
@@ -268,13 +288,19 @@ extern "C" {
         max_width = buffer.used - location->start;
       }
 
-      width = f_macro_utf_byte_width(buffer.string[location->start]);
+      width = f_macro_utf_byte_width_is(buffer.string[location->start]);
+
+      if (width == 0) {
+        width = 1;
 
-      if (width == 1) {
         if (seek_width == width) {
           if (buffer.string[location->start] == seek_to_this) return f_none;
         }
       }
+      // Do not operate on UTF-8 fragments that are not the first byte of the character.
+      else if (width == 1) {
+        return f_status_set_error(f_incomplete_utf);
+      }
       else {
         if (location->start + width >= buffer.used) return f_status_set_error(f_incomplete_utf_on_eos);
         if (location->start + width > location->stop) return f_status_set_error(f_incomplete_utf_on_stop);
diff --git a/level_1/fl_strings/c/strings.h b/level_1/fl_strings/c/strings.h
index 4566b69..7e82cbd 100644
--- a/level_1/fl_strings/c/strings.h
+++ b/level_1/fl_strings/c/strings.h
@@ -38,7 +38,6 @@ extern "C" {
  * @return
  *   f_none on success.
  *   f_no_data if nothing to rip, no allocations or reallocations are performed.
- *   f_incomplete_utf_on_eos if end of sting is reached before a complete UTF-8 character can be processed.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
  *   f_allocation_error (with error bit) on memory allocation error.
  *   f_reallocation_error (with error bit) on memory reallocation error.
@@ -63,6 +62,7 @@ extern "C" {
  *   f_none on success.
  *   f_none_on_eol on success, but stopped at EOL.
  *   f_none_on_eos on success, but stopped at end of buffer.
+ *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
  *   f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed.
  *   f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
@@ -90,6 +90,7 @@ extern "C" {
  *   f_none_on_eol on success, but stopped at EOL.
  *   f_none_on_eos on success, but stopped at end of buffer.
  *   f_none_on_stop on success, but stopped stop location.
+ *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
  *   f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed.
  *   f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
@@ -139,6 +140,8 @@ extern "C" {
  *   f_none on success.
  *   f_none_on_eol on success, but stopped at EOL.
  *   f_none_on_eos on success, but stopped at end of buffer.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
  *   f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed.
  *   f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
@@ -164,6 +167,7 @@ extern "C" {
  *   f_none on success.
  *   f_none_on_eos on success, but stopped at end of buffer.
  *   f_none_on_stop on success, but stopped stop location.
+ *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
  *   f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed.
  *   f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
@@ -188,6 +192,8 @@ extern "C" {
  * @return
  *   f_none on success.
  *   f_none_on_eos on success, but stopped at end of buffer.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
  *   f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed.
  *   f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed.
  *   f_invalid_parameter (with error bit) if a parameter is invalid.
diff --git a/level_2/fll_status/c/status.c b/level_2/fll_status/c/status.c
index 016cc52..d62e6df 100644
--- a/level_2/fll_status/c/status.c
+++ b/level_2/fll_status/c/status.c
@@ -783,6 +783,21 @@ extern "C" {
         return f_none;
       }
 
+      if (fl_compare_strings(string, fl_status_string_incomplete_utf, length, fl_status_string_incomplete_utf_length) == f_equal_to) {
+        *code = f_incomplete_utf;
+        return f_none;
+      }
+
+      if (fl_compare_strings(string, fl_status_string_incomplete_utf_on_eof, length, fl_status_string_incomplete_utf_on_eof_length) == f_equal_to) {
+        *code = f_incomplete_utf_on_eof;
+        return f_none;
+      }
+
+      if (fl_compare_strings(string, fl_status_string_incomplete_utf_on_eol, length, fl_status_string_incomplete_utf_on_eol_length) == f_equal_to) {
+        *code = f_incomplete_utf_on_eol;
+        return f_none;
+      }
+
       if (fl_compare_strings(string, fl_status_string_incomplete_utf_on_eos, length, fl_status_string_incomplete_utf_on_eos_length) == f_equal_to) {
         *code = f_incomplete_utf_on_eos;
         return f_none;