Update: finish implementing f_utf_character_is_valid() and related UTF-8 changes

author Kevin Day <thekevinday@gmail.com>

Wed, 18 Sep 2019 00:09:44 +0000 (19:09 -0500)

committer Kevin Day <thekevinday@gmail.com>

Wed, 18 Sep 2019 00:09:44 +0000 (19:09 -0500)
author Kevin Day <thekevinday@gmail.com>
Wed, 18 Sep 2019 00:09:44 +0000 (19:09 -0500)
committer Kevin Day <thekevinday@gmail.com>
Wed, 18 Sep 2019 00:09:44 +0000 (19:09 -0500)
diff --git a/level_0/f_fss/c/fss.h b/level_0/f_fss/c/fss.h

index 96d75cf5c00dbeca42d5c65fa5eec8fe834e88f0..696386d732572cfefc3473fe7341ff2f475782b1 100644 (file)
--- a/level_0/f_fss/c/fss.h
+++ b/level_0/f_fss/c/fss.h
@@ -84,10 +84,14 @@ extern "C" {
   * Max size of a FSS header.
   *
   * The standard FSS character header is: "# fss-0000\n\0", which is 10 characters + newline + EOS = 12.
- * This includes the possibility of the first character being a UTF-8 BOM (which is 3-bytes long, which results in a max size of 15 bytes).
+ *
+ * The UTF-8 BOM is not supported because it is not an actual thing (only a suggestion according to rfc3629).
+ * The UTF-8 BOM sequence is actually a different character called "zero-width non breaking space".
+ * Because it already has use, this project considers the existence of UTF-8 BOM bad practice in all cases.
+ * After all, if your file begins with a "zero-width non breaking space", you may want to actually use a space and not a "BOM".
   */
  #ifndef _di_f_fss_max_header_length_
-  #define f_fss_max_header_length 15
+  #define f_fss_max_header_length 12
  #endif // _di_f_fss_max_header_length_
  
  /**
diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c

index 2ed03899c60ae5b462bd0162447114a3e152391c..045f6c49d4d8257cdd4c3f3c0287e8b13d49921b 100644 (file)
--- a/level_0/f_utf/c/utf.c
+++ b/level_0/f_utf/c/utf.c
@@ -20,22 +20,6 @@ extern "C" {
    }
  #endif // _di_f_utf_character_is_
  
-#ifndef _di_f_utf_character_is_bom_
-  f_return_status f_utf_character_is_bom(const f_utf_character character) {
-    if (character == f_utf_character_mask_bom) {
-      return f_true;
-    }
-
-    unsigned short width = f_macro_utf_character_width_is(character);
-
-    if (width == 1) {
-      return f_status_is_error(f_invalid_utf);
-    }
-
-    return f_false;
-  }
-#endif // _di_f_utf_character_is_bom_
-
  #ifndef _di_f_utf_character_is_control_
    f_return_status f_utf_character_is_control(const f_utf_character character) {
      unsigned short width = f_macro_utf_character_width_is(character);
@@ -171,6 +155,11 @@ extern "C" {
      if (width == 2) {
        uint8_t byte = f_macro_utf_character_to_char_2(character);
  
+      if (byte_first < 0xc2 || byte_first > 0xdf) {
+        // Valid UTF-8-2 range = %xC2-DF UTF8-tail.
+        return f_false;
+      }
+
        if (byte_first == 0xcd) {
          // Greek and Coptic: U+0378, U+0379.
          if (byte == 0xb8 || byte == 0xb9) {
@@ -261,7 +250,21 @@ extern "C" {
      else if (width == 3) {
        uint16_t bytes = (uint16_t) ((character & 0x00ffff00) >> 4);
  
+      if (byte_first < 0xe0 || byte_first > 0xef) {
+        // Valid UTF-8-3 ranges = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
+        return f_false;
+      }
+
        if (byte_first == 0xe0) {
+        {
+          uint8_t byte_second = f_macro_utf_character_to_char_2(character);
+
+          // Valid UTF-8-3 ranges = %xE0 %xA0-BF UTF8-tail
+          if (byte_second < 0xa0 || byte_second > 0xbf) {
+            return f_false;
+          }
+        }
+
          // Arabic Extended-A: U+08B5, U+08BE to U+08D3.
          if (bytes == 0xa2b5 || bytes >= 0xa2be && bytes <= 0xa393) {
            return f_false;
@@ -1299,6 +1302,15 @@ extern "C" {
          }
        }
        else if (byte_first == 0xed) {
+        {
+          uint8_t byte_second = f_macro_utf_character_to_char_2(character);
+
+          // Valid UTF-8-3 ranges = %xED %x80-9F UTF8-tail
+          if (byte_second < 0x80 || byte_second > 0x9f) {
+            return f_false;
+          }
+        }
+
          // Hangul Jamo Extended-B: U+D7C7 to U+D7CA.
          if (bytes >= 0x9f87 && bytes <= 0x9f8a) {
            return f_false;
@@ -2583,36 +2595,6 @@ extern "C" {
    }
  #endif // _di_f_utf_is_
  
-#ifndef _di_f_utf_is_bom_
-  f_return_status f_utf_is_bom(const f_string character, const unsigned short max_width) {
-    #ifndef _di_level_0_parameter_checking_
-      if (max_width < 1) return f_status_set_error(f_invalid_parameter);
-    #endif // _di_level_0_parameter_checking_
-
-    unsigned short width = f_macro_utf_byte_width_is(*character);
-
-    if (width == 0) {
-      return f_false;
-    }
-
-    if (width == 1) {
-      return f_status_is_error(f_incomplete_utf);
-    }
-
-    if (width > max_width) {
-      return f_status_set_error(f_maybe);
-    }
-
-    if (width == 3) {
-      if (!memcmp(character, f_utf_bom, width)) {
-        return f_true;
-      }
-    }
-
-    return f_false;
-  }
-#endif // _di_f_utf_is_bom_
-
  #ifndef _di_f_utf_is_control_
    f_return_status f_utf_is_control(const f_string character, const unsigned short max_width) {
      #ifndef _di_level_0_parameter_checking_
diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h

index 54a2bd85247787cacaf5d6f395504d10b4e2147b..ba1b7650a1374200c8a38ebcb9eaae8a1104d14f 100644 (file)
--- a/level_0/f_utf/c/utf.h
+++ b/level_0/f_utf/c/utf.h
@@ -48,20 +48,6 @@ extern "C" {
  #endif
  
  /**
- * Define the UTF-8 BOM.
- *
- * The BOM designates that a string is in UTF-8.
- * The BOM must be checked for when processing strings.
- *
- * In many cases, this should be removed such that only one exists in some string block.
- */
-#ifndef _di_f_utf_bom_
-  #define f_utf_bom_length 3
-
-  const static int8_t f_utf_bom[f_utf_bom_length] = { 0xef, 0xbb, 0xbf }; // 1110 1111, 1011 1011, 1011 1111
-#endif // _di_f_utf_bom_
-
-/**
   * Define the UTF-8 bytes.
   *
   * The bytes are for checking a single 8-bit character value (specifically, checking the first bits).
@@ -126,8 +112,6 @@ extern "C" {
  #ifndef _di_f_utf_character_
    typedef uint32_t f_utf_character;
  
-  #define f_utf_character_mask_bom 0xefbbbf00 // 1110 1111, 1011 1011, 1011 1111, 0000 0000
-
    #define f_utf_character_mask_byte_1 0xff000000 // 1111 1111, 0000 0000, 0000 0000, 0000 0000
    #define f_utf_character_mask_byte_2 0xffff0000 // 1111 1111, 1111 1111, 0000 0000, 0000 0000
    #define f_utf_character_mask_byte_3 0xffffff00 // 1111 1111, 1111 1111, 1111 1111, 0000 0000
@@ -531,25 +515,8 @@ extern "C" {
  #endif // _di_f_utf_character_is_
  
  /**
- * Check to see if the entire byte block of the character is a UTF-8 BOM.
- *
- * @param character
- *   The character to validate.
- *
- * @return
- *   f_true if a UTF-8 BOM.
- *   f_false if not a UTF-8 BOM.
- *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
- */
-#ifndef _di_f_utf_character_is_bom_
-  extern f_return_status f_utf_character_is_bom(const f_utf_character character);
-#endif // _di_f_utf_character_is_bom_
-
-/**
   * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character.
   *
- * The UTF-8 BOM is considered a control character.
- *
   * @param character
   *   The character to validate.
   *
@@ -727,31 +694,8 @@ extern "C" {
  #endif // _di_f_utf_is_
  
  /**
- * Check to see if the entire byte block of the character is a UTF-8 BOM.
- *
- * @param character
- *   The character to validate.
- *   There must be enough space allocated to compare against, as limited by max_width.
- * @param max_width
- *   The maximum width available for checking.
- *   Can be anything greater than 0.
- *
- * @return
- *   f_true if a UTF-8 whitespace or substitute.
- *   f_false if not a UTF-8 whitespace or substitute.
- *   f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough.
- *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
- *   f_invalid_parameter (with error bit) if a parameter is invalid.
- */
-#ifndef _di_f_utf_is_bom_
-  extern f_return_status f_utf_is_bom(const f_string character, const unsigned short max_width);
-#endif // _di_f_utf_is_bom_
-
-/**
   * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character.
   *
- * The UTF-8 BOM is considered a control character.
- *
   * @param character
   *   The character to validate.
   *   There must be enough space allocated to compare against, as limited by max_width.
@@ -799,6 +743,17 @@ extern "C" {
   *
   * For normal validation functions, try using f_utf_character_is() or f_utf_character_is_valid().
   *
+ * According to rfc3629, the valid octect sequences for UTF-8 are:
+ *   UTF8-octets = *( UTF8-char )
+ *   UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
+ *   UTF8-1      = %x00-7F
+ *   UTF8-2      = %xC2-DF UTF8-tail
+ *   UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
+ *                 %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
+ *   UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
+ *                 %xF4 %x80-8F 2( UTF8-tail )
+ *   UTF8-tail   = %x80-BF
+ *
   * @param character
   *   The character to validate.
   *   There must be enough space allocated to compare against, as limited by max_width.
diff --git a/level_1/fl_console/c/console.h b/level_1/fl_console/c/console.h

index eb2334a07158e09d8cc2f1d8e49d2ef93d6b1033..93834ce958a6e34aba6323219b4a7dc6c768c51a 100644 (file)
--- a/level_1/fl_console/c/console.h
+++ b/level_1/fl_console/c/console.h
@@ -46,8 +46,6 @@ extern "C" {
   * - May not be grouped and must be separated from any subsequent parameter, such as: "tar extract create file".
   * - Additional parameters must immediately follow the parameter, such as "tar extract file file.tar.gz create".
   *
- * The UTF-8 BOM is not allowed in the parameters.
- *
   * @param arguments
   *   The parameters passed to the process.
   * @param parameters
diff --git a/level_1/fl_fss/c/fss.c b/level_1/fl_fss/c/fss.c

index 0388567436c74c2e439b33bc25d50bc11d70e066..0a00d630a1df2432a96f9fd6ef09e52ec7a306aa 100644 (file)
--- a/level_1/fl_fss/c/fss.c
+++ b/level_1/fl_fss/c/fss.c
@@ -47,27 +47,8 @@ extern "C" {
  
      register f_string_length i = 0;
  
-    // A single UTF-8 BOM is allowed to exist before the valid FSS identifier.
-    if (buffer.used > 3) {
-      f_status status = f_utf_is_bom(buffer.string, 4);
-
-      if (f_status_is_error(status)) {
-        return f_status_set_error(fl_fss_no_header);
-      }
-
-      if (status == f_true) {
-        i = f_utf_bom_length;
-
-        if (buffer.used < 10 + f_utf_bom_length) {
-          return fl_fss_no_header;
-        }
-      }
-      else if (buffer.used < 10) {
-        // "# fss-0000" without UTF-8 BOM is always 10 characters.
-        return fl_fss_no_header;
-      }
-    }
-    else {
+    if (buffer.used < 10) {
+      // "# fss-0000" is always 10 characters.
        return fl_fss_no_header;
      }
  
diff --git a/level_1/fl_fss/c/fss.h b/level_1/fl_fss/c/fss.h

index ccf0bc50885aee2fca866f34242aa106423ae5d5..24b07e1ecd7c0ab575cdd75d49c2441d605c33f8 100644 (file)
--- a/level_1/fl_fss/c/fss.h
+++ b/level_1/fl_fss/c/fss.h
@@ -60,8 +60,6 @@ extern "C" {
  /**
   * Identify FSS type from a buffered string.
   *
- * The UTF-8 BOM is allowed to exist as the first character of the FSS header, but not anywhere else.
- *
   * @param buffer
   *   The string to process.
   * @param header
diff --git a/level_1/fl_string/c/string.h b/level_1/fl_string/c/string.h

index b62862cb5d3128cd6db706b2040219b664bedd82..dc979566d129f396da82ed4f58f66cc7be7df948 100644 (file)
--- a/level_1/fl_string/c/string.h
+++ b/level_1/fl_string/c/string.h
@@ -49,8 +49,6 @@ extern "C" {
  /**
   * Increment buffer location until a graph character (including UTF-8) or an EOL is matched.
   *
- * This will ignore the UTF-8 BOM.
- *
   * @param buffer
   *   The buffer to traverse.
   * @param location
@@ -76,8 +74,6 @@ extern "C" {
  /**
   * Increment buffer location until a non-graph character (including UTF-8) or an EOL is matched.
   *
- * This will ignore the UTF-8 BOM.
- *
   * @param buffer
   *   The buffer to traverse.
   * @param location
diff --git a/level_1/fl_utf/c/utf.h b/level_1/fl_utf/c/utf.h

index 854a3df5983601a7bb3325db0b3e748cfbda3346..41b7d8075228f0155128b94bd85c7fb1edbd21ed 100644 (file)
--- a/level_1/fl_utf/c/utf.h
+++ b/level_1/fl_utf/c/utf.h
@@ -28,8 +28,6 @@ extern "C" {
  /**
   * Increment buffer location until a graph character or an EOL is matched.
   *
- * This will ignore the UTF-8 BOM.
- *
   * @param buffer
   *   The buffer to traverse.
   * @param location
@@ -51,8 +49,6 @@ extern "C" {
  /**
   * Increment buffer location until a non-graph character or an EOL is matched.
   *
- * This will ignore the UTF-8 BOM.
- *
   * @param buffer
   *   The buffer to traverse.
   * @param location
diff --git a/level_3/byte_dump/c/byte_dump.h b/level_3/byte_dump/c/byte_dump.h

index 523d8a470cb3159af3ac00a6542473ae0b13779e..c44ac40cd1400bbb99868ec371da8122221f75ce 100644 (file)
--- a/level_3/byte_dump/c/byte_dump.h
+++ b/level_3/byte_dump/c/byte_dump.h
@@ -109,7 +109,6 @@ extern "C" {
    #define byte_dump_sequence_tab                       "␉"
    #define byte_dump_sequence_tab_vertical              "␋"
    #define byte_dump_sequence_unit_separator            "␟"
-  #define byte_dump_sequence_utf_bom                   "␂"
  
    #define byte_dump_character_wall        "|"
    #define byte_dump_character_placeholder "␣" // other likely choices: (substitute form 1: '␚', substitute form 2: '␦').
diff --git a/level_3/byte_dump/c/private-byte_dump.c b/level_3/byte_dump/c/private-byte_dump.c

index 354b70d9710bbf5d67787d2bbe6019beb46b908e..a47716d636ff31d966fa605dd6d045cfd47955ef 100644 (file)
--- a/level_3/byte_dump/c/private-byte_dump.c
+++ b/level_3/byte_dump/c/private-byte_dump.c
@@ -71,21 +71,6 @@
            found_invalid_utf = f_true;
            invalid[character_current] = 1;
          }
-        // UTF-8 characters with width of 4 cannot have any characters of 0x8f as the first byte.
-        else if (width_utf == 4 && byte == 0x8f) {
-          found_invalid_utf = f_true;
-          invalid[character_current] = width_utf;
-        }
-        // These are not defined in Unicode, and so are considered invalid in UTF-8, regardless of their width_utf.
-        else if (byte >= 0xf5) {
-          found_invalid_utf = f_true;
-          invalid[character_current] = width_utf;
-        }
-        // Sequences that start with 0xc1 are invalid because UTF-8 does not support overlong ASCII.
-        else if (byte == 0xc1) {
-          found_invalid_utf = f_true;
-          invalid[character_current] = width_utf;
-        }
          // Process the UTF-8 character.
          else if (width_utf > 1) {
            position++;
@@ -129,21 +114,14 @@
        }
  
        // At this point: an ASCII character is collected, the entire UTF-8 character sequence is collected, or an invalid UTF-8 was processed.
-      if (!found_invalid_utf && width_utf > 1) {
+      if (!invalid[character_current] && width_utf > 1) {
          if (f_utf_character_is_valid(characters.string[character_current]) == f_false) {
            found_invalid_utf = f_true;
            invalid[character_current] = width_utf;
          }
-        // @todo: remove this check once implemented in f_utf_character_is_valid().
-        // Handle special case invalid situations, 0xc0 and 0xc1 are used for two-byte encoding of a 7-bit ASCII but are considered invalid by UTF-8.
-        // Does not include 0xc0 0x80 because this is considered a overlong NULL in UTF-8, which is a valid NULL.
-        else if (width_utf == 2 && characters.string[character_current] > 0xc0800000 && characters.string[character_current] <= 0xc0ff0000) {
-          found_invalid_utf = f_true;
-          invalid[character_current] = width_utf;
-        }
        }
  
-      if (byte_dump_print_character_fragment(data, characters, invalid, width_utf, 1, &previous_bytes, &previous_invalid, &column, &row)) {
+      if (byte_dump_print_character_fragment(data, characters, invalid, width_utf, 1, &previous_bytes, &previous_invalid, &column, &row) == f_true) {
          character_reset = f_true;
        }
  
@@ -153,12 +131,12 @@
          }
  
          if (width_utf > 2) {
-          if (byte_dump_print_character_fragment(data, characters, invalid, width_utf, 3, &previous_bytes, &previous_invalid, &column, &row)) {
+          if (byte_dump_print_character_fragment(data, characters, invalid, width_utf, 3, &previous_bytes, &previous_invalid, &column, &row) == f_true) {
              character_reset = f_true;
            }
  
            if (width_utf > 3) {
-            if (byte_dump_print_character_fragment(data, characters, invalid, width_utf, 4, &previous_bytes, &previous_invalid, &column, &row)) {
+            if (byte_dump_print_character_fragment(data, characters, invalid, width_utf, 4, &previous_bytes, &previous_invalid, &column, &row) == f_true) {
                character_reset = f_true;
              }
            }
@@ -525,19 +503,19 @@
        else if (f_utf_character_is_whitespace(characters.string[i]) == f_true) {
          printf("%s", byte_dump_sequence_space);
        }
-      else if (width_utf == 2 && characters.string[i] == 0xc0800000) {
-        // This is an "Overlong Null" and is a valid NULL character.
-        printf("%s", byte_dump_sequence_null);
+      else if (f_utf_character_is_control(characters.string[i]) == f_true) {
+        // print a space (or '.') for control characters.
+        if (data.presentation == byte_dump_presentation_classic) {
+          printf(".");
+        }
+        else {
+          printf(" ");
+        }
        }
        else if (width_utf == 2 && characters.string[i] == 0xd89d0000) {
          // U+061C
          printf(" ");
        }
-      else if (width_utf == 2 && characters.string[i] >= 0xc2800000 && characters.string[i] <= 0xc29f0000) {
-        // Use space to represent unprintable Latin-1 supplement control codes.
-        // 0xc2a00000 happens to be the non-breaking space character and is explicitly handled above.
-        printf(" ");
-      }
        else if (width_utf == 3 && characters.string[i] >= 0xefbfb000 && characters.string[i] <= 0xefbfbc00) {
          // Use space to represent Specials codes.
          // 0xefbfbd00 is excluded because it is printable (and is the "Replacement Character" code).
@@ -563,9 +541,6 @@
          // Use space to represent Supplemental Private Use Area-B codes.
          printf(" ");
        }
-      else if (characters.string[i] == f_utf_character_mask_bom) {
-        fl_color_print(f_standard_output, data.context.warning, data.context.reset, "%s", byte_dump_sequence_utf_bom);
-      }
        else if (width_utf == 1) {
          // print invalid placeholder for invalid UTF-8 widths.
          if (invalid[i]) {
@@ -593,6 +568,7 @@
            }
          }
  
+        // @todo: implement a function in f_utf, such as f_utf_is_combining(), for detecting these combining characters.
          // print a space for combining characters to combine into, thereby allowing it to be safely and readably displayed.
          if (width_utf == 2 && characters.string[i] >= 0xdea60000 && characters.string[i] <= 0xdeb00000) {
            // Thana combining codes: U+07A6 to U+07B0.
diff --git a/level_3/byte_dump/c/private-byte_dump.h b/level_3/byte_dump/c/private-byte_dump.h

index e0c8f06c06086c3fa14acfda6b7f214e8ca6bc50..0be072ceb8edc35cd1a0c299579f76c98815285d 100644 (file)
--- a/level_3/byte_dump/c/private-byte_dump.h
+++ b/level_3/byte_dump/c/private-byte_dump.h
@@ -60,6 +60,10 @@ extern "C" {
   *   The current row that the character is being printed on.
   *   When the max width is reached byte_dump_print_text() is called and this value is incremented.
   *
+ * @return
+ *   f_true is returned to designate that a reset is needed.
+ *   f_false is returned to designate that a reset is not needed.
+ *
   * @see byte_dump_print_text()
   */
  #ifndef _di_byte_dump_print_character_fragment_
author	Kevin Day <thekevinday@gmail.com>
	Wed, 18 Sep 2019 00:09:44 +0000 (19:09 -0500)
committer	Kevin Day <thekevinday@gmail.com>
	Wed, 18 Sep 2019 00:09:44 +0000 (19:09 -0500)
level_0/f_fss/c/fss.h		patch \| blob \| history
level_0/f_utf/c/utf.c		patch \| blob \| history
level_0/f_utf/c/utf.h		patch \| blob \| history
level_1/fl_console/c/console.h		patch \| blob \| history
level_1/fl_fss/c/fss.c		patch \| blob \| history
level_1/fl_fss/c/fss.h		patch \| blob \| history
level_1/fl_string/c/string.h		patch \| blob \| history
level_1/fl_utf/c/utf.h		patch \| blob \| history
level_3/byte_dump/c/byte_dump.h		patch \| blob \| history
level_3/byte_dump/c/private-byte_dump.c		patch \| blob \| history
level_3/byte_dump/c/private-byte_dump.h		patch \| blob \| history