Update: begin improving UTF-8

author Kevin Day <thekevinday@gmail.com>

Sat, 14 Sep 2019 00:38:52 +0000 (19:38 -0500)

committer Kevin Day <thekevinday@gmail.com>

Sat, 14 Sep 2019 00:38:52 +0000 (19:38 -0500)
author Kevin Day <thekevinday@gmail.com>
Sat, 14 Sep 2019 00:38:52 +0000 (19:38 -0500)
committer Kevin Day <thekevinday@gmail.com>
Sat, 14 Sep 2019 00:38:52 +0000 (19:38 -0500)
diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c

index 5d4e425966af7713b3bd0bcdb71798742ab44f28..84027534cabda60eb9c6d5425903e043ea45f985 100644 (file)
--- a/level_0/f_utf/c/utf.c
+++ b/level_0/f_utf/c/utf.c
@@ -4,809 +4,976 @@
  extern "C" {
  #endif
  
-#ifndef _di_f_utf_is_big_endian_
-  f_return_status f_utf_is_big_endian() {
-    uint16_t test_int = (0x01 << 8) | 0x02;
-    int8_t test_char[2] = {0x01, 0x02};
+#ifndef _di_f_utf_character_is_
+  f_return_status f_utf_character_is(const f_utf_character character) {
+    unsigned short width = f_macro_utf_character_width_is(character);
  
-    if (!memcmp(&test_int, test_char, 2)) {
-      return f_true;
+    if (width == 0) {
+      return f_false;
      }
  
-    return f_false;
+    if (width == 1) {
+      return f_status_is_error(f_invalid_utf);
+    }
+
+    return f_true;
    }
-#endif // _di_f_utf_is_big_endian_
+#endif // _di_f_utf_character_is_
  
-#ifndef _di_f_utf_is_
-  f_return_status f_utf_is(const f_string character, const unsigned short max_width) {
-    #ifndef _di_level_0_parameter_checking_
-      if (max_width < 1) return f_status_set_error(f_invalid_parameter);
-    #endif // _di_level_0_parameter_checking_
+#ifndef _di_f_utf_character_is_bom_
+  f_return_status f_utf_character_is_bom(const f_utf_character character) {
+    if (character == f_utf_character_mask_bom) {
+      return f_true;
+    }
  
-    unsigned short width = f_macro_utf_byte_width_is(*character);
+    unsigned short width = f_macro_utf_character_width_is(character);
  
-    if (width == 0) {
-      return f_false;
-    }
-    else if (width == 1) {
-      return f_status_is_error(f_incomplete_utf);
+    if (width == 1) {
+      return f_status_is_error(f_invalid_utf);
      }
  
-    return f_true;
+    return f_false;
    }
-#endif // _di_f_utf_is_
-
-#ifndef _di_f_utf_is_bom_
-  f_return_status f_utf_is_bom(const f_string character, const unsigned short max_width) {
-    #ifndef _di_level_0_parameter_checking_
-      if (max_width < 1) return f_status_set_error(f_invalid_parameter);
-    #endif // _di_level_0_parameter_checking_
+#endif // _di_f_utf_character_is_bom_
  
-    unsigned short width = f_macro_utf_byte_width_is(*character);
+#ifndef _di_f_utf_character_is_control_
+  f_return_status f_utf_character_is_control(const f_utf_character character) {
+    unsigned short width = f_macro_utf_character_width_is(character);
  
      if (width == 0) {
+      if (iscntrl()) {
+        return f_true;
+      }
+
        return f_false;
      }
-    else if (width == 1) {
-      return f_status_is_error(f_incomplete_utf);
+
+    if (width == 1) {
+      return f_status_is_error(f_invalid_utf);
      }
  
-    if (width > max_width) {
-      return f_status_set_error(f_maybe);
+    if (width == 2) {
+      // Latin-1 Supplement: U+0080 to U+009F.
+      if (character >= 0xc2800000 && character =< 0xc29f0000) {
+        return f_true;
+      }
+
+      return f_false;
      }
  
      if (width == 3) {
-      if (!memcmp(character, f_utf_bom, width)) {
+      // @todo: these might not be "control characters" and instead be "marking characters" or "combining characters".
+      // Special: U+FFF9 to U+FFFB.
+      if (character >= 0xefbfb900 && character =< 0xefbfbb00) {
+        return f_true;
+      }
+
+      return f_false;
+    }
+
+    if (width == 4) {
+      // Tags: U+E0001 and U+E007F.
+      if (character == 0xf3a08081 || character == 0xf3a081bf) {
          return f_true;
        }
      }
  
      return f_false;
    }
-#endif // _di_f_utf_is_bom_
+#endif // _di_f_utf_character_is_control_
  
-#ifndef _di_f_utf_is_character_
-  f_return_status f_utf_is_character(const f_utf_character character) {
+#ifndef _di_f_utf_character_is_control_picture_
+  f_return_status f_utf_character_is_control_picture(const f_utf_character character) {
      unsigned short width = f_macro_utf_character_width_is(character);
  
      if (width == 0) {
+      // There are no control picture characters in ASCII.
        return f_false;
      }
-    else if (width == 1) {
+
+    if (width == 1) {
        return f_status_is_error(f_invalid_utf);
      }
  
-    return f_true;
+    if (width != 3) {
+      return f_false;
+    }
+
+    // Control Pictures: U+2400 to U+2426.
+    if (character >= 0xe2908000 && character <= 0xe290a600) {
+      return f_true;
+    }
+
+    // Specials: U+FFFC to U+FFFD.
+    if (character == 0xefbfbc00 || character == 0xefbfbd00) {
+      return f_true;
+    }
+
+    return f_false;
    }
-#endif // _di_f_utf_is_
+#endif // _di_f_utf_character_is_control_picture_
  
-#ifndef _di_f_utf_is_graph_
-  f_return_status f_utf_is_graph(const f_string character, const unsigned short max_width) {
-    #ifndef _di_level_0_parameter_checking_
-      if (max_width < 1) return f_status_set_error(f_invalid_parameter);
-    #endif // _di_level_0_parameter_checking_
+#ifndef _di_f_utf_character_is_fragment_
+  f_return_status f_utf_character_is_fragment(const f_utf_character character) {
+    unsigned short width = f_macro_utf_character_width_is(character);
  
-    if (f_macro_utf_byte_width_is(*character) == 0) {
-      if (isgraph(*character)) {
+    if (width == 1) return f_true;
+
+    return f_false;
+  }
+#endif // _di_f_utf_character_is_fragment_
+
+#ifndef _di_f_utf_character_is_graph_
+  f_return_status f_utf_character_is_graph(const f_utf_character character) {
+    unsigned short width = f_macro_utf_character_width_is(character);
+
+    if (width == 0) {
+      int8_t ascii = character >> 24;
+
+      if (isgraph(ascii)) {
          return f_true;
        }
  
        return f_false;
      }
  
-    // For now, just assume that any non-whitespace, non-substitute UTF-8 character is a graph.
-    f_status status = f_utf_is_space(character, max_width);
-
-    if (f_status_is_error(status)) {
-      return status;
+    if (width == 1) {
+      return f_status_is_error(f_invalid_utf);
      }
-    else if (status == f_true) {
+
+    if (f_utf_character_is_control(character) == f_true) {
        return f_false;
      }
  
-    if (f_utf_is_bom(character, max_width) == f_true) {
+    if (f_utf_character_is_whitespace(character) == f_true) {
        return f_false;
      }
  
+    // @todo: does this need to check combining and marking characters? or are those still considered graph characters?
+
      return f_true;
    }
-#endif // _di_f_utf_is_graph_
-
-#ifndef _di_f_utf_is_space_
-  f_return_status f_utf_is_space(const f_string character, const unsigned short max_width) {
-    #ifndef _di_level_0_parameter_checking_
-      if (max_width < 1) return f_status_set_error(f_invalid_parameter);
-    #endif // _di_level_0_parameter_checking_
+#endif // _di_f_utf_character_is_graph_
  
-    unsigned short width = f_macro_utf_byte_width_is(*character);
+#ifndef _di_f_utf_character_is_valid_
+  f_return_status f_utf_character_is_valid(const f_utf_character character) {
+    unsigned short width = f_macro_utf_character_width_is(character);
  
-    if (width == 0) {
-      if (isspace(*character)) {
-        return f_true;
-      }
+    if (width == 0) return f_false;
  
-      return f_false;
-    }
-    else if (width == 1) {
-      return f_status_is_error(f_incomplete_utf);
+    if (width == 1) {
+      return f_status_is_error(f_invalid_utf);
      }
  
-    if (width > max_width) {
-      return f_status_set_error(f_maybe);
-    }
+    // @todo: check every single character that is not allowed but is represented in UTF-8 and return false.
  
      if (width == 2) {
-      if (!memcmp(character, f_utf_space_no_break, width)) {
-        return f_true;
+      // Syriac: U+070E, U+074B, U+074C.
+      if (character == 0xdc8e0000 || character == 0xdd8b0000 || character == 0xdd8c0000) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_line_feed_reverse, width)) {
-        return f_true;
+      // Thaana: U+07B2 to U+07BF.
+      if (character >= 0xdeb20000 && character <= 0xdebf0000) {
+        return f_false;
        }
+    }
  
-      if (!memcmp(character, f_utf_space_line_next, width)) {
+    if (width == 3) {
+      // consider all private use codes as valid, U+E000 to U+F8FF.
+      if (character >= 0xee808000 && character <= 0xefa3bf00) {
          return f_true;
        }
  
-      if (!memcmp(character, f_utf_substitute_middle_dot, width)) {
-        return f_true;
+      // Sinhala: U+0D97 to U+0D99.
+      if (character >= 0xe0b69700 && character <= 0xe0b69900) {
+        return f_false;
        }
  
-      return f_false;
-    }
+      // Sinhala: U+0DC7 to U+0DC9.
+      if (character >= 0xe0b78700 && character <= 0xe0b78900) {
+        return f_false;
+      }
  
-    if (width == 3) {
-      if (!memcmp(character, f_utf_space_no_break_narrow, width)) {
-        return f_true;
+      // Sinhala: U+0DCB to U+0DCE.
+      if (character >= 0xe0b78b00 && character <= 0xe0b78e00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_en, width)) {
-        return f_true;
+      // Sinhala: U+0DE0 to U+0DE5.
+      if (character >= 0xe0b7a000 && character <= 0xe0b7a500) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_en_quad, width)) {
-        return f_true;
+      // Sinhala: U+0DF5 to U+0DFF.
+      if (character >= 0xe0b7b500 && character <= 0xe0b7bf00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_en_quad, width)) {
-        return f_true;
+      // Sinhala: U+0D80, U+0D81, U+0D84.
+      if (character == 0xe0b68000 || character == 0xe0b68100 || character == 0xe0b68400) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_em, width)) {
-        return f_true;
+      // Sinhala: U+0DB2, U+0DBC, U+0DBE.
+      if (character == 0xe0b6b200 || character == 0xe0b6bc00 || character == 0xe0b6be00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_em_quad, width)) {
-        return f_true;
+      // Sinhala: U+0DBF, U+0DD5, U+0DD7.
+      if (character == 0xe0b6bf00 || character == 0xe0b79500 || character == 0xe0b79700) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_em_per_three, width)) {
-        return f_true;
+      // Sinhala: U+0DF0, U+0DF1.
+      if (character == 0xe0b7b000 || character == 0xe0b7b100) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_em_per_four, width)) {
-        return f_true;
+      // Small Form Variants: U+FE6C to U+FE6F.
+      if (character >= 0xefb9ac00 && character <= 0xefb9af00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_em_per_six, width)) {
-        return f_true;
+      // Small Form Variants: U+FE53, U+FE67.
+      if (character == 0xefb99300 || character == 0xefb9a700) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_figure, width)) {
-        return f_true;
+      // Sundanese Supplement: U+1CC8 to U+1CCF.
+      if (character >= 0xe1b38800 && character <= 0xe1b38f00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_punctuation, width)) {
-        return f_true;
+      // Superscripts and Subscripts: U+2072, U+2073, U+208F.
+      if (character == 0xe281b200 || character == 0xe281b300 || character == 0xe2828f00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_thin, width)) {
-        return f_true;
+      // Superscripts and Subscripts: U+209D to U+209F.
+      if (character >= 0xe2829d00 && character <= 0xe2829f00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_hair, width)) {
-        return f_true;
+      // Supplemental Punctuation: U+2E45 to U+2E7F.
+      if (character >= 0xe2b98500 && character <= 0xe2b9bf00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_separator_line, width)) {
-        return f_true;
+      // Syloti Nagri: U+A82C to U+A82F.
+      if (character >= 0xeaa0ac00 && character <= 0xeaa0af00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_separator_paragraph, width)) {
-        return f_true;
+      // Tagalog: U+1715 to U+171f.
+      if (character >= 0xe19c9500 && character <= 0xe19c9f00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_ogham, width)) {
-        return f_true;
+      // Tagalog: U+170D
+      if (character == 0xe19c8d00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_ideographic, width)) {
-        return f_true;
+      // Tagbanwa: U+1774 to U+177f.
+      if (character >= 0xe19db400 && character <= 0xe19dbf00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_medium_mathematical, width)) {
-        return f_true;
+      // Tagbanwa: U+176D, U+1771
+      if (character == 0xe19dad00 || character == 0xe19db100) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_substitute_symbol_blank, width)) {
-        return f_true;
+      // Tai Lee: U+196E, U+196F.
+      if (character == 0xe1a5ae00 || character == 0xe1a5ef00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_substitute_symbol_space, width)) {
-        return f_true;
+      // Tai Lee: U+1975 to U+197F.
+      if (character >= 0xe1a5b500 && character <= 0xe1a5bf00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_substitute_open_box, width)) {
-        return f_true;
+      // Tai Tham: U+1A7D to U+1A7E.
+      if (character >= 0xe1a9bd00 && character <= 0xe1a9be00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_substitute_open_box_shouldered, width)) {
-        return f_true;
+      // Tai Tham: U+1A8A to U+1A8F.
+      if (character >= 0xe1aa8a00 && character <= 0xe1aa8f00) {
+        return f_false;
        }
  
-      return f_false;
-    }
+      // Tai Tham: U+1A9A to U+1A9F.
+      if (character >= 0xe1aa9a00 && character <= 0xe1aa9f00) {
+        return f_false;
+      }
  
-    return f_false;
-  }
-#endif // _di_f_utf_is_space_
+      // Tai Tham: U+1AAE to U+1AAF.
+      if (character >= 0xe1aaae00 && character <= 0xe1aaaf00) {
+        return f_false;
+      }
  
-#ifndef _di_f_utf_is_substitute_
-  f_return_status f_utf_is_substitute(const f_string character, const unsigned short max_width) {
-    #ifndef _di_level_0_parameter_checking_
-      if (max_width < 1) return f_status_set_error(f_invalid_parameter);
-    #endif // _di_level_0_parameter_checking_
+      // Tai Tham: U+1A5F.
+      if (character == 0xe1a99f00) {
+        return f_false;
+      }
  
-    unsigned short width = f_macro_utf_byte_width_is(*character);
+      // Tai Viet: U+AAC3 to U+AADA.
+      if (character >= 0xeaab8300 && character <= 0xeaab9a00) {
+        return f_false;
+      }
  
-    if (width == 0) {
-      // there is no substitute character in ASCII.
-      return f_false;
-    }
-    else if (width == 1) {
-      return f_status_is_error(f_incomplete_utf);
-    }
+      // Tamil: U+0B80, U+0B81.
+      if (character == 0xe0ae8000 || character == 0xe0ae8100) {
+        return f_false;
+      }
  
-    if (width > max_width) {
-      return f_status_set_error(f_maybe);
-    }
+      // Tamil: U+0B8B to U+0B8D.
+      if (character >= 0xe0ae8b00 && character <= 0xe0ae8d00) {
+        return f_false;
+      }
  
-    if (width == 2) {
-      if (!memcmp(character, f_utf_substitute_middle_dot, width)) {
-        return f_true;
+      // Tamil: U+0B96 to U+0B98.
+      if (character >= 0xe0ae9600 && character <= 0xe0ae9800) {
+        return f_false;
        }
  
-      return f_false;
-    }
+      // Tamil: U+0BA0 to U+0BA2.
+      if (character >= 0xe0aea000 && character <= 0xe0aea200) {
+        return f_false;
+      }
  
-    if (width == 3) {
-      if (!memcmp(character, f_utf_substitute_symbol_blank, width)) {
-        return f_true;
+      // Tamil: U+0BA5 to U+0BA7.
+      if (character >= 0xe0aea500 && character <= 0xe0aea700) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_substitute_symbol_space, width)) {
-        return f_true;
+      // Tamil: U+0BAB to U+0BAD.
+      if (character >= 0xe0aeab00 && character <= 0xe0aead00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_substitute_open_box, width)) {
-        return f_true;
+      // Tamil: U+0BBA to U+0BBD.
+      if (character >= 0xe0aeba00 && character <= 0xe0aebd00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_substitute_open_box_shouldered, width)) {
-        return f_true;
+      // Tamil: U+0BC3 to U+0BC5.
+      if (character >= 0xe0af8300 && character <= 0xe0af8500) {
+        return f_false;
        }
  
-      return f_false;
-    }
+      // Tamil: U+0BCE, U+0BCF.
+      if (character == 0xe0af8e00 || character == 0xe0af8f00) {
+        return f_false;
+      }
  
-    return f_false;
-  }
-#endif // _di_f_utf_is_substitute_
+      // Tamil: U+0BD1 to U+0BD6.
+      if (character >= 0xe0af9100 && character <= 0xe0af9600) {
+        return f_false;
+      }
  
-#ifndef _di_f_utf_is_whitespace_
-  f_return_status f_utf_is_whitespace(const f_string character, const unsigned short max_width) {
-    #ifndef _di_level_0_parameter_checking_
-      if (max_width < 1) return f_status_set_error(f_invalid_parameter);
-    #endif // _di_level_0_parameter_checking_
+      // Tamil: U+0BD8 to U+0BE5.
+      if (character >= 0xe0af9800 && character <= 0xe0af9800) {
+        return f_false;
+      }
  
-    unsigned short width = f_macro_utf_byte_width_is(*character);
+      // Tamil: U+0BFB to U+0BFF.
+      if (character >= 0xe0afbb00 && character <= 0xe0afbf00) {
+        return f_false;
+      }
  
-    if (width == 0) {
-      if (isspace(*character)) {
-        return f_true;
+      // Tamil: U+0B84, U+0B91, U+0BC9.
+      if (character == 0xe0ae8400 || character == 0xe0ae9100 || character == 0xe0af8900) {
+        return f_false;
        }
  
-      return f_false;
-    }
-    else if (width == 1) {
-      return f_status_is_error(f_incomplete_utf);
-    }
+      // Telugu: U+0C3A to U+0C3C.
+      if (character >= 0xe0b0ba00 && character <= 0xe0b0bc00) {
+        return f_false;
+      }
  
-    if (width > max_width) {
-      return f_status_set_error(f_maybe);
-    }
+      // Telugu: U+0C4E to U+0C54.
+      if (character >= 0xe0b18e00 && character <= 0xe0b19400) {
+        return f_false;
+      }
  
-    if (width == 2) {
-      if (!memcmp(character, f_utf_space_no_break, width)) {
-        return f_true;
+      // Telugu: U+0C5B to U+0C5F.
+      if (character >= 0xe0b19b00 && character <= 0xe0b19f00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_line_feed_reverse, width)) {
-        return f_true;
+      // Telugu: U+0C64, U+0C65.
+      if (character == 0xe0b1a400 || character == 0xe0b1a500) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_line_next, width)) {
-        return f_true;
+      // Telugu: U+0C70 to U+0C77.
+      if (character >= 0xe0b1b000 && character <= 0xe0b1b700) {
+        return f_false;
        }
  
-      return f_false;
-    }
+      // Telugu: U+0C04, U+0C0D, U+0C29.
+      if (character == 0xe0b08400 || character == 0xe0b08d00 || character == 0xe0b0a900) {
+        return f_false;
+      }
  
-    if (width == 3) {
-      if (!memcmp(character, f_utf_space_no_break_narrow, width)) {
-        return f_true;
+      // Telugu: U+0C45, U+0C49, U+0C57.
+      if (character == 0xe0b18500 || character == 0xe0b18900 || character == 0xe0b19700) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_en, width)) {
-        return f_true;
+      // Thai: U+0E5C to U+0E7F.
+      if (character >= 0xe0b99c00 && character <= 0xe0b9bf00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_en_quad, width)) {
-        return f_true;
+      // Thai: U+0E3B to U+0E3E.
+      if (character >= 0xe0b8bb00 && character <= 0xe0b8be00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_en_quad, width)) {
-        return f_true;
+      // Thai: U+0E00.
+      if (character == 0xe0b88000) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_em, width)) {
-        return f_true;
+      // Tibetan: U+0FDB to U+0FFF.
+      if (character >= 0xe0bf9b00 && character <= 0xe0bfbf00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_em_quad, width)) {
-        return f_true;
+      // Tibetan: U+0F6D to U+0F70.
+      if (character >= 0xe0bdad00 && character <= 0xe0bdb000) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_em_per_three, width)) {
-        return f_true;
+      // Tibetan: U+0F48, U+0F98, U+0FBD, U+0FCD
+      if (character == 0xe0bd8800 || character == 0xe0be9800 || character == 0xe0bebd00 || character == 0xe0bf8d) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_em_per_four, width)) {
-        return f_true;
+      // Tifinagh: U+2D68 to U+2D6E.
+      if (character >= 0xe2b5a800 && character <= 0xe2b5ae00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_em_per_six, width)) {
-        return f_true;
+      // Tifinagh: U+2D71 to U+2D7E.
+      if (character >= 0xe2b5b100 && character <= 0xe2b5be00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_figure, width)) {
-        return f_true;
+      // Unified Canadian Aboriginal Syllabics Extended: U+18F6 to U+18FF.
+      if (character >= 0xe1a3b600 && character <= 0xe1a3bf00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_punctuation, width)) {
-        return f_true;
+      // Vai: U+A62C to U+A63F.
+      if (character >= 0xea98ac00 && character <= 0xea98bf00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_thin, width)) {
-        return f_true;
+      // Vedic Extensions: U+1CF7 and U+1CFA to U+1CFF.
+      if (character == 0xe1b3b700 || character >= 0xe1b3ba00 && character <= 0xe1b3bf00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_hair, width)) {
-        return f_true;
+      // Vertical Forms: U+FE10 to U+FE1F.
+      if (character >= 0xefb89000 && character <= 0xefb89f00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_separator_line, width)) {
-        return f_true;
+      // Yi Radicals: U+A4C7 to U+A4CF.
+      if (character >= 0xea938700 && character <= 0xea938f00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_separator_paragraph, width)) {
-        return f_true;
+      // Yi Syllables: U+A48D to U+A48F.
+      if (character >= 0xea928d00 && character <= 0xea928f00) {
+        return f_false;
        }
  
-      if (!memcmp(character, f_utf_space_ogham, width)) {
-        return f_true;
+      // Specials: U+FFF0 to U+FFF8.
+      if (character >= 0xefbfb000 && character <= 0xefbfb800) {
+        return f_false;
+      }
+
+      // Specials: U+FFFE to U+FFFF.
+      if (character >= 0xefbfbe00 && character <= 0xefbfbf00) {
+        return f_false;
        }
+    }
  
-      if (!memcmp(character, f_utf_space_ideographic, width)) {
+    if (width == 4) {
+      // Consider all private use codes as valid, U+F0000 to U+FFFFF.
+      if (character >= 0xf3b08080 && character <= 0xf3bfbfbf) {
          return f_true;
        }
  
-      if (!memcmp(character, f_utf_space_medium_mathematical, width)) {
+      // Consider all private use codes as valid, U+100000 to U+10FFFF.
+      if (character >= 0xf4808080 && character <= 0xf48fbfbf) {
          return f_true;
        }
  
-      return f_false;
-    }
+      // Sharada: U+111CE, U+111CF.
+      if (character == 0xf091878e || character == 0xf091878f) {
+        return f_false;
+      }
  
-    return f_false;
-  }
-#endif // _di_f_utf_is_whitespace_
+      // Shorthand Format Controls: U+1BCA4 to U+1BCAF.
+      if (character >= 0xf09bb2a4 && character <= 0xf09bb2af) {
+        return f_false;
+      }
  
-#ifndef _di_f_utf_is_bom_character_
-  f_return_status f_utf_is_bom_character(const f_utf_character character) {
-    if (character == f_utf_character_mask_bom) {
-      return f_true;
-    }
+      // Siddham: U+115DE to U+115FF.
+      if (character >= 0xf091979e && character <= 0xf09197bf) {
+        return f_false;
+      }
  
-    return f_false;
-  }
-#endif // _di_f_utf_is_bom_character_
+      // Siddham: U+115B6, U+115B7.
+      if (character == 0xf09196b6 || character == 0xf09196b7) {
+        return f_false;
+      }
  
-#ifndef _di_f_utf_is_graph_character_
-  f_return_status f_utf_is_graph_character(const f_utf_character character) {
-    // for now, just assume that any non-whitespace, non-substitute utf-8 character is a graph.
-    f_status status = f_utf_is_space_character(character);
+      // Sinhala Archaic Numbers: U+111F5 to U+111FF.
+      if (character >= 0xf09187b5 && character <= 0xf09187bf) {
+        return f_false;
+      }
  
-    if (f_status_is_error(status)) {
-      return status;
-    }
-    else if (status == f_true) {
-      return f_false;
-    }
+      // Sinhala Archaic Numbers: U+1F93F.
+      if (character == 0xf09187a0) {
+        return f_false;
+      }
  
-    if (f_utf_is_bom_character(character) == f_true) {
-      return f_false;
-    }
-
-    return f_true;
-  }
-#endif // _di_f_utf_is_graph_character_
-
-#ifndef _di_f_utf_is_space_character_
-  f_return_status f_utf_is_space_character(const f_utf_character character) {
-    unsigned short width = f_macro_utf_character_width_is(character);
-
-    if (width == 0) {
-      int8_t ascii = character >> 24;
-
-      if (isspace(ascii)) {
-        return f_true;
+      // Sora Sompeng: U+110E9 to U+110EF.
+      if (character >= 0xf09183a9 && character <= 0xf09183af) {
+        return f_false;
        }
  
-      return f_false;
-    }
-    else if (width == 1) {
-      return f_status_is_error(f_invalid_utf);
-    }
-
-    f_bool is_big_endian = f_utf_is_big_endian();
-
-    if (width == 2) {
-      uint16_t utf = 0;
-      if (is_big_endian) {
-        utf = (uint16_t) (character >> 16);
-      }
-      else {
-        utf = (f_macro_utf_character_to_char_2(character) << 8) | f_macro_utf_character_to_char_1(character);
+      // Sora Sompeng: U+110FA to U+110FF.
+      if (character >= 0xf09183ba && character <= 0xf09183bf) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_no_break, width)) {
-        return f_true;
+      // Supplemental Arrows-C: U+1F80C to U+1F80F.
+      if (character >= 0xf09fa08c && character <= 0xf09fa08f) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_line_feed_reverse, width)) {
-        return f_true;
+      // Supplemental Arrows-C: U+1F848 to U+1F84F.
+      if (character >= 0xf09fa188 && character <= 0xf09fa18f) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_line_next, width)) {
-        return f_true;
+      // Supplemental Arrows-C: U+1F85A to U+1F85F.
+      if (character >= 0xf09fa19a && character <= 0xf09fa19f) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_substitute_middle_dot, width)) {
-        return f_true;
+      // Supplemental Arrows-C: U+1F8AE to U+1F8FF.
+      if (character >= 0xf09fa2ae && character <= 0xf09fa3bf) {
+        return f_false;
        }
  
-      return f_false;
-    }
-
-    if (width == 3) {
-      uint32_t utf = 0;
-      if (is_big_endian) {
-        utf = character;
-      }
-      else {
-        utf = (f_macro_utf_character_to_char_3(character) << 24) | (f_macro_utf_character_to_char_2(character) << 16) | (f_macro_utf_character_to_char_1(character) << 8);
+      // Supplemental Symbols and Pictographs: U+1F900 to U+1F90F.
+      if (character >= 0xf09fa480 && character <= 0xf09fa48f) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_no_break_narrow, width)) {
-        return f_true;
+      // Supplemental Symbols and Pictographs: U+1F928 to U+1F92F.
+      if (character >= 0xf09fa4a8 && character <= 0xf09fa4af) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_en, width)) {
-        return f_true;
+      // Supplemental Symbols and Pictographs: U+1F94C to U+1F94F.
+      if (character >= 0xf09fa58c && character <= 0xf09fa58f) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_en_quad, width)) {
-        return f_true;
+      // Supplemental Symbols and Pictographs: U+1F960 to U+1F97F.
+      if (character >= 0xf09fa5a0 && character <= 0xf09fa5bf) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_en_quad, width)) {
-        return f_true;
+      // Supplemental Symbols and Pictographs: U+1F992 to U+1F9BF.
+      if (character >= 0xf09fa692 && character <= 0xf09fa6bf) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_em, width)) {
-        return f_true;
+      // Supplemental Symbols and Pictographs: U+1F9C1 to U+1F9FF.
+      if (character >= 0xf09fa781 && character <= 0xf09fa7bf) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_em_quad, width)) {
-        return f_true;
+      // Supplemental Symbols and Pictographs: U+1F91F, U+1F931, U+1F932.
+      if (character == 0xf09fa49f || character == 0xf09fa4b1 || character == 0xf09fa4b2) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_em_per_three, width)) {
-        return f_true;
+      // Supplemental Symbols and Pictographs: U+1F93F, U+1F95F.
+      if (character == 0xf09fa4bf || character == 0xf09fa59f) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_em_per_four, width)) {
-        return f_true;
+      // Sutton SignWriting: U+1DA8C to U+1DA9A.
+      if (character >= 0xf09daa8c && character <= 0xf09daa9a) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_em_per_six, width)) {
-        return f_true;
+      // Tags: U+E0000, U+E0002 to U+E001F.
+      if (character == 0xf3a08080 || character >= 0xf3a08082 && character <= 0xf3a081bf) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_figure, width)) {
-        return f_true;
+      // Tai Xuan Jing Symbols: U+1D357 to U+1D35F.
+      if (character >= 0xf09d8d97 && character <= 0xf09d8d9f) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_punctuation, width)) {
-        return f_true;
+      // Takri: U+116B8 to U+116BF.
+      if (character >= 0xf0919ab8 && character <= 0xf0919abf) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_thin, width)) {
-        return f_true;
+      // Takri: U+116CA to U+116CF.
+      if (character >= 0xf0919b8a && character <= 0xf0919b8f) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_hair, width)) {
-        return f_true;
+      // Tangut: U+187ED to U+187FF.
+      if (character >= 0xf0989fad && character <= 0xf0989fbf) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_separator_line, width)) {
-        return f_true;
+      // Tangut Components: U+18AF3 to U+18AFF.
+      if (character >= 0xf098abb3 && character <= 0xf098abbf) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_separator_paragraph, width)) {
-        return f_true;
+      // Tirhuta: U+114C8 to U+114CF.
+      if (character >= 0xf0919388 && character <= 0xf091938f) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_ogham, width)) {
-        return f_true;
+      // Tirhuta: U+114DA to U+114DF.
+      if (character >= 0xf091939a && character <= 0xf091939f) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_ideographic, width)) {
-        return f_true;
+      // Transport and Map Symbols: U+1F6D3 to U+1F6DF.
+      if (character >= 0xf09f9b93 && character <= 0xf09f9b9f) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_space_medium_mathematical, width)) {
-        return f_true;
+      // Transport and Map Symbols: U+1F6ED to U+1F6EF.
+      if (character >= 0xf09f9bad && character <= 0xf09f9baf) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_substitute_symbol_blank, width)) {
-        return f_true;
+      // Transport and Map Symbols: U+1F6F7 to U+1F6FF.
+      if (character >= 0xf09f9bb7 && character <= 0xf09f9bbf) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_substitute_symbol_space, width)) {
-        return f_true;
+      // Ugaritic: U+1039E.
+      if (character == 0xf0908e9e) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_substitute_open_box, width)) {
-        return f_true;
+      // Warang Citi: U+118F3 to U+118FE.
+      if (character >= 0xf091a3b3 && character <= 0xf091a3be) {
+        return f_false;
        }
  
-      if (!memcmp(&utf, f_utf_substitute_open_box_shouldered, width)) {
-        return f_true;
+      // Unicode (and therefore UTF-8) does not support representing any character greater than this (U+10FFFF).
+      if (character > 0xf48fbfbf) {
+        return f_false;
        }
-
-      return f_false;
      }
  
-    return f_false;
+    return f_true;
    }
-#endif // _di_f_utf_is_space_character_
+#endif // _di_f_utf_character_is_value_
  
-#ifndef _di_f_utf_is_substitute_character_
-  f_return_status f_utf_is_substitute_character(const f_utf_character character) {
+#ifndef _di_f_utf_character_is_whitespace_
+  f_return_status f_utf_character_is_whitespace(const f_utf_character character) {
      unsigned short width = f_macro_utf_character_width_is(character);
  
      if (width == 0) {
-      // there is no substitute character in ASCII.
+      int8_t ascii = character >> 24;
+
+      if (isspace(ascii)) {
+        return f_true;
+      }
+
        return f_false;
      }
-    else if (width == 1) {
+
+    if (width == 1) {
        return f_status_is_error(f_invalid_utf);
      }
  
-    f_bool is_big_endian = f_utf_is_big_endian();
+    // Latin-1 Supplement: U+00A0, U+00AD.
+    if (character == 0xc2a00000 || character == 0xc2ad0000) {
+      return f_true;
+    }
  
-    if (width == 2) {
-      uint16_t utf = 0;
-      if (is_big_endian) {
-        utf = (uint16_t) (character >> 16);
-      }
-      else {
-        utf = (f_macro_utf_character_to_char_2(character) << 8) | f_macro_utf_character_to_char_1(character);
-      }
+    // Tags: U+E0020.
+    if (character == 0xf3a08080) {
+      return f_true;
+    }
  
-      if (!memcmp(&utf, f_utf_substitute_middle_dot, width)) {
-        return f_true;
-      }
+    return f_false;
+  }
+#endif // _di_f_utf_character_is_whitespace_
  
-      return f_false;
+#ifndef _di_f_utf_character_to_char_
+  f_return_status f_utf_character_to_char(const f_utf_character utf_character, f_string *character, unsigned short *max_width) {
+    #ifndef _di_level_0_parameter_checking_
+      if (utf_character == 0) return f_status_set_error(f_invalid_parameter);
+      if (max_width == 0 && *character != 0) return f_status_set_error(f_invalid_parameter);
+      if (max_width != 0 && *character == 0) return f_status_set_error(f_invalid_parameter);
+      if (max_width != 0 && *max_width > 4) return f_status_set_error(f_invalid_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    f_status status = f_none;
+
+    unsigned short width = f_macro_utf_character_width_is(utf_character);
+
+    if (max_width == 0) {
+      f_macro_string_new(status, *character, width);
+
+      if (f_status_is_error(status)) return status;
+
+      width = 1;
+      *max_width = 1;
+    }
+    else if (width == 1) {
+      return f_status_is_error(f_invalid_utf);
+    }
+    else if (width > *max_width) {
+      return f_status_set_error(f_failure);
      }
  
-    if (width == 3) {
+    *max_width = width;
+
+    if (f_utf_is_big_endian()) {
+      memcpy(*character, &utf_character, sizeof(int8_t) * width);
+    }
+    else {
        uint32_t utf = 0;
-      if (is_big_endian) {
-        utf = character;
+
+      if (width == 1) {
+        utf = f_macro_utf_character_to_char_1(utf_character) << 24;
        }
-      else {
-        utf = (f_macro_utf_character_to_char_3(character) << 24) | (f_macro_utf_character_to_char_2(character) << 16) | (f_macro_utf_character_to_char_1(character) << 8);
+      else if (width == 2) {
+        utf = (f_macro_utf_character_to_char_2(utf_character) << 24) | (f_macro_utf_character_to_char_1(utf_character) << 16);
        }
-
-      if (!memcmp(&utf, f_utf_substitute_symbol_blank, width)) {
-        return f_true;
+      else if (width == 3) {
+        utf = (f_macro_utf_character_to_char_3(utf_character) << 24) | (f_macro_utf_character_to_char_2(utf_character) << 16) | (f_macro_utf_character_to_char_1(utf_character) << 8);
        }
-
-      if (!memcmp(&utf, f_utf_substitute_symbol_space, width)) {
-        return f_true;
+      else if (width == 4) {
+        utf = (f_macro_utf_character_to_char_4(utf_character) << 24) | (f_macro_utf_character_to_char_3(utf_character) << 16) | (f_macro_utf_character_to_char_2(utf_character) << 8) | f_macro_utf_character_to_char_1(utf_character);
        }
  
-      if (!memcmp(&utf, f_utf_substitute_open_box, width)) {
-        return f_true;
-      }
+      memcpy(*character, &utf, sizeof(int8_t) * width);
+    }
  
-      if (!memcmp(&utf, f_utf_substitute_open_box_shouldered, width)) {
-        return f_true;
-      }
+    return f_none;
+  }
+#endif // _di_f_utf_character_to_char_
  
-      return f_false;
+#ifndef _di_f_utf_is_big_endian_
+  f_return_status f_utf_is_big_endian() {
+    uint16_t test_int = (0x01 << 8) | 0x02;
+    int8_t test_char[2] = {0x01, 0x02};
+
+    if (!memcmp(&test_int, test_char, 2)) {
+      return f_true;
      }
  
      return f_false;
    }
-#endif // _di_f_utf_is_substitute_character_
+#endif // _di_f_utf_is_big_endian_
  
-#ifndef _di_f_utf_is_whitespace_character_
-  f_return_status f_utf_is_whitespace_character(const f_utf_character character) {
-    unsigned short width = f_macro_utf_character_width_is(character);
+#ifndef _di_f_utf_is_
+  f_return_status f_utf_is(const f_string character, const unsigned short max_width) {
+    #ifndef _di_level_0_parameter_checking_
+      if (max_width < 1) return f_status_set_error(f_invalid_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    unsigned short width = f_macro_utf_byte_width_is(*character);
  
      if (width == 0) {
-      int8_t ascii = character >> 24;
+      return f_false;
+    }
  
-      if (isspace(ascii)) {
-        return f_true;
-      }
+    if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
+
+    return f_true;
+  }
+#endif // _di_f_utf_is_
+
+#ifndef _di_f_utf_is_bom_
+  f_return_status f_utf_is_bom(const f_string character, const unsigned short max_width) {
+    #ifndef _di_level_0_parameter_checking_
+      if (max_width < 1) return f_status_set_error(f_invalid_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    unsigned short width = f_macro_utf_byte_width_is(*character);
  
+    if (width == 0) {
        return f_false;
      }
-    else if (width == 1) {
-      return f_status_is_error(f_invalid_utf);
-    }
  
-    f_bool is_big_endian = f_utf_is_big_endian();
+    if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
  
-    if (width == 2) {
-      uint16_t utf = 0;
-      if (is_big_endian) {
-        utf = (uint16_t) (character >> 16);
-      }
-      else {
-        utf = (f_macro_utf_character_to_char_2(character) << 8) | f_macro_utf_character_to_char_1(character);
-      }
+    if (width > max_width) {
+      return f_status_set_error(f_maybe);
+    }
  
-      if (!memcmp(&utf, f_utf_space_no_break, width)) {
+    if (width == 3) {
+      if (!memcmp(character, f_utf_bom, width)) {
          return f_true;
        }
+    }
  
-      if (!memcmp(&utf, f_utf_space_line_feed_reverse, width)) {
-        return f_true;
-      }
+    return f_false;
+  }
+#endif // _di_f_utf_is_bom_
+
+#ifndef _di_f_utf_is_control_
+  f_return_status f_utf_is_control(const f_string character, const unsigned short max_width) {
+    #ifndef _di_level_0_parameter_checking_
+      if (max_width < 1) return f_status_set_error(f_invalid_parameter);
+    #endif // _di_level_0_parameter_checking_
  
-      if (!memcmp(&utf, f_utf_space_line_next, width)) {
+    if (f_macro_utf_byte_width_is(*character) == 0) {
+      if (iscntrl(*character)) {
          return f_true;
        }
  
        return f_false;
      }
  
-    if (width == 3) {
-      uint32_t utf = 0;
-      if (is_big_endian) {
-        utf = character;
-      }
-      else {
-        utf = (f_macro_utf_character_to_char_3(character) << 24) | (f_macro_utf_character_to_char_2(character) << 16) | (f_macro_utf_character_to_char_1(character) << 8);
-      }
+    if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
  
-      if (!memcmp(&utf, f_utf_space_no_break_narrow, width)) {
-        return f_true;
-      }
+    f_utf_character character_utf = 0;
+    f_status status = 0;
  
-      if (!memcmp(&utf, f_utf_space_en, width)) {
-        return f_true;
-      }
+    status = f_utf_char_to_control(character, max_width, &character_utf);
  
-      if (!memcmp(&utf, f_utf_space_en_quad, width)) {
-        return f_true;
-      }
+    if (status != f_none) return status;
  
-      if (!memcmp(&utf, f_utf_space_en_quad, width)) {
-        return f_true;
-      }
+    return f_utf_character_is_control(character);
+  }
+#endif // _di_f_utf_is_control_
  
-      if (!memcmp(&utf, f_utf_space_em, width)) {
-        return f_true;
-      }
+#ifndef _di_f_utf_is_control_picture_
+  f_return_status f_utf_is_control_picture(const f_string character, const unsigned short max_width) {
+    #ifndef _di_level_0_parameter_checking_
+      if (max_width < 1) return f_status_set_error(f_invalid_parameter);
+    #endif // _di_level_0_parameter_checking_
  
-      if (!memcmp(&utf, f_utf_space_em_quad, width)) {
-        return f_true;
-      }
+    // There are not ASCII control pictures.
+    if (f_macro_utf_byte_width_is(*character) == 0) {
+      return f_false;
+    }
  
-      if (!memcmp(&utf, f_utf_space_em_per_three, width)) {
-        return f_true;
-      }
+    if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
  
-      if (!memcmp(&utf, f_utf_space_em_per_four, width)) {
-        return f_true;
-      }
+    f_utf_character character_utf = 0;
+    f_status status = 0;
  
-      if (!memcmp(&utf, f_utf_space_em_per_six, width)) {
-        return f_true;
-      }
+    status = f_utf_char_to_character(character, max_width, &character_utf);
  
-      if (!memcmp(&utf, f_utf_space_figure, width)) {
-        return f_true;
-      }
+    if (status != f_none) return status;
  
-      if (!memcmp(&utf, f_utf_space_punctuation, width)) {
-        return f_true;
-      }
+    return f_utf_character_is_control_picture(character);
+  }
+#endif // _di_f_utf_is_control_picture_
  
-      if (!memcmp(&utf, f_utf_space_thin, width)) {
-        return f_true;
-      }
+#ifndef _di_f_utf_is_graph_
+  f_return_status f_utf_is_graph(const f_string character, const unsigned short max_width) {
+    #ifndef _di_level_0_parameter_checking_
+      if (max_width < 1) return f_status_set_error(f_invalid_parameter);
+    #endif // _di_level_0_parameter_checking_
  
-      if (!memcmp(&utf, f_utf_space_hair, width)) {
+    if (f_macro_utf_byte_width_is(*character) == 0) {
+      if (isgraph(*character)) {
          return f_true;
        }
  
-      if (!memcmp(&utf, f_utf_space_separator_line, width)) {
-        return f_true;
-      }
+      return f_false;
+    }
  
-      if (!memcmp(&utf, f_utf_space_separator_paragraph, width)) {
-        return f_true;
-      }
+    if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
  
-      if (!memcmp(&utf, f_utf_space_ogham, width)) {
-        return f_true;
-      }
+    f_utf_character character_utf = 0;
+    f_status status = 0;
  
-      if (!memcmp(&utf, f_utf_space_ideographic, width)) {
-        return f_true;
-      }
+    status = f_utf_char_to_character(character, max_width, &character_utf);
+
+    if (status != f_none) return status;
  
-      if (!memcmp(&utf, f_utf_space_medium_mathematical, width)) {
+    return f_utf_character_is_graph(character);
+  }
+#endif // _di_f_utf_is_graph_
+
+#ifndef _di_f_utf_is_whitespace_
+  f_return_status f_utf_is_whitespace(const f_string character, const unsigned short max_width) {
+    #ifndef _di_level_0_parameter_checking_
+      if (max_width < 1) return f_status_set_error(f_invalid_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    unsigned short width = f_macro_utf_byte_width_is(*character);
+
+    if (width == 0) {
+      if (isspace(*character)) {
          return f_true;
        }
  
        return f_false;
      }
  
-    return f_false;
+    if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
+
+    f_utf_character character_utf = 0;
+    f_status status = 0;
+
+    status = f_utf_char_to_character(character, max_width, &character_utf);
+
+    if (status != f_none) return status;
+
+    return f_utf_character_is_whitespace(character);
    }
-#endif // _di_f_utf_is_whitespace_character_
+#endif // _di_f_utf_is_whitespace_
  
  #ifndef _di_f_utf_char_to_character_
-  f_return_status f_utf_char_to_character(const f_string character, const unsigned short max_width, f_utf_character *utf_character) {
+  f_return_status f_utf_char_to_character(const f_string character, const unsigned short max_width, f_utf_character *character_utf) {
      #ifndef _di_level_0_parameter_checking_
        if (max_width < 1) return f_status_set_error(f_invalid_parameter);
-      if (utf_character == 0) return f_status_set_error(f_invalid_parameter);
+      if (character_utf == 0) return f_status_set_error(f_invalid_parameter);
      #endif // _di_level_0_parameter_checking_
  
      unsigned short width = f_macro_utf_byte_width_is(*character);
  
      if (width == 0) {
-      *utf_character = f_macro_utf_character_from_char_1(character[0]);
+      *character_utf = f_macro_utf_character_from_char_1(character[0]);
        return f_none;
      }
      else if (width == 1) {
@@ -817,87 +984,31 @@ extern "C" {
        return f_status_set_error(f_failure);
      }
  
-    *utf_character = 0;
-    *utf_character |= f_macro_utf_character_to_char_1(character[0]);
+    *character_utf = 0;
+    *character_utf |= f_macro_utf_character_to_char_1(character[0]);
  
      if (width < 2) {
        return f_none;
      }
  
-    *utf_character |= f_macro_utf_character_to_char_2(character[1]);
+    *character_utf |= f_macro_utf_character_to_char_2(character[1]);
  
      if (width == 2) {
        return f_none;
      }
  
-    *utf_character |= f_macro_utf_character_to_char_3(character[2]);
+    *character_utf |= f_macro_utf_character_to_char_3(character[2]);
  
      if (width == 3) {
        return f_none;
      }
  
-    *utf_character |= f_macro_utf_character_to_char_4(character[3]);
+    *character_utf |= f_macro_utf_character_to_char_4(character[3]);
  
      return f_none;
    }
  #endif // _di_f_utf_char_to_character_
  
-#ifndef _di_f_utf_character_to_char_
-  f_return_status f_utf_character_to_char(const f_utf_character utf_character, f_string *character, unsigned short *max_width) {
-    #ifndef _di_level_0_parameter_checking_
-      if (utf_character == 0) return f_status_set_error(f_invalid_parameter);
-      if (max_width == 0 && *character != 0) return f_status_set_error(f_invalid_parameter);
-      if (max_width != 0 && *character == 0) return f_status_set_error(f_invalid_parameter);
-      if (max_width != 0 && *max_width > 4) return f_status_set_error(f_invalid_parameter);
-    #endif // _di_level_0_parameter_checking_
-
-    f_status status = f_none;
-
-    unsigned short width = f_macro_utf_character_width_is(utf_character);
-
-    if (max_width == 0) {
-      f_macro_string_new(status, *character, width);
-
-      if (f_status_is_error(status)) return status;
-
-      width = 1;
-      *max_width = 1;
-    }
-    else if (width == 1) {
-      return f_status_is_error(f_invalid_utf);
-    }
-    else if (width > *max_width) {
-      return f_status_set_error(f_failure);
-    }
-
-    *max_width = width;
-
-    if (f_utf_is_big_endian()) {
-      memcpy(*character, &utf_character, sizeof(int8_t) * width);
-    }
-    else {
-      uint32_t utf = 0;
-
-      if (width == 1) {
-        utf = f_macro_utf_character_to_char_1(utf_character) << 24;
-      }
-      else if (width == 2) {
-        utf = (f_macro_utf_character_to_char_2(utf_character) << 24) | (f_macro_utf_character_to_char_1(utf_character) << 16);
-      }
-      else if (width == 3) {
-        utf = (f_macro_utf_character_to_char_3(utf_character) << 24) | (f_macro_utf_character_to_char_2(utf_character) << 16) | (f_macro_utf_character_to_char_1(utf_character) << 8);
-      }
-      else if (width == 4) {
-        utf = (f_macro_utf_character_to_char_4(utf_character) << 24) | (f_macro_utf_character_to_char_3(utf_character) << 16) | (f_macro_utf_character_to_char_2(utf_character) << 8) | f_macro_utf_character_to_char_1(utf_character);
-      }
-
-      memcpy(*character, &utf, sizeof(int8_t) * width);
-    }
-
-    return f_none;
-  }
-#endif // _di_f_utf_character_to_char_
-
  #ifdef __cplusplus
  } // extern "C"
  #endif
diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h

index 93c25a59db0594604fbced932516deb83ab99626..de1bfa93977a5e87b0c0d6195c3700c7ea6fd05f 100644 (file)
--- a/level_0/f_utf/c/utf.h
+++ b/level_0/f_utf/c/utf.h
@@ -489,6 +489,8 @@ extern "C" {
   *
   * These are integers representing character codes that represent types of substitute spaces.
   *
+ * Substitute codes are not actual codes and are actually prints of the codes so they should not be treated as the actual codes.
+ *
   * This does not provide substitute whitespace codes for standard ascii whitespaces, such as '\t' or '\r'.
   */
  #ifndef _di_f_utf_substitute_
@@ -510,60 +512,124 @@ extern "C" {
  #endif // _di_f_utf_substitute_
  
  /**
- * Helper function for UTF-8 processing code to determine endianess of the system.
+ * Check to see if the entire byte block of the character is a UTF-8 character.
+ *
+ * This does not validate if the UTF-8 character is a valid UTF-8 character, for that use f_utf_character_is_valid().
   *
+ * @param character
+ *   The character to validate.
   *
   * @return
- *   f_true if the system is big-endian.
- *   f_false if the system is little-endian.
+ *   f_true if a UTF-8 character.
+ *   f_false if not a UTF-8 character.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see f_utf_character_is_valid()
   */
-#ifndef _di_f_utf_is_big_endian_
-  extern f_return_status f_utf_is_big_endian();
-#endif // _di_f_utf_is_big_endian_
+#ifndef _di_f_utf_character_is_
+  extern f_return_status f_utf_character_is(const f_utf_character character);
+#endif // _di_f_utf_character_is_
  
  /**
- * Check to see if the entire byte block of the character is a UTF-8 character.
+ * Check to see if the entire byte block of the character is a UTF-8 BOM.
+ *
+ * @param character
+ *   The character to validate.
+ *
+ * @return
+ *   f_true if a UTF-8 BOM.
+ *   f_false if not a UTF-8 BOM.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ */
+#ifndef _di_f_utf_character_is_bom_
+  extern f_return_status f_utf_character_is_bom(const f_utf_character character);
+#endif // _di_f_utf_character_is_bom_
+
+/**
+ * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character.
+ *
+ * The UTF-8 BOM is considered a control character.
+ *
+ * @param character
+ *   The character to validate.
+ *
+ * @return
+ *   f_true if a UTF-8 control character.
+ *   f_false if not a UTF-8 control character.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see iscntrl()
+ */
+#ifndef _di_f_utf_character_is_control_
+  extern f_return_status f_utf_character_is_control(const f_utf_character character);
+#endif // _di_f_utf_character_is_control_
+
+/**
+ * Check to see if the entire byte block of the character is a UTF-8 control picture character.
+ *
+ * Control Picture characters are placeholders for special ASCII characters and therefore there are no ASCII Control Picture characters.
+ *
+ * @param character
+ *   The character to validate.
+ *
+ * @return
+ *   f_true if a UTF-8 control picture character.
+ *   f_false if not a UTF-8 control picture character.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ */
+#ifndef _di_f_utf_character_is_control_picture_
+  extern f_return_status f_utf_character_is_control_picture(const f_utf_character character);
+#endif // _di_f_utf_character_is_control_picture_
+
+/**
+ * Check to see if the entire byte block of the character is a 1-width UTF-8 character fragment.
+ *
+ * Characters whose width is 1-byte are invalid.
+ * However, the character could have been cut-off, so whether or not this is actually valid should be determined by the caller.
+ *
+ * For normal validation functions, try using f_utf_character_is() or f_utf_character_is_valid().
   *
   * @param character
   *   The character to validate.
- *   There must be enough space allocated to compare against, as limited by max_width.
- * @param max_width
- *   The maximum width available for checking.
- *   Can be anything greater than 0.
   *
   * @return
   *   f_true if a UTF-8 character.
   *   f_false if not a UTF-8 character.
- *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
- *   f_invalid_parameter (with error bit) if a parameter is invalid.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see f_utf_character_is()
+ * @see f_utf_character_is_valid()
   */
-#ifndef _di_f_utf_is_
-  extern f_return_status f_utf_is(const f_string character, const unsigned short max_width);
-#endif // _di_f_utf_is_
+#ifndef _di_f_utf_character_is_fragment_
+  extern f_return_status f_utf_character_is_fragment(const f_utf_character character);
+#endif // _di_f_utf_character_is_fragment_
  
  /**
- * Check to see if the entire byte block of the character is a UTF-8 BOM.
+ * Check to see if the entire byte block of the character is an ASCII or UTF-8 printable character.
   *
   * @param character
   *   The character to validate.
- *   There must be enough space allocated to compare against, as limited by max_width.
- * @param max_width
- *   The maximum width available for checking.
- *   Can be anything greater than 0.
   *
   * @return
- *   f_true if a UTF-8 whitespace or substitute.
- *   f_false if not a UTF-8 whitespace or substitute.
- *   f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough.
- *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
- *   f_invalid_parameter (with error bit) if a parameter is invalid.
+ *   f_true if a UTF-8 graph.
+ *   f_false if not a UTF-8 graph.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see isgraph()
   */
-#ifndef _di_f_utf_is_bom_
-  extern f_return_status f_utf_is_bom(const f_string character, const unsigned short max_width);
-#endif // _di_f_utf_is_bom_
+#ifndef _di_f_utf_character_is_graph_
+  extern f_return_status f_utf_character_is_graph(const f_utf_character character);
+#endif // _di_f_utf_character_is_graph_
  
  /**
- * Check to see if the entire byte block of the character is a UTF-8 character.
+ * Check to see if the entire byte block of the character is a valid UTF-8 character.
+ *
+ * This does validate if the UTF-8 character is a valid UTF-8 character.
+ * To not do this, use f_utf_character_is().
+ *
+ * This function can be expensive due to how Unicode has invalid codes spread randomly through it.
+ * For simpler error checking, try f_utf_is_fragment(), to just check that the width is valid or not.
+ * (First characters should not have a width of 1, and all other characters should have a width of 1.)
   *
   * @param character
   *   The character to validate.
@@ -571,17 +637,75 @@ extern "C" {
   * @return
   *   f_true if a UTF-8 character.
   *   f_false if not a UTF-8 character.
- *   f_invalid_utf (with error bit) if character is an incomplete UTF-8 fragment.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see f_utf_character_is()
+ * @see f_utf_character_is_fragment()
+ */
+#ifndef _di_f_utf_character_is_valid_
+  extern f_return_status f_utf_character_is_valid(const f_utf_character character);
+#endif // _di_f_utf_character_is_value_
+
+/**
+ * Check to see if the entire byte block of the character is an ASCII or UTF-8 general space or control character.
+ *
+ * @param character
+ *   The character to validate.
+ *
+ * @return
+ *   f_true if a UTF-8 whitespace.
+ *   f_false if not a UTF-8 whitespace.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ */
+#ifndef _di_f_utf_character_is_whitespace_
+  extern f_return_status f_utf_character_is_whitespace(const f_utf_character character);
+#endif // _di_f_utf_character_is_whitespace_
+
+/**
+ * Convert a specialized f_utf_character type to a int8_t, stored as a string (character buffer).
+ *
+ * This will also convert ASCII characters stored in the utf_character array.
+ *
+ * @param utf_character
+ *   The UTF-8 characterr to convert from.
+ * @param character
+ *   A int8_t representation of the UTF-8 character, stored as a string of width bytes.
+ *   If max_width is 0, then this should not be allocated (set the pointer address to 0).
+ * @param max_width
+ *   The number of bytes the generated character represents.
+ *   If this is set to 0, then the character will be allocated and this will be set to the width of the utf_character.
+ *   If this is set to some value greater than 0 (up to 4), then this represents the size of the character array (no allocations are performed).
+ *   If this is greater than 0, and the utf_character width is larger than this size, then an error is returned.
+ *
+ * @return
+ *   f_none if conversion was successful.
+ *   f_failure (with error bit) if width is not long enough to convert.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
   *   f_invalid_parameter (with error bit) if a parameter is invalid.
+ *   f_allocation_error (with error bit) on memory allocation error.
+ *   f_failure (with error bit) if width is not long enough to convert.
   */
-#ifndef _di_f_utf_is_
-  extern f_return_status f_utf_is_character(const f_utf_character character);
-#endif // _di_f_utf_is_
+#ifndef _di_f_utf_character_to_char_
+  extern f_return_status f_utf_character_to_char(const f_utf_character utf_character, f_string *character, unsigned short *max_width);
+#endif // _di_f_utf_character_to_char_
+
+/**
+ * Helper function for UTF-8 processing code to determine endianess of the system.
+ *
+ * @todo relocate this outside of f_utf into a more general path, perhaps f_memory (f_memory_is_big_endian).
+ *
+ * @return
+ *   f_true if the system is big-endian.
+ *   f_false if the system is little-endian.
+ */
+#ifndef _di_f_utf_is_big_endian_
+  extern f_return_status f_utf_is_big_endian();
+#endif // _di_f_utf_is_big_endian_
  
  /**
- * Check to see if the entire byte block of the character is a UTF-8 printable character.
+ * Check to see if the entire byte block of the character is a UTF-8 character.
   *
- * This does not check non-UTF-8 graph.
+ * This does not check the validity of the character, for that instead use f_utf_is_valid().
   *
   * @param character
   *   The character to validate.
@@ -591,20 +715,19 @@ extern "C" {
   *   Can be anything greater than 0.
   *
   * @return
- *   f_true if a UTF-8 graph.
- *   f_false if not a UTF-8 graph.
- *   f_maybe (with error bit) if this could be a graph but width is not long enough.
+ *   f_true if a UTF-8 character.
+ *   f_false if not a UTF-8 character.
   *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
   *   f_invalid_parameter (with error bit) if a parameter is invalid.
+ *
+ * @see f_utf_is_valid()
   */
-#ifndef _di_f_utf_is_graph_
-  extern f_return_status f_utf_is_graph(const f_string character, const unsigned short max_width);
-#endif // _di_f_utf_is_graph_
+#ifndef _di_f_utf_is_
+  extern f_return_status f_utf_is(const f_string character, const unsigned short max_width);
+#endif // _di_f_utf_is_
  
  /**
- * Check to see if the entire byte block of the character is a UTF-8 whitespace or substitute character.
- *
- * This does not check non-UTF-8 whitespace.
+ * Check to see if the entire byte block of the character is a UTF-8 BOM.
   *
   * @param character
   *   The character to validate.
@@ -620,14 +743,14 @@ extern "C" {
   *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
   *   f_invalid_parameter (with error bit) if a parameter is invalid.
   */
-#ifndef _di_f_utf_is_space_
-  extern f_return_status f_utf_is_space(const f_string character, const unsigned short max_width);
-#endif // _di_f_utf_is_space_
+#ifndef _di_f_utf_is_bom_
+  extern f_return_status f_utf_is_bom(const f_string character, const unsigned short max_width);
+#endif // _di_f_utf_is_bom_
  
  /**
- * Check to see if the entire byte block of the character is a UTF-8 whitespace substitute character.
+ * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character.
   *
- * This does not check non-UTF-8 whitespace.
+ * The UTF-8 BOM is considered a control character.
   *
   * @param character
   *   The character to validate.
@@ -637,20 +760,20 @@ extern "C" {
   *   Can be anything greater than 0.
   *
   * @return
- *   f_true if a UTF-8 substitute.
- *   f_false if not a UTF-8 substitute.
- *   f_maybe (with error bit) if this could be a substitute but width is not long enough.
+ *   f_true if a UTF-8 control character.
+ *   f_false if not a UTF-8 control character.
   *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
- *   f_invalid_parameter (with error bit) if a parameter is invalid.
+ *
+ * @see iscntrl()
   */
-#ifndef _di_f_utf_is_substitute_
-  extern f_return_status f_utf_is_substitute(const f_string character, const unsigned short max_width);
-#endif // _di_f_utf_is_substitute_
+#ifndef _di_f_utf_is_control_
+  extern f_return_status f_utf_is_control(const f_string character, const unsigned short max_width);
+#endif // _di_f_utf_is_control_
  
  /**
- * Check to see if the entire byte block of the character is a UTF-8 general whitespace character.
+ * Check to see if the entire byte block of the character is a UTF-8 control picture character.
   *
- * This does not check non-UTF-8 whitespace.
+ * Control Picture characters are placeholders for special ASCII characters and therefore there are no ASCII Control Picture characters.
   *
   * @param character
   *   The character to validate.
@@ -660,109 +783,119 @@ extern "C" {
   *   Can be anything greater than 0.
   *
   * @return
- *   f_true if a UTF-8 whitespace.
- *   f_false if not a UTF-8 whitespace.
- *   f_maybe (with error bit) if this could be a whitespace but width is not long enough.
+ *   f_true if a UTF-8 control picture character.
+ *   f_false if not a UTF-8 control picture character.
   *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
- *   f_invalid_parameter (with error bit) if a parameter is invalid.
   */
-#ifndef _di_f_utf_is_whitespace_
-  extern f_return_status f_utf_is_whitespace(const f_string character, const unsigned short max_width);
-#endif // _di_f_utf_is_whitespace_
+#ifndef _di_f_utf_is_control_picture_
+  extern f_return_status f_utf_is_control_picture(const f_string character, const unsigned short max_width);
+#endif // _di_f_utf_is_control_picture_
  
  /**
- * Check to see if the entire byte block of the character is a UTF-8 BOM.
- *
- * @param character
- *   The UTF-8 character to validate.
+ * Check to see if the entire byte block of the character is a 1-width UTF-8 character fragment.
   *
- * @return
- *   f_true if a UTF-8 whitespace or substitute.
- *   f_false if not a UTF-8 whitespace or substitute.
- *   f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough.
- *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
- *   f_invalid_parameter (with error bit) if a parameter is invalid.
- */
-#ifndef _di_f_utf_is_bom_character_
-  extern f_return_status f_utf_is_bom_character(const f_utf_character character);
-#endif // _di_f_utf_is_bom_character_
-
-/**
- * Check to see if the entire byte block of the character is a UTF-8 printable character.
+ * Characters whose width is 1-byte are invalid.
+ * However, the character could have been cut-off, so whether or not this is actually valid should be determined by the caller.
   *
- * This does not check non-UTF-8 graph.
+ * For normal validation functions, try using f_utf_character_is() or f_utf_character_is_valid().
   *
   * @param character
   *   The character to validate.
+ *   There must be enough space allocated to compare against, as limited by max_width.
+ * @param max_width
+ *   The maximum width available for checking.
+ *   Can be anything greater than 0.
   *
   * @return
- *   f_true if a UTF-8 graph.
- *   f_false if not a UTF-8 graph.
- *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
- *   f_invalid_parameter (with error bit) if a parameter is invalid.
+ *   f_true if a UTF-8 character.
+ *   f_false if not a UTF-8 character.
+ *
+ * @see f_utf_character_is()
+ * @see f_utf_character_is_valid()
   */
-#ifndef _di_f_utf_is_graph_character_
-  extern f_return_status f_utf_is_graph_character(const f_utf_character character);
-#endif // _di_f_utf_is_graph_character_
+#ifndef _di_f_utf_is_fragment_
+  extern f_return_status f_utf_is_fragment(const f_string character, const unsigned short max_width);
+#endif // _di_f_utf_is_fragment_
  
  /**
- * Check to see if the entire byte block of the character is a UTF-8 whitespace or substitute character.
- *
- * This does not check non-UTF-8 whitespace.
+ * Check to see if the entire byte block of the character is an ASCII or UTF-8 printable character.
   *
   * @param character
   *   The character to validate.
+ *   There must be enough space allocated to compare against, as limited by max_width.
+ * @param max_width
+ *   The maximum width available for checking.
+ *   Can be anything greater than 0.
   *
   * @return
- *   f_true if a UTF-8 whitespace or substitute.
- *   f_false if not a UTF-8 whitespace or substitute.
- *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ *   f_true if a UTF-8 graph.
+ *   f_false if not a UTF-8 graph.
+ *   f_maybe (with error bit) if this could be a graph but width is not long enough.
+ *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
   *   f_invalid_parameter (with error bit) if a parameter is invalid.
+ *
+ * @see isgraph()
+ * @see iscntrl()
   */
-#ifndef _di_f_utf_is_space_character_
-  extern f_return_status f_utf_is_space_character(const f_utf_character character);
-#endif // _di_f_utf_is_space_character_
+#ifndef _di_f_utf_is_graph_
+  extern f_return_status f_utf_is_graph(const f_string character, const unsigned short max_width);
+#endif // _di_f_utf_is_graph_
  
  /**
- * Check to see if the entire byte block of the character is a UTF-8 whitespace substitute character.
+ * Check to see if the entire byte block of the character is a UTF-8 character and if that character is a valid UTF-8.
   *
- * This does not check non-UTF-8 whitespace.
+ * This does check the validity of the character, to not do this use f_utf_is().
+ *
+ * This function can be expensive due to how Unicode has invalid codes spread randomly through it.
+ * For simpler error checking, try f_utf_is_fragment(), to just check that the width is valid or not.
+ * (First characters should have a width of not 1, and all other characters should not have a width of 1.)
   *
   * @param character
   *   The character to validate.
+ *   There must be enough space allocated to compare against, as limited by max_width.
+ * @param max_width
+ *   The maximum width available for checking.
+ *   Can be anything greater than 0.
   *
   * @return
- *   f_true if a UTF-8 substitute.
- *   f_false if not a UTF-8 substitute.
- *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ *   f_true if a valid UTF-8 character.
+ *   f_false if not a valid UTF-8 character.
+ *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
   *   f_invalid_parameter (with error bit) if a parameter is invalid.
+ *
+ * @see f_utf_is()
+ * @see f_utf_is_fragment()
   */
-#ifndef _di_f_utf_is_substitute_character_
-  extern f_return_status f_utf_is_substitute_character(const f_utf_character character);
-#endif // _di_f_utf_is_substitute_character_
+#ifndef _di_f_utf_is_valid_
+  extern f_return_status f_utf_is_valid(const f_string character, const unsigned short max_width);
+#endif // _di_f_utf_is_valid_
  
  /**
- * Check to see if the entire byte block of the character is a UTF-8 general whitespace character.
- *
- * This does not check non-UTF-8 whitespace.
+ * Check to see if the entire byte block of the character is an ASCII or UTF-8 general space or control character.
   *
   * @param character
   *   The character to validate.
+ *   There must be enough space allocated to compare against, as limited by max_width.
+ * @param max_width
+ *   The maximum width available for checking.
+ *   Can be anything greater than 0.
   *
   * @return
   *   f_true if a UTF-8 whitespace.
   *   f_false if not a UTF-8 whitespace.
- *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ *   f_maybe (with error bit) if this could be a whitespace but width is not long enough.
+ *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
   *   f_invalid_parameter (with error bit) if a parameter is invalid.
+ *
+ * @see isspace()
+ * @see iscntrl()
   */
-#ifndef _di_f_utf_is_whitespace_character_
-  extern f_return_status f_utf_is_whitespace_character(const f_utf_character character);
-#endif // _di_f_utf_is_whitespace_character_
+#ifndef _di_f_utf_is_whitespace_
+  extern f_return_status f_utf_is_whitespace(const f_string character, const unsigned short max_width);
+#endif // _di_f_utf_is_whitespace_
  
  /**
- * Convert a UTF-8 character, stored as a string (character buffer), to the specialized f_utf_character type.
- *
- * This will also convert ASCII characters.
+ * Convert an ASCII or UTF-8 character, stored as a string (character buffer), to the specialized f_utf_character type.
   *
   * @param character
   *   The character string to be converted to the f_utf_character type.
@@ -770,7 +903,7 @@ extern "C" {
   * @param max_width
   *   The maximum width available for converting.
   *   Can be anything greater than 0.
- * @param utf_character
+ * @param character_utf
   *   The generated character of type f_utf_character.
   *   This value may be cleared, even on error.
   *
@@ -781,37 +914,9 @@ extern "C" {
   *   f_invalid_parameter (with error bit) if a parameter is invalid.
   */
  #ifndef _di_f_utf_char_to_character_
-  extern f_return_status f_utf_char_to_character(const f_string character, const unsigned short max_width, f_utf_character *utf_character);
+  extern f_return_status f_utf_char_to_character(const f_string character, const unsigned short max_width, f_utf_character *character_utf);
  #endif // _di_f_utf_char_to_character_
  
-/**
- * Convert a specialized f_utf_character type to a int8_t, stored as a string (character buffer).
- *
- * This will also convert ASCII characters stored in the utf_character array.
- *
- * @param utf_character
- *   The UTF-8 characterr to convert from.
- * @param character
- *   A int8_t representation of the UTF-8 character, stored as a string of width bytes.
- *   If max_width is 0, then this should not be allocated (set the pointer address to 0).
- * @param max_width
- *   The number of bytes the generated character represents.
- *   If this is set to 0, then the character will be allocated and this will be set to the width of the utf_character.
- *   If this is set to some value greater than 0 (up to 4), then this represents the size of the character array (no allocations are performed).
- *   If this is greater than 0, and the utf_character width is larger than this size, then an error is returned.
- *
- * @return
- *   f_none if conversion was successful.
- *   f_failure (with error bit) if width is not long enough to convert.
- *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
- *   f_invalid_parameter (with error bit) if a parameter is invalid.
- *   f_allocation_error (with error bit) on memory allocation error.
- *   f_failure (with error bit) if width is not long enough to convert.
- */
-#ifndef _di_f_utf_character_to_char_
-  extern f_return_status f_utf_character_to_char(const f_utf_character utf_character, f_string *character, unsigned short *max_width);
-#endif // _di_f_utf_character_to_char_
-
  #ifdef __cplusplus
  } // extern "C"
  #endif
diff --git a/level_1/fl_fss/c/fss.c b/level_1/fl_fss/c/fss.c

index 1f4d877a18746c7982badb94012b81c2ea867cde..0388567436c74c2e439b33bc25d50bc11d70e066 100644 (file)
--- a/level_1/fl_fss/c/fss.c
+++ b/level_1/fl_fss/c/fss.c
@@ -306,7 +306,7 @@ extern "C" {
        max_width = buffer.used - input.start;
      }
  
-    return f_utf_is_space(buffer.string + input.start, max_width);
+    return f_utf_is_whitespace(buffer.string + input.start, max_width);
    }
  #endif // _di_fl_fss_is_space_
  
diff --git a/level_1/fl_string/c/string.c b/level_1/fl_string/c/string.c

index 8b74f4f4c1dcc223e2e4bddda85b01b19a97f2a3..c960bb2b870fb1e5457e1af0ab734505122e7336 100644 (file)
--- a/level_1/fl_string/c/string.c
+++ b/level_1/fl_string/c/string.c
@@ -119,7 +119,7 @@ extern "C" {
        max_width = buffer.used - location->start;
      }
  
-    while (buffer.string[location->start] == placeholder || (status = f_utf_is_space(buffer.string + location->start, max_width)) == f_false) {
+    while (buffer.string[location->start] == placeholder || (status = f_utf_is_whitespace(buffer.string + location->start, max_width)) == f_false) {
        if (f_status_is_error(status)) {
          return status;
        }
diff --git a/level_1/fl_utf/c/utf.c b/level_1/fl_utf/c/utf.c

index 34bce80fca529e558dcb06f61b692fbd262c08e2..f6d2962bcbba0b4eaf3823a5286451c7ad3e8d3e 100644 (file)
--- a/level_1/fl_utf/c/utf.c
+++ b/level_1/fl_utf/c/utf.c
@@ -16,7 +16,7 @@ extern "C" {
  
      f_status status = f_none;
  
-    while (buffer.string[location->start] == placeholder || (status = f_utf_is_graph_character(buffer.string[location->start])) == f_false) {
+    while (buffer.string[location->start] == placeholder || (status = f_utf_character_is_graph(buffer.string[location->start])) == f_false) {
        if (f_status_is_error(status)) {
          return status;
        }
@@ -53,7 +53,7 @@ extern "C" {
  
      f_status status = f_none;
  
-    while (buffer.string[location->start] == placeholder || (status = f_utf_is_space_character(buffer.string[location->start])) == f_false) {
+    while (buffer.string[location->start] == placeholder || (status = f_utf_character_is_space(buffer.string[location->start])) == f_false) {
        if (f_status_is_error(status)) {
          return status;
        }
diff --git a/level_3/byte_dump/c/private-byte_dump.c b/level_3/byte_dump/c/private-byte_dump.c

index be730e5920c3e0a016dfbfee0d5ddc7db1159636..014efcf129df04f492d28fd85635d40532f8d16e 100644 (file)
--- a/level_3/byte_dump/c/private-byte_dump.c
+++ b/level_3/byte_dump/c/private-byte_dump.c
@@ -571,7 +571,7 @@
            printf(".");
          }
        }
-      else if (f_utf_is_whitespace_character(characters.string[i]) == f_true) {
+      else if (f_utf_character_is_whitespace(characters.string[i]) == f_true) {
          printf("%s", byte_dump_sequence_space);
        }
        else if (width_utf == 2 && characters.string[i] == 0xc0800000) {
@@ -616,11 +616,11 @@
          // Use space to represent Vaiation Selectors Supplement codes.
          printf(" ");
        }
-      else if (width_utf == 4 && characters.string[i] >= 0xf09e8080 && characters.string[i] <= 0xf09fbfbf) {
+      else if (width_utf == 4 && characters.string[i] >= 0xf3b08080 && characters.string[i] <= 0xf3bfbfbf) {
          // Use space to represent Supplemental Private Use Area-A codes.
          printf(" ");
        }
-      else if (width_utf == 4 && characters.string[i] >= 0xf0a08080 && characters.string[i] <= 0xf0a1bfbf) {
+      else if (width_utf == 4 && characters.string[i] >= 0xf4808080 && characters.string[i] <= 0xf48fbfbf) {
          // Use space to represent Supplemental Private Use Area-B codes.
          printf(" ");
        }
author	Kevin Day <thekevinday@gmail.com>
	Sat, 14 Sep 2019 00:38:52 +0000 (19:38 -0500)
committer	Kevin Day <thekevinday@gmail.com>
	Sat, 14 Sep 2019 00:38:52 +0000 (19:38 -0500)
level_0/f_utf/c/utf.c		patch \| blob \| history
level_0/f_utf/c/utf.h		patch \| blob \| history
level_1/fl_fss/c/fss.c		patch \| blob \| history
level_1/fl_string/c/string.c		patch \| blob \| history
level_1/fl_utf/c/utf.c		patch \| blob \| history
level_3/byte_dump/c/private-byte_dump.c		patch \| blob \| history