From: Kevin Day Date: Sat, 14 Sep 2019 00:38:52 +0000 (-0500) Subject: Update: begin improving UTF-8 X-Git-Tag: 0.5.0~418 X-Git-Url: https://git.kevux.org/?a=commitdiff_plain;h=a2d806e1f97b1fdd9eabbd59b4562cc2631ced1b;p=fll Update: begin improving UTF-8 I am now moving to perform a more thorough implementation of UTF-8 support. Cleaned up the functions. Due to the sheer size of the changes needed, I am uploading this is stages to ensure nothing gets lost. The work done is incomplete. The funtions will need to be reviewed once everything is in place. --- diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index 5d4e425..8402753 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -4,809 +4,976 @@ extern "C" { #endif -#ifndef _di_f_utf_is_big_endian_ - f_return_status f_utf_is_big_endian() { - uint16_t test_int = (0x01 << 8) | 0x02; - int8_t test_char[2] = {0x01, 0x02}; +#ifndef _di_f_utf_character_is_ + f_return_status f_utf_character_is(const f_utf_character character) { + unsigned short width = f_macro_utf_character_width_is(character); - if (!memcmp(&test_int, test_char, 2)) { - return f_true; + if (width == 0) { + return f_false; } - return f_false; + if (width == 1) { + return f_status_is_error(f_invalid_utf); + } + + return f_true; } -#endif // _di_f_utf_is_big_endian_ +#endif // _di_f_utf_character_is_ -#ifndef _di_f_utf_is_ - f_return_status f_utf_is(const f_string character, const unsigned short max_width) { - #ifndef _di_level_0_parameter_checking_ - if (max_width < 1) return f_status_set_error(f_invalid_parameter); - #endif // _di_level_0_parameter_checking_ +#ifndef _di_f_utf_character_is_bom_ + f_return_status f_utf_character_is_bom(const f_utf_character character) { + if (character == f_utf_character_mask_bom) { + return f_true; + } - unsigned short width = f_macro_utf_byte_width_is(*character); + unsigned short width = f_macro_utf_character_width_is(character); - if (width == 0) { - return f_false; - } - else if (width == 1) { - return f_status_is_error(f_incomplete_utf); + if (width == 1) { + return f_status_is_error(f_invalid_utf); } - return f_true; + return f_false; } -#endif // _di_f_utf_is_ - -#ifndef _di_f_utf_is_bom_ - f_return_status f_utf_is_bom(const f_string character, const unsigned short max_width) { - #ifndef _di_level_0_parameter_checking_ - if (max_width < 1) return f_status_set_error(f_invalid_parameter); - #endif // _di_level_0_parameter_checking_ +#endif // _di_f_utf_character_is_bom_ - unsigned short width = f_macro_utf_byte_width_is(*character); +#ifndef _di_f_utf_character_is_control_ + f_return_status f_utf_character_is_control(const f_utf_character character) { + unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { + if (iscntrl()) { + return f_true; + } + return f_false; } - else if (width == 1) { - return f_status_is_error(f_incomplete_utf); + + if (width == 1) { + return f_status_is_error(f_invalid_utf); } - if (width > max_width) { - return f_status_set_error(f_maybe); + if (width == 2) { + // Latin-1 Supplement: U+0080 to U+009F. + if (character >= 0xc2800000 && character =< 0xc29f0000) { + return f_true; + } + + return f_false; } if (width == 3) { - if (!memcmp(character, f_utf_bom, width)) { + // @todo: these might not be "control characters" and instead be "marking characters" or "combining characters". + // Special: U+FFF9 to U+FFFB. + if (character >= 0xefbfb900 && character =< 0xefbfbb00) { + return f_true; + } + + return f_false; + } + + if (width == 4) { + // Tags: U+E0001 and U+E007F. + if (character == 0xf3a08081 || character == 0xf3a081bf) { return f_true; } } return f_false; } -#endif // _di_f_utf_is_bom_ +#endif // _di_f_utf_character_is_control_ -#ifndef _di_f_utf_is_character_ - f_return_status f_utf_is_character(const f_utf_character character) { +#ifndef _di_f_utf_character_is_control_picture_ + f_return_status f_utf_character_is_control_picture(const f_utf_character character) { unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { + // There are no control picture characters in ASCII. return f_false; } - else if (width == 1) { + + if (width == 1) { return f_status_is_error(f_invalid_utf); } - return f_true; + if (width != 3) { + return f_false; + } + + // Control Pictures: U+2400 to U+2426. + if (character >= 0xe2908000 && character <= 0xe290a600) { + return f_true; + } + + // Specials: U+FFFC to U+FFFD. + if (character == 0xefbfbc00 || character == 0xefbfbd00) { + return f_true; + } + + return f_false; } -#endif // _di_f_utf_is_ +#endif // _di_f_utf_character_is_control_picture_ -#ifndef _di_f_utf_is_graph_ - f_return_status f_utf_is_graph(const f_string character, const unsigned short max_width) { - #ifndef _di_level_0_parameter_checking_ - if (max_width < 1) return f_status_set_error(f_invalid_parameter); - #endif // _di_level_0_parameter_checking_ +#ifndef _di_f_utf_character_is_fragment_ + f_return_status f_utf_character_is_fragment(const f_utf_character character) { + unsigned short width = f_macro_utf_character_width_is(character); - if (f_macro_utf_byte_width_is(*character) == 0) { - if (isgraph(*character)) { + if (width == 1) return f_true; + + return f_false; + } +#endif // _di_f_utf_character_is_fragment_ + +#ifndef _di_f_utf_character_is_graph_ + f_return_status f_utf_character_is_graph(const f_utf_character character) { + unsigned short width = f_macro_utf_character_width_is(character); + + if (width == 0) { + int8_t ascii = character >> 24; + + if (isgraph(ascii)) { return f_true; } return f_false; } - // For now, just assume that any non-whitespace, non-substitute UTF-8 character is a graph. - f_status status = f_utf_is_space(character, max_width); - - if (f_status_is_error(status)) { - return status; + if (width == 1) { + return f_status_is_error(f_invalid_utf); } - else if (status == f_true) { + + if (f_utf_character_is_control(character) == f_true) { return f_false; } - if (f_utf_is_bom(character, max_width) == f_true) { + if (f_utf_character_is_whitespace(character) == f_true) { return f_false; } + // @todo: does this need to check combining and marking characters? or are those still considered graph characters? + return f_true; } -#endif // _di_f_utf_is_graph_ - -#ifndef _di_f_utf_is_space_ - f_return_status f_utf_is_space(const f_string character, const unsigned short max_width) { - #ifndef _di_level_0_parameter_checking_ - if (max_width < 1) return f_status_set_error(f_invalid_parameter); - #endif // _di_level_0_parameter_checking_ +#endif // _di_f_utf_character_is_graph_ - unsigned short width = f_macro_utf_byte_width_is(*character); +#ifndef _di_f_utf_character_is_valid_ + f_return_status f_utf_character_is_valid(const f_utf_character character) { + unsigned short width = f_macro_utf_character_width_is(character); - if (width == 0) { - if (isspace(*character)) { - return f_true; - } + if (width == 0) return f_false; - return f_false; - } - else if (width == 1) { - return f_status_is_error(f_incomplete_utf); + if (width == 1) { + return f_status_is_error(f_invalid_utf); } - if (width > max_width) { - return f_status_set_error(f_maybe); - } + // @todo: check every single character that is not allowed but is represented in UTF-8 and return false. if (width == 2) { - if (!memcmp(character, f_utf_space_no_break, width)) { - return f_true; + // Syriac: U+070E, U+074B, U+074C. + if (character == 0xdc8e0000 || character == 0xdd8b0000 || character == 0xdd8c0000) { + return f_false; } - if (!memcmp(character, f_utf_space_line_feed_reverse, width)) { - return f_true; + // Thaana: U+07B2 to U+07BF. + if (character >= 0xdeb20000 && character <= 0xdebf0000) { + return f_false; } + } - if (!memcmp(character, f_utf_space_line_next, width)) { + if (width == 3) { + // consider all private use codes as valid, U+E000 to U+F8FF. + if (character >= 0xee808000 && character <= 0xefa3bf00) { return f_true; } - if (!memcmp(character, f_utf_substitute_middle_dot, width)) { - return f_true; + // Sinhala: U+0D97 to U+0D99. + if (character >= 0xe0b69700 && character <= 0xe0b69900) { + return f_false; } - return f_false; - } + // Sinhala: U+0DC7 to U+0DC9. + if (character >= 0xe0b78700 && character <= 0xe0b78900) { + return f_false; + } - if (width == 3) { - if (!memcmp(character, f_utf_space_no_break_narrow, width)) { - return f_true; + // Sinhala: U+0DCB to U+0DCE. + if (character >= 0xe0b78b00 && character <= 0xe0b78e00) { + return f_false; } - if (!memcmp(character, f_utf_space_en, width)) { - return f_true; + // Sinhala: U+0DE0 to U+0DE5. + if (character >= 0xe0b7a000 && character <= 0xe0b7a500) { + return f_false; } - if (!memcmp(character, f_utf_space_en_quad, width)) { - return f_true; + // Sinhala: U+0DF5 to U+0DFF. + if (character >= 0xe0b7b500 && character <= 0xe0b7bf00) { + return f_false; } - if (!memcmp(character, f_utf_space_en_quad, width)) { - return f_true; + // Sinhala: U+0D80, U+0D81, U+0D84. + if (character == 0xe0b68000 || character == 0xe0b68100 || character == 0xe0b68400) { + return f_false; } - if (!memcmp(character, f_utf_space_em, width)) { - return f_true; + // Sinhala: U+0DB2, U+0DBC, U+0DBE. + if (character == 0xe0b6b200 || character == 0xe0b6bc00 || character == 0xe0b6be00) { + return f_false; } - if (!memcmp(character, f_utf_space_em_quad, width)) { - return f_true; + // Sinhala: U+0DBF, U+0DD5, U+0DD7. + if (character == 0xe0b6bf00 || character == 0xe0b79500 || character == 0xe0b79700) { + return f_false; } - if (!memcmp(character, f_utf_space_em_per_three, width)) { - return f_true; + // Sinhala: U+0DF0, U+0DF1. + if (character == 0xe0b7b000 || character == 0xe0b7b100) { + return f_false; } - if (!memcmp(character, f_utf_space_em_per_four, width)) { - return f_true; + // Small Form Variants: U+FE6C to U+FE6F. + if (character >= 0xefb9ac00 && character <= 0xefb9af00) { + return f_false; } - if (!memcmp(character, f_utf_space_em_per_six, width)) { - return f_true; + // Small Form Variants: U+FE53, U+FE67. + if (character == 0xefb99300 || character == 0xefb9a700) { + return f_false; } - if (!memcmp(character, f_utf_space_figure, width)) { - return f_true; + // Sundanese Supplement: U+1CC8 to U+1CCF. + if (character >= 0xe1b38800 && character <= 0xe1b38f00) { + return f_false; } - if (!memcmp(character, f_utf_space_punctuation, width)) { - return f_true; + // Superscripts and Subscripts: U+2072, U+2073, U+208F. + if (character == 0xe281b200 || character == 0xe281b300 || character == 0xe2828f00) { + return f_false; } - if (!memcmp(character, f_utf_space_thin, width)) { - return f_true; + // Superscripts and Subscripts: U+209D to U+209F. + if (character >= 0xe2829d00 && character <= 0xe2829f00) { + return f_false; } - if (!memcmp(character, f_utf_space_hair, width)) { - return f_true; + // Supplemental Punctuation: U+2E45 to U+2E7F. + if (character >= 0xe2b98500 && character <= 0xe2b9bf00) { + return f_false; } - if (!memcmp(character, f_utf_space_separator_line, width)) { - return f_true; + // Syloti Nagri: U+A82C to U+A82F. + if (character >= 0xeaa0ac00 && character <= 0xeaa0af00) { + return f_false; } - if (!memcmp(character, f_utf_space_separator_paragraph, width)) { - return f_true; + // Tagalog: U+1715 to U+171f. + if (character >= 0xe19c9500 && character <= 0xe19c9f00) { + return f_false; } - if (!memcmp(character, f_utf_space_ogham, width)) { - return f_true; + // Tagalog: U+170D + if (character == 0xe19c8d00) { + return f_false; } - if (!memcmp(character, f_utf_space_ideographic, width)) { - return f_true; + // Tagbanwa: U+1774 to U+177f. + if (character >= 0xe19db400 && character <= 0xe19dbf00) { + return f_false; } - if (!memcmp(character, f_utf_space_medium_mathematical, width)) { - return f_true; + // Tagbanwa: U+176D, U+1771 + if (character == 0xe19dad00 || character == 0xe19db100) { + return f_false; } - if (!memcmp(character, f_utf_substitute_symbol_blank, width)) { - return f_true; + // Tai Lee: U+196E, U+196F. + if (character == 0xe1a5ae00 || character == 0xe1a5ef00) { + return f_false; } - if (!memcmp(character, f_utf_substitute_symbol_space, width)) { - return f_true; + // Tai Lee: U+1975 to U+197F. + if (character >= 0xe1a5b500 && character <= 0xe1a5bf00) { + return f_false; } - if (!memcmp(character, f_utf_substitute_open_box, width)) { - return f_true; + // Tai Tham: U+1A7D to U+1A7E. + if (character >= 0xe1a9bd00 && character <= 0xe1a9be00) { + return f_false; } - if (!memcmp(character, f_utf_substitute_open_box_shouldered, width)) { - return f_true; + // Tai Tham: U+1A8A to U+1A8F. + if (character >= 0xe1aa8a00 && character <= 0xe1aa8f00) { + return f_false; } - return f_false; - } + // Tai Tham: U+1A9A to U+1A9F. + if (character >= 0xe1aa9a00 && character <= 0xe1aa9f00) { + return f_false; + } - return f_false; - } -#endif // _di_f_utf_is_space_ + // Tai Tham: U+1AAE to U+1AAF. + if (character >= 0xe1aaae00 && character <= 0xe1aaaf00) { + return f_false; + } -#ifndef _di_f_utf_is_substitute_ - f_return_status f_utf_is_substitute(const f_string character, const unsigned short max_width) { - #ifndef _di_level_0_parameter_checking_ - if (max_width < 1) return f_status_set_error(f_invalid_parameter); - #endif // _di_level_0_parameter_checking_ + // Tai Tham: U+1A5F. + if (character == 0xe1a99f00) { + return f_false; + } - unsigned short width = f_macro_utf_byte_width_is(*character); + // Tai Viet: U+AAC3 to U+AADA. + if (character >= 0xeaab8300 && character <= 0xeaab9a00) { + return f_false; + } - if (width == 0) { - // there is no substitute character in ASCII. - return f_false; - } - else if (width == 1) { - return f_status_is_error(f_incomplete_utf); - } + // Tamil: U+0B80, U+0B81. + if (character == 0xe0ae8000 || character == 0xe0ae8100) { + return f_false; + } - if (width > max_width) { - return f_status_set_error(f_maybe); - } + // Tamil: U+0B8B to U+0B8D. + if (character >= 0xe0ae8b00 && character <= 0xe0ae8d00) { + return f_false; + } - if (width == 2) { - if (!memcmp(character, f_utf_substitute_middle_dot, width)) { - return f_true; + // Tamil: U+0B96 to U+0B98. + if (character >= 0xe0ae9600 && character <= 0xe0ae9800) { + return f_false; } - return f_false; - } + // Tamil: U+0BA0 to U+0BA2. + if (character >= 0xe0aea000 && character <= 0xe0aea200) { + return f_false; + } - if (width == 3) { - if (!memcmp(character, f_utf_substitute_symbol_blank, width)) { - return f_true; + // Tamil: U+0BA5 to U+0BA7. + if (character >= 0xe0aea500 && character <= 0xe0aea700) { + return f_false; } - if (!memcmp(character, f_utf_substitute_symbol_space, width)) { - return f_true; + // Tamil: U+0BAB to U+0BAD. + if (character >= 0xe0aeab00 && character <= 0xe0aead00) { + return f_false; } - if (!memcmp(character, f_utf_substitute_open_box, width)) { - return f_true; + // Tamil: U+0BBA to U+0BBD. + if (character >= 0xe0aeba00 && character <= 0xe0aebd00) { + return f_false; } - if (!memcmp(character, f_utf_substitute_open_box_shouldered, width)) { - return f_true; + // Tamil: U+0BC3 to U+0BC5. + if (character >= 0xe0af8300 && character <= 0xe0af8500) { + return f_false; } - return f_false; - } + // Tamil: U+0BCE, U+0BCF. + if (character == 0xe0af8e00 || character == 0xe0af8f00) { + return f_false; + } - return f_false; - } -#endif // _di_f_utf_is_substitute_ + // Tamil: U+0BD1 to U+0BD6. + if (character >= 0xe0af9100 && character <= 0xe0af9600) { + return f_false; + } -#ifndef _di_f_utf_is_whitespace_ - f_return_status f_utf_is_whitespace(const f_string character, const unsigned short max_width) { - #ifndef _di_level_0_parameter_checking_ - if (max_width < 1) return f_status_set_error(f_invalid_parameter); - #endif // _di_level_0_parameter_checking_ + // Tamil: U+0BD8 to U+0BE5. + if (character >= 0xe0af9800 && character <= 0xe0af9800) { + return f_false; + } - unsigned short width = f_macro_utf_byte_width_is(*character); + // Tamil: U+0BFB to U+0BFF. + if (character >= 0xe0afbb00 && character <= 0xe0afbf00) { + return f_false; + } - if (width == 0) { - if (isspace(*character)) { - return f_true; + // Tamil: U+0B84, U+0B91, U+0BC9. + if (character == 0xe0ae8400 || character == 0xe0ae9100 || character == 0xe0af8900) { + return f_false; } - return f_false; - } - else if (width == 1) { - return f_status_is_error(f_incomplete_utf); - } + // Telugu: U+0C3A to U+0C3C. + if (character >= 0xe0b0ba00 && character <= 0xe0b0bc00) { + return f_false; + } - if (width > max_width) { - return f_status_set_error(f_maybe); - } + // Telugu: U+0C4E to U+0C54. + if (character >= 0xe0b18e00 && character <= 0xe0b19400) { + return f_false; + } - if (width == 2) { - if (!memcmp(character, f_utf_space_no_break, width)) { - return f_true; + // Telugu: U+0C5B to U+0C5F. + if (character >= 0xe0b19b00 && character <= 0xe0b19f00) { + return f_false; } - if (!memcmp(character, f_utf_space_line_feed_reverse, width)) { - return f_true; + // Telugu: U+0C64, U+0C65. + if (character == 0xe0b1a400 || character == 0xe0b1a500) { + return f_false; } - if (!memcmp(character, f_utf_space_line_next, width)) { - return f_true; + // Telugu: U+0C70 to U+0C77. + if (character >= 0xe0b1b000 && character <= 0xe0b1b700) { + return f_false; } - return f_false; - } + // Telugu: U+0C04, U+0C0D, U+0C29. + if (character == 0xe0b08400 || character == 0xe0b08d00 || character == 0xe0b0a900) { + return f_false; + } - if (width == 3) { - if (!memcmp(character, f_utf_space_no_break_narrow, width)) { - return f_true; + // Telugu: U+0C45, U+0C49, U+0C57. + if (character == 0xe0b18500 || character == 0xe0b18900 || character == 0xe0b19700) { + return f_false; } - if (!memcmp(character, f_utf_space_en, width)) { - return f_true; + // Thai: U+0E5C to U+0E7F. + if (character >= 0xe0b99c00 && character <= 0xe0b9bf00) { + return f_false; } - if (!memcmp(character, f_utf_space_en_quad, width)) { - return f_true; + // Thai: U+0E3B to U+0E3E. + if (character >= 0xe0b8bb00 && character <= 0xe0b8be00) { + return f_false; } - if (!memcmp(character, f_utf_space_en_quad, width)) { - return f_true; + // Thai: U+0E00. + if (character == 0xe0b88000) { + return f_false; } - if (!memcmp(character, f_utf_space_em, width)) { - return f_true; + // Tibetan: U+0FDB to U+0FFF. + if (character >= 0xe0bf9b00 && character <= 0xe0bfbf00) { + return f_false; } - if (!memcmp(character, f_utf_space_em_quad, width)) { - return f_true; + // Tibetan: U+0F6D to U+0F70. + if (character >= 0xe0bdad00 && character <= 0xe0bdb000) { + return f_false; } - if (!memcmp(character, f_utf_space_em_per_three, width)) { - return f_true; + // Tibetan: U+0F48, U+0F98, U+0FBD, U+0FCD + if (character == 0xe0bd8800 || character == 0xe0be9800 || character == 0xe0bebd00 || character == 0xe0bf8d) { + return f_false; } - if (!memcmp(character, f_utf_space_em_per_four, width)) { - return f_true; + // Tifinagh: U+2D68 to U+2D6E. + if (character >= 0xe2b5a800 && character <= 0xe2b5ae00) { + return f_false; } - if (!memcmp(character, f_utf_space_em_per_six, width)) { - return f_true; + // Tifinagh: U+2D71 to U+2D7E. + if (character >= 0xe2b5b100 && character <= 0xe2b5be00) { + return f_false; } - if (!memcmp(character, f_utf_space_figure, width)) { - return f_true; + // Unified Canadian Aboriginal Syllabics Extended: U+18F6 to U+18FF. + if (character >= 0xe1a3b600 && character <= 0xe1a3bf00) { + return f_false; } - if (!memcmp(character, f_utf_space_punctuation, width)) { - return f_true; + // Vai: U+A62C to U+A63F. + if (character >= 0xea98ac00 && character <= 0xea98bf00) { + return f_false; } - if (!memcmp(character, f_utf_space_thin, width)) { - return f_true; + // Vedic Extensions: U+1CF7 and U+1CFA to U+1CFF. + if (character == 0xe1b3b700 || character >= 0xe1b3ba00 && character <= 0xe1b3bf00) { + return f_false; } - if (!memcmp(character, f_utf_space_hair, width)) { - return f_true; + // Vertical Forms: U+FE10 to U+FE1F. + if (character >= 0xefb89000 && character <= 0xefb89f00) { + return f_false; } - if (!memcmp(character, f_utf_space_separator_line, width)) { - return f_true; + // Yi Radicals: U+A4C7 to U+A4CF. + if (character >= 0xea938700 && character <= 0xea938f00) { + return f_false; } - if (!memcmp(character, f_utf_space_separator_paragraph, width)) { - return f_true; + // Yi Syllables: U+A48D to U+A48F. + if (character >= 0xea928d00 && character <= 0xea928f00) { + return f_false; } - if (!memcmp(character, f_utf_space_ogham, width)) { - return f_true; + // Specials: U+FFF0 to U+FFF8. + if (character >= 0xefbfb000 && character <= 0xefbfb800) { + return f_false; + } + + // Specials: U+FFFE to U+FFFF. + if (character >= 0xefbfbe00 && character <= 0xefbfbf00) { + return f_false; } + } - if (!memcmp(character, f_utf_space_ideographic, width)) { + if (width == 4) { + // Consider all private use codes as valid, U+F0000 to U+FFFFF. + if (character >= 0xf3b08080 && character <= 0xf3bfbfbf) { return f_true; } - if (!memcmp(character, f_utf_space_medium_mathematical, width)) { + // Consider all private use codes as valid, U+100000 to U+10FFFF. + if (character >= 0xf4808080 && character <= 0xf48fbfbf) { return f_true; } - return f_false; - } + // Sharada: U+111CE, U+111CF. + if (character == 0xf091878e || character == 0xf091878f) { + return f_false; + } - return f_false; - } -#endif // _di_f_utf_is_whitespace_ + // Shorthand Format Controls: U+1BCA4 to U+1BCAF. + if (character >= 0xf09bb2a4 && character <= 0xf09bb2af) { + return f_false; + } -#ifndef _di_f_utf_is_bom_character_ - f_return_status f_utf_is_bom_character(const f_utf_character character) { - if (character == f_utf_character_mask_bom) { - return f_true; - } + // Siddham: U+115DE to U+115FF. + if (character >= 0xf091979e && character <= 0xf09197bf) { + return f_false; + } - return f_false; - } -#endif // _di_f_utf_is_bom_character_ + // Siddham: U+115B6, U+115B7. + if (character == 0xf09196b6 || character == 0xf09196b7) { + return f_false; + } -#ifndef _di_f_utf_is_graph_character_ - f_return_status f_utf_is_graph_character(const f_utf_character character) { - // for now, just assume that any non-whitespace, non-substitute utf-8 character is a graph. - f_status status = f_utf_is_space_character(character); + // Sinhala Archaic Numbers: U+111F5 to U+111FF. + if (character >= 0xf09187b5 && character <= 0xf09187bf) { + return f_false; + } - if (f_status_is_error(status)) { - return status; - } - else if (status == f_true) { - return f_false; - } + // Sinhala Archaic Numbers: U+1F93F. + if (character == 0xf09187a0) { + return f_false; + } - if (f_utf_is_bom_character(character) == f_true) { - return f_false; - } - - return f_true; - } -#endif // _di_f_utf_is_graph_character_ - -#ifndef _di_f_utf_is_space_character_ - f_return_status f_utf_is_space_character(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); - - if (width == 0) { - int8_t ascii = character >> 24; - - if (isspace(ascii)) { - return f_true; + // Sora Sompeng: U+110E9 to U+110EF. + if (character >= 0xf09183a9 && character <= 0xf09183af) { + return f_false; } - return f_false; - } - else if (width == 1) { - return f_status_is_error(f_invalid_utf); - } - - f_bool is_big_endian = f_utf_is_big_endian(); - - if (width == 2) { - uint16_t utf = 0; - if (is_big_endian) { - utf = (uint16_t) (character >> 16); - } - else { - utf = (f_macro_utf_character_to_char_2(character) << 8) | f_macro_utf_character_to_char_1(character); + // Sora Sompeng: U+110FA to U+110FF. + if (character >= 0xf09183ba && character <= 0xf09183bf) { + return f_false; } - if (!memcmp(&utf, f_utf_space_no_break, width)) { - return f_true; + // Supplemental Arrows-C: U+1F80C to U+1F80F. + if (character >= 0xf09fa08c && character <= 0xf09fa08f) { + return f_false; } - if (!memcmp(&utf, f_utf_space_line_feed_reverse, width)) { - return f_true; + // Supplemental Arrows-C: U+1F848 to U+1F84F. + if (character >= 0xf09fa188 && character <= 0xf09fa18f) { + return f_false; } - if (!memcmp(&utf, f_utf_space_line_next, width)) { - return f_true; + // Supplemental Arrows-C: U+1F85A to U+1F85F. + if (character >= 0xf09fa19a && character <= 0xf09fa19f) { + return f_false; } - if (!memcmp(&utf, f_utf_substitute_middle_dot, width)) { - return f_true; + // Supplemental Arrows-C: U+1F8AE to U+1F8FF. + if (character >= 0xf09fa2ae && character <= 0xf09fa3bf) { + return f_false; } - return f_false; - } - - if (width == 3) { - uint32_t utf = 0; - if (is_big_endian) { - utf = character; - } - else { - utf = (f_macro_utf_character_to_char_3(character) << 24) | (f_macro_utf_character_to_char_2(character) << 16) | (f_macro_utf_character_to_char_1(character) << 8); + // Supplemental Symbols and Pictographs: U+1F900 to U+1F90F. + if (character >= 0xf09fa480 && character <= 0xf09fa48f) { + return f_false; } - if (!memcmp(&utf, f_utf_space_no_break_narrow, width)) { - return f_true; + // Supplemental Symbols and Pictographs: U+1F928 to U+1F92F. + if (character >= 0xf09fa4a8 && character <= 0xf09fa4af) { + return f_false; } - if (!memcmp(&utf, f_utf_space_en, width)) { - return f_true; + // Supplemental Symbols and Pictographs: U+1F94C to U+1F94F. + if (character >= 0xf09fa58c && character <= 0xf09fa58f) { + return f_false; } - if (!memcmp(&utf, f_utf_space_en_quad, width)) { - return f_true; + // Supplemental Symbols and Pictographs: U+1F960 to U+1F97F. + if (character >= 0xf09fa5a0 && character <= 0xf09fa5bf) { + return f_false; } - if (!memcmp(&utf, f_utf_space_en_quad, width)) { - return f_true; + // Supplemental Symbols and Pictographs: U+1F992 to U+1F9BF. + if (character >= 0xf09fa692 && character <= 0xf09fa6bf) { + return f_false; } - if (!memcmp(&utf, f_utf_space_em, width)) { - return f_true; + // Supplemental Symbols and Pictographs: U+1F9C1 to U+1F9FF. + if (character >= 0xf09fa781 && character <= 0xf09fa7bf) { + return f_false; } - if (!memcmp(&utf, f_utf_space_em_quad, width)) { - return f_true; + // Supplemental Symbols and Pictographs: U+1F91F, U+1F931, U+1F932. + if (character == 0xf09fa49f || character == 0xf09fa4b1 || character == 0xf09fa4b2) { + return f_false; } - if (!memcmp(&utf, f_utf_space_em_per_three, width)) { - return f_true; + // Supplemental Symbols and Pictographs: U+1F93F, U+1F95F. + if (character == 0xf09fa4bf || character == 0xf09fa59f) { + return f_false; } - if (!memcmp(&utf, f_utf_space_em_per_four, width)) { - return f_true; + // Sutton SignWriting: U+1DA8C to U+1DA9A. + if (character >= 0xf09daa8c && character <= 0xf09daa9a) { + return f_false; } - if (!memcmp(&utf, f_utf_space_em_per_six, width)) { - return f_true; + // Tags: U+E0000, U+E0002 to U+E001F. + if (character == 0xf3a08080 || character >= 0xf3a08082 && character <= 0xf3a081bf) { + return f_false; } - if (!memcmp(&utf, f_utf_space_figure, width)) { - return f_true; + // Tai Xuan Jing Symbols: U+1D357 to U+1D35F. + if (character >= 0xf09d8d97 && character <= 0xf09d8d9f) { + return f_false; } - if (!memcmp(&utf, f_utf_space_punctuation, width)) { - return f_true; + // Takri: U+116B8 to U+116BF. + if (character >= 0xf0919ab8 && character <= 0xf0919abf) { + return f_false; } - if (!memcmp(&utf, f_utf_space_thin, width)) { - return f_true; + // Takri: U+116CA to U+116CF. + if (character >= 0xf0919b8a && character <= 0xf0919b8f) { + return f_false; } - if (!memcmp(&utf, f_utf_space_hair, width)) { - return f_true; + // Tangut: U+187ED to U+187FF. + if (character >= 0xf0989fad && character <= 0xf0989fbf) { + return f_false; } - if (!memcmp(&utf, f_utf_space_separator_line, width)) { - return f_true; + // Tangut Components: U+18AF3 to U+18AFF. + if (character >= 0xf098abb3 && character <= 0xf098abbf) { + return f_false; } - if (!memcmp(&utf, f_utf_space_separator_paragraph, width)) { - return f_true; + // Tirhuta: U+114C8 to U+114CF. + if (character >= 0xf0919388 && character <= 0xf091938f) { + return f_false; } - if (!memcmp(&utf, f_utf_space_ogham, width)) { - return f_true; + // Tirhuta: U+114DA to U+114DF. + if (character >= 0xf091939a && character <= 0xf091939f) { + return f_false; } - if (!memcmp(&utf, f_utf_space_ideographic, width)) { - return f_true; + // Transport and Map Symbols: U+1F6D3 to U+1F6DF. + if (character >= 0xf09f9b93 && character <= 0xf09f9b9f) { + return f_false; } - if (!memcmp(&utf, f_utf_space_medium_mathematical, width)) { - return f_true; + // Transport and Map Symbols: U+1F6ED to U+1F6EF. + if (character >= 0xf09f9bad && character <= 0xf09f9baf) { + return f_false; } - if (!memcmp(&utf, f_utf_substitute_symbol_blank, width)) { - return f_true; + // Transport and Map Symbols: U+1F6F7 to U+1F6FF. + if (character >= 0xf09f9bb7 && character <= 0xf09f9bbf) { + return f_false; } - if (!memcmp(&utf, f_utf_substitute_symbol_space, width)) { - return f_true; + // Ugaritic: U+1039E. + if (character == 0xf0908e9e) { + return f_false; } - if (!memcmp(&utf, f_utf_substitute_open_box, width)) { - return f_true; + // Warang Citi: U+118F3 to U+118FE. + if (character >= 0xf091a3b3 && character <= 0xf091a3be) { + return f_false; } - if (!memcmp(&utf, f_utf_substitute_open_box_shouldered, width)) { - return f_true; + // Unicode (and therefore UTF-8) does not support representing any character greater than this (U+10FFFF). + if (character > 0xf48fbfbf) { + return f_false; } - - return f_false; } - return f_false; + return f_true; } -#endif // _di_f_utf_is_space_character_ +#endif // _di_f_utf_character_is_value_ -#ifndef _di_f_utf_is_substitute_character_ - f_return_status f_utf_is_substitute_character(const f_utf_character character) { +#ifndef _di_f_utf_character_is_whitespace_ + f_return_status f_utf_character_is_whitespace(const f_utf_character character) { unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { - // there is no substitute character in ASCII. + int8_t ascii = character >> 24; + + if (isspace(ascii)) { + return f_true; + } + return f_false; } - else if (width == 1) { + + if (width == 1) { return f_status_is_error(f_invalid_utf); } - f_bool is_big_endian = f_utf_is_big_endian(); + // Latin-1 Supplement: U+00A0, U+00AD. + if (character == 0xc2a00000 || character == 0xc2ad0000) { + return f_true; + } - if (width == 2) { - uint16_t utf = 0; - if (is_big_endian) { - utf = (uint16_t) (character >> 16); - } - else { - utf = (f_macro_utf_character_to_char_2(character) << 8) | f_macro_utf_character_to_char_1(character); - } + // Tags: U+E0020. + if (character == 0xf3a08080) { + return f_true; + } - if (!memcmp(&utf, f_utf_substitute_middle_dot, width)) { - return f_true; - } + return f_false; + } +#endif // _di_f_utf_character_is_whitespace_ - return f_false; +#ifndef _di_f_utf_character_to_char_ + f_return_status f_utf_character_to_char(const f_utf_character utf_character, f_string *character, unsigned short *max_width) { + #ifndef _di_level_0_parameter_checking_ + if (utf_character == 0) return f_status_set_error(f_invalid_parameter); + if (max_width == 0 && *character != 0) return f_status_set_error(f_invalid_parameter); + if (max_width != 0 && *character == 0) return f_status_set_error(f_invalid_parameter); + if (max_width != 0 && *max_width > 4) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_0_parameter_checking_ + + f_status status = f_none; + + unsigned short width = f_macro_utf_character_width_is(utf_character); + + if (max_width == 0) { + f_macro_string_new(status, *character, width); + + if (f_status_is_error(status)) return status; + + width = 1; + *max_width = 1; + } + else if (width == 1) { + return f_status_is_error(f_invalid_utf); + } + else if (width > *max_width) { + return f_status_set_error(f_failure); } - if (width == 3) { + *max_width = width; + + if (f_utf_is_big_endian()) { + memcpy(*character, &utf_character, sizeof(int8_t) * width); + } + else { uint32_t utf = 0; - if (is_big_endian) { - utf = character; + + if (width == 1) { + utf = f_macro_utf_character_to_char_1(utf_character) << 24; } - else { - utf = (f_macro_utf_character_to_char_3(character) << 24) | (f_macro_utf_character_to_char_2(character) << 16) | (f_macro_utf_character_to_char_1(character) << 8); + else if (width == 2) { + utf = (f_macro_utf_character_to_char_2(utf_character) << 24) | (f_macro_utf_character_to_char_1(utf_character) << 16); } - - if (!memcmp(&utf, f_utf_substitute_symbol_blank, width)) { - return f_true; + else if (width == 3) { + utf = (f_macro_utf_character_to_char_3(utf_character) << 24) | (f_macro_utf_character_to_char_2(utf_character) << 16) | (f_macro_utf_character_to_char_1(utf_character) << 8); } - - if (!memcmp(&utf, f_utf_substitute_symbol_space, width)) { - return f_true; + else if (width == 4) { + utf = (f_macro_utf_character_to_char_4(utf_character) << 24) | (f_macro_utf_character_to_char_3(utf_character) << 16) | (f_macro_utf_character_to_char_2(utf_character) << 8) | f_macro_utf_character_to_char_1(utf_character); } - if (!memcmp(&utf, f_utf_substitute_open_box, width)) { - return f_true; - } + memcpy(*character, &utf, sizeof(int8_t) * width); + } - if (!memcmp(&utf, f_utf_substitute_open_box_shouldered, width)) { - return f_true; - } + return f_none; + } +#endif // _di_f_utf_character_to_char_ - return f_false; +#ifndef _di_f_utf_is_big_endian_ + f_return_status f_utf_is_big_endian() { + uint16_t test_int = (0x01 << 8) | 0x02; + int8_t test_char[2] = {0x01, 0x02}; + + if (!memcmp(&test_int, test_char, 2)) { + return f_true; } return f_false; } -#endif // _di_f_utf_is_substitute_character_ +#endif // _di_f_utf_is_big_endian_ -#ifndef _di_f_utf_is_whitespace_character_ - f_return_status f_utf_is_whitespace_character(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); +#ifndef _di_f_utf_is_ + f_return_status f_utf_is(const f_string character, const unsigned short max_width) { + #ifndef _di_level_0_parameter_checking_ + if (max_width < 1) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_0_parameter_checking_ + + unsigned short width = f_macro_utf_byte_width_is(*character); if (width == 0) { - int8_t ascii = character >> 24; + return f_false; + } - if (isspace(ascii)) { - return f_true; - } + if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } + + return f_true; + } +#endif // _di_f_utf_is_ + +#ifndef _di_f_utf_is_bom_ + f_return_status f_utf_is_bom(const f_string character, const unsigned short max_width) { + #ifndef _di_level_0_parameter_checking_ + if (max_width < 1) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_0_parameter_checking_ + + unsigned short width = f_macro_utf_byte_width_is(*character); + if (width == 0) { return f_false; } - else if (width == 1) { - return f_status_is_error(f_invalid_utf); - } - f_bool is_big_endian = f_utf_is_big_endian(); + if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } - if (width == 2) { - uint16_t utf = 0; - if (is_big_endian) { - utf = (uint16_t) (character >> 16); - } - else { - utf = (f_macro_utf_character_to_char_2(character) << 8) | f_macro_utf_character_to_char_1(character); - } + if (width > max_width) { + return f_status_set_error(f_maybe); + } - if (!memcmp(&utf, f_utf_space_no_break, width)) { + if (width == 3) { + if (!memcmp(character, f_utf_bom, width)) { return f_true; } + } - if (!memcmp(&utf, f_utf_space_line_feed_reverse, width)) { - return f_true; - } + return f_false; + } +#endif // _di_f_utf_is_bom_ + +#ifndef _di_f_utf_is_control_ + f_return_status f_utf_is_control(const f_string character, const unsigned short max_width) { + #ifndef _di_level_0_parameter_checking_ + if (max_width < 1) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_0_parameter_checking_ - if (!memcmp(&utf, f_utf_space_line_next, width)) { + if (f_macro_utf_byte_width_is(*character) == 0) { + if (iscntrl(*character)) { return f_true; } return f_false; } - if (width == 3) { - uint32_t utf = 0; - if (is_big_endian) { - utf = character; - } - else { - utf = (f_macro_utf_character_to_char_3(character) << 24) | (f_macro_utf_character_to_char_2(character) << 16) | (f_macro_utf_character_to_char_1(character) << 8); - } + if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } - if (!memcmp(&utf, f_utf_space_no_break_narrow, width)) { - return f_true; - } + f_utf_character character_utf = 0; + f_status status = 0; - if (!memcmp(&utf, f_utf_space_en, width)) { - return f_true; - } + status = f_utf_char_to_control(character, max_width, &character_utf); - if (!memcmp(&utf, f_utf_space_en_quad, width)) { - return f_true; - } + if (status != f_none) return status; - if (!memcmp(&utf, f_utf_space_en_quad, width)) { - return f_true; - } + return f_utf_character_is_control(character); + } +#endif // _di_f_utf_is_control_ - if (!memcmp(&utf, f_utf_space_em, width)) { - return f_true; - } +#ifndef _di_f_utf_is_control_picture_ + f_return_status f_utf_is_control_picture(const f_string character, const unsigned short max_width) { + #ifndef _di_level_0_parameter_checking_ + if (max_width < 1) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_0_parameter_checking_ - if (!memcmp(&utf, f_utf_space_em_quad, width)) { - return f_true; - } + // There are not ASCII control pictures. + if (f_macro_utf_byte_width_is(*character) == 0) { + return f_false; + } - if (!memcmp(&utf, f_utf_space_em_per_three, width)) { - return f_true; - } + if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } - if (!memcmp(&utf, f_utf_space_em_per_four, width)) { - return f_true; - } + f_utf_character character_utf = 0; + f_status status = 0; - if (!memcmp(&utf, f_utf_space_em_per_six, width)) { - return f_true; - } + status = f_utf_char_to_character(character, max_width, &character_utf); - if (!memcmp(&utf, f_utf_space_figure, width)) { - return f_true; - } + if (status != f_none) return status; - if (!memcmp(&utf, f_utf_space_punctuation, width)) { - return f_true; - } + return f_utf_character_is_control_picture(character); + } +#endif // _di_f_utf_is_control_picture_ - if (!memcmp(&utf, f_utf_space_thin, width)) { - return f_true; - } +#ifndef _di_f_utf_is_graph_ + f_return_status f_utf_is_graph(const f_string character, const unsigned short max_width) { + #ifndef _di_level_0_parameter_checking_ + if (max_width < 1) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_0_parameter_checking_ - if (!memcmp(&utf, f_utf_space_hair, width)) { + if (f_macro_utf_byte_width_is(*character) == 0) { + if (isgraph(*character)) { return f_true; } - if (!memcmp(&utf, f_utf_space_separator_line, width)) { - return f_true; - } + return f_false; + } - if (!memcmp(&utf, f_utf_space_separator_paragraph, width)) { - return f_true; - } + if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } - if (!memcmp(&utf, f_utf_space_ogham, width)) { - return f_true; - } + f_utf_character character_utf = 0; + f_status status = 0; - if (!memcmp(&utf, f_utf_space_ideographic, width)) { - return f_true; - } + status = f_utf_char_to_character(character, max_width, &character_utf); + + if (status != f_none) return status; - if (!memcmp(&utf, f_utf_space_medium_mathematical, width)) { + return f_utf_character_is_graph(character); + } +#endif // _di_f_utf_is_graph_ + +#ifndef _di_f_utf_is_whitespace_ + f_return_status f_utf_is_whitespace(const f_string character, const unsigned short max_width) { + #ifndef _di_level_0_parameter_checking_ + if (max_width < 1) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_0_parameter_checking_ + + unsigned short width = f_macro_utf_byte_width_is(*character); + + if (width == 0) { + if (isspace(*character)) { return f_true; } return f_false; } - return f_false; + if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } + + f_utf_character character_utf = 0; + f_status status = 0; + + status = f_utf_char_to_character(character, max_width, &character_utf); + + if (status != f_none) return status; + + return f_utf_character_is_whitespace(character); } -#endif // _di_f_utf_is_whitespace_character_ +#endif // _di_f_utf_is_whitespace_ #ifndef _di_f_utf_char_to_character_ - f_return_status f_utf_char_to_character(const f_string character, const unsigned short max_width, f_utf_character *utf_character) { + f_return_status f_utf_char_to_character(const f_string character, const unsigned short max_width, f_utf_character *character_utf) { #ifndef _di_level_0_parameter_checking_ if (max_width < 1) return f_status_set_error(f_invalid_parameter); - if (utf_character == 0) return f_status_set_error(f_invalid_parameter); + if (character_utf == 0) return f_status_set_error(f_invalid_parameter); #endif // _di_level_0_parameter_checking_ unsigned short width = f_macro_utf_byte_width_is(*character); if (width == 0) { - *utf_character = f_macro_utf_character_from_char_1(character[0]); + *character_utf = f_macro_utf_character_from_char_1(character[0]); return f_none; } else if (width == 1) { @@ -817,87 +984,31 @@ extern "C" { return f_status_set_error(f_failure); } - *utf_character = 0; - *utf_character |= f_macro_utf_character_to_char_1(character[0]); + *character_utf = 0; + *character_utf |= f_macro_utf_character_to_char_1(character[0]); if (width < 2) { return f_none; } - *utf_character |= f_macro_utf_character_to_char_2(character[1]); + *character_utf |= f_macro_utf_character_to_char_2(character[1]); if (width == 2) { return f_none; } - *utf_character |= f_macro_utf_character_to_char_3(character[2]); + *character_utf |= f_macro_utf_character_to_char_3(character[2]); if (width == 3) { return f_none; } - *utf_character |= f_macro_utf_character_to_char_4(character[3]); + *character_utf |= f_macro_utf_character_to_char_4(character[3]); return f_none; } #endif // _di_f_utf_char_to_character_ -#ifndef _di_f_utf_character_to_char_ - f_return_status f_utf_character_to_char(const f_utf_character utf_character, f_string *character, unsigned short *max_width) { - #ifndef _di_level_0_parameter_checking_ - if (utf_character == 0) return f_status_set_error(f_invalid_parameter); - if (max_width == 0 && *character != 0) return f_status_set_error(f_invalid_parameter); - if (max_width != 0 && *character == 0) return f_status_set_error(f_invalid_parameter); - if (max_width != 0 && *max_width > 4) return f_status_set_error(f_invalid_parameter); - #endif // _di_level_0_parameter_checking_ - - f_status status = f_none; - - unsigned short width = f_macro_utf_character_width_is(utf_character); - - if (max_width == 0) { - f_macro_string_new(status, *character, width); - - if (f_status_is_error(status)) return status; - - width = 1; - *max_width = 1; - } - else if (width == 1) { - return f_status_is_error(f_invalid_utf); - } - else if (width > *max_width) { - return f_status_set_error(f_failure); - } - - *max_width = width; - - if (f_utf_is_big_endian()) { - memcpy(*character, &utf_character, sizeof(int8_t) * width); - } - else { - uint32_t utf = 0; - - if (width == 1) { - utf = f_macro_utf_character_to_char_1(utf_character) << 24; - } - else if (width == 2) { - utf = (f_macro_utf_character_to_char_2(utf_character) << 24) | (f_macro_utf_character_to_char_1(utf_character) << 16); - } - else if (width == 3) { - utf = (f_macro_utf_character_to_char_3(utf_character) << 24) | (f_macro_utf_character_to_char_2(utf_character) << 16) | (f_macro_utf_character_to_char_1(utf_character) << 8); - } - else if (width == 4) { - utf = (f_macro_utf_character_to_char_4(utf_character) << 24) | (f_macro_utf_character_to_char_3(utf_character) << 16) | (f_macro_utf_character_to_char_2(utf_character) << 8) | f_macro_utf_character_to_char_1(utf_character); - } - - memcpy(*character, &utf, sizeof(int8_t) * width); - } - - return f_none; - } -#endif // _di_f_utf_character_to_char_ - #ifdef __cplusplus } // extern "C" #endif diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h index 93c25a5..de1bfa9 100644 --- a/level_0/f_utf/c/utf.h +++ b/level_0/f_utf/c/utf.h @@ -489,6 +489,8 @@ extern "C" { * * These are integers representing character codes that represent types of substitute spaces. * + * Substitute codes are not actual codes and are actually prints of the codes so they should not be treated as the actual codes. + * * This does not provide substitute whitespace codes for standard ascii whitespaces, such as '\t' or '\r'. */ #ifndef _di_f_utf_substitute_ @@ -510,60 +512,124 @@ extern "C" { #endif // _di_f_utf_substitute_ /** - * Helper function for UTF-8 processing code to determine endianess of the system. + * Check to see if the entire byte block of the character is a UTF-8 character. + * + * This does not validate if the UTF-8 character is a valid UTF-8 character, for that use f_utf_character_is_valid(). * + * @param character + * The character to validate. * * @return - * f_true if the system is big-endian. - * f_false if the system is little-endian. + * f_true if a UTF-8 character. + * f_false if not a UTF-8 character. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + * + * @see f_utf_character_is_valid() */ -#ifndef _di_f_utf_is_big_endian_ - extern f_return_status f_utf_is_big_endian(); -#endif // _di_f_utf_is_big_endian_ +#ifndef _di_f_utf_character_is_ + extern f_return_status f_utf_character_is(const f_utf_character character); +#endif // _di_f_utf_character_is_ /** - * Check to see if the entire byte block of the character is a UTF-8 character. + * Check to see if the entire byte block of the character is a UTF-8 BOM. + * + * @param character + * The character to validate. + * + * @return + * f_true if a UTF-8 BOM. + * f_false if not a UTF-8 BOM. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + */ +#ifndef _di_f_utf_character_is_bom_ + extern f_return_status f_utf_character_is_bom(const f_utf_character character); +#endif // _di_f_utf_character_is_bom_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character. + * + * The UTF-8 BOM is considered a control character. + * + * @param character + * The character to validate. + * + * @return + * f_true if a UTF-8 control character. + * f_false if not a UTF-8 control character. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + * + * @see iscntrl() + */ +#ifndef _di_f_utf_character_is_control_ + extern f_return_status f_utf_character_is_control(const f_utf_character character); +#endif // _di_f_utf_character_is_control_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 control picture character. + * + * Control Picture characters are placeholders for special ASCII characters and therefore there are no ASCII Control Picture characters. + * + * @param character + * The character to validate. + * + * @return + * f_true if a UTF-8 control picture character. + * f_false if not a UTF-8 control picture character. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + */ +#ifndef _di_f_utf_character_is_control_picture_ + extern f_return_status f_utf_character_is_control_picture(const f_utf_character character); +#endif // _di_f_utf_character_is_control_picture_ + +/** + * Check to see if the entire byte block of the character is a 1-width UTF-8 character fragment. + * + * Characters whose width is 1-byte are invalid. + * However, the character could have been cut-off, so whether or not this is actually valid should be determined by the caller. + * + * For normal validation functions, try using f_utf_character_is() or f_utf_character_is_valid(). * * @param character * The character to validate. - * There must be enough space allocated to compare against, as limited by max_width. - * @param max_width - * The maximum width available for checking. - * Can be anything greater than 0. * * @return * f_true if a UTF-8 character. * f_false if not a UTF-8 character. - * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. - * f_invalid_parameter (with error bit) if a parameter is invalid. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + * + * @see f_utf_character_is() + * @see f_utf_character_is_valid() */ -#ifndef _di_f_utf_is_ - extern f_return_status f_utf_is(const f_string character, const unsigned short max_width); -#endif // _di_f_utf_is_ +#ifndef _di_f_utf_character_is_fragment_ + extern f_return_status f_utf_character_is_fragment(const f_utf_character character); +#endif // _di_f_utf_character_is_fragment_ /** - * Check to see if the entire byte block of the character is a UTF-8 BOM. + * Check to see if the entire byte block of the character is an ASCII or UTF-8 printable character. * * @param character * The character to validate. - * There must be enough space allocated to compare against, as limited by max_width. - * @param max_width - * The maximum width available for checking. - * Can be anything greater than 0. * * @return - * f_true if a UTF-8 whitespace or substitute. - * f_false if not a UTF-8 whitespace or substitute. - * f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough. - * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. - * f_invalid_parameter (with error bit) if a parameter is invalid. + * f_true if a UTF-8 graph. + * f_false if not a UTF-8 graph. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + * + * @see isgraph() */ -#ifndef _di_f_utf_is_bom_ - extern f_return_status f_utf_is_bom(const f_string character, const unsigned short max_width); -#endif // _di_f_utf_is_bom_ +#ifndef _di_f_utf_character_is_graph_ + extern f_return_status f_utf_character_is_graph(const f_utf_character character); +#endif // _di_f_utf_character_is_graph_ /** - * Check to see if the entire byte block of the character is a UTF-8 character. + * Check to see if the entire byte block of the character is a valid UTF-8 character. + * + * This does validate if the UTF-8 character is a valid UTF-8 character. + * To not do this, use f_utf_character_is(). + * + * This function can be expensive due to how Unicode has invalid codes spread randomly through it. + * For simpler error checking, try f_utf_is_fragment(), to just check that the width is valid or not. + * (First characters should not have a width of 1, and all other characters should have a width of 1.) * * @param character * The character to validate. @@ -571,17 +637,75 @@ extern "C" { * @return * f_true if a UTF-8 character. * f_false if not a UTF-8 character. - * f_invalid_utf (with error bit) if character is an incomplete UTF-8 fragment. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + * + * @see f_utf_character_is() + * @see f_utf_character_is_fragment() + */ +#ifndef _di_f_utf_character_is_valid_ + extern f_return_status f_utf_character_is_valid(const f_utf_character character); +#endif // _di_f_utf_character_is_value_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 general space or control character. + * + * @param character + * The character to validate. + * + * @return + * f_true if a UTF-8 whitespace. + * f_false if not a UTF-8 whitespace. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + */ +#ifndef _di_f_utf_character_is_whitespace_ + extern f_return_status f_utf_character_is_whitespace(const f_utf_character character); +#endif // _di_f_utf_character_is_whitespace_ + +/** + * Convert a specialized f_utf_character type to a int8_t, stored as a string (character buffer). + * + * This will also convert ASCII characters stored in the utf_character array. + * + * @param utf_character + * The UTF-8 characterr to convert from. + * @param character + * A int8_t representation of the UTF-8 character, stored as a string of width bytes. + * If max_width is 0, then this should not be allocated (set the pointer address to 0). + * @param max_width + * The number of bytes the generated character represents. + * If this is set to 0, then the character will be allocated and this will be set to the width of the utf_character. + * If this is set to some value greater than 0 (up to 4), then this represents the size of the character array (no allocations are performed). + * If this is greater than 0, and the utf_character width is larger than this size, then an error is returned. + * + * @return + * f_none if conversion was successful. + * f_failure (with error bit) if width is not long enough to convert. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. * f_invalid_parameter (with error bit) if a parameter is invalid. + * f_allocation_error (with error bit) on memory allocation error. + * f_failure (with error bit) if width is not long enough to convert. */ -#ifndef _di_f_utf_is_ - extern f_return_status f_utf_is_character(const f_utf_character character); -#endif // _di_f_utf_is_ +#ifndef _di_f_utf_character_to_char_ + extern f_return_status f_utf_character_to_char(const f_utf_character utf_character, f_string *character, unsigned short *max_width); +#endif // _di_f_utf_character_to_char_ + +/** + * Helper function for UTF-8 processing code to determine endianess of the system. + * + * @todo relocate this outside of f_utf into a more general path, perhaps f_memory (f_memory_is_big_endian). + * + * @return + * f_true if the system is big-endian. + * f_false if the system is little-endian. + */ +#ifndef _di_f_utf_is_big_endian_ + extern f_return_status f_utf_is_big_endian(); +#endif // _di_f_utf_is_big_endian_ /** - * Check to see if the entire byte block of the character is a UTF-8 printable character. + * Check to see if the entire byte block of the character is a UTF-8 character. * - * This does not check non-UTF-8 graph. + * This does not check the validity of the character, for that instead use f_utf_is_valid(). * * @param character * The character to validate. @@ -591,20 +715,19 @@ extern "C" { * Can be anything greater than 0. * * @return - * f_true if a UTF-8 graph. - * f_false if not a UTF-8 graph. - * f_maybe (with error bit) if this could be a graph but width is not long enough. + * f_true if a UTF-8 character. + * f_false if not a UTF-8 character. * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_invalid_parameter (with error bit) if a parameter is invalid. + * + * @see f_utf_is_valid() */ -#ifndef _di_f_utf_is_graph_ - extern f_return_status f_utf_is_graph(const f_string character, const unsigned short max_width); -#endif // _di_f_utf_is_graph_ +#ifndef _di_f_utf_is_ + extern f_return_status f_utf_is(const f_string character, const unsigned short max_width); +#endif // _di_f_utf_is_ /** - * Check to see if the entire byte block of the character is a UTF-8 whitespace or substitute character. - * - * This does not check non-UTF-8 whitespace. + * Check to see if the entire byte block of the character is a UTF-8 BOM. * * @param character * The character to validate. @@ -620,14 +743,14 @@ extern "C" { * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_invalid_parameter (with error bit) if a parameter is invalid. */ -#ifndef _di_f_utf_is_space_ - extern f_return_status f_utf_is_space(const f_string character, const unsigned short max_width); -#endif // _di_f_utf_is_space_ +#ifndef _di_f_utf_is_bom_ + extern f_return_status f_utf_is_bom(const f_string character, const unsigned short max_width); +#endif // _di_f_utf_is_bom_ /** - * Check to see if the entire byte block of the character is a UTF-8 whitespace substitute character. + * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character. * - * This does not check non-UTF-8 whitespace. + * The UTF-8 BOM is considered a control character. * * @param character * The character to validate. @@ -637,20 +760,20 @@ extern "C" { * Can be anything greater than 0. * * @return - * f_true if a UTF-8 substitute. - * f_false if not a UTF-8 substitute. - * f_maybe (with error bit) if this could be a substitute but width is not long enough. + * f_true if a UTF-8 control character. + * f_false if not a UTF-8 control character. * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. - * f_invalid_parameter (with error bit) if a parameter is invalid. + * + * @see iscntrl() */ -#ifndef _di_f_utf_is_substitute_ - extern f_return_status f_utf_is_substitute(const f_string character, const unsigned short max_width); -#endif // _di_f_utf_is_substitute_ +#ifndef _di_f_utf_is_control_ + extern f_return_status f_utf_is_control(const f_string character, const unsigned short max_width); +#endif // _di_f_utf_is_control_ /** - * Check to see if the entire byte block of the character is a UTF-8 general whitespace character. + * Check to see if the entire byte block of the character is a UTF-8 control picture character. * - * This does not check non-UTF-8 whitespace. + * Control Picture characters are placeholders for special ASCII characters and therefore there are no ASCII Control Picture characters. * * @param character * The character to validate. @@ -660,109 +783,119 @@ extern "C" { * Can be anything greater than 0. * * @return - * f_true if a UTF-8 whitespace. - * f_false if not a UTF-8 whitespace. - * f_maybe (with error bit) if this could be a whitespace but width is not long enough. + * f_true if a UTF-8 control picture character. + * f_false if not a UTF-8 control picture character. * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. - * f_invalid_parameter (with error bit) if a parameter is invalid. */ -#ifndef _di_f_utf_is_whitespace_ - extern f_return_status f_utf_is_whitespace(const f_string character, const unsigned short max_width); -#endif // _di_f_utf_is_whitespace_ +#ifndef _di_f_utf_is_control_picture_ + extern f_return_status f_utf_is_control_picture(const f_string character, const unsigned short max_width); +#endif // _di_f_utf_is_control_picture_ /** - * Check to see if the entire byte block of the character is a UTF-8 BOM. - * - * @param character - * The UTF-8 character to validate. + * Check to see if the entire byte block of the character is a 1-width UTF-8 character fragment. * - * @return - * f_true if a UTF-8 whitespace or substitute. - * f_false if not a UTF-8 whitespace or substitute. - * f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough. - * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. - * f_invalid_parameter (with error bit) if a parameter is invalid. - */ -#ifndef _di_f_utf_is_bom_character_ - extern f_return_status f_utf_is_bom_character(const f_utf_character character); -#endif // _di_f_utf_is_bom_character_ - -/** - * Check to see if the entire byte block of the character is a UTF-8 printable character. + * Characters whose width is 1-byte are invalid. + * However, the character could have been cut-off, so whether or not this is actually valid should be determined by the caller. * - * This does not check non-UTF-8 graph. + * For normal validation functions, try using f_utf_character_is() or f_utf_character_is_valid(). * * @param character * The character to validate. + * There must be enough space allocated to compare against, as limited by max_width. + * @param max_width + * The maximum width available for checking. + * Can be anything greater than 0. * * @return - * f_true if a UTF-8 graph. - * f_false if not a UTF-8 graph. - * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. - * f_invalid_parameter (with error bit) if a parameter is invalid. + * f_true if a UTF-8 character. + * f_false if not a UTF-8 character. + * + * @see f_utf_character_is() + * @see f_utf_character_is_valid() */ -#ifndef _di_f_utf_is_graph_character_ - extern f_return_status f_utf_is_graph_character(const f_utf_character character); -#endif // _di_f_utf_is_graph_character_ +#ifndef _di_f_utf_is_fragment_ + extern f_return_status f_utf_is_fragment(const f_string character, const unsigned short max_width); +#endif // _di_f_utf_is_fragment_ /** - * Check to see if the entire byte block of the character is a UTF-8 whitespace or substitute character. - * - * This does not check non-UTF-8 whitespace. + * Check to see if the entire byte block of the character is an ASCII or UTF-8 printable character. * * @param character * The character to validate. + * There must be enough space allocated to compare against, as limited by max_width. + * @param max_width + * The maximum width available for checking. + * Can be anything greater than 0. * * @return - * f_true if a UTF-8 whitespace or substitute. - * f_false if not a UTF-8 whitespace or substitute. - * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + * f_true if a UTF-8 graph. + * f_false if not a UTF-8 graph. + * f_maybe (with error bit) if this could be a graph but width is not long enough. + * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_invalid_parameter (with error bit) if a parameter is invalid. + * + * @see isgraph() + * @see iscntrl() */ -#ifndef _di_f_utf_is_space_character_ - extern f_return_status f_utf_is_space_character(const f_utf_character character); -#endif // _di_f_utf_is_space_character_ +#ifndef _di_f_utf_is_graph_ + extern f_return_status f_utf_is_graph(const f_string character, const unsigned short max_width); +#endif // _di_f_utf_is_graph_ /** - * Check to see if the entire byte block of the character is a UTF-8 whitespace substitute character. + * Check to see if the entire byte block of the character is a UTF-8 character and if that character is a valid UTF-8. * - * This does not check non-UTF-8 whitespace. + * This does check the validity of the character, to not do this use f_utf_is(). + * + * This function can be expensive due to how Unicode has invalid codes spread randomly through it. + * For simpler error checking, try f_utf_is_fragment(), to just check that the width is valid or not. + * (First characters should have a width of not 1, and all other characters should not have a width of 1.) * * @param character * The character to validate. + * There must be enough space allocated to compare against, as limited by max_width. + * @param max_width + * The maximum width available for checking. + * Can be anything greater than 0. * * @return - * f_true if a UTF-8 substitute. - * f_false if not a UTF-8 substitute. - * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + * f_true if a valid UTF-8 character. + * f_false if not a valid UTF-8 character. + * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_invalid_parameter (with error bit) if a parameter is invalid. + * + * @see f_utf_is() + * @see f_utf_is_fragment() */ -#ifndef _di_f_utf_is_substitute_character_ - extern f_return_status f_utf_is_substitute_character(const f_utf_character character); -#endif // _di_f_utf_is_substitute_character_ +#ifndef _di_f_utf_is_valid_ + extern f_return_status f_utf_is_valid(const f_string character, const unsigned short max_width); +#endif // _di_f_utf_is_valid_ /** - * Check to see if the entire byte block of the character is a UTF-8 general whitespace character. - * - * This does not check non-UTF-8 whitespace. + * Check to see if the entire byte block of the character is an ASCII or UTF-8 general space or control character. * * @param character * The character to validate. + * There must be enough space allocated to compare against, as limited by max_width. + * @param max_width + * The maximum width available for checking. + * Can be anything greater than 0. * * @return * f_true if a UTF-8 whitespace. * f_false if not a UTF-8 whitespace. - * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + * f_maybe (with error bit) if this could be a whitespace but width is not long enough. + * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_invalid_parameter (with error bit) if a parameter is invalid. + * + * @see isspace() + * @see iscntrl() */ -#ifndef _di_f_utf_is_whitespace_character_ - extern f_return_status f_utf_is_whitespace_character(const f_utf_character character); -#endif // _di_f_utf_is_whitespace_character_ +#ifndef _di_f_utf_is_whitespace_ + extern f_return_status f_utf_is_whitespace(const f_string character, const unsigned short max_width); +#endif // _di_f_utf_is_whitespace_ /** - * Convert a UTF-8 character, stored as a string (character buffer), to the specialized f_utf_character type. - * - * This will also convert ASCII characters. + * Convert an ASCII or UTF-8 character, stored as a string (character buffer), to the specialized f_utf_character type. * * @param character * The character string to be converted to the f_utf_character type. @@ -770,7 +903,7 @@ extern "C" { * @param max_width * The maximum width available for converting. * Can be anything greater than 0. - * @param utf_character + * @param character_utf * The generated character of type f_utf_character. * This value may be cleared, even on error. * @@ -781,37 +914,9 @@ extern "C" { * f_invalid_parameter (with error bit) if a parameter is invalid. */ #ifndef _di_f_utf_char_to_character_ - extern f_return_status f_utf_char_to_character(const f_string character, const unsigned short max_width, f_utf_character *utf_character); + extern f_return_status f_utf_char_to_character(const f_string character, const unsigned short max_width, f_utf_character *character_utf); #endif // _di_f_utf_char_to_character_ -/** - * Convert a specialized f_utf_character type to a int8_t, stored as a string (character buffer). - * - * This will also convert ASCII characters stored in the utf_character array. - * - * @param utf_character - * The UTF-8 characterr to convert from. - * @param character - * A int8_t representation of the UTF-8 character, stored as a string of width bytes. - * If max_width is 0, then this should not be allocated (set the pointer address to 0). - * @param max_width - * The number of bytes the generated character represents. - * If this is set to 0, then the character will be allocated and this will be set to the width of the utf_character. - * If this is set to some value greater than 0 (up to 4), then this represents the size of the character array (no allocations are performed). - * If this is greater than 0, and the utf_character width is larger than this size, then an error is returned. - * - * @return - * f_none if conversion was successful. - * f_failure (with error bit) if width is not long enough to convert. - * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. - * f_invalid_parameter (with error bit) if a parameter is invalid. - * f_allocation_error (with error bit) on memory allocation error. - * f_failure (with error bit) if width is not long enough to convert. - */ -#ifndef _di_f_utf_character_to_char_ - extern f_return_status f_utf_character_to_char(const f_utf_character utf_character, f_string *character, unsigned short *max_width); -#endif // _di_f_utf_character_to_char_ - #ifdef __cplusplus } // extern "C" #endif diff --git a/level_1/fl_fss/c/fss.c b/level_1/fl_fss/c/fss.c index 1f4d877..0388567 100644 --- a/level_1/fl_fss/c/fss.c +++ b/level_1/fl_fss/c/fss.c @@ -306,7 +306,7 @@ extern "C" { max_width = buffer.used - input.start; } - return f_utf_is_space(buffer.string + input.start, max_width); + return f_utf_is_whitespace(buffer.string + input.start, max_width); } #endif // _di_fl_fss_is_space_ diff --git a/level_1/fl_string/c/string.c b/level_1/fl_string/c/string.c index 8b74f4f..c960bb2 100644 --- a/level_1/fl_string/c/string.c +++ b/level_1/fl_string/c/string.c @@ -119,7 +119,7 @@ extern "C" { max_width = buffer.used - location->start; } - while (buffer.string[location->start] == placeholder || (status = f_utf_is_space(buffer.string + location->start, max_width)) == f_false) { + while (buffer.string[location->start] == placeholder || (status = f_utf_is_whitespace(buffer.string + location->start, max_width)) == f_false) { if (f_status_is_error(status)) { return status; } diff --git a/level_1/fl_utf/c/utf.c b/level_1/fl_utf/c/utf.c index 34bce80..f6d2962 100644 --- a/level_1/fl_utf/c/utf.c +++ b/level_1/fl_utf/c/utf.c @@ -16,7 +16,7 @@ extern "C" { f_status status = f_none; - while (buffer.string[location->start] == placeholder || (status = f_utf_is_graph_character(buffer.string[location->start])) == f_false) { + while (buffer.string[location->start] == placeholder || (status = f_utf_character_is_graph(buffer.string[location->start])) == f_false) { if (f_status_is_error(status)) { return status; } @@ -53,7 +53,7 @@ extern "C" { f_status status = f_none; - while (buffer.string[location->start] == placeholder || (status = f_utf_is_space_character(buffer.string[location->start])) == f_false) { + while (buffer.string[location->start] == placeholder || (status = f_utf_character_is_space(buffer.string[location->start])) == f_false) { if (f_status_is_error(status)) { return status; } diff --git a/level_3/byte_dump/c/private-byte_dump.c b/level_3/byte_dump/c/private-byte_dump.c index be730e5..014efcf 100644 --- a/level_3/byte_dump/c/private-byte_dump.c +++ b/level_3/byte_dump/c/private-byte_dump.c @@ -571,7 +571,7 @@ printf("."); } } - else if (f_utf_is_whitespace_character(characters.string[i]) == f_true) { + else if (f_utf_character_is_whitespace(characters.string[i]) == f_true) { printf("%s", byte_dump_sequence_space); } else if (width_utf == 2 && characters.string[i] == 0xc0800000) { @@ -616,11 +616,11 @@ // Use space to represent Vaiation Selectors Supplement codes. printf(" "); } - else if (width_utf == 4 && characters.string[i] >= 0xf09e8080 && characters.string[i] <= 0xf09fbfbf) { + else if (width_utf == 4 && characters.string[i] >= 0xf3b08080 && characters.string[i] <= 0xf3bfbfbf) { // Use space to represent Supplemental Private Use Area-A codes. printf(" "); } - else if (width_utf == 4 && characters.string[i] >= 0xf0a08080 && characters.string[i] <= 0xf0a1bfbf) { + else if (width_utf == 4 && characters.string[i] >= 0xf4808080 && characters.string[i] <= 0xf48fbfbf) { // Use space to represent Supplemental Private Use Area-B codes. printf(" "); }