From 2995ceebac5aec7cde3dd168c10f86f7c95cf1fd Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Sat, 13 Nov 2021 22:39:06 -0600 Subject: [PATCH] Update: Improve UTF-8 Control detecting, expanding to distinguish Control Code and Control Format. There seem to be "Control Format". Create functions for "Control Code" and "Control Format" (is_control_code and is_control_format functions). The is_control functions now check for both. --- level_0/f_utf/c/private-utf.c | 214 ++++++++++++++++++++++++++++++++++++++++-- level_0/f_utf/c/private-utf.h | 50 +++++++++- level_0/f_utf/c/utf.c | 101 ++++++++++++++++++++ level_0/f_utf/c/utf.h | 96 ++++++++++++++++++- 4 files changed, 448 insertions(+), 13 deletions(-) diff --git a/level_0/f_utf/c/private-utf.c b/level_0/f_utf/c/private-utf.c index 50b0908..beed62c 100644 --- a/level_0/f_utf/c/private-utf.c +++ b/level_0/f_utf/c/private-utf.c @@ -54,6 +54,7 @@ extern "C" { return F_false; } + // is_control() handles both is_control_code() and is_control_format(). if (private_f_utf_character_is_control(character, width)) { return F_false; } @@ -105,6 +106,7 @@ extern "C" { return F_false; } + // is_control() handles both is_control_code() and is_control_format(). if (private_f_utf_character_is_control(character, width)) { return F_false; } @@ -152,6 +154,7 @@ extern "C" { return F_false; } + // is_control() handles both is_control_code() and is_control_format(). if (private_f_utf_character_is_control(character, width)) { return F_false; } @@ -248,32 +251,110 @@ extern "C" { if (width == 2) { + // Control Codes. + // Latin-1 Supplement: U+0080 to U+009F. if (character >= 0xc2800000 && character <= 0xc29f0000) { return F_true; } + + // Control Formats. + + // Latin-1 Supplement: U+00AD. + if (character == 0xc2ad0000) { + return F_true; + } + + // Arabic: U+0600 to U+0605. + if (character >= 0xd8800000 && character <= 0xd8850000) { + return F_true; + } + + // Arabic: U+061C, U+06DD. + if (character == 0xd89c0000 || character == 0xdb9d0000) { + return F_true; + } + + // Syriac: U+070F. + if (character == 0xdc8f0000) { + return F_true; + } } else if (width == 3) { - // General Punctuation: U+200E and U+200F. - if (character == 0xe2808e00 || character == 0xe2808f00) { + // Control Formats. + + // Arabic Extended-A: U+08E2. + if (character == 0xe0a3a200) { return F_true; } - // General Punctuation: U+2066 to U+2069. - if (character >= 0xe281a600 && character <= 0xe281a900) { + // Mongolian: U+180E. + if (character == 0xe1a08e00) { + return F_true; + } + + // General Punctuation: U+200B to U+200F. + if (character >= 0xe2808b00 && character <= 0xe2808f00) { + return F_true; + } + + // General Punctuation: U+202A to U+202E. + if (character >= 0xe280aa00 && character <= 0xe280ae00) { + return F_true; + } + + // General Punctuation: U+2060 to U+2064. + if (character >= 0xe281a000 && character <= 0xe281a400) { + return F_true; + } + + // General Punctuation: U+2066 to U+206F. + if (character >= 0xe281a600 && character <= 0xe281af00) { + return F_true; + } + + // Arabic Presentation Forms-B: U+FEFF. + if (character == 0xefbbbf00) { return F_true; } - // Special: U+FFF9 to U+FFFB. + // Specials: U+FFF9 to U+FFFB. if (character >= 0xefbfb900 && character <= 0xefbfbb00) { return F_true; } } else if (width == 4) { - // Tags: U+E0001 and U+E007F. - if (character == 0xf3a08081 || character == 0xf3a081bf) { + // Control Formats. + + // Kaithi: U+110BD, U+110CD. + if (character == 0xf09182bd || character == 0xf091838d) { + return F_true; + } + + // Egyptian Hieroglyphics: U+13430 to U+13438. + if (character >= 0xf09390b0 && character <= 0xf09390b8) { + return F_true; + } + + // Shothand Format Controls: U+1BCA0 to U+1BCA3. + if (character >= 0xf09bb2a0 && character <= 0xf09bb2a3) { + return F_true; + } + + // Music Symbols: U+1D173 to U+1D17A. + if (character >= 0xf09d85b3 && character <= 0xf09d85ba) { + return F_true; + } + + // Tags: U+E0001. + if (character == 0xf3a08081) { + return F_true; + } + + // Tags: U+E0020 to U+E007F. + if (character >= 0xf3a080a0 && character <= 0xf3a081bf) { return F_true; } } @@ -282,6 +363,125 @@ extern "C" { } #endif // !defined(_di_f_utf_character_is_control_) || !defined(_di_f_utf_is_control_) +#if !defined(_di_f_utf_character_is_control_code_) || !defined(_di_f_utf_is_control_code_) + f_status_t private_f_utf_character_is_control_code(const f_utf_character_t character, const uint8_t width) { + + if (width == 2) { + + // Latin-1 Supplement: U+0080 to U+009F. + if (character >= 0xc2800000 && character <= 0xc29f0000) { + return F_true; + } + } + + return F_false; + } +#endif // !defined(_di_f_utf_character_is_control_code_) || !defined(_di_f_utf_is_contro_codel_) + +#if !defined(_di_f_utf_character_is_control_format_) || !defined(_di_f_utf_is_control_format_) + f_status_t private_f_utf_character_is_control_format(const f_utf_character_t character, const uint8_t width) { + + if (width == 2) { + + // Latin-1 Supplement: U+00AD. + if (character == 0xc2ad0000) { + return F_true; + } + + // Arabic: U+0600 to U+0605. + if (character >= 0xd8800000 && character <= 0xd8850000) { + return F_true; + } + + // Arabic: U+061C, U+06DD. + if (character == 0xd89c0000 || character == 0xdb9d0000) { + return F_true; + } + + // Syriac: U+070F. + if (character == 0xdc8f0000) { + return F_true; + } + } + else if (width == 3) { + + // Arabic Extended-A: U+08E2. + if (character == 0xe0a3a200) { + return F_true; + } + + // Mongolian: U+180E. + if (character == 0xe1a08e00) { + return F_true; + } + + // General Punctuation: U+200B to U+200F. + if (character >= 0xe2808b00 && character <= 0xe2808f00) { + return F_true; + } + + // General Punctuation: U+202A to U+202E. + if (character >= 0xe280aa00 && character <= 0xe280ae00) { + return F_true; + } + + // General Punctuation: U+2060 to U+2064. + if (character >= 0xe281a000 && character <= 0xe281a400) { + return F_true; + } + + // General Punctuation: U+2066 to U+206F. + if (character >= 0xe281a600 && character <= 0xe281af00) { + return F_true; + } + + // Arabic Presentation Forms-B: U+FEFF. + if (character == 0xefbbbf00) { + return F_true; + } + + // Specials: U+FFF9 to U+FFFB. + if (character >= 0xefbfb900 && character <= 0xefbfbb00) { + return F_true; + } + } + else if (width == 4) { + + // Kaithi: U+110BD, U+110CD. + if (character == 0xf09182bd || character == 0xf091838d) { + return F_true; + } + + // Egyptian Hieroglyphics: U+13430 to U+13438. + if (character >= 0xf09390b0 && character <= 0xf09390b8) { + return F_true; + } + + // Shothand Format Controls: U+1BCA0 to U+1BCA3. + if (character >= 0xf09bb2a0 && character <= 0xf09bb2a3) { + return F_true; + } + + // Music Symbols: U+1D173 to U+1D17A. + if (character >= 0xf09d85b3 && character <= 0xf09d85ba) { + return F_true; + } + + // Tags: U+E0001. + if (character == 0xf3a08081) { + return F_true; + } + + // Tags: U+E0020 to U+E007F. + if (character >= 0xf3a080a0 && character <= 0xf3a081bf) { + return F_true; + } + } + + return F_false; + } +#endif // !defined(_di_f_utf_character_is_control_format_) || !defined(_di_f_utf_is_control_format_) + #if !defined(_di_f_utf_character_is_control_picture_) || !defined(_di_f_utf_is_control_picture_) f_status_t private_f_utf_character_is_control_picture(const f_utf_character_t character, const uint8_t width) { diff --git a/level_0/f_utf/c/private-utf.h b/level_0/f_utf/c/private-utf.h index f2e0c3e..ff453a8 100644 --- a/level_0/f_utf/c/private-utf.h +++ b/level_0/f_utf/c/private-utf.h @@ -175,8 +175,8 @@ extern "C" { * The number of bytes repesenting the character width. * * @return - * F_true if a UTF-8 control picture character. - * F_false if not a UTF-8 control picture character. + * F_true if a UTF-8 combining character. + * F_false if not a UTF-8 combining character. * * F_utf (with error bit) if character is an invalid UTF-8 character. * @@ -211,6 +211,52 @@ extern "C" { #endif // !defined(_di_f_utf_character_is_control_) || !defined(_di_f_utf_is_control_) /** + * Private implementation of f_utf_character_is_control_code(). + * + * Intended to be shared to each of the different implementation variations. + * + * @param character + * The character to validate. + * @param width + * The number of bytes repesenting the character width. + * + * @return + * F_true if a UTF-8 control character. + * F_false if not a UTF-8 control character. + * + * F_utf (with error bit) if character is an invalid UTF-8 character. + * + * @see f_utf_character_is_control_code() + * @see f_utf_is_control_code() + */ +#if !defined(_di_f_utf_character_is_control_code_) || !defined(_di_f_utf_is_control_code_) + extern f_status_t private_f_utf_character_is_control_code(const f_utf_character_t character, const uint8_t width) F_attribute_visibility_internal_d; +#endif // !defined(_di_f_utf_character_is_control_code_) || !defined(_di_f_utf_is_control_code_) + +/** + * Private implementation of f_utf_character_is_control_format(). + * + * Intended to be shared to each of the different implementation variations. + * + * @param character + * The character to validate. + * @param width + * The number of bytes repesenting the character width. + * + * @return + * F_true if a UTF-8 control character. + * F_false if not a UTF-8 control character. + * + * F_utf (with error bit) if character is an invalid UTF-8 character. + * + * @see f_utf_character_is_control_format() + * @see f_utf_is_control_format() + */ +#if !defined(_di_f_utf_character_is_control_format_) || !defined(_di_f_utf_is_control_format_) + extern f_status_t private_f_utf_character_is_control_format(const f_utf_character_t character, const uint8_t width) F_attribute_visibility_internal_d; +#endif // !defined(_di_f_utf_character_is_control_format_) || !defined(_di_f_utf_is_control_format_) + +/** * Private implementation of f_utf_character_is_control_picture(). * * Intended to be shared to each of the different implementation variations. diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index 6081392..9c32813 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -229,6 +229,46 @@ extern "C" { } #endif // _di_f_utf_character_is_control_ +#ifndef _di_f_utf_character_is_control_code_ + f_status_t f_utf_character_is_control_code(const f_utf_character_t character) { + + const uint8_t width = macro_f_utf_character_t_width_is(character); + + if (!width) { + if (iscntrl(macro_f_utf_character_t_to_char_1(character))) { + return F_true; + } + + return F_false; + } + + if (width == 1) { + return F_status_is_error(F_utf); + } + + return private_f_utf_character_is_control_code(character, width); + } +#endif // _di_f_utf_character_is_control_code_ + +#ifndef _di_f_utf_character_is_control_picture_ + f_status_t character_is_control_format(const f_utf_character_t character) { + + const uint8_t width = macro_f_utf_character_t_width_is(character); + + if (!width) { + + // There are no control format characters in ASCII. + return F_false; + } + + if (width == 1) { + return F_status_is_error(F_utf); + } + + return private_f_utf_character_is_control_format(character, width); + } +#endif // _di_f_utf_character_is_control_format_ + #ifndef _di_f_utf_character_is_control_picture_ f_status_t f_utf_character_is_control_picture(const f_utf_character_t character) { @@ -1008,6 +1048,63 @@ extern "C" { } #endif // _di_f_utf_is_control_ +#ifndef _di_f_utf_is_control_code + f_status_t f_utf_is_control_code(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + const uint8_t width = macro_f_utf_byte_width_is(*character); + + if (!width) { + if (iscntrl(*character)) { + return F_true; + } + + return F_false; + } + + if (width == 1) { + return F_status_is_error(F_complete_not_utf); + } + + f_utf_character_t character_utf = 0; + + f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + + return private_f_utf_character_is_control_code(character_utf, width); + } +#endif // _di_f_utf_is_control_code_ + +#ifndef _di_f_utf_is_control_format_ + f_status_t f_utf_is_control_format(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + const uint8_t width = macro_f_utf_byte_width_is(*character); + + // There are no ASCII control formats. + if (!width) { + return F_false; + } + + if (width == 1) { + return F_status_is_error(F_complete_not_utf); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_control_format(character_utf, width); + } +#endif // _di_f_utf_is_control_format_ + #ifndef _di_f_utf_is_control_picture_ f_status_t f_utf_is_control_picture(const f_string_t character, const f_array_length_t width_max) { #ifndef _di_level_0_parameter_checking_ @@ -1025,6 +1122,10 @@ extern "C" { return F_status_is_error(F_complete_not_utf); } + if (width != 3) { + return F_false; + } + f_utf_character_t character_utf = 0; { diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h index 5d95774..1a0b480 100644 --- a/level_0/f_utf/c/utf.h +++ b/level_0/f_utf/c/utf.h @@ -224,8 +224,8 @@ extern "C" { * The character to validate. * * @return - * F_true if a UTF-8 control picture character. - * F_false if not a UTF-8 control picture character. + * F_true if a UTF-8 combining character. + * F_false if not a UTF-8 combining character. * * F_utf (with error bit) if character is an invalid UTF-8 character. */ @@ -236,6 +236,8 @@ extern "C" { /** * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character. * + * This includes control code and control format characters. + * * @param character * The character to validate. * @@ -252,6 +254,45 @@ extern "C" { #endif // _di_f_utf_character_is_control_ /** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 control code character. + * + * Control Code characters are the traditional control characters, such as "\n" as well as some newer Unicode ones. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 control code character. + * F_false if not a UTF-8 control code character. + * + * F_utf (with error bit) if character is an invalid UTF-8 character. + * + * @see iscntrl() + */ +#ifndef _di_f_utf_character_is_control_code_ + extern f_status_t f_utf_character_is_control_code(const f_utf_character_t character); +#endif // _di_f_utf_character_is_control_code_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 control format character. + * + * Control Format characters are special characters used for formatting. + * These are considered control characters. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 control format character. + * F_false if not a UTF-8 control format character. + * + * F_utf (with error bit) if character is an invalid UTF-8 character. + */ +#ifndef _di_f_utf_character_is_control_format_ + extern f_status_t f_utf_character_is_control_format(const f_utf_character_t character); +#endif // _di_f_utf_character_is_control_format_ + +/** * Check to see if the entire byte block of the character is a UTF-8 control picture character. * * Control Picture characters are placeholders for special ASCII characters and therefore there are no ASCII Control Picture characters. @@ -876,8 +917,8 @@ extern "C" { * Can be anything greater than 0. * * @return - * F_true if a UTF-8 control picture character. - * F_false if not a UTF-8 control picture character. + * F_true if a UTF-8 combining character. + * F_false if not a UTF-8 combining character. * * F_complete_not_utf (with error bit) if character is an incomplete UTF-8 fragment. */ @@ -888,6 +929,8 @@ extern "C" { /** * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character. * + * This includes control code and control format characters. + * * @param character * The character to validate. * There must be enough space allocated to compare against, as limited by width_max. @@ -908,6 +951,51 @@ extern "C" { #endif // _di_f_utf_is_control_ /** + * Check to see if the entire byte block of the character is a UTF-8 control code character. + * + * Control Code characters are the traditional control characters, such as "\n" as well as some newer Unicode ones. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 control code character. + * F_false if not a UTF-8 control code character. + * + * F_complete_not_utf (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_control_code_ + extern f_status_t f_utf_is_control_code(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_control_code_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 control format character. + * + * Control Format characters are special characters used for formatting. + * These are considered control characters. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 control format character. + * F_false if not a UTF-8 control format character. + * + * F_complete_not_utf (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_control_format_ + extern f_status_t f_utf_is_control_format(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_control_format_ + +/** * Check to see if the entire byte block of the character is a UTF-8 control picture character. * * Control Picture characters are placeholders for special ASCII characters and therefore there are no ASCII Control Picture characters. -- 1.8.3.1