From: Kevin Day Date: Mon, 29 Jun 2020 03:00:15 +0000 (-0500) Subject: Bugfix: f_utf_is_whitespace_modifier() is incomplete. X-Git-Tag: 0.5.0~116 X-Git-Url: https://git.kevux.org/?a=commitdiff_plain;h=55371cee2e9dd0f3795518e096411c0f0dc66121;p=fll Bugfix: f_utf_is_whitespace_modifier() is incomplete. Remove "@see iscntrl()" which appears all over the place, incorrectly. Make sure appropriate "@see" uses exist. Remove unnecessary "== F_true" tests. Implement is_punctuation and is_symbol discovery for ASCII characters. Some function declarations are missing. Some function implementations are missing. Remove duplicate declaration of f_utf_character_is_word(). There are no whitespace modifiers in ASCII to my knowledge, and isdigit() is clearly from a copy and paste that was never completed. Add notes about phonetic whitespaces and improve handling. --- diff --git a/level_0/f_utf/c/private-utf.h b/level_0/f_utf/c/private-utf.h index 9c9b581..d09dba9 100644 --- a/level_0/f_utf/c/private-utf.h +++ b/level_0/f_utf/c/private-utf.h @@ -32,7 +32,6 @@ extern "C" { * F_false if not a UTF-8 control character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() * @see f_utf_character_is_alpha() * @see f_utf_is_alpha() */ @@ -55,7 +54,6 @@ extern "C" { * F_false if not a UTF-8 control character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() * @see f_utf_character_is_control() * @see f_utf_is_control() */ @@ -100,7 +98,6 @@ extern "C" { * F_false if not a UTF-8 control character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() * @see f_utf_character_is_control() * @see f_utf_is_control() */ @@ -145,7 +142,6 @@ extern "C" { * F_false if not a UTF-8 control character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() * @see f_utf_character_is_emoji() * @see f_utf_is_emoji() */ @@ -168,7 +164,6 @@ extern "C" { * F_false if not a UTF-8 control character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() * @see f_utf_character_is_numeric() * @see f_utf_is_numeric() */ @@ -191,7 +186,6 @@ extern "C" { * F_false if not a UTF-8 control character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() * @see f_utf_character_is_punctuation() * @see f_utf_is_punctuation() */ @@ -214,7 +208,6 @@ extern "C" { * F_false if not a UTF-8 control character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() * @see f_utf_character_is_symbol() * @see f_utf_is_symbol() */ @@ -277,11 +270,10 @@ extern "C" { * The number of bytes repesenting the character width. * * @return - * F_true if a UTF-8 control character. - * F_false if not a UTF-8 control character. + * F_true if a UTF-8 phonetic whitespace. + * F_false if not a UTF-8 phonetic whitespace. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() * @see f_utf_character_is_whitespace_modifier() * @see f_utf_is_whitespace_modifier() */ @@ -304,7 +296,6 @@ extern "C" { * F_false if not a UTF-8 control character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() * @see f_utf_character_is_word() * @see f_utf_is_word() */ @@ -327,7 +318,6 @@ extern "C" { * F_false if not a UTF-8 control character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() * @see f_utf_character_is_word_dash() * @see f_utf_is_word_dash() */ @@ -350,7 +340,6 @@ extern "C" { * F_false if not a UTF-8 control character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() * @see f_utf_character_is_word_dash() * @see f_utf_is_word_dash() */ diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index 25d1ee9..bda828d 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -242,15 +242,15 @@ extern "C" { return F_status_is_error(F_utf); } - if (private_f_utf_character_is_control(character, width) == F_true) { + if (private_f_utf_character_is_control(character, width)) { return F_false; } - if (private_f_utf_character_is_whitespace(character, width) == F_true) { + if (private_f_utf_character_is_whitespace(character, width)) { return F_false; } - if (private_f_utf_character_is_zero_width(character, width) == F_true) { + if (private_f_utf_character_is_zero_width(character, width)) { return F_false; } @@ -283,7 +283,33 @@ extern "C" { unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { - if (isdigit(f_macro_utf_character_to_char_1(character))) { + // ASCII: '!' to '#'. + if (character > 0x20000000 && character < 0x24000000) { + return F_true; + } + + // ASCII: '%' to '*'. + if (character > 0x24000000 && character < 0x2b000000) { + return F_true; + } + + // ASCII: ',' to '/'. + if (character > 0x2b000000 && character < 0x30000000) { + return F_true; + } + + // ASCII: ':', ';', '?', or '@'. + if (character == 0x3a000000 || character == 0x3b000000 || character == 0x3f000000 || character == 0x40000000) { + return F_true; + } + + // ASCII: '[' to ']'. + if (character > 0x5a000000 && character < 0x5d000000) { + return F_true; + } + + // ASCII: '_', '{', or '}'. + if (character == 0x5f000000 || character == 0x7b000000 || character == 0x7d000000) { return F_true; } @@ -298,6 +324,37 @@ extern "C" { } #endif // _di_f_utf_character_is_punctuation_ +#ifndef _di_f_utf_character_is_symbol_ + f_return_status f_utf_character_is_symbol(const f_utf_character character) { + unsigned short width = f_macro_utf_character_width_is(character); + + if (width == 0) { + // ASCII: '$' or '+'. + if (character == 0x24000000 || character == 0x2b000000) { + return F_true; + } + + // ASCII: '<' to '>'. + if (character > 0x3c000000 && character < 0x3e000000) { + return F_true; + } + + // ASCII: '^', '`', '|', or '~'. + if (character == 0x5e000000 || character == 0x60000000 || character == 0x7c000000 || character == 0x7e000000) { + return F_true; + } + + return F_false; + } + + if (width == 1) { + return F_status_is_error(F_utf); + } + + return private_f_utf_character_is_symbol(character, width); + } +#endif // _di_f_utf_character_is_symbol_ + #ifndef _di_f_utf_character_is_valid_ f_return_status f_utf_character_is_valid(const f_utf_character character) { unsigned short width = f_macro_utf_character_width_is(character); @@ -330,6 +387,23 @@ extern "C" { } #endif // _di_f_utf_character_is_whitespace_ +#ifndef _di_f_utf_character_is_whitespace_modifier_ + f_return_status f_utf_character_is_whitespace_modifier(const f_utf_character character) { + unsigned short width = f_macro_utf_character_width_is(character); + + if (width == 0) { + // There are no ASCII whitespace modifiers. + return F_false; + } + + if (width == 1) { + return F_status_is_error(F_utf); + } + + return private_f_utf_character_is_whitespace_modifier(character, width); + } +#endif // _di_f_utf_character_is_whitespace_modifier_ + #ifndef _di_f_utf_character_is_word_ f_return_status f_utf_character_is_word(const f_utf_character character) { unsigned short width = f_macro_utf_character_width_is(character); @@ -755,16 +829,16 @@ extern "C" { if (status != F_none) return status; } - if (private_f_utf_character_is_control(character_utf, width) == F_true) { + if (private_f_utf_character_is_control(character_utf, width)) { return F_false; } - if (private_f_utf_character_is_whitespace(character_utf, width) == F_true) { + if (private_f_utf_character_is_whitespace(character_utf, width)) { return F_false; } // This test is in isolation so zero-width characters must be treated as a non-graph. - if (private_f_utf_character_is_zero_width(character_utf, width) == F_true) { + if (private_f_utf_character_is_zero_width(character_utf, width)) { return F_false; } @@ -815,7 +889,33 @@ extern "C" { uint8_t width = f_macro_utf_byte_width_is(*character); if (width == 0) { - if (isdigit(*character)) { + // ASCII: '!' to '#'. + if (character[0] > 0x20 && character[0] < 0x24) { + return F_true; + } + + // ASCII: '%' to '*'. + if (character[0] > 0x24 && character[0] < 0x2b) { + return F_true; + } + + // ASCII: ',' to '/'. + if (character[0] > 0x2b && character[0] < 0x30) { + return F_true; + } + + // ASCII: ':', ';', '?', or '@'. + if (character[0] == 0x3a || character[0] == 0x3b || character[0] == 0x3f || character[0] == 0x40) { + return F_true; + } + + // ASCII: '[' to ']'. + if (character[0] > 0x5a && character[0] < 0x5d) { + return F_true; + } + + // ASCII: '_', '{', or '}'. + if (character[0] == 0x5f || character[0] == 0x7b || character[0] == 0x7d) { return F_true; } @@ -840,6 +940,51 @@ extern "C" { } #endif // _di_f_utf_is_punctuation_ +#ifndef _di_f_utf_is_symbol_ + f_return_status f_utf_is_symbol(const f_string character, const f_string_length width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + uint8_t width = f_macro_utf_byte_width_is(*character); + + if (width == 0) { + // ASCII: '$' or '+'. + if (character[0] == 0x24 || character[0] == 0x2b) { + return F_true; + } + + // ASCII: '<' to '>'. + if (character[0] > 0x3c && character[0] < 0x3e) { + return F_true; + } + + // ASCII: '^', '`', '|', or '~'. + if (character[0] == 0x5e || character[0] == 0x60 || character[0] == 0x7c || character[0] == 0x7e) { + return F_true; + } + + return F_false; + } + + if (width == 1) { + return F_status_is_error(F_incomplete_utf); + } + + f_utf_character character_utf = 0; + + { + f_status status = 0; + + status = f_utf_char_to_character(character, width_max, &character_utf); + + if (status != F_none) return status; + } + + return private_f_utf_character_is_symbol(character_utf, width); + } +#endif // _di_f_utf_is_symbol_ + #ifndef _di_f_utf_is_valid_ f_return_status f_utf_is_valid(const f_string character, const f_string_length width_max) { #ifndef _di_level_0_parameter_checking_ @@ -909,10 +1054,7 @@ extern "C" { uint8_t width = f_macro_utf_byte_width_is(*character); if (width == 0) { - if (isdigit(*character)) { - return F_true; - } - + // There are no ASCII whitespace modifiers. return F_false; } diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h index 4f28179..443eef9 100644 --- a/level_0/f_utf/c/utf.h +++ b/level_0/f_utf/c/utf.h @@ -142,7 +142,7 @@ extern "C" { * F_false if not a UTF-8 alphabet character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() + * @see isalpha() */ #ifndef _di_f_utf_character_is_alpha_ extern f_return_status f_utf_character_is_alpha(const f_utf_character character); @@ -159,7 +159,7 @@ extern "C" { * F_false if not a UTF-8 alpha-numeric character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() + * @see isalnum() */ #ifndef _di_f_utf_character_is_alpha_numeric_ extern f_return_status f_utf_character_is_alpha_numeric(const f_utf_character character); @@ -226,8 +226,6 @@ extern "C" { * F_true if a UTF-8 emoji character. * F_false if not a UTF-8 emoji character. * F_utf (with error bit) if character is an invalid UTF-8 character. - * - * @see iscntrl() */ #ifndef _di_f_utf_character_is_emoji_ extern f_return_status f_utf_character_is_emoji(const f_utf_character character); @@ -296,7 +294,7 @@ extern "C" { * F_false if not a UTF-8 numeric character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() + * @see isdigit() * @see f_utf_is_numeric() */ #ifndef _di_f_utf_character_is_numeric_ @@ -316,7 +314,6 @@ extern "C" { * F_false if not a UTF-8 punctuation character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() * @see f_utf_is_punctuation() */ #ifndef _di_f_utf_character_is_punctuation_ @@ -336,7 +333,6 @@ extern "C" { * F_false if not a UTF-8 symbol character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() * @see f_utf_is_symbol() */ #ifndef _di_f_utf_character_is_symbol_ @@ -363,7 +359,6 @@ extern "C" { * * @see f_utf_character_is() * @see f_utf_character_is_fragment() - * @see f_utf_is_valid() */ #ifndef _di_f_utf_character_is_valid_ extern f_return_status f_utf_character_is_valid(const f_utf_character character); @@ -374,6 +369,10 @@ extern "C" { * * Non-printing or zero-width characters are not considered whitespace. * This does include line separators like '\n'. + * This does not include phonetic spaces, like whitespace modifiers. + * + * Phonetic spaces are whitespaces with additional phonetic meaning associated with them. + * However, because they are not renderred as whitespace, they are technically not white space. * * @param character * The character to validate. @@ -383,7 +382,7 @@ extern "C" { * F_false if not a UTF-8 whitespace. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see f_utf_is_whitespace() + * @see isspace() */ #ifndef _di_f_utf_character_is_whitespace_ extern f_return_status f_utf_character_is_whitespace(const f_utf_character character); @@ -392,6 +391,11 @@ extern "C" { /** * Check to see if the entire byte block of the character is an ASCII or UTF-8 whitespace modifier character. * + * These are phonetic spaces. + * + * Phonetic spaces are whitespaces with additional phonetic meaning associated with them. + * Therefore, these are valid spaces in the technical sense, even if they are not visibly whitespace. + * * @param character * The character to validate. * @@ -399,9 +403,6 @@ extern "C" { * F_true if a UTF-8 modifier character. * F_false if not a UTF-8 modifier character. * F_utf (with error bit) if character is an invalid UTF-8 character. - * - * @see iscntrl() - * @see f_utf_is_whitespace_modifier() */ #ifndef _di_f_utf_character_is_whitespace_modifier_ extern f_return_status f_utf_character_is_whitespace_modifier(const f_utf_character character); @@ -422,7 +423,7 @@ extern "C" { * F_false if not a UTF-8 word character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() + * @see isalnum() * @see f_utf_is_word() */ #ifndef _di_f_utf_character_is_word_ @@ -444,7 +445,7 @@ extern "C" { * F_false if not a UTF-8 word or dash character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() + * @see isalnum() * @see f_utf_is_word_dash() */ #ifndef _di_f_utf_character_is_word_dash_ @@ -466,7 +467,7 @@ extern "C" { * F_false if not a UTF-8 word or dash character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see iscntrl() + * @see isalnum() * @see f_utf_is_word_dash() */ #ifndef _di_f_utf_character_is_word_dash_plus_ @@ -485,36 +486,12 @@ extern "C" { * F_true if a UTF-8 non-printing or zero-width character. * F_false if not a UTF-8 non-printing or zero-width character. * F_utf (with error bit) if character is an invalid UTF-8 character. - * - * @see f_utf_is_zero_width() */ #ifndef _di_f_utf_character_is_zero_width_ extern f_return_status f_utf_character_is_zero_width(const f_utf_character character); #endif // _di_f_utf_character_is_zero_width_ /** - * Check to see if the entire byte block of the character is an word character. - * - * A word character is alphanumeric or underscore '_'. - * - * @todo Incomplete, UTF-8 codes not yet checked! - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 alpha character. - * F_false if not a UTF-8 alpha character. - * F_utf (with error bit) if character is an invalid UTF-8 character. - * - * @see iscntrl() - * @see f_utf_is_word() - */ -#ifndef _di_f_utf_character_is_word_ - extern f_return_status f_utf_character_is_word(const f_utf_character character); -#endif // _di_f_utf_character_is_word_ - -/** * Convert a specialized f_utf_character type to a int8_t, stored as a string (character buffer). * * This will also convert ASCII characters stored in the utf_character array. @@ -592,7 +569,7 @@ extern "C" { * F_false if not a UTF-8 alphabet character. * F_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * - * @see iscntrl() + * @see isalpha() */ #ifndef _di_f_utf_is_alpha_ extern f_return_status f_utf_is_alpha(const f_string character, const f_string_length width_max); @@ -613,7 +590,7 @@ extern "C" { * F_false if not a UTF-8 alpha-numeric character.x * F_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * - * @see iscntrl() + * @see isalnum() */ #ifndef _di_f_utf_is_alpha_numeric_ extern f_return_status f_utf_is_alpha_numeric(const f_string character, const f_string_length width_max); @@ -654,7 +631,6 @@ extern "C" { * F_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * * @see iscntrl() - * @see f_utf_character_is_control() */ #ifndef _di_f_utf_is_control_ extern f_return_status f_utf_is_control(const f_string character, const f_string_length width_max); @@ -697,8 +673,6 @@ extern "C" { * F_true if a UTF-8 emoji character. * F_false if not a UTF-8 emoji character. * F_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see iscntrl() */ #ifndef _di_f_utf_is_emoji_ extern f_return_status f_utf_is_emoji(const f_string character, const f_string_length width_max); @@ -756,7 +730,6 @@ extern "C" { * F_parameter (with error bit) if a parameter is invalid. * * @see isgraph() - * @see iscntrl() */ #ifndef _di_f_utf_is_graph_ extern f_return_status f_utf_is_graph(const f_string character, const f_string_length width_max); @@ -777,7 +750,7 @@ extern "C" { * F_false if not a UTF-8 numeric character. * F_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * - * @see iscntrl() + * @see isdigit() */ #ifndef _di_f_utf_is_numeric_ extern f_return_status f_utf_is_numeric(const f_string character, const f_string_length width_max); @@ -799,8 +772,6 @@ extern "C" { * F_true if a UTF-8 punctuation character. * F_false if not a UTF-8 punctuation character. * F_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see iscntrl() */ #ifndef _di_f_utf_is_punctuation_ extern f_return_status f_utf_is_punctuation(const f_string character, const f_string_length width_max); @@ -822,8 +793,6 @@ extern "C" { * F_true if a UTF-8 symbol character. * F_false if not a UTF-8 symbol character. * F_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see iscntrl() */ #ifndef _di_f_utf_is_symbol_ extern f_return_status f_utf_is_symbol(const f_string character, const f_string_length width_max); @@ -858,6 +827,13 @@ extern "C" { /** * Check to see if the entire byte block of the character is an ASCII or UTF-8 general space character. * + * Non-printing or zero-width characters are not considered whitespace. + * This does include line separators like '\n'. + * This does not include phonetic spaces, like whitespace modifiers. + * + * Phonetic spaces are whitespaces with additional phonetic meaning associated with them. + * However, because they are not renderred as whitespace, they are technically not white space. + * * @param character * The character to validate. * There must be enough space allocated to compare against, as limited by width_max. @@ -873,13 +849,38 @@ extern "C" { * F_parameter (with error bit) if a parameter is invalid. * * @see isspace() - * @see iscntrl() */ #ifndef _di_f_utf_is_whitespace_ extern f_return_status f_utf_is_whitespace(const f_string character, const f_string_length width_max); #endif // _di_f_utf_is_whitespace_ /** + * Check to see if the entire byte block of the character is a UTF-8 whitespace modifier character. + * + * These are phonetic spaces. + * + * Phonetic spaces are whitespaces with additional phonetic meaning associated with them. + * Therefore, these are valid spaces in the technical sense, even if they are not visibly whitespace. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 whitespace. + * F_false if not a UTF-8 whitespace. + * F_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. + * F_maybe (with error bit) if this could be a whitespace but width is not long enough. + * F_parameter (with error bit) if a parameter is invalid. + */ +#ifndef _di_f_utf_is_whitespace_modifier_ + extern f_return_status f_utf_is_whitespace_modifier(const f_string character, const f_string_length width_max); +#endif // _di_f_utf_is_whitespace_modifier_ + +/** * Check to see if the entire byte block of the character is an ASCII or UTF-8 word character. * * A word character is alpha-numeric or an underscore '_'. @@ -898,7 +899,7 @@ extern "C" { * F_false if not a UTF-8 word character. * F_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * - * @see iscntrl() + * @see isalnum() */ #ifndef _di_f_utf_is_word_ extern f_return_status f_utf_is_word(const f_string character, const f_string_length width_max); @@ -923,7 +924,7 @@ extern "C" { * F_false if not a UTF-8 word or dash character. * F_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * - * @see iscntrl() + * @see isalnum() */ #ifndef _di_f_utf_is_word_dash_ extern f_return_status f_utf_is_word_dash(const f_string character, const f_string_length width_max); @@ -948,7 +949,7 @@ extern "C" { * F_false if not a UTF-8 word or dash character. * F_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * - * @see iscntrl() + * @see isalnum() */ #ifndef _di_f_utf_is_word_dash_plus_ extern f_return_status f_utf_is_word_dash_plus(const f_string character, const f_string_length width_max); @@ -972,9 +973,6 @@ extern "C" { * F_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * F_maybe (with error bit) if this could be a whitespace but width is not long enough. * F_parameter (with error bit) if a parameter is invalid. - * - * @see isspace() - * @see iscntrl() */ #ifndef _di_f_utf_is_zero_width_ extern f_return_status f_utf_is_zero_width(const f_string character, const f_string_length width_max);