From e416f05addf8a742a3ae312f24ecde88a2651337 Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Sat, 2 May 2020 00:10:23 -0500 Subject: [PATCH] Feature: add word, dash, and plus UTF-8 character checker The use of the '+' operator is more common than in the past. Add support for detecting if a character is a word character, a dash character, or a plus character. --- level_0/f_utf/c/private-utf.c | 9 ++++++++ level_0/f_utf/c/private-utf.h | 23 ++++++++++++++++++ level_0/f_utf/c/utf.c | 54 +++++++++++++++++++++++++++++++++++++++++++ level_0/f_utf/c/utf.h | 48 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 134 insertions(+) diff --git a/level_0/f_utf/c/private-utf.c b/level_0/f_utf/c/private-utf.c index 752a77b..0529a99 100644 --- a/level_0/f_utf/c/private-utf.c +++ b/level_0/f_utf/c/private-utf.c @@ -2470,6 +2470,15 @@ extern "C" { } #endif // !defined(_di_f_utf_character_is_word_dash_) || !defined(_di_f_utf_is_word_dash_) +#if !defined(_di_f_utf_character_is_word_dash_plus_) || !defined(_di_f_utf_is_word_dash_plus_) + f_return_status private_f_utf_character_is_word_dash_plus(const f_utf_character character, const uint8_t width) { + + // @todo: handle all Unicode "word_dash_plus". + + return f_false; + } +#endif // !defined(_di_f_utf_character_is_word_dash_plus_) || !defined(_di_f_utf_is_word_dash_plus_) + #if !defined(_di_f_utf_character_is_zero_width_) || !defined(_di_f_utf_is_zero_width_) f_return_status private_f_utf_character_is_zero_width(const f_utf_character character) { // reduce the number of checks by grouping checks by first byte. diff --git a/level_0/f_utf/c/private-utf.h b/level_0/f_utf/c/private-utf.h index e89c277..8a2afb4 100644 --- a/level_0/f_utf/c/private-utf.h +++ b/level_0/f_utf/c/private-utf.h @@ -218,6 +218,29 @@ extern "C" { #endif // !defined(_di_f_utf_character_is_word_dash_) || !defined(_di_f_utf_is_word_dash_) /** + * Private implementation of f_utf_character_is_word_dash_plus(). + * + * Intended to be shared to each of the different implementation variations. + * + * @param character + * The character to validate. + * @param width + * The number of bytes repesenting the character width. + * + * @return + * f_true if a UTF-8 control character. + * f_false if not a UTF-8 control character. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + * + * @see iscntrl() + * @see f_utf_character_is_word_dash() + * @see f_utf_is_word_dash() + */ +#if !defined(_di_f_utf_character_is_word_dash_plus_) || !defined(_di_f_utf_is_word_dash_plus_) + extern f_return_status private_f_utf_character_is_word_dash_plus(const f_utf_character character, const uint8_t width) f_gcc_attribute_visibility_internal; +#endif // !defined(_di_f_utf_character_is_word_dash_plus_) || !defined(_di_f_utf_is_word_dash_plus_) + +/** * Private implementation of f_utf_character_is_zero_width(). * * Intended to be shared to each of the different implementation variations. diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index 5573aea..3e82b6c 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -237,6 +237,26 @@ extern "C" { } #endif // _di_f_utf_character_is_word_dash_ +#ifndef _di_f_utf_character_is_word_dash_plus_ + f_return_status f_utf_character_is_word_dash_plus(const f_utf_character character) { + unsigned short width = f_macro_utf_character_width_is(character); + + if (width == 0) { + if (isalnum(f_macro_utf_character_to_char_1(character)) || character == '_' || character == '-' || character == '+') { + return f_true; + } + + return f_false; + } + + if (width == 1) { + return f_status_is_error(f_invalid_utf); + } + + return private_f_utf_character_is_word_dash_plus(character, width); + } +#endif // _di_f_utf_character_is_word_dash_plus_ + #ifndef _di_f_utf_character_is_zero_width_ f_return_status f_utf_character_is_zero_width(const f_utf_character character) { if (f_macro_utf_character_width_is(character) == 1) { @@ -678,6 +698,40 @@ extern "C" { } #endif // _di_f_utf_is_word_dash_ +#ifndef _di_f_utf_is_word_dash_plus_ + f_return_status f_utf_is_word_dash_plus(const f_string character, const f_string_length width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_0_parameter_checking_ + + uint8_t width = f_macro_utf_byte_width_is(*character); + + if (width == 0) { + if (isalnum(*character) || *character == '_' || *character == '-' || *character == '+') { + return f_true; + } + + return f_false; + } + + if (width == 1) { + return f_status_is_error(f_incomplete_utf); + } + + f_utf_character character_utf = 0; + + { + f_status status = 0; + + status = f_utf_char_to_character(character, width_max, &character_utf); + + if (status != f_none) return status; + } + + return private_f_utf_character_is_word_dash_plus(character_utf, width); + } +#endif // _di_f_utf_is_word_dash_plus_ + #ifndef _di_f_utf_is_zero_width_ f_return_status f_utf_is_zero_width(const f_string character, const f_string_length width_max) { #ifndef _di_level_0_parameter_checking_ diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h index 1acf354..a4f9ad0 100644 --- a/level_0/f_utf/c/utf.h +++ b/level_0/f_utf/c/utf.h @@ -749,6 +749,28 @@ extern "C" { #endif // _di_f_utf_character_is_word_dash_ /** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 word, dash, or plus character. + * + * A word dash character is alpha-numeric, an underscore '_', a dash '-', or a plus '+'. + * + * @todo Incomplete, UTF-8 codes not yet checked! + * + * @param character + * The character to validate. + * + * @return + * f_true if a UTF-8 word or dash character. + * f_false if not a UTF-8 word or dash character. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + * + * @see iscntrl() + * @see f_utf_is_word_dash() + */ +#ifndef _di_f_utf_character_is_word_dash_plus_ + extern f_return_status f_utf_character_is_word_dash_plus(const f_utf_character character); +#endif // _di_f_utf_character_is_word_dash_plus_ + +/** * Check to see if the entire byte block of the character is an ASCII or UTF-8 general non-printing character. * * Only characters that do not print, which are generally called zero-width. @@ -1143,6 +1165,32 @@ extern "C" { #endif // _di_f_utf_is_word_dash_ /** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 word, dash, or plus character. + * + * A word dash character is alpha-numeric, an underscore '_', a dash '-', or a plus '+'. + * + * @todo Incomplete, UTF-8 codes not yet checked! + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * f_true if a UTF-8 word or dash character. + * f_false if not a UTF-8 word or dash character. + * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see iscntrl() + * @see f_utf_character_is_word_dash_plus() + */ +#ifndef _di_f_utf_is_word_dash_plus_ + extern f_return_status f_utf_is_word_dash_plus(const f_string character, const f_string_length width_max); +#endif // _di_f_utf_is_word_dash_plus_ + +/** * Check to see if the entire byte block of the character is an ASCII or UTF-8 general non-printing character. * * Only characters that do not print, which are generally called zero-width. -- 1.8.3.1