From 8b5d185d80fda982ad26ec2e50b5c3771d34b25a Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Mon, 6 Dec 2021 21:51:02 -0600 Subject: [PATCH] Update: Add missing function in f_utf needed for completeness and reduce repeated code. As per my completeness principle, the f_utf_unicode_string_to() must have the f_utf_character_unicode_string_to() compliment. This function only allows for ASCII characters to represent the number and returns errors as appropriate for non-ASCII values. Unicode number values are not treated as the ASCII numbers for representing a Unicode code sequence. The f_utf_character_unicode_to() and f_utf_unicode_to() now has code reduced by utilizing private_f_utf_character_unicode_to(). --- level_0/f_utf/c/private-utf.c | 41 ++++++++++++ level_0/f_utf/c/private-utf.h | 27 ++++++++ level_0/f_utf/c/utf.c | 149 ++++++++++++++++++++++-------------------- level_0/f_utf/c/utf.h | 29 +++++++- 4 files changed, 174 insertions(+), 72 deletions(-) diff --git a/level_0/f_utf/c/private-utf.c b/level_0/f_utf/c/private-utf.c index 1af2b53..ff15202 100644 --- a/level_0/f_utf/c/private-utf.c +++ b/level_0/f_utf/c/private-utf.c @@ -3596,6 +3596,47 @@ extern "C" { } #endif // !defined(_di_f_utf_character_is_zero_width_) || !defined(_di_f_utf_is_zero_width_) +#if !defined(_di_f_utf_unicode_to_) || !defined(_di_f_utf_character_unicode_to_) + f_status_t private_f_utf_character_unicode_to(const f_utf_character_t character, uint32_t *unicode) { + + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + if (private_f_utf_character_is_valid(character) == F_false) { + return F_status_set_error(F_utf); + } + + // U+0000 -> U+007F (ASCII). + if (macro_f_utf_character_t_width(character) == 1) { + *unicode = macro_f_utf_character_t_to_char_1(character) & 0x7f; + } + + // U+0080 -> U+07FF. + else if (macro_f_utf_character_t_width(character) == 2) { + *unicode = (macro_f_utf_character_t_to_char_1(character) & 0x1f) << 6; + *unicode |= macro_f_utf_character_t_to_char_2(character) & 0x3f; + } + + // U+0800 -> U+FFFF. + else if (macro_f_utf_character_t_width(character) == 3) { + *unicode = (macro_f_utf_character_t_to_char_1(character) & 0xf) << 12; + *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 6; + *unicode |= macro_f_utf_character_t_to_char_3(character) & 0x3f; + } + + // U+10000 -> U+10FFFF. + else if (macro_f_utf_character_t_width(character) == 4) { + *unicode = (macro_f_utf_character_t_to_char_1(character) & 0x7) << 18; + *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 12; + *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 6; + *unicode |= macro_f_utf_character_t_to_char_4(character) & 0x3f; + } + + return F_none; + } +#endif // !defined(_di_f_utf_unicode_to_) || !defined(_di_f_utf_character_unicode_to_) + #ifdef __cplusplus } // extern "C" #endif diff --git a/level_0/f_utf/c/private-utf.h b/level_0/f_utf/c/private-utf.h index 5b3336c..a2dbf7d 100644 --- a/level_0/f_utf/c/private-utf.h +++ b/level_0/f_utf/c/private-utf.h @@ -633,6 +633,33 @@ extern "C" { extern f_status_t private_f_utf_character_is_zero_width(const f_utf_character_t character) F_attribute_visibility_internal_d; #endif // !defined(_di_f_utf_character_is_zero_width_) || !defined(_di_f_utf_is_zero_width_) +/** + * Private implementation of f_utf_character_is_zero_width(). + * + * Intended to be shared to each of the different implementation variations. + * + * @param character + * The (UTF-8) character to convert to the Unicode representation. + * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged. + * @param unicode + * A 32-bit integer representing the Unicode (such as U+0001). + * Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF). + * + * @return + * F_none on success. + * + * F_failure (with error bit) if width is not long enough to convert. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see f_utf_character_unicode_to() + * @see f_utf_unicode_to() + */ +#if !defined(_di_f_utf_character_unicode_to_) || !defined(_di_f_utf_unicode_to_) + extern f_status_t private_f_utf_character_unicode_to(const f_utf_character_t character, uint32_t *unicode) F_attribute_visibility_internal_d; +#endif // !defined(_di_f_utf_character_unicode_to_) || !defined(_di_f_utf_unicode_to_) + #ifdef __cplusplus } // extern "C" #endif diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index 9122fa7..80a9e8a 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -720,41 +720,7 @@ extern "C" { if (!unicode) return F_status_set_error(F_parameter); #endif // _di_level_0_parameter_checking_ - // ASCII. - if (!macro_f_utf_character_t_width_is(character)) { - *unicode = macro_f_utf_character_t_to_char_1(character) & 0x7f; - } - - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - if (private_f_utf_character_is_valid(character) == F_false) { - return F_status_set_error(F_utf); - } - - // U+0080 -> U+07FF. - if (macro_f_utf_character_t_width_is(character) == 2) { - *unicode = (macro_f_utf_character_t_to_char_1(character) & 0x1f) << 6; - *unicode |= macro_f_utf_character_t_to_char_2(character) & 0x3f; - } - - // U+0800 -> U+FFFF. - else if (macro_f_utf_character_t_width_is(character) == 3) { - *unicode = (macro_f_utf_character_t_to_char_1(character) & 0xf) << 12; - *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 6; - *unicode |= macro_f_utf_character_t_to_char_3(character) & 0x3f; - } - - // U+10000 -> U+10FFFF. - else if (macro_f_utf_character_t_width_is(character) == 4) { - *unicode = (macro_f_utf_character_t_to_char_1(character) & 0x7) << 18; - *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 12; - *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 6; - *unicode |= macro_f_utf_character_t_to_char_4(character) & 0x3f; - } - - return F_none; + return private_f_utf_character_unicode_to(character, unicode); } #endif // _di_f_utf_character_unicode_to_ @@ -801,6 +767,81 @@ extern "C" { } #endif // _di_f_utf_character_unicode_from_ +#ifndef _di_f_utf_character_unicode_string_to_ + f_status_t f_utf_character_unicode_string_to(const f_utf_string_t string, const f_array_length_t length, uint32_t *unicode) { + #ifndef _di_level_0_parameter_checking_ + if (!string) return F_status_set_error(F_parameter); + if (!unicode) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + f_array_length_t i = 0; + + while (i < length && !string[i]) { + ++i; + } // while + + if (i < length) { + if (macro_f_utf_character_t_width_is(string[i])) { + i = length; + } + else { + if (macro_f_utf_character_t_to_char_1(string[i]) == f_string_ascii_u_s[0] || macro_f_utf_character_t_to_char_1(string[i]) == f_string_ascii_U_s[0]) { + do { + ++i; + } while (i < length && !string[i]); + + if (i < length && !macro_f_utf_character_t_width_is(string[i]) && macro_f_utf_character_t_to_char_1(string[i]) == f_string_ascii_plus_s[0]) { + ++i; + } + else { + i = length; + } + } + else { + i = length; + } + } + } + + if (i == length) { + return F_status_set_error(F_valid_not); + } + + uint32_t value = 0; + uint8_t character = 0; + + for (; i < length; ++i) { + + if (!string[i]) continue; + + // Only ASCII character numbers are allowed to represent + if (macro_f_utf_character_t_width_is(string[i])) { + return F_status_set_error(F_valid_not); + } + + value *= 16; + character = macro_f_utf_character_t_to_char_1(string[i]); + + if (character > 0x2f && character < 0x3a) { + value += character - 0x30; + } + else if (character > 0x40 && character < 0x47) { + value += (character - 0x41) + 10; + } + else if (character > 0x60 && character < 0x67) { + value += (character - 0x61) + 10; + } + else { + return F_status_set_error(F_valid_not); + } + } // for + + *unicode = value; + + return F_none; + } +#endif // _di_f_utf_character_unicode_string_to_ + #ifndef _di_f_utf_is_ f_status_t f_utf_is(const f_string_t character) { @@ -1806,48 +1847,14 @@ extern "C" { if (!unicode) return F_status_set_error(F_parameter); #endif // _di_level_0_parameter_checking_ - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } + f_utf_character_t character_utf = 0; { - f_utf_character_t character_utf = 0; - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; - - if (private_f_utf_character_is_valid(character_utf) == F_false) { - return F_status_set_error(F_utf); - } - } - - // U+0000 -> U+007F. - if (macro_f_utf_byte_width(*character) == 1) { - *unicode = ((uint8_t) character[0]) & 0x7f; - } - - // U+0080 -> U+07FF. - else if (macro_f_utf_byte_width(*character) == 2) { - *unicode = (((uint8_t) character[0]) & 0x1f) << 6; - *unicode |= ((uint8_t) character[1]) & 0x3f; } - // U+0800 -> U+FFFF. - else if (macro_f_utf_byte_width(*character) == 3) { - *unicode = (((uint8_t) character[0]) & 0xf) << 12; - *unicode |= (((uint8_t) character[1]) & 0x3f) << 6; - *unicode |= ((uint8_t) character[2]) & 0x3f; - } - - // U+10000 -> U+10FFFF. - else if (macro_f_utf_byte_width(*character) == 4) { - *unicode = (((uint8_t) character[0]) & 0x7) << 18; - *unicode |= (((uint8_t) character[1]) & 0x3f) << 12; - *unicode |= (((uint8_t) character[2]) & 0x3f) << 6; - *unicode |= ((uint8_t) character[3]) & 0x3f; - } - - return F_none; + return private_f_utf_character_unicode_to(character_utf, unicode); } #endif // _di_f_utf_unicode_to_ diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h index 477537c..389bead 100644 --- a/level_0/f_utf/c/utf.h +++ b/level_0/f_utf/c/utf.h @@ -836,6 +836,33 @@ extern "C" { #endif // _di_f_utf_character_unicode_from_ /** + * Convert a string of the format "U+FFFF" into the codepoint value. + * + * This ignores NULL characters. + * The string may only contain "U+" followed by a hexidecimal digit, upper or lower case. + * The "U+" prefix is optional. + * Only ASCII characters are allowed to represent the Unicode sequence string. + * + * @param string + * The string representing a Unicode sequence. + * @param length + * The maximum number of characters. + * @param unicode + * A 32-bit integer representing the Unicode (such as U+0001). + * Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF). + * + * @return + * F_none on success. + * + * F_failure (with error bit) if width_max is not long enough to convert. + * F_parameter (with error bit) if a parameter is invalid. + * F_valid_not (with error bit) if string is not a valid Unicode string. + */ +#ifndef _di_f_utf_character_unicode_string_to_ + extern f_status_t f_utf_character_unicode_string_to(const f_utf_string_t string, const f_array_length_t length, uint32_t *unicode); +#endif // _di_f_utf_character_unicode_string_to_ + +/** * Check to see if the entire byte block of the character is a non-ASCII UTF-8 character. * * This does not check the validity of the character, for that instead use f_utf_is_valid(). @@ -1672,7 +1699,7 @@ extern "C" { * F_parameter (with error bit) if a parameter is invalid. * F_valid_not (with error bit) if string is not a valid Unicode string. */ -#ifndef _di_f_utf_unicode_string_to_f_ +#ifndef _di_f_utf_unicode_string_to_ extern f_status_t f_utf_unicode_string_to(const f_string_t string, const f_array_length_t length, uint32_t *unicode); #endif // _di_f_utf_unicode_string_to_ -- 1.8.3.1