From 4937a92977dd4e46ffc726101c9c1f6770578ca4 Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Tue, 25 May 2021 18:27:30 -0500 Subject: [PATCH] Feature: Add Unicode to/from UTF-8 translations and fix usage of function that should be private. This logic has not actually been tested yet. This needs to be reviewed for endianess correctness. All of the UTF-8 processing code, in fact, needs to be reviewed for endianess so I decided to not test this further until I can review and correct the big vs little endianness support. --- level_0/f_utf/c/private-utf.c | 42 ++++++ level_0/f_utf/c/private-utf.h | 54 ++++++++ level_0/f_utf/c/utf-common.h | 7 +- level_0/f_utf/c/utf.c | 300 +++++++++++++++++++++++++++++++++--------- level_0/f_utf/c/utf.h | 102 ++++++++++++++ 5 files changed, 444 insertions(+), 61 deletions(-) diff --git a/level_0/f_utf/c/private-utf.c b/level_0/f_utf/c/private-utf.c index 031ad85..52d8f64 100644 --- a/level_0/f_utf/c/private-utf.c +++ b/level_0/f_utf/c/private-utf.c @@ -5,6 +5,48 @@ extern "C" { #endif +#if !defined(_di_f_utf_char_to_character_) || !defined(_di_f_utf_is_alpha_) || !defined(_di_f_utf_is_alpha_digit_) || !defined(_di_f_utf_is_alpha_numeric_) || !defined(_di_f_utf_is_ascii_) || !defined(_di_f_utf_is_combining_) || !defined(_di_f_utf_is_control_) || !defined(_di_f_utf_is_control_picture_) || !defined(_di_f_utf_is_digit_) || !defined(_di_f_utf_is_emoji_) || !defined(_di_f_utf_is_graph_) || !defined(_di_f_utf_is_numeric_) || !defined(_di_f_utf_is_phonetic_) || !defined(_di_f_utf_is_private_) || !defined(_di_f_utf_is_punctuation_) || !defined(_di_f_utf_is_symbol_) || !defined(_di_f_utf_is_unassigned_) || !defined(_di_f_utf_is_valid_) || !defined(_di_f_utf_is_whitespace_) || !defined(_di_f_utf_is_whitespace_modifier_) || !defined(_di_f_utf_is_whitespace_other_) || !defined(_di_f_utf_is_word_) || !defined(_di_f_utf_is_word_dash_) || !defined(_di_f_utf_is_word_dash_plus_) || !defined(_di_f_utf_is_zero_width_) || !defined(f_utf_unicode_to) + f_status_t private_f_utf_char_to_character(const f_string_t character, const f_array_length_t width_max, f_utf_character_t *character_utf) { + + const uint8_t width = macro_f_utf_byte_width_is(*character); + + if (!width) { + *character_utf = macro_f_utf_character_t_from_char_1(character[0]); + + return F_none; + } + else if (width == 1) { + return F_status_is_error(F_utf); + } + + if (width > width_max) { + return F_status_set_error(F_failure); + } + + *character_utf = macro_f_utf_character_t_from_char_1(character[0]); + + if (width < 2) { + return F_none; + } + + *character_utf |= macro_f_utf_character_t_from_char_2(character[1]); + + if (width == 2) { + return F_none; + } + + *character_utf |= macro_f_utf_character_t_from_char_3(character[2]); + + if (width == 3) { + return F_none; + } + + *character_utf |= macro_f_utf_character_t_from_char_4(character[3]); + + return F_none; + } +#endif // !defined(_di_f_utf_char_to_character_) || !defined(_di_f_utf_is_alpha_) || !defined(_di_f_utf_is_alpha_digit_) || !defined(_di_f_utf_is_alpha_numeric_) || !defined(_di_f_utf_is_ascii_) || !defined(_di_f_utf_is_combining_) || !defined(_di_f_utf_is_control_) || !defined(_di_f_utf_is_control_picture_) || !defined(_di_f_utf_is_digit_) || !defined(_di_f_utf_is_emoji_) || !defined(_di_f_utf_is_graph_) || !defined(_di_f_utf_is_numeric_) || !defined(_di_f_utf_is_phonetic_) || !defined(_di_f_utf_is_private_) || !defined(_di_f_utf_is_punctuation_) || !defined(_di_f_utf_is_symbol_) || !defined(_di_f_utf_is_unassigned_) || !defined(_di_f_utf_is_valid_) || !defined(_di_f_utf_is_whitespace_) || !defined(_di_f_utf_is_whitespace_modifier_) || !defined(_di_f_utf_is_whitespace_other_) || !defined(_di_f_utf_is_word_) || !defined(_di_f_utf_is_word_dash_) || !defined(_di_f_utf_is_word_dash_plus_) || !defined(_di_f_utf_is_zero_width_) || !defined(f_utf_unicode_to) + #if !defined(_di_f_utf_character_is_alpha_) || !defined(_di_f_utf_is_alpha_) f_status_t private_f_utf_character_is_alpha(const f_utf_character_t character, const uint8_t width) { diff --git a/level_0/f_utf/c/private-utf.h b/level_0/f_utf/c/private-utf.h index 91ace79..1b3a1df 100644 --- a/level_0/f_utf/c/private-utf.h +++ b/level_0/f_utf/c/private-utf.h @@ -18,6 +18,60 @@ extern "C" { #endif /** + * Private implementation of f_utf_char_to_character(). + * + * Intended to be shared to each of the different implementation variations. + * + * @param character + * The character string to be converted to the f_utf_character_t type. + * There must be enough space allocated to convert against, as limited by width_max. + * @param width_max + * The maximum width available for converting. + * Can be anything greater than 0. + * @param character_utf + * The generated character of type f_utf_character_t. + * This value may be cleared, even on error. + * + * @return + * F_none if conversion was successful. + * + * F_failure (with error bit) if width is not long enough to convert. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if character is an invalid UTF-8 character. + * + * @see f_utf_char_to_character() + * @see f_utf_character_is_valid() + * @see f_utf_is_valid() + * @see f_utf_is_alpha() + * @see f_utf_is_alpha_digit() + * @see f_utf_is_alpha_numeric() + * @see f_utf_is_ascii() + * @see f_utf_is_combining_ _di_f_utf_is_control() + * @see f_utf_is_control_picture() + * @see f_utf_is_digit() + * @see f_utf_is_emoji() + * @see f_utf_is_graph() + * @see f_utf_is_numeric() + * @see f_utf_is_phonetic() + * @see f_utf_is_private() + * @see f_utf_is_punctuation() + * @see f_utf_is_symbol() + * @see f_utf_is_unassigned() + * @see f_utf_is_valid() + * @see f_utf_is_whitespace() + * @see f_utf_is_whitespace_modifier() + * @see f_utf_is_whitespace_other() + * @see f_utf_is_word() + * @see f_utf_is_word_dash() + * @see f_utf_is_word_dash_plus() + * @see f_utf_is_zero_width() + * @see f_utf_unicode_to() + */ +#if !defined(_di_f_utf_char_to_character_) || !defined(_di_f_utf_is_alpha_) || !defined(_di_f_utf_is_alpha_digit_) || !defined(_di_f_utf_is_alpha_numeric_) || !defined(_di_f_utf_is_ascii_) || !defined(_di_f_utf_is_combining_) || !defined(_di_f_utf_is_control_) || !defined(_di_f_utf_is_control_picture_) || !defined(_di_f_utf_is_digit_) || !defined(_di_f_utf_is_emoji_) || !defined(_di_f_utf_is_graph_) || !defined(_di_f_utf_is_numeric_) || !defined(_di_f_utf_is_phonetic_) || !defined(_di_f_utf_is_private_) || !defined(_di_f_utf_is_punctuation_) || !defined(_di_f_utf_is_symbol_) || !defined(_di_f_utf_is_unassigned_) || !defined(_di_f_utf_is_valid_) || !defined(_di_f_utf_is_whitespace_) || !defined(_di_f_utf_is_whitespace_modifier_) || !defined(_di_f_utf_is_whitespace_other_) || !defined(_di_f_utf_is_word_) || !defined(_di_f_utf_is_word_dash_) || !defined(_di_f_utf_is_word_dash_plus_) || !defined(_di_f_utf_is_zero_width_) || !defined(f_utf_unicode_to) + extern f_status_t private_f_utf_char_to_character(const f_string_t character, const f_array_length_t width_max, f_utf_character_t *character_utf) f_attribute_visibility_internal; +#endif // !defined(_di_f_utf_char_to_character_) || !defined(_di_f_utf_is_alpha_) || !defined(_di_f_utf_is_alpha_digit_) || !defined(_di_f_utf_is_alpha_numeric_) || !defined(_di_f_utf_is_ascii_) || !defined(_di_f_utf_is_combining_) || !defined(_di_f_utf_is_control_) || !defined(_di_f_utf_is_control_picture_) || !defined(_di_f_utf_is_digit_) || !defined(_di_f_utf_is_emoji_) || !defined(_di_f_utf_is_graph_) || !defined(_di_f_utf_is_numeric_) || !defined(_di_f_utf_is_phonetic_) || !defined(_di_f_utf_is_private_) || !defined(_di_f_utf_is_punctuation_) || !defined(_di_f_utf_is_symbol_) || !defined(_di_f_utf_is_unassigned_) || !defined(_di_f_utf_is_valid_) || !defined(_di_f_utf_is_whitespace_) || !defined(_di_f_utf_is_whitespace_modifier_) || !defined(_di_f_utf_is_whitespace_other_) || !defined(_di_f_utf_is_word_) || !defined(_di_f_utf_is_word_dash_) || !defined(_di_f_utf_is_word_dash_plus_) || !defined(_di_f_utf_is_zero_width_) || !defined(f_utf_unicode_to) + +/** * Private implementation of f_utf_character_is_alpha(). * * Intended to be shared to each of the different implementation variations. diff --git a/level_0/f_utf/c/utf-common.h b/level_0/f_utf/c/utf-common.h index 9a3139c..b2110bf 100644 --- a/level_0/f_utf/c/utf-common.h +++ b/level_0/f_utf/c/utf-common.h @@ -7,6 +7,8 @@ * * Defines common data to be used for/by project utf. * + * @fixme this code probably only works on little-endian only as-is, this needs to be checked for and possibly redesign to support both big or little. + * * This is auto-included by utf.h and should not need to be explicitly included. */ #ifndef _F_utf_common_h @@ -31,7 +33,7 @@ extern "C" { * The macro_f_utf_byte_is_* macros are used to determine a width of the character (either 1, 2, 3, or 4, respectively). * * The macro_f_utf_byte_width macro determines a width of the character. - * The macro_f_utf_byte_width_is is identical to macro_f_utf_byte_width, except it returns 0 when character is not UTF-8. + * The macro_f_utf_byte_width_is is identical to macro_f_utf_byte_width, except it returns 0 when character is ASCII. */ #ifndef _di_f_utf_byte_ #define f_utf_byte_1 0x80 // 1000 0000 @@ -166,6 +168,9 @@ extern "C" { * The macro_f_utf_character_t_width is used to determine the width of the UTF-8 character based on macro_f_utf_byte_width. * The macro_f_utf_character_t_width_is is used to determine the width of the UTF-8 character based on macro_f_utf_byte_width_is. * + * The macro_f_utf_character_t_width macro determines a width of the UTF-8 character based on macro_f_utf_byte_width. + * The macro_f_utf_character_t_width_is is identical to macro_f_utf_character_t_width, except it returns 0 when character is ASCII. + * * @see f_utf_is_big_endian() */ #ifndef _di_f_utf_character_t_ diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index a6e2d86..893fc6f 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -89,42 +89,7 @@ extern "C" { if (!character_utf) return F_status_set_error(F_parameter); #endif // _di_level_0_parameter_checking_ - const uint8_t width = macro_f_utf_byte_width_is(*character); - - if (!width) { - *character_utf = macro_f_utf_character_t_from_char_1(character[0]); - - return F_none; - } - else if (width == 1) { - return F_status_is_error(F_utf); - } - - if (width > width_max) { - return F_status_set_error(F_failure); - } - - *character_utf = macro_f_utf_character_t_from_char_1(character[0]); - - if (width < 2) { - return F_none; - } - - *character_utf |= macro_f_utf_character_t_from_char_2(character[1]); - - if (width == 2) { - return F_none; - } - - *character_utf |= macro_f_utf_character_t_from_char_3(character[2]); - - if (width == 3) { - return F_none; - } - - *character_utf |= macro_f_utf_character_t_from_char_4(character[3]); - - return F_none; + return private_f_utf_char_to_character(character, width_max, character_utf); } #endif // _di_f_utf_char_to_character_ @@ -749,6 +714,93 @@ extern "C" { } #endif // _di_f_utf_character_to_char_ +#ifndef _di_f_utf_character_unicode_to_ + f_status_t f_utf_character_unicode_to(const f_utf_character_t character, uint32_t *unicode) { + #ifndef _di_level_0_parameter_checking_ + if (!unicode) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + const uint8_t width = macro_f_utf_character_t_width(character); + + if (private_f_utf_character_is_valid(character, width) == F_false) { + return F_status_set_error(F_utf); + } + + if (width < 2) { + + // U+0000 -> U+007F + *unicode = macro_f_utf_character_t_to_char_1(character) & 0x7f; + } + else if (width == 2) { + + // U+0080 -> U+07FF + *unicode = (macro_f_utf_character_t_to_char_1(character) & 0x1f) << 6; + *unicode |= macro_f_utf_character_t_to_char_2(character) & 0x3f; + } + else if (width == 3) { + + // U+0800 -> U+FFFF + *unicode = (macro_f_utf_character_t_to_char_1(character) & 0xf) << 12; + *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 6; + *unicode |= macro_f_utf_character_t_to_char_3(character) & 0x3f; + } + else if (width == 4) { + + // U+10000 -> U+10FFFF + *unicode = (macro_f_utf_character_t_to_char_1(character) & 0x7) << 18; + *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 12; + *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 6; + *unicode |= macro_f_utf_character_t_to_char_4(character) & 0x3f; + } + + return F_none; + } +#endif // _di_f_utf_character_unicode_to_ + +#ifndef _di_f_utf_character_unicode_from_ + f_status_t f_utf_character_unicode_from(const uint32_t unicode, f_utf_character_t *character) { + #ifndef _di_level_0_parameter_checking_ + if (!character) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (unicode > 0x10ffff) { + return F_status_set_error(F_utf); + } + + if (unicode < 0x80) { + + // U+0000 -> U+007F + *character = unicode; + } + else if (unicode < 0x800) { + + // U+0080 -> U+07FF + *character = (unicode & 0x7c0) << 2; + *character |= unicode & 0x3f; + *character |= 0xc080; + } + else if (unicode < 0x10000) { + + // U+0800 -> U+FFFF + *character = (unicode & 0xf000) << 4; + *character |= (unicode & 0xfc0) << 2; + *character |= unicode & 0x3f; + *character |= 0xe08080; + } + else { + + // U+10000 -> U+ + *character = (unicode & 0x1c0000) << 6; + *character |= (unicode & 0x3f000) << 4; + *character |= (unicode & 0xfc0) << 2; + *character |= unicode & 0x3f; + *character |= 0xe0808080; + } + + return F_none; + } +#endif // _di_f_utf_character_unicode_from_ + #ifndef _di_f_utf_is_big_endian_ f_status_t f_utf_is_big_endian() { uint16_t test_int = (0x01 << 8) | 0x02; @@ -805,7 +857,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -836,7 +888,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -867,7 +919,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -890,7 +942,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -918,7 +970,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -949,7 +1001,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -977,7 +1029,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1008,7 +1060,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1039,7 +1091,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1086,7 +1138,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1130,7 +1182,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1159,7 +1211,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1188,7 +1240,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1246,7 +1298,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1289,7 +1341,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1312,7 +1364,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1335,7 +1387,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1366,7 +1418,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1395,7 +1447,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1424,7 +1476,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1455,7 +1507,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1486,7 +1538,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1517,7 +1569,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1556,7 +1608,7 @@ extern "C" { f_utf_character_t character_utf = 0; { - const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf); + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); if (F_status_is_error(status)) return status; } @@ -1930,6 +1982,134 @@ extern "C" { } #endif // _di_f_utf_string_seek_to_ +#ifndef _di_f_utf_unicode_to_ + f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, uint32_t *unicode) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + if (!unicode) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + const uint8_t width = macro_f_utf_byte_width(*character); + + { + f_utf_character_t character_utf = 0; + + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + + if (private_f_utf_character_is_valid(character_utf, width) == F_false) { + return F_status_set_error(F_utf); + } + } + + // @fixme the code here needs to be reviewed for endianess accuracy for both big and little endian. + if (width == 1) { + + // U+0000 -> U+007F + *unicode = character[0] & 0x7f; + } + else if (width == 2) { + + // U+0080 -> U+07FF + *unicode = (character[0] & 0x1f) << 6; + *unicode |= character[1] & 0x3f; + } + else if (width == 3) { + + // U+0800 -> U+FFFF + *unicode = (character[0] & 0xf) << 12; + *unicode |= (character[1] & 0x3f) << 6; + *unicode |= character[2] & 0x3f; + } + else if (width == 4) { + + // U+10000 -> U+10FFFF + *unicode = (character[0] & 0x7) << 18; + *unicode |= (character[1] & 0x3f) << 12; + *unicode |= (character[2] & 0x3f) << 6; + *unicode |= character[3] & 0x3f; + } + + return F_none; + } +#endif // _di_f_utf_unicode_to_ + +#ifndef _di_f_utf_unicode_from_ + f_status_t f_utf_unicode_from(const uint32_t unicode, const f_array_length_t width_max, f_string_t *character) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + if (!unicode) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + // @fixme the code here needs to be reviewed for endianess accuracy for both big and little endian. + if (unicode > 0x10ffff) { + return F_status_set_error(F_utf); + } + + if (unicode < 0x80) { + + // U+0000 -> U+007F + (*character)[0] = (char) unicode; + + if (width_max > 1) { + (*character)[1] = 0; + + if (width_max > 2) { + (*character)[2] = 0; + + if (width_max > 3) { + (*character)[3] = 0; + } + } + } + } + else if (unicode < 0x800) { + if (width_max < 2) { + return F_status_set_error(F_utf); + } + + // U+0080 -> U+07FF + (*character)[0] = f_utf_byte_2 | ((char) ((unicode & 0x7c0) >> 6)); + (*character)[1] = f_utf_byte_1 | ((char) (unicode & 0x3f)); + + if (width_max > 2) { + (*character)[2] = 0; + + if (width_max > 2) { + (*character)[2] = 0; + } + } + } + else if (unicode < 0x10000) { + if (width_max < 3) { + return F_status_set_error(F_utf); + } + + // U+0800 -> U+FFFF + (*character)[0] = f_utf_byte_3 | ((char) ((unicode & 0xf000) >> 12)); + (*character)[1] = f_utf_byte_1 | ((char) ((unicode & 0xfc0) >> 6)); + (*character)[2] = f_utf_byte_1 | ((char) (unicode & 0x3f)); + + if (width_max > 3) { + character[3] = 0; + } + } + else { + if (width_max < 4) { + return F_status_set_error(F_utf); + } + + // U+10000 -> U+10FFFF + (*character)[0] = f_utf_byte_4 | ((char) ((unicode & 0x1c0000) >> 18)); + (*character)[1] = f_utf_byte_1 | ((char) ((unicode & 0x3f000) >> 12)); + (*character)[2] = f_utf_byte_1 | ((char) ((unicode & 0xfc0) >> 6)); + (*character)[3] = f_utf_byte_1 | ((char) (unicode & 0x3f)); + } + + return F_none; + } +#endif // _di_f_utf_unicode_from_ + #ifdef __cplusplus } // extern "C" #endif diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h index 1a90547..f5f35f0 100644 --- a/level_0/f_utf/c/utf.h +++ b/level_0/f_utf/c/utf.h @@ -701,6 +701,52 @@ extern "C" { #endif // _di_f_utf_character_to_char_ /** + * Convert a given (UTF-8) character into Unicode. + * + * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged. + * The Unicode is a 32-bit integer representing the Unicode (such as U+0001). + * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF). + * + * @param character + * The (UTF-8) character. + * @param unicode + * The Unicode number. + * + * @return + * F_none on success. + * + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if character is an invalid UTF-8 character. + * + * @see f_utf_character_is_valid() + */ +#ifndef _di_f_utf_character_unicode_to_ + extern f_status_t f_utf_character_unicode_to(const f_utf_character_t character, uint32_t *unicode); +#endif // _di_f_utf_character_unicode_to_ + +/** + * Convert a given Unicode into (UTF-8) character. + * + * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged. + * The Unicode is a 32-bit integer representing the Unicode (such as U+0001). + * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF). + * + * @param unicode + * The Unicode number. + * @param character + * The (UTF-8) character. + * + * @return + * F_none on success. + * + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + */ +#ifndef _di_f_utf_character_unicode_from_ + extern f_status_t f_utf_character_unicode_from(const uint32_t unicode, f_utf_character_t *character); +#endif // _di_f_utf_character_unicode_from_ + +/** * Helper function for UTF-8 processing code to determine endianess of the system. * * @todo relocate this outside of f_utf into a more general path, perhaps f_memory (f_memory_is_big_endian). @@ -1778,6 +1824,62 @@ extern "C" { extern f_status_t f_utf_string_seek_to(const f_utf_string_t string, const uint8_t seek_to, f_utf_string_range_t *range); #endif // _di_f_utf_string_seek_to_ +/** + * Convert a given string block representing a single character into Unicode. + * + * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged. + * The Unicode is a 32-bit integer representing the Unicode (such as U+0001). + * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF). + * + * @param character + * The (UTF-8) character to convert to the Unicode representation. + * @param width_max + * The max width available for representing the UTF-8 character. + * There must be enough space in the character buffer to handle the Unicode width. + * It is recommended to always have 4 characters (4 uint8_t) of space available in character. + * @param unicode + * The Unicode number. + * + * @return + * F_none on success. + * + * F_failure (with error bit) if width is not long enough to convert. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if character is an invalid UTF-8 character. + * + * @see f_utf_character_is_valid() + */ +#ifndef _di_f_utf_unicode_to_ + extern f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, uint32_t *unicode); +#endif // _di_f_utf_unicode_to_ + +/** + * Convert a given Unicode into a string block representing a single character. + * + * The f_string is a 32-bit integer containing UTF-8 sequences, unchanged. + * The Unicode is a 32-bit integer representing the Unicode (such as U+0001). + * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF). + * + * @param character + * The (UTF-8) character. + * @param width_max + * The max width available for representing the UTF-8 character. + * There must be enough space in the character buffer to handle the Unicode width. + * It is recommended to always have 4 characters (4 uint8_t) of space available in character. + * @param unicode + * The Unicode number. + * + * @return + * F_none on success. + * + * F_failure (with error bit) if width is not long enough to convert. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + */ +#ifndef _di_f_utf_unicode_from_ + extern f_status_t f_utf_unicode_from(const uint32_t unicode, const f_array_length_t width_max, f_string_t *character); +#endif // _di_f_utf_unicode_from_ + #ifdef __cplusplus } // extern "C" #endif -- 1.8.3.1