From ac9378e6659574958c103f506d432cc45e37f28f Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Sun, 27 Mar 2022 19:56:35 -0500 Subject: [PATCH] Update: Restructure parts of f_utf project, and use f_utf_t, and add 'u' to hexdigits in common.h. Move the relevant functions into utf/convert.h, utf/is.h, and utf/is_character.h. Implement f_utf_t as a type of uint32_t. This should allow more customizability on that type if need be. It also provides a more explicit context. Add 'u' to designate the hexdigits are unsigned (This likely more specifically means it is an unsigned int). This might be removed if it becomes a problem with architectures where "u" represents 16-bits or less. Due to the size of the work, I am avoiding making this change in all of the Unicode comparison functions. There are quite a lot of hexdigits in use. --- build/level_0/settings | 4 +- build/monolithic/settings | 4 +- level_0/f_utf/c/utf.c | 1869 ------------------------------------ level_0/f_utf/c/utf.h | 1602 +------------------------------ level_0/f_utf/c/utf/common.h | 61 +- level_0/f_utf/c/utf/convert.c | 375 ++++++++ level_0/f_utf/c/utf/convert.h | 231 +++++ level_0/f_utf/c/utf/dynamic.h | 18 - level_0/f_utf/c/utf/is.c | 963 +++++++++++++++++++ level_0/f_utf/c/utf/is.h | 777 +++++++++++++++ level_0/f_utf/c/utf/is_character.c | 567 +++++++++++ level_0/f_utf/c/utf/is_character.h | 660 +++++++++++++ level_0/f_utf/data/build/settings | 4 +- 13 files changed, 3618 insertions(+), 3517 deletions(-) create mode 100644 level_0/f_utf/c/utf/convert.c create mode 100644 level_0/f_utf/c/utf/convert.h create mode 100644 level_0/f_utf/c/utf/is.c create mode 100644 level_0/f_utf/c/utf/is.h create mode 100644 level_0/f_utf/c/utf/is_character.c create mode 100644 level_0/f_utf/c/utf/is_character.h diff --git a/build/level_0/settings b/build/level_0/settings index bf18cfc..ccdc938 100644 --- a/build/level_0/settings +++ b/build/level_0/settings @@ -56,7 +56,7 @@ build_sources_library status_string.c build_sources_library string.c private-string.c string/common.c string/private-dynamic.c string/private-map.c string/private-map_multi.c string/private-quantity.c string/private-range.c string/private-triple.c string/dynamic.c string/map.c string/map_multi.c string/quantity.c string/range.c string/static.c string/triple.c build_sources_library type_array/array_length.c type_array/cell.c type_array/fll_id.c type_array/int8.c type_array/int16.c type_array/int32.c type_array/int64.c type_array/int128.c type_array/state.c type_array/status.c type_array/uint8.c type_array/uint16.c type_array/uint32.c type_array/uint64.c type_array/uint128.c build_sources_library type_array/private-array_length.c type_array/private-cell.c type_array/private-fll_id.c type_array/private-int8.c type_array/private-int16.c type_array/private-int32.c type_array/private-int64.c type_array/private-int128.c type_array/private-state.c type_array/private-status.c type_array/private-uint8.c type_array/private-uint16.c type_array/private-uint32.c type_array/private-uint64.c type_array/private-uint128.c -build_sources_library utf.c private-utf.c utf/common.c utf/dynamic.c utf/map.c utf/private-is_unassigned.c utf/private-string.c utf/string.c utf/triple.c +build_sources_library utf.c private-utf.c utf/common.c utf/convert.c utf/dynamic.c utf/is.c utf/is_character.c utf/map.c utf/private-is_unassigned.c utf/private-string.c utf/string.c utf/triple.c build_sources_library-level thread.c private-thread.c build_sources_library_shared build_sources_library_static @@ -92,7 +92,7 @@ build_sources_headers string.h string/common.h string/dynamic.h string/map.h str build_sources_headers type.h build_sources_headers type_array.h type_array/common.h build_sources_headers type_array/array_length.h type_array/cell.h type_array/fll_id.h type_array/int8.h type_array/int16.h type_array/int32.h type_array/int64.h type_array/int128.h type_array/state.h type_array/status.h type_array/uint8.h type_array/uint16.h type_array/uint32.h type_array/uint64.h type_array/uint128.h -build_sources_headers utf.h utf/common.h utf/dynamic.h utf/map.h utf/string.h utf/triple.h +build_sources_headers utf.h utf/common.h utf/convert.h utf/dynamic.h utf/is.h utf/is_character.h utf/map.h utf/string.h utf/triple.h build_sources_headers-level thread.h thread/common.h build_sources_headers_shared build_sources_headers_static diff --git a/build/monolithic/settings b/build/monolithic/settings index 6984a24..2301b60 100644 --- a/build/monolithic/settings +++ b/build/monolithic/settings @@ -56,7 +56,7 @@ build_sources_library level_0/status_string.c build_sources_library level_0/string.c level_0/private-string.c level_0/string/common.c level_0/string/private-dynamic.c level_0/string/private-map.c level_0/string/private-map_multi.c level_0/string/private-quantity.c level_0/string/private-range.c level_0/string/private-triple.c level_0/string/dynamic.c level_0/string/map.c level_0/string/map_multi.c level_0/string/quantity.c level_0/string/range.c level_0/string/static.c level_0/string/triple.c build_sources_library level_0/type_array/array_length.c level_0/type_array/cell.c level_0/type_array/fll_id.c level_0/type_array/int8.c level_0/type_array/int16.c level_0/type_array/int32.c level_0/type_array/int64.c level_0/type_array/int128.c level_0/type_array/state.c level_0/type_array/status.c level_0/type_array/uint8.c level_0/type_array/uint16.c level_0/type_array/uint32.c level_0/type_array/uint64.c level_0/type_array/uint128.c build_sources_library level_0/type_array/private-array_length.c level_0/type_array/private-cell.c level_0/type_array/private-fll_id.c level_0/type_array/private-int8.c level_0/type_array/private-int16.c level_0/type_array/private-int32.c level_0/type_array/private-int64.c level_0/type_array/private-int128.c level_0/type_array/private-state.c level_0/type_array/private-status.c level_0/type_array/private-uint8.c level_0/type_array/private-uint16.c level_0/type_array/private-uint32.c level_0/type_array/private-uint64.c level_0/type_array/private-uint128.c -build_sources_library level_0/utf.c level_0/private-utf.c level_0/utf/common.c level_0/utf/dynamic.c level_0/utf/map.c level_0/utf/string.c level_0/utf/triple.c level_0/utf/private-is_unassigned.c level_0/utf/private-string.c +build_sources_library level_0/utf.c level_0/private-utf.c level_0/utf/common.c level_0/utf/convert.c level_0/utf/dynamic.c level_0/utf/is.c level_0/utf/is_character.c level_0/utf/map.c level_0/utf/string.c level_0/utf/triple.c level_0/utf/private-is_unassigned.c level_0/utf/private-string.c build_sources_library level_1/control_group.c build_sources_library level_1/conversion.c level_1/private-conversion.c build_sources_library level_1/directory.c level_1/private-directory.c @@ -114,7 +114,7 @@ build_sources_headers level_0/string.h level_0/string/common.h level_0/string/dy build_sources_headers level_0/type.h build_sources_headers level_0/type_array.h level_0/type_array/common.h build_sources_headers level_0/type_array/array_length.h level_0/type_array/cell.h level_0/type_array/fll_id.h level_0/type_array/int8.h level_0/type_array/int16.h level_0/type_array/int32.h level_0/type_array/int64.h level_0/type_array/int128.h level_0/type_array/state.h level_0/type_array/status.h level_0/type_array/uint8.h level_0/type_array/uint16.h level_0/type_array/uint32.h level_0/type_array/uint64.h level_0/type_array/uint128.h -build_sources_headers level_0/utf.h level_0/utf/common.h level_0/utf/dynamic.h level_0/utf/map.h level_0/utf/string.h level_0/utf/triple.h +build_sources_headers level_0/utf.h level_0/utf/common.h level_0/utf/convert.h level_0/utf/dynamic.h level_0/utf/is.h level_0/utf/is_character.h level_0/utf/map.h level_0/utf/string.h level_0/utf/triple.h build_sources_headers level_1/control_group.h build_sources_headers level_1/conversion.h build_sources_headers level_1/directory.h level_1/directory/common.h diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index 02b17de..dcf4344 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -85,1875 +85,6 @@ extern "C" { } #endif // _di_f_utf_buffer_increment_ -#ifndef _di_f_utf_char_to_character_ - f_status_t f_utf_char_to_character(const f_string_t character, const f_array_length_t width_max, f_utf_character_t *character_utf) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - if (!character_utf) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_char_to_character(character, width_max, character_utf); - } -#endif // _di_f_utf_char_to_character_ - -#ifndef _di_f_utf_character_is_ - f_status_t f_utf_character_is(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_utf_fragment; - } - - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_ - -#ifndef _di_f_utf_character_is_alpha_ - f_status_t f_utf_character_is_alpha(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_alpha(character); - } - - if (isalpha(macro_f_utf_character_t_to_char_1(character))) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_alpha_ - -#ifndef _di_f_utf_character_is_alpha_digit_ - f_status_t f_utf_character_is_alpha_digit(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_alpha_digit(character); - } - - if (isalnum(macro_f_utf_character_t_to_char_1(character))) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_alpha_digit_ - -#ifndef _di_f_utf_character_is_alpha_numeric_ - f_status_t f_utf_character_is_alpha_numeric(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_alpha_numeric(character); - } - - if (isalnum(macro_f_utf_character_t_to_char_1(character))) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_alpha_numeric_ - -#ifndef _di_f_utf_character_is_ascii_ - f_status_t f_utf_character_is_ascii(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - return F_false; - } - - return F_true; - } -#endif // _di_f_utf_character_is_ascii_ - -#ifndef _di_f_utf_character_is_combining_ - f_status_t f_utf_character_is_combining(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_combining(character); - } - - // There are no combining characters in ASCII. - return F_false; - } -#endif // _di_f_utf_character_is_combining_ - -#ifndef _di_f_utf_character_is_control_ - f_status_t f_utf_character_is_control(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_control(character); - } - - if (iscntrl(macro_f_utf_character_t_to_char_1(character))) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_control_ - -#ifndef _di_f_utf_character_is_control_code_ - f_status_t f_utf_character_is_control_code(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_control_code(character); - } - - if (iscntrl(macro_f_utf_character_t_to_char_1(character))) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_control_code_ - -#ifndef _di_f_utf_character_is_control_picture_ - f_status_t character_is_control_format(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_control_format(character); - } - - // There are no control format characters in ASCII. - return F_false; - } -#endif // _di_f_utf_character_is_control_format_ - -#ifndef _di_f_utf_character_is_control_picture_ - f_status_t f_utf_character_is_control_picture(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_control_picture(character); - } - - // There are no control picture characters in ASCII. - return F_false; - } -#endif // _di_f_utf_character_is_control_picture_ - -#ifndef _di_f_utf_character_is_digit_ - f_status_t f_utf_character_is_digit(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_digit(character); - } - - if (isdigit(macro_f_utf_character_t_to_char_1(character))) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_digit_ - -#ifndef _di_f_utf_character_is_emoji_ - f_status_t f_utf_character_is_emoji(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_emoji(character); - } - - if (isdigit(macro_f_utf_character_t_to_char_1(character))) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_emoji_ - -#ifndef _di_f_utf_character_is_fragment_ - f_status_t f_utf_character_is_fragment(const f_utf_character_t character) { - - return macro_f_utf_character_t_width_is(character) == 1; - } -#endif // _di_f_utf_character_is_fragment_ - -#ifndef _di_f_utf_character_is_graph_ - f_status_t f_utf_character_is_graph(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - if (private_f_utf_character_is_control(character)) { - return F_false; - } - - if (private_f_utf_character_is_whitespace(character)) { - return F_false; - } - - if (private_f_utf_character_is_zero_width(character)) { - return F_false; - } - - return F_true; - } - - if (isgraph(macro_f_utf_character_t_to_char_1(character))) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_graph_ - -#ifndef _di_f_utf_character_is_numeric_ - f_status_t f_utf_character_is_numeric(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_numeric(character); - } - - if (isdigit(macro_f_utf_character_t_to_char_1(character))) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_numeric_ - -#ifndef _di_f_utf_character_is_phonetic_ - f_status_t f_utf_character_is_phonetic(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_phonetic(character); - } - - // There are no ASCII phonetic characters. - return F_false; - } -#endif // _di_f_utf_character_is_phonetic_ - -#ifndef _di_f_utf_character_is_private_ - f_status_t f_utf_character_is_private(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_private(character); - } - - // There are no ASCII private characters. - return F_false; - } -#endif // _di_f_utf_character_is_phonetic_ - -#ifndef _di_f_utf_character_is_punctuation_ - f_status_t f_utf_character_is_punctuation(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_punctuation(character); - } - - // ASCII: '!' to '#'. - if (character > 0x20000000 && character < 0x24000000) { - return F_true; - } - - // ASCII: '%' to '*'. - if (character > 0x24000000 && character < 0x2b000000) { - return F_true; - } - - // ASCII: ',' to '/'. - if (character > 0x2b000000 && character < 0x30000000) { - return F_true; - } - - // ASCII: ':', ';', '?', or '@'. - if (character == 0x3a000000 || character == 0x3b000000 || character == 0x3f000000 || character == 0x40000000) { - return F_true; - } - - // ASCII: '[' to ']'. - if (character > 0x5a000000 && character < 0x5d000000) { - return F_true; - } - - // ASCII: '_', '{', or '}'. - if (character == 0x5f000000 || character == 0x7b000000 || character == 0x7d000000) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_punctuation_ - -#ifndef _di_f_utf_character_is_symbol_ - f_status_t f_utf_character_is_symbol(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_symbol(character); - } - - // ASCII: '$' or '+'. - if (character == 0x24000000 || character == 0x2b000000) { - return F_true; - } - - // ASCII: '<' to '>'. - if (character > 0x3c000000 && character < 0x3e000000) { - return F_true; - } - - // ASCII: '^', '`', '|', or '~'. - if (character == 0x5e000000 || character == 0x60000000 || character == 0x7c000000 || character == 0x7e000000) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_symbol_ - -#ifndef _di_f_utf_character_is_unassigned_ - f_status_t f_utf_character_is_unassigned(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_unassigned(character); - } - - return F_false; - } -#endif // _di_f_utf_character_is_unassigned_ - -#ifndef _di_f_utf_character_is_valid_ - f_status_t f_utf_character_is_valid(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_valid(character); - } - - return F_true; - } -#endif // _di_f_utf_character_is_valid_ - -#ifndef _di_f_utf_character_is_whitespace_ - f_status_t f_utf_character_is_whitespace(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_whitespace(character); - } - - if (isspace(macro_f_utf_character_t_to_char_1(character))) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_whitespace_ - -#ifndef _di_f_utf_character_is_whitespace_modifier_ - f_status_t f_utf_character_is_whitespace_modifier(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_whitespace_modifier(character); - } - - // There are no ASCII whitespace modifiers. - return F_false; - } -#endif // _di_f_utf_character_is_whitespace_modifier_ - -#ifndef _di_f_utf_character_is_whitespace_other_ - f_status_t f_utf_character_is_whitespace_other(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_whitespace_other(character); - } - - // There are no ASCII whitespace other. - return F_false; - } -#endif // _di_f_utf_character_is_whitespace_other_ - -#ifndef _di_f_utf_character_is_wide_ - f_status_t f_utf_character_is_wide(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_wide(character); - } - - // There are no wide ASCII characters. - return F_false; - } -#endif // _di_f_utf_character_is_wide_ - -#ifndef _di_f_utf_character_is_word_ - f_status_t f_utf_character_is_word(const f_utf_character_t character, const bool strict) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_word(character, strict); - } - - if (isalnum(macro_f_utf_character_t_to_char_1(character)) || character == f_string_ascii_underscore_s.string[0]) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_word_ - -#ifndef _di_f_utf_character_is_word_dash_ - f_status_t f_utf_character_is_word_dash(const f_utf_character_t character, const bool strict) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_word_dash(character, strict); - } - - if (isalnum(macro_f_utf_character_t_to_char_1(character)) || character == f_string_ascii_underscore_s.string[0] || character == f_string_ascii_minus_s.string[0]) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_word_dash_ - -#ifndef _di_f_utf_character_is_word_dash_plus_ - f_status_t f_utf_character_is_word_dash_plus(const f_utf_character_t character, const bool strict) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_word_dash_plus(character, strict); - } - - if (isalnum(macro_f_utf_character_t_to_char_1(character)) || character == f_string_ascii_underscore_s.string[0] || character == f_string_ascii_minus_s.string[0] || character == f_string_ascii_plus_s.string[0]) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_word_dash_plus_ - -#ifndef _di_f_utf_character_is_zero_width_ - f_status_t f_utf_character_is_zero_width(const f_utf_character_t character) { - - if (macro_f_utf_character_t_width_is(character)) { - if (macro_f_utf_character_t_width_is(character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return private_f_utf_character_is_zero_width(character); - } - - const uint8_t ascii = macro_f_utf_character_t_to_char_1(character); - - // These control characters are considered zero-width spaces. - if (ascii >= 0x00 && ascii <= 0x08) { - return F_true; - } - else if (ascii == 0x0a) { - return F_true; - } - else if (ascii >= 0x0c && ascii <= 0x1f) { - return F_true; - } - else if (ascii == 0x7f) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_character_is_zero_width_ - -#ifndef _di_f_utf_character_to_char_ - f_status_t f_utf_character_to_char(const f_utf_character_t utf_character, f_string_t *character, f_array_length_t *width_max) { - #ifndef _di_level_0_parameter_checking_ - if (!utf_character) return F_status_set_error(F_parameter); - if (!character) return F_status_set_error(F_parameter); - if (!width_max) return F_status_set_error(F_parameter); - if (!*width_max) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_character_t_width_is(utf_character)) { - if (macro_f_utf_character_t_width_is(utf_character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - #if __BYTE_ORDER == __LITTLE_ENDIAN - uint32_t utf = 0; - - switch (macro_f_utf_character_t_width_is(utf_character)) { - case 1: - utf = macro_f_utf_character_t_to_char_1(utf_character) << 24; - break; - case 2: - utf = (macro_f_utf_character_t_to_char_2(utf_character) << 24) | (macro_f_utf_character_t_to_char_1(utf_character) << 16); - break; - case 3: - utf = (macro_f_utf_character_t_to_char_3(utf_character) << 24) | (macro_f_utf_character_t_to_char_2(utf_character) << 16) | (macro_f_utf_character_t_to_char_1(utf_character) << 8); - break; - case 4: - utf = (macro_f_utf_character_t_to_char_4(utf_character) << 24) | (macro_f_utf_character_t_to_char_3(utf_character) << 16) | (macro_f_utf_character_t_to_char_2(utf_character) << 8) | macro_f_utf_character_t_to_char_1(utf_character); - break; - default: - return F_status_set_error(F_failure); - } - - memcpy(*character, &utf, sizeof(f_char_t) * macro_f_utf_character_t_width_is(utf_character)); - #else - memcpy(*character, &utf_character, sizeof(f_char_t) * macro_f_utf_character_t_width_is(utf_character)); - #endif // __BYTE_ORDER == __LITTLE_ENDIAN - - return F_none; - } - - #if __BYTE_ORDER == __LITTLE_ENDIAN - uint32_t utf = macro_f_utf_character_t_to_char_1(utf_character) << 24; - - memcpy(*character, &utf, sizeof(f_char_t)); - #else - memcpy(*character, &utf_character, sizeof(f_char_t)); - #endif // __BYTE_ORDER == __LITTLE_ENDIAN - - return F_none; - } -#endif // _di_f_utf_character_to_char_ - -#ifndef _di_f_utf_character_unicode_to_ - f_status_t f_utf_character_unicode_to(const f_utf_character_t character, uint32_t *unicode) { - #ifndef _di_level_0_parameter_checking_ - if (!unicode) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - return private_f_utf_character_unicode_to(character, unicode); - } -#endif // _di_f_utf_character_unicode_to_ - -#ifndef _di_f_utf_character_unicode_from_ - f_status_t f_utf_character_unicode_from(const uint32_t unicode, f_utf_character_t *character) { - #ifndef _di_level_0_parameter_checking_ - if (!character) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (unicode > 0x10ffff) { - return F_status_set_error(F_utf); - } - - // U+0000 -> U+007F. - if (unicode < 0x80) { - *character = unicode; - } - - // U+0080 -> U+07FF. - else if (unicode < 0x800) { - *character = (unicode & 0x7c0) << 2; - *character |= unicode & 0x3f; - *character |= 0xc080; - } - - // U+0800 -> U+FFFF. - else if (unicode < 0x10000) { - *character = (unicode & 0xf000) << 4; - *character |= (unicode & 0xfc0) << 2; - *character |= unicode & 0x3f; - *character |= 0xe08080; - } - - // U+100000 -> U+10FFFF. - else { - *character = (unicode & 0x1c0000) << 6; - *character |= (unicode & 0x3f000) << 4; - *character |= (unicode & 0xfc0) << 2; - *character |= unicode & 0x3f; - *character |= 0xe0808080; - } - - return F_none; - } -#endif // _di_f_utf_character_unicode_from_ - -#ifndef _di_f_utf_character_unicode_string_to_ - f_status_t f_utf_character_unicode_string_to(const f_utf_string_t string, const f_array_length_t length, uint32_t *unicode) { - #ifndef _di_level_0_parameter_checking_ - if (!string) return F_status_set_error(F_parameter); - if (!unicode) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - f_array_length_t i = 0; - - while (i < length && !string[i]) { - ++i; - } // while - - if (i < length) { - if (macro_f_utf_character_t_width_is(string[i])) { - i = length; - } - else { - if (macro_f_utf_character_t_to_char_1(string[i]) == f_string_ascii_u_s.string[0] || macro_f_utf_character_t_to_char_1(string[i]) == f_string_ascii_U_s.string[0]) { - do { - ++i; - } while (i < length && !string[i]); - - if (i < length && !macro_f_utf_character_t_width_is(string[i]) && macro_f_utf_character_t_to_char_1(string[i]) == f_string_ascii_plus_s.string[0]) { - ++i; - } - else { - i = length; - } - } - else { - i = length; - } - } - } - - if (i == length) { - return F_status_set_error(F_valid_not); - } - - uint32_t value = 0; - uint8_t character = 0; - - for (; i < length; ++i) { - - if (!string[i]) continue; - - // Only ASCII character numbers are allowed to represent - if (macro_f_utf_character_t_width_is(string[i])) { - return F_status_set_error(F_valid_not); - } - - value *= 16; - character = macro_f_utf_character_t_to_char_1(string[i]); - - if (character > 0x2f && character < 0x3a) { - value += character - 0x30; - } - else if (character > 0x40 && character < 0x47) { - value += (character - 0x41) + 10; - } - else if (character > 0x60 && character < 0x67) { - value += (character - 0x61) + 10; - } - else { - return F_status_set_error(F_valid_not); - } - } // for - - *unicode = value; - - return F_none; - } -#endif // _di_f_utf_character_unicode_string_to_ - -#ifndef _di_f_utf_is_ - f_status_t f_utf_is(const f_string_t character) { - - return macro_f_utf_byte_width_is(*character); - } -#endif // _di_f_utf_is_ - -#ifndef _di_f_utf_is_alpha_ - f_status_t f_utf_is_alpha(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_alpha(character_utf); - } - - if (isalpha(*character)) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_alpha_ - -#ifndef _di_f_utf_is_alpha_digit_ - f_status_t f_utf_is_alpha_digit(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_alpha_digit(character_utf); - } - - if (isalnum(*character)) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_alpha_digit_ - -#ifndef _di_f_utf_is_alpha_numeric_ - f_status_t f_utf_is_alpha_numeric(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_alpha_numeric(character_utf); - } - - if (isalnum(*character)) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_alpha_numeric_ - -#ifndef _di_f_utf_is_ascii_ - f_status_t f_utf_is_ascii(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - return F_false; - } - - return F_true; - } -#endif // _di_f_utf_is_ascii_ - -#ifndef _di_f_utf_is_combining_ - f_status_t f_utf_is_combining(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_combining(character_utf); - } - - // There are no ASCII combining characters. - return F_false; - } -#endif // _di_f_utf_is_combining_ - -#ifndef _di_f_utf_is_control_ - f_status_t f_utf_is_control(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_control(character_utf); - } - - return iscntrl(*character); - } -#endif // _di_f_utf_is_control_ - -#ifndef _di_f_utf_is_control_code - f_status_t f_utf_is_control_code(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_control_code(character_utf); - } - - if (iscntrl(*character)) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_control_code_ - -#ifndef _di_f_utf_is_control_format_ - f_status_t f_utf_is_control_format(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_control_format(character_utf); - } - - // There are no ASCII control formats. - return F_false; - } -#endif // _di_f_utf_is_control_format_ - -#ifndef _di_f_utf_is_control_picture_ - f_status_t f_utf_is_control_picture(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - if (macro_f_utf_byte_width_is(*character) != 3) { - return F_false; - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_control_picture(character_utf); - } - - // There are no ASCII control pictures. - return F_false; - } -#endif // _di_f_utf_is_control_picture_ - -#ifndef _di_f_utf_is_digit_ - f_status_t f_utf_is_digit(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_digit(character_utf); - } - - if (isdigit(*character)) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_digit_ - -#ifndef _di_f_utf_is_emoji_ - f_status_t f_utf_is_emoji(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_emoji(character_utf); - } - - if (isdigit(*character)) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_emoji_ - -#ifndef _di_f_utf_is_fragment_ - f_status_t f_utf_is_fragment(const f_string_t character) { - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_fragment_ - -#ifndef _di_f_utf_is_graph_ - f_status_t f_utf_is_graph(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - if (private_f_utf_character_is_control(character_utf)) { - return F_false; - } - - if (private_f_utf_character_is_whitespace(character_utf)) { - return F_false; - } - - // Zero-width characters are be treated as a non-graph. - if (private_f_utf_character_is_zero_width(character_utf)) { - return F_false; - } - - return F_true; - } - - if (isgraph(*character)) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_graph_ - -#ifndef _di_f_utf_is_numeric_ - f_status_t f_utf_is_numeric(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_numeric(character_utf); - } - - if (isdigit(*character)) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_numeric_ - -#ifndef _di_f_utf_is_phonetic_ - f_status_t f_utf_is_phonetic(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_phonetic(character_utf); - } - - // There are no ASCII phonetic characters. - return F_false; - } -#endif // _di_f_utf_is_phonetic_ - -#ifndef _di_f_utf_is_private_ - f_status_t f_utf_is_private(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_private(character_utf); - } - - // There are no ASCII private characters. - return F_false; - } -#endif // _di_f_utf_is_private_ - -#ifndef _di_f_utf_is_punctuation_ - f_status_t f_utf_is_punctuation(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_punctuation(character_utf); - } - - // ASCII: '!' to '#'. - if (character[0] > 0x20 && character[0] < 0x24) { - return F_true; - } - - // ASCII: '%' to '*'. - if (character[0] > 0x24 && character[0] < 0x2b) { - return F_true; - } - - // ASCII: ',' to '/'. - if (character[0] > 0x2b && character[0] < 0x30) { - return F_true; - } - - // ASCII: ':', ';', '?', or '@'. - if (character[0] == 0x3a || character[0] == 0x3b || character[0] == 0x3f || character[0] == 0x40) { - return F_true; - } - - // ASCII: '[' to ']'. - if (character[0] > 0x5a && character[0] < 0x5d) { - return F_true; - } - - // ASCII: '_', '{', or '}'. - if (character[0] == 0x5f || character[0] == 0x7b || character[0] == 0x7d) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_punctuation_ - -#ifndef _di_f_utf_is_symbol_ - f_status_t f_utf_is_symbol(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_symbol(character_utf); - } - - // ASCII: '$' or '+'. - if (character[0] == 0x24 || character[0] == 0x2b) { - return F_true; - } - - // ASCII: '<' to '>'. - if (character[0] > 0x3c && character[0] < 0x3e) { - return F_true; - } - - // ASCII: '^', '`', '|', or '~'. - if (character[0] == 0x5e || character[0] == 0x60 || character[0] == 0x7c || character[0] == 0x7e) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_symbol_ - -#ifndef _di_f_utf_is_surrogate_ - f_status_t f_utf_is_surrogate(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_surrogate(character_utf); - } - - // ASCII are never surrogate. - return F_false; - } -#endif // _di_f_utf_is_surrogate_ - -#ifndef _di_f_utf_is_unassigned_ - f_status_t f_utf_is_unassigned(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_unassigned(character_utf); - } - - // ASCII are never unassigned. - return F_false; - } -#endif // _di_f_utf_is_unassigned_ - -#ifndef _di_f_utf_is_valid_ - f_status_t f_utf_is_valid(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_valid(character_utf); - } - - // ASCII are valid. - return F_true; - } -#endif // _di_f_utf_is_valid_ - -#ifndef _di_f_utf_is_whitespace_ - f_status_t f_utf_is_whitespace(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_whitespace(character_utf); - } - - if (isspace(*character)) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_whitespace_ - -#ifndef _di_f_utf_is_whitespace_modifier_ - f_status_t f_utf_is_whitespace_modifier(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_whitespace_modifier(character_utf); - } - - // There are no ASCII whitespace modifiers. - return F_false; - } -#endif // _di_f_utf_is_whitespace_modifier_ - -#ifndef _di_f_utf_is_whitespace_other_ - f_status_t f_utf_is_whitespace_other(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_whitespace_other(character_utf); - } - - // There are no ASCII whitespace other. - return F_false; - } -#endif // _di_f_utf_is_whitespace_other_ - -#ifndef _di_f_utf_is_wide_ - f_status_t f_utf_is_wide(const f_string_t character, const f_array_length_t width_max) { - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_wide(character_utf); - } - - // There are no wide ASCII characters. - return F_false; - } -#endif // _di_f_utf_is_wide_ - -#ifndef _di_f_utf_is_word_ - f_status_t f_utf_is_word(const f_string_t character, const f_array_length_t width_max, const bool strict) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_word(character_utf, strict); - } - - if (isalnum(*character) || *character == f_string_ascii_underscore_s.string[0]) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_word_ - -#ifndef _di_f_utf_is_word_dash_ - f_status_t f_utf_is_word_dash(const f_string_t character, const f_array_length_t width_max, const bool strict) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_word_dash(character_utf, strict); - } - - if (isalnum(*character) || *character == f_string_ascii_underscore_s.string[0] || *character == f_string_ascii_minus_s.string[0]) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_word_dash_ - -#ifndef _di_f_utf_is_word_dash_plus_ - f_status_t f_utf_is_word_dash_plus(const f_string_t character, const f_array_length_t width_max, const bool strict) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_word_dash_plus(character_utf, strict); - } - - if (isalnum(*character) || *character == f_string_ascii_underscore_s.string[0] || *character == f_string_ascii_minus_s.string[0] || *character == f_string_ascii_plus_s.string[0]) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_word_dash_plus_ - -#ifndef _di_f_utf_is_zero_width_ - f_status_t f_utf_is_zero_width(const f_string_t character, const f_array_length_t width_max) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - if (macro_f_utf_byte_width_is(*character)) { - if (macro_f_utf_byte_width_is(*character) > width_max) { - return F_status_set_error(F_failure); - } - - if (macro_f_utf_byte_width_is(*character) == 1) { - return F_status_set_error(F_utf_fragment); - } - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_is_zero_width(character_utf); - } - - // These control characters are considered zero-width spaces. - if (*character >= 0x00 && *character <= 0x08) { - return F_true; - } - else if (*character >= 0x0c && *character <= 0x1f) { - return F_true; - } - else if (*character == 0x7f) { - return F_true; - } - - return F_false; - } -#endif // _di_f_utf_is_zero_width_ - -#ifndef _di_f_utf_unicode_from_ - f_status_t f_utf_unicode_from(const uint32_t unicode, const f_array_length_t width_max, f_string_t *character) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - if (!unicode) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - // @fixme the code here needs to be reviewed for endianess accuracy for both big and little endian. - if (unicode > 0x10ffff) { - return F_status_set_error(F_utf); - } - - if (unicode < 0x80) { - - // U+0000 -> U+007F - (*character)[0] = (uint8_t) unicode; - - if (width_max > 1) { - (*character)[1] = 0; - - if (width_max > 2) { - (*character)[2] = 0; - - if (width_max > 3) { - (*character)[3] = 0; - } - } - } - } - else if (unicode < 0x800) { - if (width_max < 2) { - return F_status_set_error(F_utf); - } - - // U+0080 -> U+07FF - (*character)[0] = F_utf_byte_2_d | ((uint8_t) ((unicode & 0x7c0) >> 6)); - (*character)[1] = F_utf_byte_1_d | ((uint8_t) (unicode & 0x3f)); - - if (width_max > 2) { - (*character)[2] = 0; - - if (width_max > 2) { - (*character)[2] = 0; - } - } - } - else if (unicode < 0x10000) { - if (width_max < 3) { - return F_status_set_error(F_utf); - } - - // U+0800 -> U+FFFF - (*character)[0] = F_utf_byte_3_d | ((uint8_t) ((unicode & 0xf000) >> 12)); - (*character)[1] = F_utf_byte_1_d | ((uint8_t) ((unicode & 0xfc0) >> 6)); - (*character)[2] = F_utf_byte_1_d | ((uint8_t) (unicode & 0x3f)); - - if (width_max > 3) { - character[3] = 0; - } - } - else { - if (width_max < 4) { - return F_status_set_error(F_utf); - } - - // U+10000 -> U+10FFFF - (*character)[0] = F_utf_byte_4_d | ((uint8_t) ((unicode & 0x1c0000) >> 18)); - (*character)[1] = F_utf_byte_1_d | ((uint8_t) ((unicode & 0x3f000) >> 12)); - (*character)[2] = F_utf_byte_1_d | ((uint8_t) ((unicode & 0xfc0) >> 6)); - (*character)[3] = F_utf_byte_1_d | ((uint8_t) (unicode & 0x3f)); - } - - return F_none; - } -#endif // _di_f_utf_unicode_from_ - -#ifndef _di_f_utf_unicode_to_ - f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, uint32_t *unicode) { - #ifndef _di_level_0_parameter_checking_ - if (width_max < 1) return F_status_set_error(F_parameter); - if (!unicode) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - f_utf_character_t character_utf = 0; - - { - const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); - if (F_status_is_error(status)) return status; - } - - return private_f_utf_character_unicode_to(character_utf, unicode); - } -#endif // _di_f_utf_unicode_to_ - -#ifndef _di_f_utf_unicode_string_to_f_ - f_status_t f_utf_unicode_string_to(const f_string_t string, const f_array_length_t length, uint32_t *unicode) { - #ifndef _di_level_0_parameter_checking_ - if (!unicode) return F_status_set_error(F_parameter); - #endif // _di_level_0_parameter_checking_ - - f_array_length_t i = 0; - - while (i < length && !string[i]) { - ++i; - } // while - - if (i < length) { - if (string[i] == f_string_ascii_u_s.string[0] || string[i] == f_string_ascii_U_s.string[0]) { - do { - ++i; - } while (i < length && !string[i]); - - if (i < length && string[i] == f_string_ascii_plus_s.string[0]) { - ++i; - } - else { - i = length; - } - } - else { - i = length; - } - } - - if (i == length) { - return F_status_set_error(F_valid_not); - } - - uint32_t value = 0; - - for (; i < length; ++i) { - - if (!string[i]) continue; - - value *= 16; - - if (string[i] > 0x2f && string[i] < 0x3a) { - value += string[i] - 0x30; - } - else if (string[i] > 0x40 && string[i] < 0x47) { - value += (string[i] - 0x41) + 10; - } - else if (string[i] > 0x60 && string[i] < 0x67) { - value += (string[i] - 0x61) + 10; - } - else { - return F_status_set_error(F_valid_not); - } - } // for - - if (value > 0x10ffff) { - return F_status_set_error(F_valid_not); - } - - *unicode = value; - - return F_none; - } -#endif // _di_f_utf_unicode_string_to_ - #ifdef __cplusplus } // extern "C" #endif diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h index 3c14279..87061af 100644 --- a/level_0/f_utf/c/utf.h +++ b/level_0/f_utf/c/utf.h @@ -47,7 +47,10 @@ // FLL-0 utf includes. #include +#include #include +#include +#include #include #include #include @@ -121,1605 +124,6 @@ extern "C" { extern f_status_t f_utf_buffer_increment(const f_string_static_t buffer, f_string_range_t *range, const f_array_length_t step); #endif // _di_f_utf_buffer_increment_ -/** - * Check to see if the entire byte block of the character is a non-ASCII UTF-8 character. - * - * This does not validate if the UTF-8 character is a valid UTF-8 character, for that use f_utf_character_is_valid(). - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 character. - * F_false if not a UTF-8 character. - * F_utf_fragment if this is a UTF-8 character fragment. - * - * @see f_utf_character_is_valid() - */ -#ifndef _di_f_utf_character_is_ - extern f_status_t f_utf_character_is(const f_utf_character_t character); -#endif // _di_f_utf_character_is_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabet character. - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 alphabet character. - * F_false if not a UTF-8 alphabet character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isalpha() - */ -#ifndef _di_f_utf_character_is_alpha_ - extern f_status_t f_utf_character_is_alpha(const f_utf_character_t character); -#endif // _di_f_utf_character_is_alpha_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabetic or digit character. - * - * Digit characters are decimal digits and letter numbers. - * - * This does not include number-like, such as 1/2 (½) or superscript 2 (²). - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 alpha-digit character. - * F_false if not a UTF-8 alpha-digit character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isalnum() - */ -#ifndef _di_f_utf_character_is_alpha_digit_ - extern f_status_t f_utf_character_is_alpha_digit(const f_utf_character_t character); -#endif // _di_f_utf_character_is_alpha_digit_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabetic or numeric character. - * - * Numeric characters are decimal digits, letter numbers, and number-like, such as 1/2 (½) or superscript 2 (²). - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 alpha-numeric character. - * F_false if not a UTF-8 alpha-numeric character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isalnum() - */ -#ifndef _di_f_utf_character_is_alpha_numeric_ - extern f_status_t f_utf_character_is_alpha_numeric(const f_utf_character_t character); -#endif // _di_f_utf_character_is_alpha_numeric_ - -/** - * Check to see if the entire byte block of the character is an ASCII character. - * - * This does not validate whether the UTF-8 character is valid or not. - * - * @param character - * The character to validate. - * - * @return - * F_true if an ASCII character. - * F_false if not an ASCII character. - */ -#ifndef _di_f_utf_character_is_ascii_ - extern f_status_t f_utf_character_is_ascii(const f_utf_character_t character); -#endif // _di_f_utf_character_is_ascii_ - -/** - * Check to see if the entire byte block of the character is a UTF-8 combining character. - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 combining character. - * F_false if not a UTF-8 combining character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_character_is_combining_ - extern f_status_t f_utf_character_is_combining(const f_utf_character_t character); -#endif // _di_f_utf_character_is_combining_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character. - * - * This includes control code and control format characters. - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 control character. - * F_false if not a UTF-8 control character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see iscntrl() - */ -#ifndef _di_f_utf_character_is_control_ - extern f_status_t f_utf_character_is_control(const f_utf_character_t character); -#endif // _di_f_utf_character_is_control_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 control code character. - * - * Control Code characters are the traditional control characters, such as "\n" as well as some newer Unicode ones. - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 control code character. - * F_false if not a UTF-8 control code character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see iscntrl() - */ -#ifndef _di_f_utf_character_is_control_code_ - extern f_status_t f_utf_character_is_control_code(const f_utf_character_t character); -#endif // _di_f_utf_character_is_control_code_ - -/** - * Check to see if the entire byte block of the character is a UTF-8 control format character. - * - * Control Format characters are special characters used for formatting. - * These are considered control characters. - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 control format character. - * F_false if not a UTF-8 control format character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_character_is_control_format_ - extern f_status_t f_utf_character_is_control_format(const f_utf_character_t character); -#endif // _di_f_utf_character_is_control_format_ - -/** - * Check to see if the entire byte block of the character is a UTF-8 control picture character. - * - * Control Picture characters are placeholders for special ASCII characters and therefore there are no ASCII Control Picture characters. - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 control picture character. - * F_false if not a UTF-8 control picture character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_character_is_control_picture_ - extern f_status_t f_utf_character_is_control_picture(const f_utf_character_t character); -#endif // _di_f_utf_character_is_control_picture_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 digit character. - * - * Digit characters are decimal digits and letter numbers. - * - * This does not include number-like, such as 1/2 (½) or superscript 2 (²). - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 digit character. - * F_false if not a UTF-8 digit character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isdigit() - */ -#ifndef _di_f_utf_character_is_digit_ - extern f_status_t f_utf_character_is_digit(const f_utf_character_t character); -#endif // _di_f_utf_character_is_digit_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 emoji character. - * - * @todo Incomplete, UTF-8 codes not yet checked! - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 emoji character. - * F_false if not a UTF-8 emoji character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_character_is_emoji_ - extern f_status_t f_utf_character_is_emoji(const f_utf_character_t character); -#endif // _di_f_utf_character_is_emoji_ - -/** - * Check to see if the entire byte block of the character is a 1-width UTF-8 character fragment. - * - * Characters whose width is 1-byte are invalid. - * However, the character could have been cut-off, so whether or not this is actually valid should be determined by the caller. - * - * For normal validation functions, try using f_utf_character_is() or f_utf_character_is_valid(). - * - * According to rfc3629, the valid octect sequences for UTF-8 are: - * UTF8-octets = *( UTF8-char ) - * UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 - * UTF8-1 = %x00-7F - * UTF8-2 = %xC2-DF UTF8-tail - * UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / - * %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) - * UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / - * %xF4 %x80-8F 2( UTF8-tail ) - * UTF8-tail = %x80-BF - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 character. - * F_false if not a UTF-8 character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see f_utf_character_is() - * @see f_utf_character_is_valid() - */ -#ifndef _di_f_utf_character_is_fragment_ - extern f_status_t f_utf_character_is_fragment(const f_utf_character_t character); -#endif // _di_f_utf_character_is_fragment_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 printable character. - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 graph. - * F_false if not a UTF-8 graph. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isgraph() - */ -#ifndef _di_f_utf_character_is_graph_ - extern f_status_t f_utf_character_is_graph(const f_utf_character_t character); -#endif // _di_f_utf_character_is_graph_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 numeric character. - * - * Numeric characters are decimal digits, letter numbers, and number-like, such as 1/2 (½) or superscript 2 (²). - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 numeric character. - * F_false if not a UTF-8 numeric character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isdigit() - */ -#ifndef _di_f_utf_character_is_numeric_ - extern f_status_t f_utf_character_is_numeric(const f_utf_character_t character); -#endif // _di_f_utf_character_is_numeric_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 phonetic character. - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 phonetic character. - * F_false if not a UTF-8 phonetic character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_character_is_phonetic_ - extern f_status_t f_utf_character_is_phonetic(const f_utf_character_t character); -#endif // _di_f_utf_character_is_phonetic_ - -/** - * Check to see if the entire byte block of the character is a UTF-8 private character. - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 private character. - * F_false if not a UTF-8 private character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_character_is_private_ - extern f_status_t f_utf_character_is_private(const f_utf_character_t character); -#endif // _di_f_utf_character_is_private_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 punctuation character. - * - * @todo Incomplete, UTF-8 codes not yet checked! - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 punctuation character. - * F_false if not a UTF-8 punctuation character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_character_is_punctuation_ - extern f_status_t f_utf_character_is_punctuation(const f_utf_character_t character); -#endif // _di_f_utf_character_is_punctuation_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 symbol character. - * - * @todo Incomplete, UTF-8 codes not yet checked! - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 symbol character. - * F_false if not a UTF-8 symbol character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_character_is_symbol_ - extern f_status_t f_utf_character_is_symbol(const f_utf_character_t character); -#endif // _di_f_utf_character_is_symbol_ - -/** - * Check to see if the entire byte block of the character is a unassigned (well-formed) UTF-8 character. - * - * The Surrogates and Private Use are not considered unassigned. - * - * This does validate if the UTF-8 character is a unassigned UTF-8 character. - * To not do this, use f_utf_character_is(). - * - * @param character - * The character to unassignedate. - * - * @return - * F_true if a UTF-8 unassigned character. - * F_false if not a UTF-8 unassigned character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see f_utf_character_is() - * @see f_utf_character_is_fragment() - */ -#ifndef _di_f_utf_character_is_unassigned_ - extern f_status_t f_utf_character_is_unassigned(const f_utf_character_t character); -#endif // _di_f_utf_character_is_value_ - -/** - * Check to see if the entire byte block of the character is a valid (well-formed) UTF-8 character. - * - * This does validate if the UTF-8 character is a valid UTF-8 character. - * To not do this, use f_utf_character_is(). - * - * ASCII character codes are considered valid by this function. - * - * Codes U+FDD0 to U+FDEF and any character ending in FFFE or FFFF are non-characters, and are therefore invalid. - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 character. - * F_false if not a UTF-8 character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see f_utf_character_is() - * @see f_utf_character_is_fragment() - */ -#ifndef _di_f_utf_character_is_valid_ - extern f_status_t f_utf_character_is_valid(const f_utf_character_t character); -#endif // _di_f_utf_character_is_value_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 general space character. - * - * Non-printing or zero-width characters are not considered whitespace. - * This does include line separators like '\n'. - * This does not include phonetic spaces, like whitespace modifiers. - * This does not include non-true whitespace characters, such as Ogham Space Mark ( ). - * - * Phonetic spaces are whitespaces with additional phonetic meaning associated with them. - * However, because they are not renderred as whitespace, they are technically not white space. - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 whitespace. - * F_false if not a UTF-8 whitespace. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isspace() - */ -#ifndef _di_f_utf_character_is_whitespace_ - extern f_status_t f_utf_character_is_whitespace(const f_utf_character_t character); -#endif // _di_f_utf_character_is_whitespace_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 whitespace modifier character. - * - * These are phonetic spaces. - * - * Phonetic spaces are whitespaces with additional phonetic meaning associated with them. - * Therefore, these are valid spaces in the technical sense, even if they are not visibly whitespace. - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 modifier character. - * F_false if not a UTF-8 modifier character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_character_is_whitespace_modifier_ - extern f_status_t f_utf_character_is_whitespace_modifier(const f_utf_character_t character); -#endif // _di_f_utf_character_is_whitespace_modifier_ - -/** - * Check to see if the entire byte block of the character is an other type of UTF-8 space character. - * - * This is a list of whitespace that are not actual whitespace (because they are graph characters) but are considered whitespace, such as Ogham Space Mark ( ). - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 (other) whitespace. - * F_false if not a UTF-8 (other) whitespace. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isspace() - */ -#ifndef _di_f_utf_character_is_whitespace_other_ - extern f_status_t f_utf_character_is_whitespace_other(const f_utf_character_t character); -#endif // _di_f_utf_character_is_whitespace_other_ - -/** - * Get whether or not the UTF-8 character is a wide character on display. - * - * This is not the wide as in width in bytes that the codepoint takes up in UTF-8. - * Instead, this is the width in characters on the screen the character takes up. - * When "wide" characters that take up either 2 characters on render. - * When "narrow" characters that take up either 1 character on render. - * - * @param character - * The (UTF-8) character. - * - * @return - * F_none on success. - * - * F_failure (with error bit) if width is not long enough to convert. - * F_parameter (with error bit) if a parameter is invalid. - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_character_is_wide_ - extern f_status_t f_utf_character_is_wide(const f_utf_character_t character); -#endif // _di_f_utf_character_is_wide_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 word character. - * - * A word character is alpha-numeric or an underscore '_'. - * - * @param character - * The character to validate. - * @param strict - * When TRUE, include all appropriate characters by type as per Unicode. - * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). - * When FALSE, zero-width punctuation characters are not considered a character. - * - * @return - * F_true if a UTF-8 word character. - * F_false if not a UTF-8 word character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isalnum() - */ -#ifndef _di_f_utf_character_is_word_ - extern f_status_t f_utf_character_is_word(const f_utf_character_t character, const bool strict); -#endif // _di_f_utf_character_is_word_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 word or dash character. - * - * A word dash character is alpha-numeric, an underscore '_' or a dash '-'. - * - * Unicode appears to refer to dashes that connect words as a hyphen. - * Therefore, only these hyphens are considered dashes for the purposes of this function. - * All other dash-like Unicode characters are not considered a dash here. - * The dash here is intended for combining words, which matches the context of the Unicode "hyphen". - * - * @param character - * The character to validate. - * @param strict - * When TRUE, include all appropriate characters by type as per Unicode. - * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). - * When FALSE, zero-width punctuation characters are not considered a character. - * - * @return - * F_true if a UTF-8 word or dash character. - * F_false if not a UTF-8 word or dash character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isalnum() - */ -#ifndef _di_f_utf_character_is_word_dash_ - extern f_status_t f_utf_character_is_word_dash(const f_utf_character_t character, const bool strict); -#endif // _di_f_utf_character_is_word_dash_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 word, dash, or plus character. - * - * A word dash plus character is alpha-digit, an underscore '_', a dash '-', or a plus '+'. - * - * Unicode appears to refer to dashes that connect words as a hyphen. - * Therefore, only these hyphens are considered dashes for the purposes of this function. - * All other dash-like Unicode characters are not considered a dash here. - * The dash here is intended for combining words, which matches the context of the Unicode "hyphen". - * - * This does not include zero-width punctuation, such as "invisible plus" (U+2064) (even in strict mode). - * - * @param character - * The character to validate. - * @param strict - * When TRUE, include all appropriate characters by type as per Unicode. - * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). - * When FALSE, zero-width punctuation characters are not considered a character. - * - * @return - * F_true if a UTF-8 word or dash character. - * F_false if not a UTF-8 word or dash character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isalnum() - */ -#ifndef _di_f_utf_character_is_word_dash_plus_ - extern f_status_t f_utf_character_is_word_dash_plus(const f_utf_character_t character, const bool strict); -#endif // _di_f_utf_character_is_word_dash_plus_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 general non-printing character. - * - * Only characters that do not print, which are generally called zero-width. - * - * @param character - * The character to validate. - * - * @return - * F_true if a UTF-8 non-printing or zero-width character. - * F_false if not a UTF-8 non-printing or zero-width character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_character_is_zero_width_ - extern f_status_t f_utf_character_is_zero_width(const f_utf_character_t character); -#endif // _di_f_utf_character_is_zero_width_ - -/** - * Convert a specialized f_utf_character_t type to a uint8_t, stored as a string (character buffer). - * - * This will also convert ASCII characters stored in the utf_character array. - * This will not resize character. - * - * @param utf_character - * The UTF-8 character to convert from. - * @param character - * A uint8_t representation of the UTF-8 character, stored as a string of width bytes. - * If width_max is 0, then this should be set to 0. - * @param width_max - * This is set to the max number of bytes available. - * This is then updated to represent the max bytes used if enough space is available. - * - * @return - * F_none if conversion was successful. - * - * F_failure (with error bit) if width is not long enough to convert. - * F_parameter (with error bit) if a parameter is invalid. - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_character_to_char_ - extern f_status_t f_utf_character_to_char(const f_utf_character_t utf_character, f_string_t *character, f_array_length_t *width_max); -#endif // _di_f_utf_character_to_char_ - -/** - * Convert a given (UTF-8) character into Unicode. - * - * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged. - * The Unicode is a 32-bit integer representing the Unicode (such as U+0001). - * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF). - * - * @param character - * The (UTF-8) character. - * @param unicode - * The Unicode number. - * - * @return - * F_none on success. - * - * F_parameter (with error bit) if a parameter is invalid. - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see f_utf_character_is_valid() - */ -#ifndef _di_f_utf_character_unicode_to_ - extern f_status_t f_utf_character_unicode_to(const f_utf_character_t character, uint32_t *unicode); -#endif // _di_f_utf_character_unicode_to_ - -/** - * Convert a given Unicode into (UTF-8) character. - * - * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged. - * The Unicode is a 32-bit integer representing the Unicode (such as U+0001). - * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF). - * - * @param unicode - * The Unicode number. - * @param character - * The (UTF-8) character. - * - * @return - * F_none on success. - * - * F_parameter (with error bit) if a parameter is invalid. - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_character_unicode_from_ - extern f_status_t f_utf_character_unicode_from(const uint32_t unicode, f_utf_character_t *character); -#endif // _di_f_utf_character_unicode_from_ - -/** - * Convert a string of the format "U+FFFF" into the codepoint value. - * - * This ignores NULL characters. - * The string may only contain "U+" followed by a hexidecimal digit, upper or lower case. - * The "U+" prefix is optional. - * Only ASCII characters are allowed to represent the Unicode sequence string. - * - * @param string - * The string representing a Unicode sequence. - * @param length - * The maximum number of characters. - * @param unicode - * A 32-bit integer representing the Unicode (such as U+0001). - * Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF). - * - * @return - * F_none on success. - * - * F_failure (with error bit) if width_max is not long enough to convert. - * F_parameter (with error bit) if a parameter is invalid. - * F_valid_not (with error bit) if string is not a valid Unicode string. - */ -#ifndef _di_f_utf_character_unicode_string_to_ - extern f_status_t f_utf_character_unicode_string_to(const f_utf_string_t string, const f_array_length_t length, uint32_t *unicode); -#endif // _di_f_utf_character_unicode_string_to_ - -/** - * Check to see if the entire byte block of the character is a non-ASCII UTF-8 character. - * - * This does not check the validity of the character, for that instead use f_utf_is_valid(). - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * - * @return - * F_true if a UTF-8 character. - * F_false if not a UTF-8 character. - */ -#ifndef _di_f_utf_is_ - extern f_status_t f_utf_is(const f_string_t character); -#endif // _di_f_utf_is_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabet character. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 alphabet character. - * F_false if not a UTF-8 alphabet character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isalpha() - */ -#ifndef _di_f_utf_is_alpha_ - extern f_status_t f_utf_is_alpha(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_alpha_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabet or digit character. - * - * Digit characters are decimal digits and letter numbers. - * - * This does not include number-like, such as 1/2 (½) or superscript 2 (²). - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 alphabet character. - * F_false if not a UTF-8 alpha-numeric character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isalnum() - */ -#ifndef _di_f_utf_is_alpha_digit_ - extern f_status_t f_utf_is_alpha_digit(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_alpha_digit_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabet or numeric character. - * - * Numeric characters are decimal digits, letter numbers, and number-like, such as 1/2 (½) or superscript 2 (²). - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 alphabet character. - * F_false if not a UTF-8 alpha-numeric character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isalnum() - */ -#ifndef _di_f_utf_is_alpha_numeric_ - extern f_status_t f_utf_is_alpha_numeric(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_alpha_numeric_ - -/** - * Check to see if the entire byte block of the character is an ASCII character. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if an ASCII character. - * F_false if not an ASCII character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_ascii_ - extern f_status_t f_utf_is_ascii(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_ascii_ - -/** - * Check to see if the entire byte block of the character is a UTF-8 combining character. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 combining character. - * F_false if not a UTF-8 combining character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_combining_ - extern f_status_t f_utf_is_combining(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_combining_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character. - * - * This includes control code and control format characters. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 control character. - * F_false if not a UTF-8 control character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see iscntrl() - */ -#ifndef _di_f_utf_is_control_ - extern f_status_t f_utf_is_control(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_control_ - -/** - * Check to see if the entire byte block of the character is a UTF-8 control code character. - * - * Control Code characters are the traditional control characters, such as "\n" as well as some newer Unicode ones. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 control code character. - * F_false if not a UTF-8 control code character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_control_code_ - extern f_status_t f_utf_is_control_code(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_control_code_ - -/** - * Check to see if the entire byte block of the character is a UTF-8 control format character. - * - * Control Format characters are special characters used for formatting. - * These are considered control characters. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 control format character. - * F_false if not a UTF-8 control format character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_control_format_ - extern f_status_t f_utf_is_control_format(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_control_format_ - -/** - * Check to see if the entire byte block of the character is a UTF-8 control picture character. - * - * Control Picture characters are placeholders for special ASCII characters and therefore there are no ASCII Control Picture characters. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 control picture character. - * F_false if not a UTF-8 control picture character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_control_picture_ - extern f_status_t f_utf_is_control_picture(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_control_picture_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 digit character. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 digit character. - * F_false if not a UTF-8 digit character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isdigit() - */ -#ifndef _di_f_utf_is_digit_ - extern f_status_t f_utf_is_digit(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_digit_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 emoji character. - * - * @todo Incomplete, UTF-8 codes not yet checked! - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 emoji character. - * F_false if not a UTF-8 emoji character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_emoji_ - extern f_status_t f_utf_is_emoji(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_emoji_ - -/** - * Check to see if the entire byte block of the character is a 1-width UTF-8 character fragment. - * - * Characters whose width is 1-byte are invalid. - * However, the character could have been cut-off, so whether or not this is actually valid should be determined by the caller. - * - * For normal validation functions, try using f_utf_character_is() or f_utf_character_is_valid(). - * - * According to rfc3629, the valid octect sequences for UTF-8 are: - * UTF8-octets = *( UTF8-char ) - * UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 - * UTF8-1 = %x00-7F - * UTF8-2 = %xC2-DF UTF8-tail - * UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / - * %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) - * UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / - * %xF4 %x80-8F 2( UTF8-tail ) - * UTF8-tail = %x80-BF - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * - * @return - * F_true if a UTF-8 character. - * F_false if not a UTF-8 character. - */ -#ifndef _di_f_utf_is_fragment_ - extern f_status_t f_utf_is_fragment(const f_string_t character); -#endif // _di_f_utf_is_fragment_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 printable character. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 graph. - * F_false if not a UTF-8 graph. - * - * F_maybe (with error bit) if this could be a graph but width is not long enough. - * F_parameter (with error bit) if a parameter is invalid. - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isgraph() - */ -#ifndef _di_f_utf_is_graph_ - extern f_status_t f_utf_is_graph(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_graph_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 numeric character. - * - * Numeric characters are decimal digits, letter numbers, and number-like, such as 1/2 (½) or superscript 2 (²). - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 numeric character. - * F_false if not a UTF-8 numeric character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isdigit() - */ -#ifndef _di_f_utf_is_numeric_ - extern f_status_t f_utf_is_numeric(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_numeric_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 phonetic character. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 phonetic character. - * F_false if not a UTF-8 phonetic character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_phonetic_ - extern f_status_t f_utf_is_phonetic(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_phonetic_ - -/** - * Check to see if the entire byte block of the character is a UTF-8 private character. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 punctuation character. - * F_false if not a UTF-8 punctuation character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_private_ - extern f_status_t f_utf_is_private(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_private_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 punctuation character. - * - * @todo Incomplete, UTF-8 codes not yet checked! - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 punctuation character. - * F_false if not a UTF-8 punctuation character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_punctuation_ - extern f_status_t f_utf_is_punctuation(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_punctuation_ - -/** - * Check to see if the entire byte block of the character is a surrogate UTF-8 character. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 symbol character. - * F_false if not a UTF-8 symbol character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_surrogate_ - extern f_status_t f_utf_is_surrogate(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_surrogate_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 symbol character. - * - * @todo Incomplete, UTF-8 codes not yet checked! - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 symbol character. - * F_false if not a UTF-8 symbol character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_symbol_ - extern f_status_t f_utf_is_symbol(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_symbol_ - -/** - * Check to see if the entire byte block of the character is a unassigned UTF-8 character. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if an unassigned UTF-8 character. - * F_false if not an unassigned UTF-8 character. - * - * F_parameter (with error bit) if a parameter is inunassigned. - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_unassigned_ - extern f_status_t f_utf_is_unassigned(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_unassigned_ - -/** - * Check to see if the entire byte block of the character is a valid (well-formed) UTF-8 character. - * - * This does validate if the UTF-8 character is a valid UTF-8 character. - * To not do this, use f_utf_is(). - * - * Valid ASCII character codes are considered valid by this function. - * - * Codes U+FDD0 to U+FDEF and any character ending in FFFE or FFFF are non-characters, and are therefore invalid. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a valid UTF-8 character or is an ASCII character. - * F_false if not a valid UTF-8 character. - * - * F_failure (with error bit) if width_max is not long enough to convert. - * F_parameter (with error bit) if a parameter is invalid. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_valid_ - extern f_status_t f_utf_is_valid(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_valid_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 general space character. - * - * Non-printing or zero-width characters are not considered whitespace. - * This does include line separators like '\n'. - * This does not include phonetic spaces, like whitespace modifiers. - * This does not include non-true whitespace characters, such as Ogham Space Mark ( ). - * - * Phonetic spaces are whitespaces with additional phonetic meaning associated with them. - * However, because they are not renderred as whitespace, they are technically not white space. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 whitespace. - * F_false if not a UTF-8 whitespace. - * - * F_maybe (with error bit) if this could be a whitespace but width is not long enough. - * F_parameter (with error bit) if a parameter is invalid. - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isspace() - */ -#ifndef _di_f_utf_is_whitespace_ - extern f_status_t f_utf_is_whitespace(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_whitespace_ - -/** - * Check to see if the entire byte block of the character is a UTF-8 whitespace modifier character. - * - * These are phonetic spaces. - * - * Phonetic spaces are whitespaces with additional phonetic meaning associated with them. - * Therefore, these are valid spaces in the technical sense, even if they are not visibly whitespace. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 whitespace. - * F_false if not a UTF-8 whitespace. - * - * F_maybe (with error bit) if this could be a whitespace but width is not long enough. - * F_parameter (with error bit) if a parameter is invalid. - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_whitespace_modifier_ - extern f_status_t f_utf_is_whitespace_modifier(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_whitespace_modifier_ - -/** - * Check to see if the entire byte block of the character is an other type of UTF-8 space character. - * - * This is a list of whitespace that are not actual whitespace (because they are graph characters) but are considered whitespace, such as Ogham Space Mark ( ). - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 whitespace. - * F_false if not a UTF-8 whitespace. - * - * F_maybe (with error bit) if this could be a whitespace but width is not long enough. - * F_parameter (with error bit) if a parameter is invalid. - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_whitespace_other_ - extern f_status_t f_utf_is_whitespace_other(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_whitespace_other_ - -/** - * Get whether or not the UTF-8 character is a wide character on display. - * - * This is not the wide as in width in bytes that the codepoint takes up in UTF-8. - * Instead, this is the width in characters on the screen the character takes up. - * When "wide" characters that take up either 2 characters on render. - * When "narrow" characters that take up either 1 character on render. - * - * @param character - * The (UTF-8) character. - * @param width_max - * The max width available for representing the UTF-8 character. - * There must be enough space in the character buffer to handle the Unicode width. - * It is recommended to always have 4 characters (4 uint8_t) of space available in character. - * This is the width in bytes the codepoint takes up in UTF-8. - * - * @return - * F_none on success. - * - * F_failure (with error bit) if width_max is not long enough to convert. - * F_parameter (with error bit) if a parameter is invalid. - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_wide_ - extern f_status_t f_utf_is_wide(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_wide_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 word character. - * - * A word character is alpha-digit or an underscore '_'. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * @param strict - * When TRUE, include all appropriate characters by type as per Unicode. - * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). - * When FALSE, zero-width punctuation characters are not considered a character. - * - * @return - * F_true if a UTF-8 word character. - * F_false if not a UTF-8 word character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isalnum() - */ -#ifndef _di_f_utf_is_word_ - extern f_status_t f_utf_is_word(const f_string_t character, const f_array_length_t width_max, const bool strict); -#endif // _di_f_utf_is_word_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 word or dash character. - * - * A word dash character is alpha-digit, an underscore '_' or a dash '-'. - * - * Unicode appears to refer to dashes that connect words as a hyphen. - * Therefore, only these hyphens are considered dashes for the purposes of this function. - * All other dash-like Unicode characters are not considered a dash here. - * The dash here is intended for combining words, which matches the context of the Unicode "hyphen". - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * @param strict - * When TRUE, include all appropriate characters by type as per Unicode. - * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). - * When FALSE, zero-width punctuation characters are not considered a character. - * - * @return - * F_true if a UTF-8 word or dash character. - * F_false if not a UTF-8 word or dash character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isalnum() - */ -#ifndef _di_f_utf_is_word_dash_ - extern f_status_t f_utf_is_word_dash(const f_string_t character, const f_array_length_t width_max, const bool strict); -#endif // _di_f_utf_is_word_dash_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 word, dash, or plus character. - * - * A word dash plus character is alpha-digit, an underscore '_', a dash '-', or a plus '+'. - * - * Unicode appears to refer to dashes that connect words as a hyphen. - * Therefore, only these hyphens are considered dashes for the purposes of this function. - * All other dash-like Unicode characters are not considered a dash here. - * The dash here is intended for combining words, which matches the context of the Unicode "hyphen". - * - * This does not include zero-width punctuation, such as "invisible plus" (U+2064) (even in strict mode). - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * @param strict - * When TRUE, include all appropriate characters by type as per Unicode. - * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). - * When FALSE, zero-width punctuation characters are not considered a character. - * - * @return - * F_true if a UTF-8 word or dash character. - * F_false if not a UTF-8 word or dash character. - * - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see isalnum() - */ -#ifndef _di_f_utf_is_word_dash_plus_ - extern f_status_t f_utf_is_word_dash_plus(const f_string_t character, const f_array_length_t width_max, const bool strict); -#endif // _di_f_utf_is_word_dash_plus_ - -/** - * Check to see if the entire byte block of the character is an ASCII or UTF-8 general non-printing character. - * - * Only characters that do not print, which are generally called zero-width. - * - * @param character - * The character to validate. - * There must be enough space allocated to compare against, as limited by width_max. - * @param width_max - * The maximum width available for checking. - * Can be anything greater than 0. - * - * @return - * F_true if a UTF-8 whitespace. - * F_false if not a UTF-8 whitespace. - * - * F_maybe (with error bit) if this could be a whitespace but width is not long enough. - * F_parameter (with error bit) if a parameter is invalid. - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_is_zero_width_ - extern f_status_t f_utf_is_zero_width(const f_string_t character, const f_array_length_t width_max); -#endif // _di_f_utf_is_zero_width_ - -/** - * Convert an ASCII or UTF-8 character, stored as a string (character buffer), to the specialized f_utf_character_t type. - * - * @param character - * The character string to be converted to the f_utf_character_t type. - * There must be enough space allocated to convert against, as limited by width_max. - * @param width_max - * The maximum width available for converting. - * Can be anything greater than 0. - * @param character_utf - * The generated character of type f_utf_character_t. - * This value may be cleared, even on error. - * - * @return - * F_none if conversion was successful. - * - * F_failure (with error bit) if width is not long enough to convert. - * F_parameter (with error bit) if a parameter is invalid. - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_char_to_character_ - extern f_status_t f_utf_char_to_character(const f_string_t character, const f_array_length_t width_max, f_utf_character_t *character_utf); -#endif // _di_f_utf_char_to_character_ - -/** - * Convert a given Unicode into a string block representing a single character. - * - * @param character - * The (UTF-8) character. - * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged. - * @param width_max - * The max width available for representing the UTF-8 character. - * There must be enough space in the character buffer to handle the Unicode width. - * It is recommended to always have 4 characters (4 uint8_t) of space available in character. - * @param unicode - * A 32-bit integer representing the Unicode (such as U+0001). - * Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF). - * - * @return - * F_none on success. - * - * F_failure (with error bit) if width_max is not long enough to convert. - * F_parameter (with error bit) if a parameter is invalid. - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - */ -#ifndef _di_f_utf_unicode_from_ - extern f_status_t f_utf_unicode_from(const uint32_t unicode, const f_array_length_t width_max, f_string_t *character); -#endif // _di_f_utf_unicode_from_ - -/** - * Convert a given string block representing a single character into Unicode. - * - * @param character - * The (UTF-8) character to convert to the Unicode representation. - * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged. - * @param width_max - * The max width available for representing the UTF-8 character. - * There must be enough space in the character buffer to handle the Unicode width. - * It is recommended to always have 4 characters (4 uint8_t) of space available in character. - * @param unicode - * A 32-bit integer representing the Unicode (such as U+0001). - * Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF). - * - * @return - * F_none on success. - * - * F_failure (with error bit) if width is not long enough to convert. - * F_parameter (with error bit) if a parameter is invalid. - * F_utf (with error bit) if unicode is an invalid Unicode character. - * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. - * - * @see f_utf_character_is_valid() - */ -#ifndef _di_f_utf_unicode_to_ - extern f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, uint32_t *unicode); -#endif // _di_f_utf_unicode_to_ - -/** - * Convert a string of the format "U+FFFF" into the codepoint value. - * - * This ignores NULL characters. - * The string may only contain "U+" followed by a hexidecimal digit, upper or lower case. - * The "U+" prefix is optional. - * Only ASCII characters are allowed to represent the Unicode sequence string. - * - * @param string - * The string representing a Unicode sequence. - * @param length - * The maximum number of characters. - * @param unicode - * A 32-bit integer representing the Unicode (such as U+0001). - * Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF). - * - * @return - * F_none on success. - * - * F_failure (with error bit) if width_max is not long enough to convert. - * F_parameter (with error bit) if a parameter is invalid. - * F_valid_not (with error bit) if string is not a valid Unicode string. - */ -#ifndef _di_f_utf_unicode_string_to_ - extern f_status_t f_utf_unicode_string_to(const f_string_t string, const f_array_length_t length, uint32_t *unicode); -#endif // _di_f_utf_unicode_string_to_ - #ifdef __cplusplus } // extern "C" #endif diff --git a/level_0/f_utf/c/utf/common.h b/level_0/f_utf/c/utf/common.h index e3f395d..1d6fb5f 100644 --- a/level_0/f_utf/c/utf/common.h +++ b/level_0/f_utf/c/utf/common.h @@ -36,15 +36,15 @@ extern "C" { * The macro_f_utf_byte_width_is is identical to macro_f_utf_byte_width, except it returns 0 when character is ASCII. */ #ifndef _di_f_utf_byte_ - #define F_utf_byte_1_d 0x80 // 1000 0000 - #define F_utf_byte_2_d 0xc0 // 1100 0000 - #define F_utf_byte_3_d 0xe0 // 1110 0000 - #define F_utf_byte_4_d 0xf0 // 1111 0000 + #define F_utf_byte_1_d 0x80u // 1000 0000 + #define F_utf_byte_2_d 0xc0u // 1100 0000 + #define F_utf_byte_3_d 0xe0u // 1110 0000 + #define F_utf_byte_4_d 0xf0u // 1111 0000 - #define F_utf_byte_off_1_d 0xc0 // 1100 0000 - #define F_utf_byte_off_2_d 0xe0 // 1110 0000 - #define F_utf_byte_off_3_d 0xf0 // 1111 0000 - #define F_utf_byte_off_4_d 0xf8 // 1111 1000 + #define F_utf_byte_off_1_d 0xc0u // 1100 0000 + #define F_utf_byte_off_2_d 0xe0u // 1110 0000 + #define F_utf_byte_off_3_d 0xf0u // 1111 0000 + #define F_utf_byte_off_4_d 0xf8u // 1111 1000 #define macro_f_utf_byte_is(character) ((character) & F_utf_byte_1_d) @@ -148,6 +148,17 @@ extern "C" { #endif // _di_f_utf_substitute_ /** + * Defines type for representing the UTF-8 code as a 32-bit unsigned integer. + */ +#ifndef _di_f_utf_t_ + typedef uint32_t f_utf_t; + + #define f_utf_t_initialize 0 + + #define macro_f_utf_initialize(code) code +#endif // _di_f_utf_t_ + +/** * Provide a basic UTF-8 character as a single 4-byte variable. * * This is intended to be used when a single variable is desired to represent a 1-byte, 2-byte, 3-byte, or even 4-byte character. @@ -176,24 +187,24 @@ extern "C" { #ifndef _di_f_utf_character_t_ typedef uint32_t f_utf_character_t; - #define F_utf_character_mask_byte_1_d 0xff000000 // 1111 1111, 0000 0000, 0000 0000, 0000 0000 - #define F_utf_character_mask_byte_2_d 0xffff0000 // 1111 1111, 1111 1111, 0000 0000, 0000 0000 - #define F_utf_character_mask_byte_3_d 0xffffff00 // 1111 1111, 1111 1111, 1111 1111, 0000 0000 - #define F_utf_character_mask_byte_4_d 0xffffffff // 1111 1111, 1111 1111, 1111 1111, 1111 1111 + #define F_utf_character_mask_byte_1_d 0xff000000u // 1111 1111, 0000 0000, 0000 0000, 0000 0000 + #define F_utf_character_mask_byte_2_d 0xffff0000u // 1111 1111, 1111 1111, 0000 0000, 0000 0000 + #define F_utf_character_mask_byte_3_d 0xffffff00u // 1111 1111, 1111 1111, 1111 1111, 0000 0000 + #define F_utf_character_mask_byte_4_d 0xffffffffu // 1111 1111, 1111 1111, 1111 1111, 1111 1111 - #define F_utf_character_mask_char_1_d 0xff000000 // 1111 1111, 0000 0000, 0000 0000, 0000 0000 - #define F_utf_character_mask_char_2_d 0x00ff0000 // 0000 0000, 1111 1111, 0000 0000, 0000 0000 - #define F_utf_character_mask_char_3_d 0x0000ff00 // 0000 0000, 0000 0000, 1111 1111, 0000 0000 - #define F_utf_character_mask_char_4_d 0x000000ff // 0000 0000, 0000 0000, 0000 0000, 1111 1111 + #define F_utf_character_mask_char_1_d 0xff000000u // 1111 1111, 0000 0000, 0000 0000, 0000 0000 + #define F_utf_character_mask_char_2_d 0x00ff0000u // 0000 0000, 1111 1111, 0000 0000, 0000 0000 + #define F_utf_character_mask_char_3_d 0x0000ff00u // 0000 0000, 0000 0000, 1111 1111, 0000 0000 + #define F_utf_character_mask_char_4_d 0x000000ffu // 0000 0000, 0000 0000, 0000 0000, 1111 1111 - #define macro_f_utf_character_t_to_char_1(character) (((character) & F_utf_character_mask_char_1_d) >> 24) // grab first byte. - #define macro_f_utf_character_t_to_char_2(character) (((character) & F_utf_character_mask_char_2_d) >> 16) // grab second byte. - #define macro_f_utf_character_t_to_char_3(character) (((character) & F_utf_character_mask_char_3_d) >> 8) // grab third byte. + #define macro_f_utf_character_t_to_char_1(character) (((character) & F_utf_character_mask_char_1_d) >> 24u) // grab first byte. + #define macro_f_utf_character_t_to_char_2(character) (((character) & F_utf_character_mask_char_2_d) >> 16u) // grab second byte. + #define macro_f_utf_character_t_to_char_3(character) (((character) & F_utf_character_mask_char_3_d) >> 8u) // grab third byte. #define macro_f_utf_character_t_to_char_4(character) ((character) & F_utf_character_mask_char_4_d) // grab fourth byte. - #define macro_f_utf_character_t_from_char_1(character) (((character) << 24) & F_utf_character_mask_char_1_d) // shift to first byte. - #define macro_f_utf_character_t_from_char_2(character) (((character) << 16) & F_utf_character_mask_char_2_d) // shift to second byte. - #define macro_f_utf_character_t_from_char_3(character) (((character) << 8) & F_utf_character_mask_char_3_d) // shift to third byte. + #define macro_f_utf_character_t_from_char_1(character) (((character) << 24u) & F_utf_character_mask_char_1_d) // shift to first byte. + #define macro_f_utf_character_t_from_char_2(character) (((character) << 16u) & F_utf_character_mask_char_2_d) // shift to second byte. + #define macro_f_utf_character_t_from_char_3(character) (((character) << 8u) & F_utf_character_mask_char_3_d) // shift to third byte. #define macro_f_utf_character_t_from_char_4(character) ((character) & F_utf_character_mask_char_4_d) // shift to fourth byte. #define macro_f_utf_character_t_width(character) (macro_f_utf_byte_width(macro_f_utf_character_t_to_char_1(character))) @@ -201,9 +212,9 @@ extern "C" { #endif // _di_f_utf_character_t_ #ifndef _di_f_utf_character_t_codes_ - #define F_utf_character_t_eol_d 0x0a000000 // 0000 1010, 0000 0000, 0000 0000, 0000 0000 - #define F_utf_character_t_eos_d 0x00000000 // 0000 0000, 0000 0000, 0000 0000, 0000 0000 - #define F_utf_character_t_placeholder_d 0x00000000 // 0000 0000, 0000 0000, 0000 0000, 0000 0000 + #define F_utf_character_t_eol_d 0x0a000000u // 0000 1010, 0000 0000, 0000 0000, 0000 0000 + #define F_utf_character_t_eos_d 0x00000000u // 0000 0000, 0000 0000, 0000 0000, 0000 0000 + #define F_utf_character_t_placeholder_d 0x00000000u // 0000 0000, 0000 0000, 0000 0000, 0000 0000 #endif // _di_f_utf_character_t_codes_ /** diff --git a/level_0/f_utf/c/utf/convert.c b/level_0/f_utf/c/utf/convert.c new file mode 100644 index 0000000..15b6f9a --- /dev/null +++ b/level_0/f_utf/c/utf/convert.c @@ -0,0 +1,375 @@ +#include "../utf.h" +#include "../private-utf.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _di_f_utf_char_to_character_ + f_status_t f_utf_char_to_character(const f_string_t character, const f_array_length_t width_max, f_utf_character_t *character_utf) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + if (!character_utf) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_char_to_character(character, width_max, character_utf); + } +#endif // _di_f_utf_char_to_character_ + +#ifndef _di_f_utf_character_to_char_ + f_status_t f_utf_character_to_char(const f_utf_character_t utf_character, f_string_t *character, f_array_length_t *width_max) { + #ifndef _di_level_0_parameter_checking_ + if (!utf_character) return F_status_set_error(F_parameter); + if (!character) return F_status_set_error(F_parameter); + if (!width_max) return F_status_set_error(F_parameter); + if (!*width_max) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_character_t_width_is(utf_character)) { + if (macro_f_utf_character_t_width_is(utf_character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + #if __BYTE_ORDER == __LITTLE_ENDIAN + f_utf_t utf = 0; + + switch (macro_f_utf_character_t_width_is(utf_character)) { + case 1: + utf = macro_f_utf_character_t_to_char_1(utf_character) << 24; + break; + + case 2: + utf = (macro_f_utf_character_t_to_char_2(utf_character) << 24) | (macro_f_utf_character_t_to_char_1(utf_character) << 16); + break; + + case 3: + utf = (macro_f_utf_character_t_to_char_3(utf_character) << 24) | (macro_f_utf_character_t_to_char_2(utf_character) << 16) | (macro_f_utf_character_t_to_char_1(utf_character) << 8); + break; + + case 4: + utf = (macro_f_utf_character_t_to_char_4(utf_character) << 24) | (macro_f_utf_character_t_to_char_3(utf_character) << 16) | (macro_f_utf_character_t_to_char_2(utf_character) << 8) | macro_f_utf_character_t_to_char_1(utf_character); + break; + + default: + return F_status_set_error(F_failure); + } + + memcpy(*character, &utf, sizeof(f_char_t) * macro_f_utf_character_t_width_is(utf_character)); + #else + memcpy(*character, &utf_character, sizeof(f_char_t) * macro_f_utf_character_t_width_is(utf_character)); + #endif // __BYTE_ORDER == __LITTLE_ENDIAN + + return F_none; + } + + #if __BYTE_ORDER == __LITTLE_ENDIAN + f_utf_t utf = macro_f_utf_character_t_to_char_1(utf_character) << 24; + + memcpy(*character, &utf, sizeof(f_char_t)); + #else + memcpy(*character, &utf_character, sizeof(f_char_t)); + #endif // __BYTE_ORDER == __LITTLE_ENDIAN + + return F_none; + } +#endif // _di_f_utf_character_to_char_ + +#ifndef _di_f_utf_character_unicode_to_ + f_status_t f_utf_character_unicode_to(const f_utf_character_t character, f_utf_t *unicode) { + #ifndef _di_level_0_parameter_checking_ + if (!unicode) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + return private_f_utf_character_unicode_to(character, unicode); + } +#endif // _di_f_utf_character_unicode_to_ + +#ifndef _di_f_utf_character_unicode_from_ + f_status_t f_utf_character_unicode_from(const f_utf_t unicode, f_utf_character_t *character) { + #ifndef _di_level_0_parameter_checking_ + if (!character) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (unicode > 0x10ffff) { + return F_status_set_error(F_utf); + } + + // U+0000 -> U+007F. + if (unicode < 0x80) { + *character = unicode; + } + + // U+0080 -> U+07FF. + else if (unicode < 0x800) { + *character = (unicode & 0x7c0) << 2; + *character |= unicode & 0x3f; + *character |= 0xc080; + } + + // U+0800 -> U+FFFF. + else if (unicode < 0x10000) { + *character = (unicode & 0xf000) << 4; + *character |= (unicode & 0xfc0) << 2; + *character |= unicode & 0x3f; + *character |= 0xe08080; + } + + // U+100000 -> U+10FFFF. + else { + *character = (unicode & 0x1c0000) << 6; + *character |= (unicode & 0x3f000) << 4; + *character |= (unicode & 0xfc0) << 2; + *character |= unicode & 0x3f; + *character |= 0xe0808080; + } + + return F_none; + } +#endif // _di_f_utf_character_unicode_from_ + +#ifndef _di_f_utf_character_unicode_string_to_ + f_status_t f_utf_character_unicode_string_to(const f_utf_string_t string, const f_array_length_t length, f_utf_t *unicode) { + #ifndef _di_level_0_parameter_checking_ + if (!string) return F_status_set_error(F_parameter); + if (!unicode) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + f_array_length_t i = 0; + + while (i < length && !string[i]) { + ++i; + } // while + + if (i < length) { + if (macro_f_utf_character_t_width_is(string[i])) { + i = length; + } + else { + if (macro_f_utf_character_t_to_char_1(string[i]) == f_string_ascii_u_s.string[0] || macro_f_utf_character_t_to_char_1(string[i]) == f_string_ascii_U_s.string[0]) { + do { + ++i; + } while (i < length && !string[i]); + + if (i < length && !macro_f_utf_character_t_width_is(string[i]) && macro_f_utf_character_t_to_char_1(string[i]) == f_string_ascii_plus_s.string[0]) { + ++i; + } + else { + i = length; + } + } + else { + i = length; + } + } + } + + if (i == length) { + return F_status_set_error(F_valid_not); + } + + f_utf_t value = 0; + uint8_t character = 0; + + for (; i < length; ++i) { + + if (!string[i]) continue; + + // Only ASCII character numbers are allowed to represent + if (macro_f_utf_character_t_width_is(string[i])) { + return F_status_set_error(F_valid_not); + } + + value *= 16; + character = macro_f_utf_character_t_to_char_1(string[i]); + + if (character > 0x2f && character < 0x3a) { + value += character - 0x30; + } + else if (character > 0x40 && character < 0x47) { + value += (character - 0x41) + 10; + } + else if (character > 0x60 && character < 0x67) { + value += (character - 0x61) + 10; + } + else { + return F_status_set_error(F_valid_not); + } + } // for + + *unicode = value; + + return F_none; + } +#endif // _di_f_utf_character_unicode_string_to_ + +#ifndef _di_f_utf_unicode_from_ + f_status_t f_utf_unicode_from(const f_utf_t unicode, const f_array_length_t width_max, f_string_t *character) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + if (!unicode) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + // @fixme the code here needs to be reviewed for endianess accuracy for both big and little endian. + if (unicode > 0x10ffff) { + return F_status_set_error(F_utf); + } + + if (unicode < 0x80) { + + // U+0000 -> U+007F + (*character)[0] = (uint8_t) unicode; + + if (width_max > 1) { + (*character)[1] = 0; + + if (width_max > 2) { + (*character)[2] = 0; + + if (width_max > 3) { + (*character)[3] = 0; + } + } + } + } + else if (unicode < 0x800) { + if (width_max < 2) { + return F_status_set_error(F_utf); + } + + // U+0080 -> U+07FF + (*character)[0] = F_utf_byte_2_d | ((uint8_t) ((unicode & 0x7c0) >> 6)); + (*character)[1] = F_utf_byte_1_d | ((uint8_t) (unicode & 0x3f)); + + if (width_max > 2) { + (*character)[2] = 0; + + if (width_max > 2) { + (*character)[2] = 0; + } + } + } + else if (unicode < 0x10000) { + if (width_max < 3) { + return F_status_set_error(F_utf); + } + + // U+0800 -> U+FFFF + (*character)[0] = F_utf_byte_3_d | ((uint8_t) ((unicode & 0xf000) >> 12)); + (*character)[1] = F_utf_byte_1_d | ((uint8_t) ((unicode & 0xfc0) >> 6)); + (*character)[2] = F_utf_byte_1_d | ((uint8_t) (unicode & 0x3f)); + + if (width_max > 3) { + character[3] = 0; + } + } + else { + if (width_max < 4) { + return F_status_set_error(F_utf); + } + + // U+10000 -> U+10FFFF + (*character)[0] = F_utf_byte_4_d | ((uint8_t) ((unicode & 0x1c0000) >> 18)); + (*character)[1] = F_utf_byte_1_d | ((uint8_t) ((unicode & 0x3f000) >> 12)); + (*character)[2] = F_utf_byte_1_d | ((uint8_t) ((unicode & 0xfc0) >> 6)); + (*character)[3] = F_utf_byte_1_d | ((uint8_t) (unicode & 0x3f)); + } + + return F_none; + } +#endif // _di_f_utf_unicode_from_ + +#ifndef _di_f_utf_unicode_to_ + f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, f_utf_t *unicode) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + if (!unicode) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_unicode_to(character_utf, unicode); + } +#endif // _di_f_utf_unicode_to_ + +#ifndef _di_f_utf_unicode_string_to_f_ + f_status_t f_utf_unicode_string_to(const f_string_t string, const f_array_length_t length, f_utf_t *unicode) { + #ifndef _di_level_0_parameter_checking_ + if (!unicode) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + f_array_length_t i = 0; + + while (i < length && !string[i]) { + ++i; + } // while + + if (i < length) { + if (string[i] == f_string_ascii_u_s.string[0] || string[i] == f_string_ascii_U_s.string[0]) { + do { + ++i; + } while (i < length && !string[i]); + + if (i < length && string[i] == f_string_ascii_plus_s.string[0]) { + ++i; + } + else { + i = length; + } + } + else { + i = length; + } + } + + if (i == length) { + return F_status_set_error(F_valid_not); + } + + f_utf_t value = 0; + + for (; i < length; ++i) { + + if (!string[i]) continue; + + value *= 16; + + if (string[i] > 0x2f && string[i] < 0x3a) { + value += string[i] - 0x30; + } + else if (string[i] > 0x40 && string[i] < 0x47) { + value += (string[i] - 0x41) + 10; + } + else if (string[i] > 0x60 && string[i] < 0x67) { + value += (string[i] - 0x61) + 10; + } + else { + return F_status_set_error(F_valid_not); + } + } // for + + if (value > 0x10ffff) { + return F_status_set_error(F_valid_not); + } + + *unicode = value; + + return F_none; + } +#endif // _di_f_utf_unicode_string_to_ + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/level_0/f_utf/c/utf/convert.h b/level_0/f_utf/c/utf/convert.h new file mode 100644 index 0000000..007da72 --- /dev/null +++ b/level_0/f_utf/c/utf/convert.h @@ -0,0 +1,231 @@ +/** + * FLL - Level 0 + * + * Project: UTF + * API Version: 0.5 + * Licenses: lgplv2.1 + * + * Defines UTF-8 "convert" functions. + * + * This is auto-included by utf.h and should not need to be explicitly included. + */ +#ifndef _F_utf_convert_h +#define _F_utf_convert_h + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Convert a specialized f_utf_character_t type to a uint8_t, stored as a string (character buffer). + * + * This will also convert ASCII characters stored in the utf_character array. + * This will not resize character. + * + * @param utf_character + * The UTF-8 character to convert from. + * @param character + * A uint8_t representation of the UTF-8 character, stored as a string of width bytes. + * If width_max is 0, then this should be set to 0. + * @param width_max + * This is set to the max number of bytes available. + * This is then updated to represent the max bytes used if enough space is available. + * + * @return + * F_none if conversion was successful. + * + * F_failure (with error bit) if width is not long enough to convert. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_character_to_char_ + extern f_status_t f_utf_character_to_char(const f_utf_character_t utf_character, f_string_t *character, f_array_length_t *width_max); +#endif // _di_f_utf_character_to_char_ + +/** + * Convert a given (UTF-8) character into Unicode. + * + * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged. + * The Unicode is a 32-bit integer representing the Unicode (such as U+0001). + * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF). + * + * @param character + * The (UTF-8) character. + * @param unicode + * The Unicode number. + * + * @return + * F_none on success. + * + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see f_utf_character_is_valid() + */ +#ifndef _di_f_utf_character_unicode_to_ + extern f_status_t f_utf_character_unicode_to(const f_utf_character_t character, f_utf_t *unicode); +#endif // _di_f_utf_character_unicode_to_ + +/** + * Convert a given Unicode into (UTF-8) character. + * + * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged. + * The Unicode is a 32-bit integer representing the Unicode (such as U+0001). + * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF). + * + * @param unicode + * The Unicode number. + * @param character + * The (UTF-8) character. + * + * @return + * F_none on success. + * + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_character_unicode_from_ + extern f_status_t f_utf_character_unicode_from(const f_utf_t unicode, f_utf_character_t *character); +#endif // _di_f_utf_character_unicode_from_ + +/** + * Convert a string of the format "U+FFFF" into the codepoint value. + * + * This ignores NULL characters. + * The string may only contain "U+" followed by a hexidecimal digit, upper or lower case. + * The "U+" prefix is optional. + * Only ASCII characters are allowed to represent the Unicode sequence string. + * + * @param string + * The string representing a Unicode sequence. + * @param length + * The maximum number of characters. + * @param unicode + * A 32-bit integer representing the Unicode (such as U+0001). + * Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF). + * + * @return + * F_none on success. + * + * F_failure (with error bit) if width_max is not long enough to convert. + * F_parameter (with error bit) if a parameter is invalid. + * F_valid_not (with error bit) if string is not a valid Unicode string. + */ +#ifndef _di_f_utf_character_unicode_string_to_ + extern f_status_t f_utf_character_unicode_string_to(const f_utf_string_t string, const f_array_length_t length, f_utf_t *unicode); +#endif // _di_f_utf_character_unicode_string_to_ + +/** + * Convert an ASCII or UTF-8 character, stored as a string (character buffer), to the specialized f_utf_character_t type. + * + * @param character + * The character string to be converted to the f_utf_character_t type. + * There must be enough space allocated to convert against, as limited by width_max. + * @param width_max + * The maximum width available for converting. + * Can be anything greater than 0. + * @param character_utf + * The generated character of type f_utf_character_t. + * This value may be cleared, even on error. + * + * @return + * F_none if conversion was successful. + * + * F_failure (with error bit) if width is not long enough to convert. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_char_to_character_ + extern f_status_t f_utf_char_to_character(const f_string_t character, const f_array_length_t width_max, f_utf_character_t *character_utf); +#endif // _di_f_utf_char_to_character_ + +/** + * Convert a given Unicode into a string block representing a single character. + * + * @param character + * The (UTF-8) character. + * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged. + * @param width_max + * The max width available for representing the UTF-8 character. + * There must be enough space in the character buffer to handle the Unicode width. + * It is recommended to always have 4 characters (4 uint8_t) of space available in character. + * @param unicode + * A 32-bit integer representing the Unicode (such as U+0001). + * Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF). + * + * @return + * F_none on success. + * + * F_failure (with error bit) if width_max is not long enough to convert. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_unicode_from_ + extern f_status_t f_utf_unicode_from(const f_utf_t unicode, const f_array_length_t width_max, f_string_t *character); +#endif // _di_f_utf_unicode_from_ + +/** + * Convert a given string block representing a single character into Unicode. + * + * @param character + * The (UTF-8) character to convert to the Unicode representation. + * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged. + * @param width_max + * The max width available for representing the UTF-8 character. + * There must be enough space in the character buffer to handle the Unicode width. + * It is recommended to always have 4 characters (4 uint8_t) of space available in character. + * @param unicode + * A 32-bit integer representing the Unicode (such as U+0001). + * Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF). + * + * @return + * F_none on success. + * + * F_failure (with error bit) if width is not long enough to convert. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see f_utf_character_is_valid() + */ +#ifndef _di_f_utf_unicode_to_ + extern f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, f_utf_t *unicode); +#endif // _di_f_utf_unicode_to_ + +/** + * Convert a string of the format "U+FFFF" into the codepoint value. + * + * This ignores NULL characters. + * The string may only contain "U+" followed by a hexidecimal digit, upper or lower case. + * The "U+" prefix is optional. + * Only ASCII characters are allowed to represent the Unicode sequence string. + * + * @param string + * The string representing a Unicode sequence. + * @param length + * The maximum number of characters. + * @param unicode + * A 32-bit integer representing the Unicode (such as U+0001). + * Does not need to be interpretted like UTF-8, this is a number from 0 onto max supported Unicode integer value (U+10FFFF). + * + * @return + * F_none on success. + * + * F_failure (with error bit) if width_max is not long enough to convert. + * F_parameter (with error bit) if a parameter is invalid. + * F_valid_not (with error bit) if string is not a valid Unicode string. + */ +#ifndef _di_f_utf_unicode_string_to_ + extern f_status_t f_utf_unicode_string_to(const f_string_t string, const f_array_length_t length, f_utf_t *unicode); +#endif // _di_f_utf_unicode_string_to_ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // _F_utf_is_h diff --git a/level_0/f_utf/c/utf/dynamic.h b/level_0/f_utf/c/utf/dynamic.h index 13aa609..255f2c8 100644 --- a/level_0/f_utf/c/utf/dynamic.h +++ b/level_0/f_utf/c/utf/dynamic.h @@ -151,24 +151,6 @@ extern "C" { #endif // _di_f_utf_string_dynamic_adjust_ /** - * Resize the dynamic string. - * - * @param length - * The new size to use. - * @param dynamic - * The string to resize. - * - * @return - * F_none on success. - * - * F_memory_not (with error bit) on out of memory. - * F_parameter (with error bit) if a parameter is invalid. - */ -#ifndef _di_f_utf_string_dynamic_adjust_ - extern f_status_t f_utf_string_dynamic_adjust(const f_array_length_t length, f_utf_string_dynamic_t *dynamic); -#endif // _di_f_utf_string_dynamic_adjust_ - -/** * Append the source string onto the destination. * * @param source diff --git a/level_0/f_utf/c/utf/is.c b/level_0/f_utf/c/utf/is.c new file mode 100644 index 0000000..114dba0 --- /dev/null +++ b/level_0/f_utf/c/utf/is.c @@ -0,0 +1,963 @@ +#include "../utf.h" +#include "../private-utf.h" +#include "private-is_unassigned.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _di_f_utf_is_ + f_status_t f_utf_is(const f_string_t character) { + + return macro_f_utf_byte_width_is(*character); + } +#endif // _di_f_utf_is_ + +#ifndef _di_f_utf_is_alpha_ + f_status_t f_utf_is_alpha(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_alpha(character_utf); + } + + if (isalpha(*character)) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_alpha_ + +#ifndef _di_f_utf_is_alpha_digit_ + f_status_t f_utf_is_alpha_digit(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_alpha_digit(character_utf); + } + + if (isalnum(*character)) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_alpha_digit_ + +#ifndef _di_f_utf_is_alpha_numeric_ + f_status_t f_utf_is_alpha_numeric(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_alpha_numeric(character_utf); + } + + if (isalnum(*character)) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_alpha_numeric_ + +#ifndef _di_f_utf_is_ascii_ + f_status_t f_utf_is_ascii(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return F_false; + } + + return F_true; + } +#endif // _di_f_utf_is_ascii_ + +#ifndef _di_f_utf_is_combining_ + f_status_t f_utf_is_combining(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_combining(character_utf); + } + + // There are no ASCII combining characters. + return F_false; + } +#endif // _di_f_utf_is_combining_ + +#ifndef _di_f_utf_is_control_ + f_status_t f_utf_is_control(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_control(character_utf); + } + + return iscntrl(*character); + } +#endif // _di_f_utf_is_control_ + +#ifndef _di_f_utf_is_control_code + f_status_t f_utf_is_control_code(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_control_code(character_utf); + } + + if (iscntrl(*character)) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_control_code_ + +#ifndef _di_f_utf_is_control_format_ + f_status_t f_utf_is_control_format(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_control_format(character_utf); + } + + // There are no ASCII control formats. + return F_false; + } +#endif // _di_f_utf_is_control_format_ + +#ifndef _di_f_utf_is_control_picture_ + f_status_t f_utf_is_control_picture(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + if (macro_f_utf_byte_width_is(*character) != 3) { + return F_false; + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_control_picture(character_utf); + } + + // There are no ASCII control pictures. + return F_false; + } +#endif // _di_f_utf_is_control_picture_ + +#ifndef _di_f_utf_is_digit_ + f_status_t f_utf_is_digit(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_digit(character_utf); + } + + if (isdigit(*character)) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_digit_ + +#ifndef _di_f_utf_is_emoji_ + f_status_t f_utf_is_emoji(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_emoji(character_utf); + } + + if (isdigit(*character)) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_emoji_ + +#ifndef _di_f_utf_is_fragment_ + f_status_t f_utf_is_fragment(const f_string_t character) { + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_fragment_ + +#ifndef _di_f_utf_is_graph_ + f_status_t f_utf_is_graph(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + if (private_f_utf_character_is_control(character_utf)) { + return F_false; + } + + if (private_f_utf_character_is_whitespace(character_utf)) { + return F_false; + } + + // Zero-width characters are be treated as a non-graph. + if (private_f_utf_character_is_zero_width(character_utf)) { + return F_false; + } + + return F_true; + } + + if (isgraph(*character)) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_graph_ + +#ifndef _di_f_utf_is_numeric_ + f_status_t f_utf_is_numeric(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_numeric(character_utf); + } + + if (isdigit(*character)) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_numeric_ + +#ifndef _di_f_utf_is_phonetic_ + f_status_t f_utf_is_phonetic(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_phonetic(character_utf); + } + + // There are no ASCII phonetic characters. + return F_false; + } +#endif // _di_f_utf_is_phonetic_ + +#ifndef _di_f_utf_is_private_ + f_status_t f_utf_is_private(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_private(character_utf); + } + + // There are no ASCII private characters. + return F_false; + } +#endif // _di_f_utf_is_private_ + +#ifndef _di_f_utf_is_punctuation_ + f_status_t f_utf_is_punctuation(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_punctuation(character_utf); + } + + // ASCII: '!' to '#'. + if (character[0] > 0x20 && character[0] < 0x24) { + return F_true; + } + + // ASCII: '%' to '*'. + if (character[0] > 0x24 && character[0] < 0x2b) { + return F_true; + } + + // ASCII: ',' to '/'. + if (character[0] > 0x2b && character[0] < 0x30) { + return F_true; + } + + // ASCII: ':', ';', '?', or '@'. + if (character[0] == 0x3a || character[0] == 0x3b || character[0] == 0x3f || character[0] == 0x40) { + return F_true; + } + + // ASCII: '[' to ']'. + if (character[0] > 0x5a && character[0] < 0x5d) { + return F_true; + } + + // ASCII: '_', '{', or '}'. + if (character[0] == 0x5f || character[0] == 0x7b || character[0] == 0x7d) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_punctuation_ + +#ifndef _di_f_utf_is_symbol_ + f_status_t f_utf_is_symbol(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_symbol(character_utf); + } + + // ASCII: '$' or '+'. + if (character[0] == 0x24 || character[0] == 0x2b) { + return F_true; + } + + // ASCII: '<' to '>'. + if (character[0] > 0x3c && character[0] < 0x3e) { + return F_true; + } + + // ASCII: '^', '`', '|', or '~'. + if (character[0] == 0x5e || character[0] == 0x60 || character[0] == 0x7c || character[0] == 0x7e) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_symbol_ + +#ifndef _di_f_utf_is_surrogate_ + f_status_t f_utf_is_surrogate(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_surrogate(character_utf); + } + + // ASCII are never surrogate. + return F_false; + } +#endif // _di_f_utf_is_surrogate_ + +#ifndef _di_f_utf_is_unassigned_ + f_status_t f_utf_is_unassigned(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_unassigned(character_utf); + } + + // ASCII are never unassigned. + return F_false; + } +#endif // _di_f_utf_is_unassigned_ + +#ifndef _di_f_utf_is_valid_ + f_status_t f_utf_is_valid(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_valid(character_utf); + } + + // ASCII are valid. + return F_true; + } +#endif // _di_f_utf_is_valid_ + +#ifndef _di_f_utf_is_whitespace_ + f_status_t f_utf_is_whitespace(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_whitespace(character_utf); + } + + if (isspace(*character)) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_whitespace_ + +#ifndef _di_f_utf_is_whitespace_modifier_ + f_status_t f_utf_is_whitespace_modifier(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_whitespace_modifier(character_utf); + } + + // There are no ASCII whitespace modifiers. + return F_false; + } +#endif // _di_f_utf_is_whitespace_modifier_ + +#ifndef _di_f_utf_is_whitespace_other_ + f_status_t f_utf_is_whitespace_other(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_whitespace_other(character_utf); + } + + // There are no ASCII whitespace other. + return F_false; + } +#endif // _di_f_utf_is_whitespace_other_ + +#ifndef _di_f_utf_is_wide_ + f_status_t f_utf_is_wide(const f_string_t character, const f_array_length_t width_max) { + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_wide(character_utf); + } + + // There are no wide ASCII characters. + return F_false; + } +#endif // _di_f_utf_is_wide_ + +#ifndef _di_f_utf_is_word_ + f_status_t f_utf_is_word(const f_string_t character, const f_array_length_t width_max, const bool strict) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_word(character_utf, strict); + } + + if (isalnum(*character) || *character == f_string_ascii_underscore_s.string[0]) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_word_ + +#ifndef _di_f_utf_is_word_dash_ + f_status_t f_utf_is_word_dash(const f_string_t character, const f_array_length_t width_max, const bool strict) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_word_dash(character_utf, strict); + } + + if (isalnum(*character) || *character == f_string_ascii_underscore_s.string[0] || *character == f_string_ascii_minus_s.string[0]) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_word_dash_ + +#ifndef _di_f_utf_is_word_dash_plus_ + f_status_t f_utf_is_word_dash_plus(const f_string_t character, const f_array_length_t width_max, const bool strict) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_word_dash_plus(character_utf, strict); + } + + if (isalnum(*character) || *character == f_string_ascii_underscore_s.string[0] || *character == f_string_ascii_minus_s.string[0] || *character == f_string_ascii_plus_s.string[0]) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_word_dash_plus_ + +#ifndef _di_f_utf_is_zero_width_ + f_status_t f_utf_is_zero_width(const f_string_t character, const f_array_length_t width_max) { + #ifndef _di_level_0_parameter_checking_ + if (width_max < 1) return F_status_set_error(F_parameter); + #endif // _di_level_0_parameter_checking_ + + if (macro_f_utf_byte_width_is(*character)) { + if (macro_f_utf_byte_width_is(*character) > width_max) { + return F_status_set_error(F_failure); + } + + if (macro_f_utf_byte_width_is(*character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + f_utf_character_t character_utf = 0; + + { + const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf); + if (F_status_is_error(status)) return status; + } + + return private_f_utf_character_is_zero_width(character_utf); + } + + // These control characters are considered zero-width spaces. + if (*character >= 0x00 && *character <= 0x08) { + return F_true; + } + else if (*character >= 0x0c && *character <= 0x1f) { + return F_true; + } + else if (*character == 0x7f) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_is_zero_width_ + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/level_0/f_utf/c/utf/is.h b/level_0/f_utf/c/utf/is.h new file mode 100644 index 0000000..099bfe3 --- /dev/null +++ b/level_0/f_utf/c/utf/is.h @@ -0,0 +1,777 @@ +/** + * FLL - Level 0 + * + * Project: UTF + * API Version: 0.5 + * Licenses: lgplv2.1 + * + * Defines UTF-8 "is" functions. + * + * This is auto-included by utf.h and should not need to be explicitly included. + */ +#ifndef _F_utf_is_h +#define _F_utf_is_h + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Check to see if the entire byte block of the character is a non-ASCII UTF-8 character. + * + * This does not check the validity of the character, for that instead use f_utf_is_valid(). + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * + * @return + * F_true if a UTF-8 character. + * F_false if not a UTF-8 character. + */ +#ifndef _di_f_utf_is_ + extern f_status_t f_utf_is(const f_string_t character); +#endif // _di_f_utf_is_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabet character. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 alphabet character. + * F_false if not a UTF-8 alphabet character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isalpha() + */ +#ifndef _di_f_utf_is_alpha_ + extern f_status_t f_utf_is_alpha(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_alpha_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabet or digit character. + * + * Digit characters are decimal digits and letter numbers. + * + * This does not include number-like, such as 1/2 (½) or superscript 2 (²). + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 alphabet character. + * F_false if not a UTF-8 alpha-numeric character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isalnum() + */ +#ifndef _di_f_utf_is_alpha_digit_ + extern f_status_t f_utf_is_alpha_digit(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_alpha_digit_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabet or numeric character. + * + * Numeric characters are decimal digits, letter numbers, and number-like, such as 1/2 (½) or superscript 2 (²). + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 alphabet character. + * F_false if not a UTF-8 alpha-numeric character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isalnum() + */ +#ifndef _di_f_utf_is_alpha_numeric_ + extern f_status_t f_utf_is_alpha_numeric(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_alpha_numeric_ + +/** + * Check to see if the entire byte block of the character is an ASCII character. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if an ASCII character. + * F_false if not an ASCII character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_ascii_ + extern f_status_t f_utf_is_ascii(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_ascii_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 combining character. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 combining character. + * F_false if not a UTF-8 combining character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_combining_ + extern f_status_t f_utf_is_combining(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_combining_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character. + * + * This includes control code and control format characters. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 control character. + * F_false if not a UTF-8 control character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see iscntrl() + */ +#ifndef _di_f_utf_is_control_ + extern f_status_t f_utf_is_control(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_control_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 control code character. + * + * Control Code characters are the traditional control characters, such as "\n" as well as some newer Unicode ones. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 control code character. + * F_false if not a UTF-8 control code character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_control_code_ + extern f_status_t f_utf_is_control_code(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_control_code_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 control format character. + * + * Control Format characters are special characters used for formatting. + * These are considered control characters. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 control format character. + * F_false if not a UTF-8 control format character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_control_format_ + extern f_status_t f_utf_is_control_format(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_control_format_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 control picture character. + * + * Control Picture characters are placeholders for special ASCII characters and therefore there are no ASCII Control Picture characters. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 control picture character. + * F_false if not a UTF-8 control picture character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_control_picture_ + extern f_status_t f_utf_is_control_picture(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_control_picture_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 digit character. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 digit character. + * F_false if not a UTF-8 digit character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isdigit() + */ +#ifndef _di_f_utf_is_digit_ + extern f_status_t f_utf_is_digit(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_digit_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 emoji character. + * + * @todo Incomplete, UTF-8 codes not yet checked! + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 emoji character. + * F_false if not a UTF-8 emoji character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_emoji_ + extern f_status_t f_utf_is_emoji(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_emoji_ + +/** + * Check to see if the entire byte block of the character is a 1-width UTF-8 character fragment. + * + * Characters whose width is 1-byte are invalid. + * However, the character could have been cut-off, so whether or not this is actually valid should be determined by the caller. + * + * For normal validation functions, try using f_utf_character_is() or f_utf_character_is_valid(). + * + * According to rfc3629, the valid octect sequences for UTF-8 are: + * UTF8-octets = *( UTF8-char ) + * UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 + * UTF8-1 = %x00-7F + * UTF8-2 = %xC2-DF UTF8-tail + * UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + * %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + * UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + * %xF4 %x80-8F 2( UTF8-tail ) + * UTF8-tail = %x80-BF + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * + * @return + * F_true if a UTF-8 character. + * F_false if not a UTF-8 character. + */ +#ifndef _di_f_utf_is_fragment_ + extern f_status_t f_utf_is_fragment(const f_string_t character); +#endif // _di_f_utf_is_fragment_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 printable character. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 graph. + * F_false if not a UTF-8 graph. + * + * F_maybe (with error bit) if this could be a graph but width is not long enough. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isgraph() + */ +#ifndef _di_f_utf_is_graph_ + extern f_status_t f_utf_is_graph(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_graph_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 numeric character. + * + * Numeric characters are decimal digits, letter numbers, and number-like, such as 1/2 (½) or superscript 2 (²). + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 numeric character. + * F_false if not a UTF-8 numeric character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isdigit() + */ +#ifndef _di_f_utf_is_numeric_ + extern f_status_t f_utf_is_numeric(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_numeric_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 phonetic character. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 phonetic character. + * F_false if not a UTF-8 phonetic character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_phonetic_ + extern f_status_t f_utf_is_phonetic(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_phonetic_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 private character. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 punctuation character. + * F_false if not a UTF-8 punctuation character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_private_ + extern f_status_t f_utf_is_private(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_private_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 punctuation character. + * + * @todo Incomplete, UTF-8 codes not yet checked! + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 punctuation character. + * F_false if not a UTF-8 punctuation character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_punctuation_ + extern f_status_t f_utf_is_punctuation(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_punctuation_ + +/** + * Check to see if the entire byte block of the character is a surrogate UTF-8 character. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 symbol character. + * F_false if not a UTF-8 symbol character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_surrogate_ + extern f_status_t f_utf_is_surrogate(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_surrogate_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 symbol character. + * + * @todo Incomplete, UTF-8 codes not yet checked! + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 symbol character. + * F_false if not a UTF-8 symbol character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_symbol_ + extern f_status_t f_utf_is_symbol(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_symbol_ + +/** + * Check to see if the entire byte block of the character is a unassigned UTF-8 character. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if an unassigned UTF-8 character. + * F_false if not an unassigned UTF-8 character. + * + * F_parameter (with error bit) if a parameter is inunassigned. + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_unassigned_ + extern f_status_t f_utf_is_unassigned(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_unassigned_ + +/** + * Check to see if the entire byte block of the character is a valid (well-formed) UTF-8 character. + * + * This does validate if the UTF-8 character is a valid UTF-8 character. + * To not do this, use f_utf_is(). + * + * Valid ASCII character codes are considered valid by this function. + * + * Codes U+FDD0 to U+FDEF and any character ending in FFFE or FFFF are non-characters, and are therefore invalid. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a valid UTF-8 character or is an ASCII character. + * F_false if not a valid UTF-8 character. + * + * F_failure (with error bit) if width_max is not long enough to convert. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_valid_ + extern f_status_t f_utf_is_valid(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_valid_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 general space character. + * + * Non-printing or zero-width characters are not considered whitespace. + * This does include line separators like '\n'. + * This does not include phonetic spaces, like whitespace modifiers. + * This does not include non-true whitespace characters, such as Ogham Space Mark ( ). + * + * Phonetic spaces are whitespaces with additional phonetic meaning associated with them. + * However, because they are not renderred as whitespace, they are technically not white space. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 whitespace. + * F_false if not a UTF-8 whitespace. + * + * F_maybe (with error bit) if this could be a whitespace but width is not long enough. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isspace() + */ +#ifndef _di_f_utf_is_whitespace_ + extern f_status_t f_utf_is_whitespace(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_whitespace_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 whitespace modifier character. + * + * These are phonetic spaces. + * + * Phonetic spaces are whitespaces with additional phonetic meaning associated with them. + * Therefore, these are valid spaces in the technical sense, even if they are not visibly whitespace. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 whitespace. + * F_false if not a UTF-8 whitespace. + * + * F_maybe (with error bit) if this could be a whitespace but width is not long enough. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_whitespace_modifier_ + extern f_status_t f_utf_is_whitespace_modifier(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_whitespace_modifier_ + +/** + * Check to see if the entire byte block of the character is an other type of UTF-8 space character. + * + * This is a list of whitespace that are not actual whitespace (because they are graph characters) but are considered whitespace, such as Ogham Space Mark ( ). + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 whitespace. + * F_false if not a UTF-8 whitespace. + * + * F_maybe (with error bit) if this could be a whitespace but width is not long enough. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_whitespace_other_ + extern f_status_t f_utf_is_whitespace_other(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_whitespace_other_ + +/** + * Get whether or not the UTF-8 character is a wide character on display. + * + * This is not the wide as in width in bytes that the codepoint takes up in UTF-8. + * Instead, this is the width in characters on the screen the character takes up. + * When "wide" characters that take up either 2 characters on render. + * When "narrow" characters that take up either 1 character on render. + * + * @param character + * The (UTF-8) character. + * @param width_max + * The max width available for representing the UTF-8 character. + * There must be enough space in the character buffer to handle the Unicode width. + * It is recommended to always have 4 characters (4 uint8_t) of space available in character. + * This is the width in bytes the codepoint takes up in UTF-8. + * + * @return + * F_none on success. + * + * F_failure (with error bit) if width_max is not long enough to convert. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_wide_ + extern f_status_t f_utf_is_wide(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_wide_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 word character. + * + * A word character is alpha-digit or an underscore '_'. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * @param strict + * When TRUE, include all appropriate characters by type as per Unicode. + * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). + * When FALSE, zero-width punctuation characters are not considered a character. + * + * @return + * F_true if a UTF-8 word character. + * F_false if not a UTF-8 word character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isalnum() + */ +#ifndef _di_f_utf_is_word_ + extern f_status_t f_utf_is_word(const f_string_t character, const f_array_length_t width_max, const bool strict); +#endif // _di_f_utf_is_word_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 word or dash character. + * + * A word dash character is alpha-digit, an underscore '_' or a dash '-'. + * + * Unicode appears to refer to dashes that connect words as a hyphen. + * Therefore, only these hyphens are considered dashes for the purposes of this function. + * All other dash-like Unicode characters are not considered a dash here. + * The dash here is intended for combining words, which matches the context of the Unicode "hyphen". + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * @param strict + * When TRUE, include all appropriate characters by type as per Unicode. + * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). + * When FALSE, zero-width punctuation characters are not considered a character. + * + * @return + * F_true if a UTF-8 word or dash character. + * F_false if not a UTF-8 word or dash character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isalnum() + */ +#ifndef _di_f_utf_is_word_dash_ + extern f_status_t f_utf_is_word_dash(const f_string_t character, const f_array_length_t width_max, const bool strict); +#endif // _di_f_utf_is_word_dash_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 word, dash, or plus character. + * + * A word dash plus character is alpha-digit, an underscore '_', a dash '-', or a plus '+'. + * + * Unicode appears to refer to dashes that connect words as a hyphen. + * Therefore, only these hyphens are considered dashes for the purposes of this function. + * All other dash-like Unicode characters are not considered a dash here. + * The dash here is intended for combining words, which matches the context of the Unicode "hyphen". + * + * This does not include zero-width punctuation, such as "invisible plus" (U+2064) (even in strict mode). + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * @param strict + * When TRUE, include all appropriate characters by type as per Unicode. + * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). + * When FALSE, zero-width punctuation characters are not considered a character. + * + * @return + * F_true if a UTF-8 word or dash character. + * F_false if not a UTF-8 word or dash character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isalnum() + */ +#ifndef _di_f_utf_is_word_dash_plus_ + extern f_status_t f_utf_is_word_dash_plus(const f_string_t character, const f_array_length_t width_max, const bool strict); +#endif // _di_f_utf_is_word_dash_plus_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 general non-printing character. + * + * Only characters that do not print, which are generally called zero-width. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by width_max. + * @param width_max + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * F_true if a UTF-8 whitespace. + * F_false if not a UTF-8 whitespace. + * + * F_maybe (with error bit) if this could be a whitespace but width is not long enough. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_is_zero_width_ + extern f_status_t f_utf_is_zero_width(const f_string_t character, const f_array_length_t width_max); +#endif // _di_f_utf_is_zero_width_ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // _F_utf_is_h diff --git a/level_0/f_utf/c/utf/is_character.c b/level_0/f_utf/c/utf/is_character.c new file mode 100644 index 0000000..d7f9bc1 --- /dev/null +++ b/level_0/f_utf/c/utf/is_character.c @@ -0,0 +1,567 @@ +#include "../utf.h" +#include "../private-utf.h" +#include "private-is_unassigned.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _di_f_utf_character_is_ + f_status_t f_utf_character_is(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_utf_fragment; + } + + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_ + +#ifndef _di_f_utf_character_is_alpha_ + f_status_t f_utf_character_is_alpha(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_alpha(character); + } + + if (isalpha(macro_f_utf_character_t_to_char_1(character))) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_alpha_ + +#ifndef _di_f_utf_character_is_alpha_digit_ + f_status_t f_utf_character_is_alpha_digit(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_alpha_digit(character); + } + + if (isalnum(macro_f_utf_character_t_to_char_1(character))) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_alpha_digit_ + +#ifndef _di_f_utf_character_is_alpha_numeric_ + f_status_t f_utf_character_is_alpha_numeric(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_alpha_numeric(character); + } + + if (isalnum(macro_f_utf_character_t_to_char_1(character))) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_alpha_numeric_ + +#ifndef _di_f_utf_character_is_ascii_ + f_status_t f_utf_character_is_ascii(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + return F_false; + } + + return F_true; + } +#endif // _di_f_utf_character_is_ascii_ + +#ifndef _di_f_utf_character_is_combining_ + f_status_t f_utf_character_is_combining(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_combining(character); + } + + // There are no combining characters in ASCII. + return F_false; + } +#endif // _di_f_utf_character_is_combining_ + +#ifndef _di_f_utf_character_is_control_ + f_status_t f_utf_character_is_control(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_control(character); + } + + if (iscntrl(macro_f_utf_character_t_to_char_1(character))) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_control_ + +#ifndef _di_f_utf_character_is_control_code_ + f_status_t f_utf_character_is_control_code(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_control_code(character); + } + + if (iscntrl(macro_f_utf_character_t_to_char_1(character))) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_control_code_ + +#ifndef _di_f_utf_character_is_control_picture_ + f_status_t character_is_control_format(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_control_format(character); + } + + // There are no control format characters in ASCII. + return F_false; + } +#endif // _di_f_utf_character_is_control_format_ + +#ifndef _di_f_utf_character_is_control_picture_ + f_status_t f_utf_character_is_control_picture(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_control_picture(character); + } + + // There are no control picture characters in ASCII. + return F_false; + } +#endif // _di_f_utf_character_is_control_picture_ + +#ifndef _di_f_utf_character_is_digit_ + f_status_t f_utf_character_is_digit(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_digit(character); + } + + if (isdigit(macro_f_utf_character_t_to_char_1(character))) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_digit_ + +#ifndef _di_f_utf_character_is_emoji_ + f_status_t f_utf_character_is_emoji(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_emoji(character); + } + + if (isdigit(macro_f_utf_character_t_to_char_1(character))) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_emoji_ + +#ifndef _di_f_utf_character_is_fragment_ + f_status_t f_utf_character_is_fragment(const f_utf_character_t character) { + + return macro_f_utf_character_t_width_is(character) == 1; + } +#endif // _di_f_utf_character_is_fragment_ + +#ifndef _di_f_utf_character_is_graph_ + f_status_t f_utf_character_is_graph(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + if (private_f_utf_character_is_control(character)) { + return F_false; + } + + if (private_f_utf_character_is_whitespace(character)) { + return F_false; + } + + if (private_f_utf_character_is_zero_width(character)) { + return F_false; + } + + return F_true; + } + + if (isgraph(macro_f_utf_character_t_to_char_1(character))) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_graph_ + +#ifndef _di_f_utf_character_is_numeric_ + f_status_t f_utf_character_is_numeric(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_numeric(character); + } + + if (isdigit(macro_f_utf_character_t_to_char_1(character))) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_numeric_ + +#ifndef _di_f_utf_character_is_phonetic_ + f_status_t f_utf_character_is_phonetic(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_phonetic(character); + } + + // There are no ASCII phonetic characters. + return F_false; + } +#endif // _di_f_utf_character_is_phonetic_ + +#ifndef _di_f_utf_character_is_private_ + f_status_t f_utf_character_is_private(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_private(character); + } + + // There are no ASCII private characters. + return F_false; + } +#endif // _di_f_utf_character_is_phonetic_ + +#ifndef _di_f_utf_character_is_punctuation_ + f_status_t f_utf_character_is_punctuation(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_punctuation(character); + } + + // ASCII: '!' to '#'. + if (character > 0x20000000 && character < 0x24000000) { + return F_true; + } + + // ASCII: '%' to '*'. + if (character > 0x24000000 && character < 0x2b000000) { + return F_true; + } + + // ASCII: ',' to '/'. + if (character > 0x2b000000 && character < 0x30000000) { + return F_true; + } + + // ASCII: ':', ';', '?', or '@'. + if (character == 0x3a000000 || character == 0x3b000000 || character == 0x3f000000 || character == 0x40000000) { + return F_true; + } + + // ASCII: '[' to ']'. + if (character > 0x5a000000 && character < 0x5d000000) { + return F_true; + } + + // ASCII: '_', '{', or '}'. + if (character == 0x5f000000 || character == 0x7b000000 || character == 0x7d000000) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_punctuation_ + +#ifndef _di_f_utf_character_is_symbol_ + f_status_t f_utf_character_is_symbol(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_symbol(character); + } + + // ASCII: '$' or '+'. + if (character == 0x24000000 || character == 0x2b000000) { + return F_true; + } + + // ASCII: '<' to '>'. + if (character > 0x3c000000 && character < 0x3e000000) { + return F_true; + } + + // ASCII: '^', '`', '|', or '~'. + if (character == 0x5e000000 || character == 0x60000000 || character == 0x7c000000 || character == 0x7e000000) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_symbol_ + +#ifndef _di_f_utf_character_is_unassigned_ + f_status_t f_utf_character_is_unassigned(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_unassigned(character); + } + + return F_false; + } +#endif // _di_f_utf_character_is_unassigned_ + +#ifndef _di_f_utf_character_is_valid_ + f_status_t f_utf_character_is_valid(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_valid(character); + } + + return F_true; + } +#endif // _di_f_utf_character_is_valid_ + +#ifndef _di_f_utf_character_is_whitespace_ + f_status_t f_utf_character_is_whitespace(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_whitespace(character); + } + + if (isspace(macro_f_utf_character_t_to_char_1(character))) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_whitespace_ + +#ifndef _di_f_utf_character_is_whitespace_modifier_ + f_status_t f_utf_character_is_whitespace_modifier(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_whitespace_modifier(character); + } + + // There are no ASCII whitespace modifiers. + return F_false; + } +#endif // _di_f_utf_character_is_whitespace_modifier_ + +#ifndef _di_f_utf_character_is_whitespace_other_ + f_status_t f_utf_character_is_whitespace_other(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_whitespace_other(character); + } + + // There are no ASCII whitespace other. + return F_false; + } +#endif // _di_f_utf_character_is_whitespace_other_ + +#ifndef _di_f_utf_character_is_wide_ + f_status_t f_utf_character_is_wide(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_wide(character); + } + + // There are no wide ASCII characters. + return F_false; + } +#endif // _di_f_utf_character_is_wide_ + +#ifndef _di_f_utf_character_is_word_ + f_status_t f_utf_character_is_word(const f_utf_character_t character, const bool strict) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_word(character, strict); + } + + if (isalnum(macro_f_utf_character_t_to_char_1(character)) || character == f_string_ascii_underscore_s.string[0]) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_word_ + +#ifndef _di_f_utf_character_is_word_dash_ + f_status_t f_utf_character_is_word_dash(const f_utf_character_t character, const bool strict) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_word_dash(character, strict); + } + + if (isalnum(macro_f_utf_character_t_to_char_1(character)) || character == f_string_ascii_underscore_s.string[0] || character == f_string_ascii_minus_s.string[0]) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_word_dash_ + +#ifndef _di_f_utf_character_is_word_dash_plus_ + f_status_t f_utf_character_is_word_dash_plus(const f_utf_character_t character, const bool strict) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_word_dash_plus(character, strict); + } + + if (isalnum(macro_f_utf_character_t_to_char_1(character)) || character == f_string_ascii_underscore_s.string[0] || character == f_string_ascii_minus_s.string[0] || character == f_string_ascii_plus_s.string[0]) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_word_dash_plus_ + +#ifndef _di_f_utf_character_is_zero_width_ + f_status_t f_utf_character_is_zero_width(const f_utf_character_t character) { + + if (macro_f_utf_character_t_width_is(character)) { + if (macro_f_utf_character_t_width_is(character) == 1) { + return F_status_set_error(F_utf_fragment); + } + + return private_f_utf_character_is_zero_width(character); + } + + const uint8_t ascii = macro_f_utf_character_t_to_char_1(character); + + // These control characters are considered zero-width spaces. + if (ascii >= 0x00 && ascii <= 0x08) { + return F_true; + } + else if (ascii == 0x0a) { + return F_true; + } + else if (ascii >= 0x0c && ascii <= 0x1f) { + return F_true; + } + else if (ascii == 0x7f) { + return F_true; + } + + return F_false; + } +#endif // _di_f_utf_character_is_zero_width_ + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/level_0/f_utf/c/utf/is_character.h b/level_0/f_utf/c/utf/is_character.h new file mode 100644 index 0000000..a56870d --- /dev/null +++ b/level_0/f_utf/c/utf/is_character.h @@ -0,0 +1,660 @@ +/** + * FLL - Level 0 + * + * Project: UTF + * API Version: 0.5 + * Licenses: lgplv2.1 + * + * Defines UTF-8 "character_is" functions. + * + * This is auto-included by utf.h and should not need to be explicitly included. + */ +#ifndef _F_utf_is_character_h +#define _F_utf_is_character_h + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Check to see if the entire byte block of the character is a non-ASCII UTF-8 character. + * + * This does not validate if the UTF-8 character is a valid UTF-8 character, for that use f_utf_character_is_valid(). + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 character. + * F_false if not a UTF-8 character. + * F_utf_fragment if this is a UTF-8 character fragment. + * + * @see f_utf_character_is_valid() + */ +#ifndef _di_f_utf_character_is_ + extern f_status_t f_utf_character_is(const f_utf_character_t character); +#endif // _di_f_utf_character_is_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabet character. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 alphabet character. + * F_false if not a UTF-8 alphabet character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isalpha() + */ +#ifndef _di_f_utf_character_is_alpha_ + extern f_status_t f_utf_character_is_alpha(const f_utf_character_t character); +#endif // _di_f_utf_character_is_alpha_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabetic or digit character. + * + * Digit characters are decimal digits and letter numbers. + * + * This does not include number-like, such as 1/2 (½) or superscript 2 (²). + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 alpha-digit character. + * F_false if not a UTF-8 alpha-digit character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isalnum() + */ +#ifndef _di_f_utf_character_is_alpha_digit_ + extern f_status_t f_utf_character_is_alpha_digit(const f_utf_character_t character); +#endif // _di_f_utf_character_is_alpha_digit_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 alphabetic or numeric character. + * + * Numeric characters are decimal digits, letter numbers, and number-like, such as 1/2 (½) or superscript 2 (²). + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 alpha-numeric character. + * F_false if not a UTF-8 alpha-numeric character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isalnum() + */ +#ifndef _di_f_utf_character_is_alpha_numeric_ + extern f_status_t f_utf_character_is_alpha_numeric(const f_utf_character_t character); +#endif // _di_f_utf_character_is_alpha_numeric_ + +/** + * Check to see if the entire byte block of the character is an ASCII character. + * + * This does not validate whether the UTF-8 character is valid or not. + * + * @param character + * The character to validate. + * + * @return + * F_true if an ASCII character. + * F_false if not an ASCII character. + */ +#ifndef _di_f_utf_character_is_ascii_ + extern f_status_t f_utf_character_is_ascii(const f_utf_character_t character); +#endif // _di_f_utf_character_is_ascii_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 combining character. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 combining character. + * F_false if not a UTF-8 combining character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_character_is_combining_ + extern f_status_t f_utf_character_is_combining(const f_utf_character_t character); +#endif // _di_f_utf_character_is_combining_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 control character. + * + * This includes control code and control format characters. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 control character. + * F_false if not a UTF-8 control character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see iscntrl() + */ +#ifndef _di_f_utf_character_is_control_ + extern f_status_t f_utf_character_is_control(const f_utf_character_t character); +#endif // _di_f_utf_character_is_control_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 control code character. + * + * Control Code characters are the traditional control characters, such as "\n" as well as some newer Unicode ones. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 control code character. + * F_false if not a UTF-8 control code character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see iscntrl() + */ +#ifndef _di_f_utf_character_is_control_code_ + extern f_status_t f_utf_character_is_control_code(const f_utf_character_t character); +#endif // _di_f_utf_character_is_control_code_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 control format character. + * + * Control Format characters are special characters used for formatting. + * These are considered control characters. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 control format character. + * F_false if not a UTF-8 control format character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_character_is_control_format_ + extern f_status_t f_utf_character_is_control_format(const f_utf_character_t character); +#endif // _di_f_utf_character_is_control_format_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 control picture character. + * + * Control Picture characters are placeholders for special ASCII characters and therefore there are no ASCII Control Picture characters. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 control picture character. + * F_false if not a UTF-8 control picture character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_character_is_control_picture_ + extern f_status_t f_utf_character_is_control_picture(const f_utf_character_t character); +#endif // _di_f_utf_character_is_control_picture_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 digit character. + * + * Digit characters are decimal digits and letter numbers. + * + * This does not include number-like, such as 1/2 (½) or superscript 2 (²). + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 digit character. + * F_false if not a UTF-8 digit character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isdigit() + */ +#ifndef _di_f_utf_character_is_digit_ + extern f_status_t f_utf_character_is_digit(const f_utf_character_t character); +#endif // _di_f_utf_character_is_digit_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 emoji character. + * + * @todo Incomplete, UTF-8 codes not yet checked! + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 emoji character. + * F_false if not a UTF-8 emoji character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_character_is_emoji_ + extern f_status_t f_utf_character_is_emoji(const f_utf_character_t character); +#endif // _di_f_utf_character_is_emoji_ + +/** + * Check to see if the entire byte block of the character is a 1-width UTF-8 character fragment. + * + * Characters whose width is 1-byte are invalid. + * However, the character could have been cut-off, so whether or not this is actually valid should be determined by the caller. + * + * For normal validation functions, try using f_utf_character_is() or f_utf_character_is_valid(). + * + * According to rfc3629, the valid octect sequences for UTF-8 are: + * UTF8-octets = *( UTF8-char ) + * UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 + * UTF8-1 = %x00-7F + * UTF8-2 = %xC2-DF UTF8-tail + * UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + * %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + * UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + * %xF4 %x80-8F 2( UTF8-tail ) + * UTF8-tail = %x80-BF + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 character. + * F_false if not a UTF-8 character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see f_utf_character_is() + * @see f_utf_character_is_valid() + */ +#ifndef _di_f_utf_character_is_fragment_ + extern f_status_t f_utf_character_is_fragment(const f_utf_character_t character); +#endif // _di_f_utf_character_is_fragment_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 printable character. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 graph. + * F_false if not a UTF-8 graph. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isgraph() + */ +#ifndef _di_f_utf_character_is_graph_ + extern f_status_t f_utf_character_is_graph(const f_utf_character_t character); +#endif // _di_f_utf_character_is_graph_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 numeric character. + * + * Numeric characters are decimal digits, letter numbers, and number-like, such as 1/2 (½) or superscript 2 (²). + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 numeric character. + * F_false if not a UTF-8 numeric character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isdigit() + */ +#ifndef _di_f_utf_character_is_numeric_ + extern f_status_t f_utf_character_is_numeric(const f_utf_character_t character); +#endif // _di_f_utf_character_is_numeric_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 phonetic character. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 phonetic character. + * F_false if not a UTF-8 phonetic character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_character_is_phonetic_ + extern f_status_t f_utf_character_is_phonetic(const f_utf_character_t character); +#endif // _di_f_utf_character_is_phonetic_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 private character. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 private character. + * F_false if not a UTF-8 private character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_character_is_private_ + extern f_status_t f_utf_character_is_private(const f_utf_character_t character); +#endif // _di_f_utf_character_is_private_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 punctuation character. + * + * @todo Incomplete, UTF-8 codes not yet checked! + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 punctuation character. + * F_false if not a UTF-8 punctuation character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_character_is_punctuation_ + extern f_status_t f_utf_character_is_punctuation(const f_utf_character_t character); +#endif // _di_f_utf_character_is_punctuation_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 symbol character. + * + * @todo Incomplete, UTF-8 codes not yet checked! + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 symbol character. + * F_false if not a UTF-8 symbol character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_character_is_symbol_ + extern f_status_t f_utf_character_is_symbol(const f_utf_character_t character); +#endif // _di_f_utf_character_is_symbol_ + +/** + * Check to see if the entire byte block of the character is a unassigned (well-formed) UTF-8 character. + * + * The Surrogates and Private Use are not considered unassigned. + * + * This does validate if the UTF-8 character is a unassigned UTF-8 character. + * To not do this, use f_utf_character_is(). + * + * @param character + * The character to unassignedate. + * + * @return + * F_true if a UTF-8 unassigned character. + * F_false if not a UTF-8 unassigned character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see f_utf_character_is() + * @see f_utf_character_is_fragment() + */ +#ifndef _di_f_utf_character_is_unassigned_ + extern f_status_t f_utf_character_is_unassigned(const f_utf_character_t character); +#endif // _di_f_utf_character_is_value_ + +/** + * Check to see if the entire byte block of the character is a valid (well-formed) UTF-8 character. + * + * This does validate if the UTF-8 character is a valid UTF-8 character. + * To not do this, use f_utf_character_is(). + * + * ASCII character codes are considered valid by this function. + * + * Codes U+FDD0 to U+FDEF and any character ending in FFFE or FFFF are non-characters, and are therefore invalid. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 character. + * F_false if not a UTF-8 character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see f_utf_character_is() + * @see f_utf_character_is_fragment() + */ +#ifndef _di_f_utf_character_is_valid_ + extern f_status_t f_utf_character_is_valid(const f_utf_character_t character); +#endif // _di_f_utf_character_is_value_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 general space character. + * + * Non-printing or zero-width characters are not considered whitespace. + * This does include line separators like '\n'. + * This does not include phonetic spaces, like whitespace modifiers. + * This does not include non-true whitespace characters, such as Ogham Space Mark ( ). + * + * Phonetic spaces are whitespaces with additional phonetic meaning associated with them. + * However, because they are not renderred as whitespace, they are technically not white space. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 whitespace. + * F_false if not a UTF-8 whitespace. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isspace() + */ +#ifndef _di_f_utf_character_is_whitespace_ + extern f_status_t f_utf_character_is_whitespace(const f_utf_character_t character); +#endif // _di_f_utf_character_is_whitespace_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 whitespace modifier character. + * + * These are phonetic spaces. + * + * Phonetic spaces are whitespaces with additional phonetic meaning associated with them. + * Therefore, these are valid spaces in the technical sense, even if they are not visibly whitespace. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 modifier character. + * F_false if not a UTF-8 modifier character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_character_is_whitespace_modifier_ + extern f_status_t f_utf_character_is_whitespace_modifier(const f_utf_character_t character); +#endif // _di_f_utf_character_is_whitespace_modifier_ + +/** + * Check to see if the entire byte block of the character is an other type of UTF-8 space character. + * + * This is a list of whitespace that are not actual whitespace (because they are graph characters) but are considered whitespace, such as Ogham Space Mark ( ). + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 (other) whitespace. + * F_false if not a UTF-8 (other) whitespace. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isspace() + */ +#ifndef _di_f_utf_character_is_whitespace_other_ + extern f_status_t f_utf_character_is_whitespace_other(const f_utf_character_t character); +#endif // _di_f_utf_character_is_whitespace_other_ + +/** + * Get whether or not the UTF-8 character is a wide character on display. + * + * This is not the wide as in width in bytes that the codepoint takes up in UTF-8. + * Instead, this is the width in characters on the screen the character takes up. + * When "wide" characters that take up either 2 characters on render. + * When "narrow" characters that take up either 1 character on render. + * + * @param character + * The (UTF-8) character. + * + * @return + * F_none on success. + * + * F_failure (with error bit) if width is not long enough to convert. + * F_parameter (with error bit) if a parameter is invalid. + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_character_is_wide_ + extern f_status_t f_utf_character_is_wide(const f_utf_character_t character); +#endif // _di_f_utf_character_is_wide_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 word character. + * + * A word character is alpha-numeric or an underscore '_'. + * + * @param character + * The character to validate. + * @param strict + * When TRUE, include all appropriate characters by type as per Unicode. + * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). + * When FALSE, zero-width punctuation characters are not considered a character. + * + * @return + * F_true if a UTF-8 word character. + * F_false if not a UTF-8 word character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isalnum() + */ +#ifndef _di_f_utf_character_is_word_ + extern f_status_t f_utf_character_is_word(const f_utf_character_t character, const bool strict); +#endif // _di_f_utf_character_is_word_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 word or dash character. + * + * A word dash character is alpha-numeric, an underscore '_' or a dash '-'. + * + * Unicode appears to refer to dashes that connect words as a hyphen. + * Therefore, only these hyphens are considered dashes for the purposes of this function. + * All other dash-like Unicode characters are not considered a dash here. + * The dash here is intended for combining words, which matches the context of the Unicode "hyphen". + * + * @param character + * The character to validate. + * @param strict + * When TRUE, include all appropriate characters by type as per Unicode. + * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). + * When FALSE, zero-width punctuation characters are not considered a character. + * + * @return + * F_true if a UTF-8 word or dash character. + * F_false if not a UTF-8 word or dash character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isalnum() + */ +#ifndef _di_f_utf_character_is_word_dash_ + extern f_status_t f_utf_character_is_word_dash(const f_utf_character_t character, const bool strict); +#endif // _di_f_utf_character_is_word_dash_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 word, dash, or plus character. + * + * A word dash plus character is alpha-digit, an underscore '_', a dash '-', or a plus '+'. + * + * Unicode appears to refer to dashes that connect words as a hyphen. + * Therefore, only these hyphens are considered dashes for the purposes of this function. + * All other dash-like Unicode characters are not considered a dash here. + * The dash here is intended for combining words, which matches the context of the Unicode "hyphen". + * + * This does not include zero-width punctuation, such as "invisible plus" (U+2064) (even in strict mode). + * + * @param character + * The character to validate. + * @param strict + * When TRUE, include all appropriate characters by type as per Unicode. + * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). + * When FALSE, zero-width punctuation characters are not considered a character. + * + * @return + * F_true if a UTF-8 word or dash character. + * F_false if not a UTF-8 word or dash character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + * + * @see isalnum() + */ +#ifndef _di_f_utf_character_is_word_dash_plus_ + extern f_status_t f_utf_character_is_word_dash_plus(const f_utf_character_t character, const bool strict); +#endif // _di_f_utf_character_is_word_dash_plus_ + +/** + * Check to see if the entire byte block of the character is an ASCII or UTF-8 general non-printing character. + * + * Only characters that do not print, which are generally called zero-width. + * + * @param character + * The character to validate. + * + * @return + * F_true if a UTF-8 non-printing or zero-width character. + * F_false if not a UTF-8 non-printing or zero-width character. + * + * F_utf (with error bit) if unicode is an invalid Unicode character. + * F_utf_fragment (with error bit) if character is an incomplete UTF-8 fragment. + */ +#ifndef _di_f_utf_character_is_zero_width_ + extern f_status_t f_utf_character_is_zero_width(const f_utf_character_t character); +#endif // _di_f_utf_character_is_zero_width_ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // _F_utf_is_character_h diff --git a/level_0/f_utf/data/build/settings b/level_0/f_utf/data/build/settings index 465b699..26c3e41 100644 --- a/level_0/f_utf/data/build/settings +++ b/level_0/f_utf/data/build/settings @@ -31,7 +31,7 @@ build_objects_library_static build_objects_program build_objects_program_shared build_objects_program_static -build_sources_library utf.c private-utf.c utf/common.c utf/dynamic.c utf/map.c utf/private-is_unassigned.c utf/private-string.c utf/string.c utf/triple.c +build_sources_library utf.c private-utf.c utf/common.c utf/convert.c utf/dynamic.c utf/is.c utf/is_character.c utf/map.c utf/private-is_unassigned.c utf/private-string.c utf/string.c utf/triple.c build_sources_library_shared build_sources_library_static build_sources_object @@ -40,7 +40,7 @@ build_sources_object_static build_sources_program build_sources_program_shared build_sources_program_static -build_sources_headers utf.h utf/common.h utf/dynamic.h utf/map.h utf/string.h utf/triple.h +build_sources_headers utf.h utf/common.h utf/convert.h utf/dynamic.h utf/is.h utf/is_character.h utf/map.h utf/string.h utf/triple.h build_sources_headers_shared build_sources_headers_static build_sources_script -- 1.8.3.1