From: Kevin Day Date: Sun, 1 Sep 2019 07:13:01 +0000 (-0500) Subject: Update: implement utf strings, ensure endianess, and add isgraph()/isspace() methods... X-Git-Tag: 0.5.0~461 X-Git-Url: https://git.kevux.org/?a=commitdiff_plain;h=e1fc481c707a2b87b6205f4ea5228e148ec7521a;p=fll Update: implement utf strings, ensure endianess, and add isgraph()/isspace() methods to UTF-8 equivalents Expand the UTF-8 character type (a 4-byte wide character represented as a big-endian 32-bit integer) into working like f_string and f_dynamic_string. Provide all similar functionality. I have decided that the isgraph(), isspace(), etc.. functions fo UTF-8 should also call the ASCII equivalents. Update all relating code. Use memcmp() and memcpy() for comparing UTF-8 characters class (4-byte integer) to the UTF-8 char strings (multiple 1-byte char). When doing this, make sure to do so with the proper endianess. Add missing f_utf_character_to_char() function. Wrap some of the macros parameters in parenthesis for safety reasons. Add f_utf_is_big_endian() and document its use. Provide custom EOL, EOS, and placeholder defines for UTF characters (4-byte integers). --- diff --git a/build/level_1/settings b/build/level_1/settings index dae68c9..ccb2097 100644 --- a/build/level_1/settings +++ b/build/level_1/settings @@ -11,9 +11,9 @@ build_compiler gcc build_linker ar build_libraries -lc build_libraries_fll -lfll_0 -build_sources_library level_1/colors.c level_1/console.c level_1/directory.c level_1/status.c level_1/file.c level_1/fss.c level_1/fss_basic.c level_1/fss_basic_list.c level_1/fss_extended.c level_1/program.c level_1/serialized.c level_1/strings.c +build_sources_library level_1/colors.c level_1/console.c level_1/directory.c level_1/status.c level_1/file.c level_1/fss.c level_1/fss_basic.c level_1/fss_basic_list.c level_1/fss_extended.c level_1/program.c level_1/serialized.c level_1/strings.c level_1/utf.c build_sources_program -build_sources_headers level_1/colors.h level_1/console.h level_1/directory.h level_1/status.h level_1/file.h level_1/fss.h level_1/fss_basic.h level_1/fss_basic_list.h level_1/fss_status.h level_1/fss_extended.h level_1/fss_macro.h level_1/program.h level_1/serialized.h level_1/strings.h +build_sources_headers level_1/colors.h level_1/console.h level_1/directory.h level_1/status.h level_1/file.h level_1/fss.h level_1/fss_basic.h level_1/fss_basic_list.h level_1/fss_status.h level_1/fss_extended.h level_1/fss_macro.h level_1/program.h level_1/serialized.h level_1/strings.h level_1/utf.h build_shared yes build_static yes diff --git a/build/monolithic/settings b/build/monolithic/settings index 8adf823..043100c 100644 --- a/build/monolithic/settings +++ b/build/monolithic/settings @@ -11,9 +11,9 @@ build_compiler gcc build_linker ar build_libraries -lc build_libraries_fll -build_sources_library level_0/console.c level_0/conversion.c level_0/file.c level_0/memory.c level_0/pipe.c level_0/print.c level_0/utf.c level_1/colors.c level_1/console.c level_1/directory.c level_1/status.c level_1/file.c level_1/fss.c level_1/fss_basic.c level_1/fss_basic_list.c level_1/fss_extended.c level_1/program.c level_1/serialized.c level_1/strings.c level_2/colors.c level_2/execute.c level_2/status.c +build_sources_library level_0/console.c level_0/conversion.c level_0/file.c level_0/memory.c level_0/pipe.c level_0/print.c level_0/utf.c level_1/colors.c level_1/console.c level_1/directory.c level_1/status.c level_1/file.c level_1/fss.c level_1/fss_basic.c level_1/fss_basic_list.c level_1/fss_extended.c level_1/program.c level_1/serialized.c level_1/strings.c level_1/utf.c level_2/colors.c level_2/execute.c level_2/status.c build_sources_program -build_sources_headers level_0/colors.h level_0/console.h level_0/conversion.h level_0/status.h level_0/file.h level_0/fss.h level_0/memory.h level_0/fll_paths.h level_0/filesystem_paths.h level_0/pipe.h level_0/print.h level_0/serialized.h level_0/strings.h level_0/types.h level_0/types_array.h level_0/utf.h level_1/colors.h level_1/console.h level_1/directory.h level_1/status.h level_1/file.h level_1/fss.h level_1/fss_basic.h level_1/fss_basic_list.h level_1/fss_status.h level_1/fss_extended.h level_1/fss_macro.h level_1/program.h level_1/serialized.h level_1/strings.h level_2/colors.h level_2/execute.h level_2/status.h level_2/fss_basic.h level_2/fss_basic_list.h level_2/fss_extended.h level_2/fss_status.h +build_sources_headers level_0/colors.h level_0/console.h level_0/conversion.h level_0/status.h level_0/file.h level_0/fss.h level_0/memory.h level_0/fll_paths.h level_0/filesystem_paths.h level_0/pipe.h level_0/print.h level_0/serialized.h level_0/strings.h level_0/types.h level_0/types_array.h level_0/utf.h level_1/colors.h level_1/console.h level_1/directory.h level_1/status.h level_1/file.h level_1/fss.h level_1/fss_basic.h level_1/fss_basic_list.h level_1/fss_status.h level_1/fss_extended.h level_1/fss_macro.h level_1/program.h level_1/serialized.h level_1/strings.h level_1/utf.h level_2/colors.h level_2/execute.h level_2/status.h level_2/fss_basic.h level_2/fss_basic_list.h level_2/fss_extended.h level_2/fss_status.h build_sources_bash build_sources_settings build_shared yes diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index 8557a39..6105c5a 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -4,6 +4,19 @@ extern "C" { #endif +#ifndef _di_f_utf_is_big_endian_ + f_return_status f_utf_is_big_endian() { + uint16_t test_int = (0x01 << 8) | 0x02; + char test_char[2] = {0x01, 0x02}; + + if (!memcmp(&test_int, test_char, 2)) { + return f_true; + } + + return f_false; + } +#endif // _di_f_utf_is_big_endian_ + #ifndef _di_f_utf_is_ f_return_status f_utf_is(const f_string character, const f_u_short max_width) { #ifndef _di_level_0_parameter_checking_ @@ -60,7 +73,7 @@ extern "C" { return f_false; } else if (width == 1) { - return f_status_is_error(f_incomplete_utf); + return f_status_is_error(f_invalid_utf); } return f_true; @@ -73,27 +86,21 @@ extern "C" { if (max_width < 1) return f_status_set_error(f_invalid_parameter); #endif // _di_level_0_parameter_checking_ - f_u_short width = f_macro_utf_byte_width_is(*character); + if (f_macro_utf_byte_width_is(*character) == 0) { + if (isgraph(*character)) { + return f_true; + } - if (width == 0) { return f_false; } - else if (width == 1) { - return f_status_is_error(f_incomplete_utf); - } - // Do not operate on UTF-8 fragments that are not the first byte of the character. - if (width == 1) { - return f_status_set_error(f_incomplete_utf); - } + // For now, just assume that any non-whitespace, non-substitute UTF-8 character is a graph. + f_status status = f_utf_is_space(character, max_width); - if (width > max_width) { - return f_status_set_error(f_maybe); + if (f_status_is_error(status)) { + return status; } - - // for now, just assume that any non-whitespace, non-substitute utf-8 character is a graph. - - if (f_utf_is_space(character, max_width) == f_true) { + else if (status == f_true) { return f_false; } @@ -114,6 +121,10 @@ extern "C" { f_u_short width = f_macro_utf_byte_width_is(*character); if (width == 0) { + if (isspace(*character)) { + return f_true; + } + return f_false; } else if (width == 1) { @@ -249,6 +260,7 @@ extern "C" { f_u_short width = f_macro_utf_byte_width_is(*character); if (width == 0) { + // there is no substitute character in ASCII. return f_false; } else if (width == 1) { @@ -300,6 +312,10 @@ extern "C" { f_u_short width = f_macro_utf_byte_width_is(*character); if (width == 0) { + if (isspace(*character)) { + return f_true; + } + return f_false; } else if (width == 1) { @@ -418,18 +434,13 @@ extern "C" { #ifndef _di_f_utf_is_graph_character_ f_return_status f_utf_is_graph_character(const f_utf_character character) { - f_u_short width = f_macro_utf_character_width_is(character); - - if (width == 0) { - return f_false; - } - else if (width == 1) { - return f_status_is_error(f_incomplete_utf); - } - // for now, just assume that any non-whitespace, non-substitute utf-8 character is a graph. + f_status status = f_utf_is_space_character(character); - if (f_utf_is_space_character(character) == f_true) { + if (f_status_is_error(status)) { + return status; + } + else if (status == f_true) { return f_false; } @@ -446,28 +457,42 @@ extern "C" { f_u_short width = f_macro_utf_character_width_is(character); if (width == 0) { + char ascii = character >> 24; + + if (isspace(ascii)) { + return f_true; + } + return f_false; } else if (width == 1) { - return f_status_is_error(f_incomplete_utf); + return f_status_is_error(f_invalid_utf); } + f_bool is_big_endian = f_utf_is_big_endian(); + if (width == 2) { - char utf[2] = { f_macro_utf_character_to_char_1(character), f_macro_utf_character_to_char_2(character) }; + uint16_t utf = 0; + if (is_big_endian) { + utf = (uint16_t) (character >> 16); + } + else { + utf = (f_macro_utf_character_to_char_2(character) << 8) | f_macro_utf_character_to_char_1(character); + } - if (utf[0] == f_utf_space_no_break[0] && utf[1] == f_utf_space_no_break[1]) { + if (!memcmp(&utf, f_utf_space_no_break, width)) { return f_true; } - if (utf[0] == f_utf_space_line_feed_reverse[0] && utf[1] == f_utf_space_line_feed_reverse[1]) { + if (!memcmp(&utf, f_utf_space_line_feed_reverse, width)) { return f_true; } - if (utf[0] == f_utf_space_line_next[0] && utf[1] == f_utf_space_line_next[1]) { + if (!memcmp(&utf, f_utf_space_line_next, width)) { return f_true; } - if (utf[0] == f_utf_substitute_middle_dot[0] && utf[1] == f_utf_substitute_middle_dot[1]) { + if (!memcmp(&utf, f_utf_substitute_middle_dot, width)) { return f_true; } @@ -475,93 +500,99 @@ extern "C" { } if (width == 3) { - char utf[3] = { f_macro_utf_character_to_char_1(character), f_macro_utf_character_to_char_2(character), f_macro_utf_character_to_char_3(character) }; + uint32_t utf = 0; + if (is_big_endian) { + utf = character; + } + else { + utf = (f_macro_utf_character_to_char_3(character) << 24) | (f_macro_utf_character_to_char_2(character) << 16) | (f_macro_utf_character_to_char_1(character) << 8); + } - if (utf[0] == f_utf_space_no_break_narrow[0] && utf[1] == f_utf_space_no_break_narrow[1] && utf[2] == f_utf_space_no_break_narrow[2]) { + if (!memcmp(&utf, f_utf_space_no_break_narrow, width)) { return f_true; } - if (utf[0] == f_utf_space_en[0] && utf[1] == f_utf_space_en[1] && utf[2] == f_utf_space_en[2]) { + if (!memcmp(&utf, f_utf_space_en, width)) { return f_true; } - if (utf[0] == f_utf_space_en_quad[0] && utf[1] == f_utf_space_en_quad[1] && utf[2] == f_utf_space_en_quad[2]) { + if (!memcmp(&utf, f_utf_space_en_quad, width)) { return f_true; } - if (utf[0] == f_utf_space_en_quad[0] && utf[1] == f_utf_space_en_quad[1] && utf[2] == f_utf_space_en_quad[2]) { + if (!memcmp(&utf, f_utf_space_en_quad, width)) { return f_true; } - if (utf[0] == f_utf_space_em[0] && utf[1] == f_utf_space_em[1] && utf[2] == f_utf_space_em[2]) { + if (!memcmp(&utf, f_utf_space_em, width)) { return f_true; } - if (utf[0] == f_utf_space_em_quad[0] && utf[1] == f_utf_space_em_quad[1] && utf[2] == f_utf_space_em_quad[2]) { + if (!memcmp(&utf, f_utf_space_em_quad, width)) { return f_true; } - if (utf[0] == f_utf_space_em_per_three[0] && utf[1] == f_utf_space_em_per_three[1] && utf[2] == f_utf_space_em_per_three[2]) { + if (!memcmp(&utf, f_utf_space_em_per_three, width)) { return f_true; } - if (utf[0] == f_utf_space_em_per_four[0] && utf[1] == f_utf_space_em_per_four[1] && utf[2] == f_utf_space_em_per_four[2]) { + if (!memcmp(&utf, f_utf_space_em_per_four, width)) { return f_true; } - if (utf[0] == f_utf_space_em_per_six[0] && utf[1] == f_utf_space_em_per_six[1] && utf[2] == f_utf_space_em_per_six[2]) { + if (!memcmp(&utf, f_utf_space_em_per_six, width)) { return f_true; } - if (utf[0] == f_utf_space_figure[0] && utf[1] == f_utf_space_figure[1] && utf[2] == f_utf_space_figure[2]) { + if (!memcmp(&utf, f_utf_space_figure, width)) { return f_true; } - if (utf[0] == f_utf_space_punctuation[0] && utf[1] == f_utf_space_punctuation[1] && utf[2] == f_utf_space_punctuation[2]) { + if (!memcmp(&utf, f_utf_space_punctuation, width)) { return f_true; } - if (utf[0] == f_utf_space_thin[0] && utf[1] == f_utf_space_thin[1] && utf[2] == f_utf_space_thin[2]) { + if (!memcmp(&utf, f_utf_space_thin, width)) { return f_true; } - if (utf[0] == f_utf_space_hair[0] && utf[1] == f_utf_space_hair[1] && utf[2] == f_utf_space_hair[2]) { + if (!memcmp(&utf, f_utf_space_hair, width)) { return f_true; } - if (utf[0] == f_utf_space_separator_line[0] && utf[1] == f_utf_space_separator_line[1] && utf[2] == f_utf_space_separator_line[2]) { + if (!memcmp(&utf, f_utf_space_separator_line, width)) { return f_true; } - if (utf[0] == f_utf_space_separator_paragraph[0] && utf[1] == f_utf_space_separator_paragraph[1] && utf[2] == f_utf_space_separator_paragraph[2]) { + if (!memcmp(&utf, f_utf_space_separator_paragraph, width)) { return f_true; } - if (utf[0] == f_utf_space_ogham[0] && utf[1] == f_utf_space_ogham[1] && utf[2] == f_utf_space_ogham[2]) { + if (!memcmp(&utf, f_utf_space_ogham, width)) { return f_true; } - if (utf[0] == f_utf_space_ideographic[0] && utf[1] == f_utf_space_ideographic[1] && utf[2] == f_utf_space_ideographic[2]) { + if (!memcmp(&utf, f_utf_space_ideographic, width)) { return f_true; } - if (utf[0] == f_utf_space_medium_mathematical[0] && utf[1] == f_utf_space_medium_mathematical[1] && utf[2] == f_utf_space_medium_mathematical[2]) { + if (!memcmp(&utf, f_utf_space_medium_mathematical, width)) { return f_true; } - if (utf[0] == f_utf_substitute_symbol_blank[0] && utf[1] == f_utf_substitute_symbol_blank[1] && utf[2] == f_utf_substitute_symbol_blank[2]) { + if (!memcmp(&utf, f_utf_substitute_symbol_blank, width)) { return f_true; } - if (utf[0] == f_utf_substitute_symbol_space[0] && utf[1] == f_utf_substitute_symbol_space[1] && utf[2] == f_utf_substitute_symbol_space[2]) { + if (!memcmp(&utf, f_utf_substitute_symbol_space, width)) { return f_true; } - if (utf[0] == f_utf_substitute_open_box[0] && utf[1] == f_utf_substitute_open_box[1] && utf[2] == f_utf_substitute_open_box[2]) { + if (!memcmp(&utf, f_utf_substitute_open_box, width)) { return f_true; } - if (utf[0] == f_utf_substitute_open_box_shouldered[0] && utf[1] == f_utf_substitute_open_box_shouldered[1] && utf[2] == f_utf_substitute_open_box_shouldered[2]) { + if (!memcmp(&utf, f_utf_substitute_open_box_shouldered, width)) { return f_true; } @@ -577,16 +608,25 @@ extern "C" { f_u_short width = f_macro_utf_character_width_is(character); if (width == 0) { + // there is no substitute character in ASCII. return f_false; } else if (width == 1) { - return f_status_is_error(f_incomplete_utf); + return f_status_is_error(f_invalid_utf); } + f_bool is_big_endian = f_utf_is_big_endian(); + if (width == 2) { - char utf[2] = { f_macro_utf_character_to_char_1(character), f_macro_utf_character_to_char_2(character) }; + uint16_t utf = 0; + if (is_big_endian) { + utf = (uint16_t) (character >> 16); + } + else { + utf = (f_macro_utf_character_to_char_2(character) << 8) | f_macro_utf_character_to_char_1(character); + } - if (utf[0] == f_utf_substitute_middle_dot[0] && utf[1] == f_utf_substitute_middle_dot[1]) { + if (!memcmp(&utf, f_utf_substitute_middle_dot, width)) { return f_true; } @@ -594,21 +634,27 @@ extern "C" { } if (width == 3) { - char utf[3] = { f_macro_utf_character_to_char_1(character), f_macro_utf_character_to_char_2(character), f_macro_utf_character_to_char_3(character) }; + uint32_t utf = 0; + if (is_big_endian) { + utf = character; + } + else { + utf = (f_macro_utf_character_to_char_3(character) << 24) | (f_macro_utf_character_to_char_2(character) << 16) | (f_macro_utf_character_to_char_1(character) << 8); + } - if (utf[0] == f_utf_substitute_symbol_blank[0] && utf[1] == f_utf_substitute_symbol_blank[1] && utf[2] == f_utf_substitute_symbol_blank[2]) { + if (!memcmp(&utf, f_utf_substitute_symbol_blank, width)) { return f_true; } - if (utf[0] == f_utf_substitute_symbol_space[0] && utf[1] == f_utf_substitute_symbol_space[1] && utf[2] == f_utf_substitute_symbol_space[2]) { + if (!memcmp(&utf, f_utf_substitute_symbol_space, width)) { return f_true; } - if (utf[0] == f_utf_substitute_open_box[0] && utf[1] == f_utf_substitute_open_box[1] && utf[2] == f_utf_substitute_open_box[2]) { + if (!memcmp(&utf, f_utf_substitute_open_box, width)) { return f_true; } - if (utf[0] == f_utf_substitute_open_box_shouldered[0] && utf[1] == f_utf_substitute_open_box_shouldered[1] && utf[2] == f_utf_substitute_open_box_shouldered[2]) { + if (!memcmp(&utf, f_utf_substitute_open_box_shouldered, width)) { return f_true; } @@ -624,29 +670,38 @@ extern "C" { f_u_short width = f_macro_utf_character_width_is(character); if (width == 0) { + char ascii = character >> 24; + + if (isspace(ascii)) { + return f_true; + } + return f_false; } else if (width == 1) { - return f_status_is_error(f_incomplete_utf); + return f_status_is_error(f_invalid_utf); } - // Do not operate on UTF-8 fragments that are not the first byte of the character. - if (width == 1) { - return f_status_set_error(f_incomplete_utf); - } + f_bool is_big_endian = f_utf_is_big_endian(); if (width == 2) { - char utf[2] = { f_macro_utf_character_to_char_1(character), f_macro_utf_character_to_char_2(character) }; + uint16_t utf = 0; + if (is_big_endian) { + utf = (uint16_t) (character >> 16); + } + else { + utf = (f_macro_utf_character_to_char_2(character) << 8) | f_macro_utf_character_to_char_1(character); + } - if (utf[0] == f_utf_space_no_break[0] && utf[1] == f_utf_space_no_break[1]) { + if (!memcmp(&utf, f_utf_space_no_break, width)) { return f_true; } - if (utf[0] == f_utf_space_line_feed_reverse[0] && utf[1] == f_utf_space_line_feed_reverse[1]) { + if (!memcmp(&utf, f_utf_space_line_feed_reverse, width)) { return f_true; } - if (utf[0] == f_utf_space_line_next[0] && utf[1] == f_utf_space_line_next[1]) { + if (!memcmp(&utf, f_utf_space_line_next, width)) { return f_true; } @@ -654,77 +709,83 @@ extern "C" { } if (width == 3) { - char utf[3] = { f_macro_utf_character_to_char_1(character), f_macro_utf_character_to_char_2(character), f_macro_utf_character_to_char_3(character) }; + uint32_t utf = 0; + if (is_big_endian) { + utf = character; + } + else { + utf = (f_macro_utf_character_to_char_3(character) << 24) | (f_macro_utf_character_to_char_2(character) << 16) | (f_macro_utf_character_to_char_1(character) << 8); + } - if (utf[0] == f_utf_space_no_break_narrow[0] && utf[1] == f_utf_space_no_break_narrow[1] && utf[2] == f_utf_space_no_break_narrow[2]) { + if (!memcmp(&utf, f_utf_space_no_break_narrow, width)) { return f_true; } - if (utf[0] == f_utf_space_en[0] && utf[1] == f_utf_space_en[1] && utf[2] == f_utf_space_en[2]) { + if (!memcmp(&utf, f_utf_space_en, width)) { return f_true; } - if (utf[0] == f_utf_space_en_quad[0] && utf[1] == f_utf_space_en_quad[1] && utf[2] == f_utf_space_en_quad[2]) { + if (!memcmp(&utf, f_utf_space_en_quad, width)) { return f_true; } - if (utf[0] == f_utf_space_en_quad[0] && utf[1] == f_utf_space_en_quad[1] && utf[2] == f_utf_space_en_quad[2]) { + if (!memcmp(&utf, f_utf_space_en_quad, width)) { return f_true; } - if (utf[0] == f_utf_space_em[0] && utf[1] == f_utf_space_em[1] && utf[2] == f_utf_space_em[2]) { + if (!memcmp(&utf, f_utf_space_em, width)) { return f_true; } - if (utf[0] == f_utf_space_em_quad[0] && utf[1] == f_utf_space_em_quad[1] && utf[2] == f_utf_space_em_quad[2]) { + if (!memcmp(&utf, f_utf_space_em_quad, width)) { return f_true; } - if (utf[0] == f_utf_space_em_per_three[0] && utf[1] == f_utf_space_em_per_three[1] && utf[2] == f_utf_space_em_per_three[2]) { + if (!memcmp(&utf, f_utf_space_em_per_three, width)) { return f_true; } - if (utf[0] == f_utf_space_em_per_four[0] && utf[1] == f_utf_space_em_per_four[1] && utf[2] == f_utf_space_em_per_four[2]) { + if (!memcmp(&utf, f_utf_space_em_per_four, width)) { return f_true; } - if (utf[0] == f_utf_space_em_per_six[0] && utf[1] == f_utf_space_em_per_six[1] && utf[2] == f_utf_space_em_per_six[2]) { + if (!memcmp(&utf, f_utf_space_em_per_six, width)) { return f_true; } - if (utf[0] == f_utf_space_figure[0] && utf[1] == f_utf_space_figure[1] && utf[2] == f_utf_space_figure[2]) { + if (!memcmp(&utf, f_utf_space_figure, width)) { return f_true; } - if (utf[0] == f_utf_space_punctuation[0] && utf[1] == f_utf_space_punctuation[1] && utf[2] == f_utf_space_punctuation[2]) { + if (!memcmp(&utf, f_utf_space_punctuation, width)) { return f_true; } - if (utf[0] == f_utf_space_thin[0] && utf[1] == f_utf_space_thin[1] && utf[2] == f_utf_space_thin[2]) { + if (!memcmp(&utf, f_utf_space_thin, width)) { return f_true; } - if (utf[0] == f_utf_space_hair[0] && utf[1] == f_utf_space_hair[1] && utf[2] == f_utf_space_hair[2]) { + if (!memcmp(&utf, f_utf_space_hair, width)) { return f_true; } - if (utf[0] == f_utf_space_separator_line[0] && utf[1] == f_utf_space_separator_line[1] && utf[2] == f_utf_space_separator_line[2]) { + if (!memcmp(&utf, f_utf_space_separator_line, width)) { return f_true; } - if (utf[0] == f_utf_space_separator_paragraph[0] && utf[1] == f_utf_space_separator_paragraph[1] && utf[2] == f_utf_space_separator_paragraph[2]) { + if (!memcmp(&utf, f_utf_space_separator_paragraph, width)) { return f_true; } - if (utf[0] == f_utf_space_ogham[0] && utf[1] == f_utf_space_ogham[1] && utf[2] == f_utf_space_ogham[2]) { + if (!memcmp(&utf, f_utf_space_ogham, width)) { return f_true; } - if (utf[0] == f_utf_space_ideographic[0] && utf[1] == f_utf_space_ideographic[1] && utf[2] == f_utf_space_ideographic[2]) { + if (!memcmp(&utf, f_utf_space_ideographic, width)) { return f_true; } - if (utf[0] == f_utf_space_medium_mathematical[0] && utf[1] == f_utf_space_medium_mathematical[1] && utf[2] == f_utf_space_medium_mathematical[2]) { + if (!memcmp(&utf, f_utf_space_medium_mathematical, width)) { return f_true; } @@ -749,15 +810,14 @@ extern "C" { return f_none; } else if (width == 1) { - return f_status_is_error(f_incomplete_utf); + return f_status_is_error(f_invalid_utf); } if (width > max_width) { return f_status_set_error(f_failure); } - memset(utf_character, 0, sizeof(f_utf_character)); - + *utf_character = 0; *utf_character |= f_macro_utf_character_to_char_1(character[0]); if (width < 2) { @@ -782,6 +842,62 @@ extern "C" { } #endif // _di_f_utf_char_to_character_ +#ifndef _di_f_utf_character_to_char_ + f_return_status f_utf_character_to_char(const f_utf_character utf_character, f_string *character, f_u_short *max_width) { + #ifndef _di_level_0_parameter_checking_ + if (utf_character == 0) return f_status_set_error(f_invalid_parameter); + if (max_width == 0 && *character != 0) return f_status_set_error(f_invalid_parameter); + if (max_width != 0 && *character == 0) return f_status_set_error(f_invalid_parameter); + if (max_width != 0 && *max_width > 4) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_0_parameter_checking_ + + f_status status = f_none; + + f_u_short width = f_macro_utf_character_width_is(utf_character); + + if (max_width == 0) { + f_new_string(status, *character, width); + + if (f_status_is_error(status)) return status; + + width = 1; + *max_width = 1; + } + else if (width == 1) { + return f_status_is_error(f_invalid_utf); + } + else if (width > *max_width) { + return f_status_set_error(f_failure); + } + + *max_width = width; + + if (f_utf_is_big_endian()) { + memcpy(*character, &utf_character, sizeof(char) * width); + } + else { + uint32_t utf = 0; + + if (width == 1) { + utf = f_macro_utf_character_to_char_1(utf_character) << 24; + } + else if (width == 2) { + utf = (f_macro_utf_character_to_char_2(utf_character) << 24) | (f_macro_utf_character_to_char_1(utf_character) << 16); + } + else if (width == 3) { + utf = (f_macro_utf_character_to_char_3(utf_character) << 24) | (f_macro_utf_character_to_char_2(utf_character) << 16) | (f_macro_utf_character_to_char_1(utf_character) << 8); + } + else if (width == 4) { + utf = (f_macro_utf_character_to_char_4(utf_character) << 24) | (f_macro_utf_character_to_char_3(utf_character) << 16) | (f_macro_utf_character_to_char_2(utf_character) << 8) | f_macro_utf_character_to_char_1(utf_character); + } + + memcpy(*character, &utf, sizeof(char) * width); + } + + return f_none; + } +#endif // _di_f_utf_character_to_char_ + #ifdef __cplusplus } // extern "C" #endif diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h index 5fc452e..68fcbb4 100644 --- a/level_0/f_utf/c/utf.h +++ b/level_0/f_utf/c/utf.h @@ -35,6 +35,7 @@ #define _F_utf_h // libc includes +#include #include // fll includes @@ -88,12 +89,12 @@ extern "C" { #define f_utf_byte_off_3 0xf0 // 1111 0000 #define f_utf_byte_off_4 0xf8 // 1111 1000 - #define f_macro_utf_byte_is(character) (character & f_utf_byte_1) + #define f_macro_utf_byte_is(character) ((character) & f_utf_byte_1) - #define f_macro_utf_byte_is_1(character) ((character & f_utf_byte_off_1) == f_utf_byte_1) // (10xx xxxx & 1100 0000) == 1000 0000 - #define f_macro_utf_byte_is_2(character) ((character & f_utf_byte_off_2) == f_utf_byte_2) // (110x xxxx & 1110 0000) == 1100 0000 - #define f_macro_utf_byte_is_3(character) ((character & f_utf_byte_off_3) == f_utf_byte_3) // (1110 xxxx & 1111 0000) == 1110 0000 - #define f_macro_utf_byte_is_4(character) ((character & f_utf_byte_off_4) == f_utf_byte_4) // (1111 0xxx & 1111 1000) == 1111 0000 + #define f_macro_utf_byte_is_1(character) (((character) & f_utf_byte_off_1) == f_utf_byte_1) // (10xx xxxx & 1100 0000) == 1000 0000 + #define f_macro_utf_byte_is_2(character) (((character) & f_utf_byte_off_2) == f_utf_byte_2) // (110x xxxx & 1110 0000) == 1100 0000 + #define f_macro_utf_byte_is_3(character) (((character) & f_utf_byte_off_3) == f_utf_byte_3) // (1110 xxxx & 1111 0000) == 1110 0000 + #define f_macro_utf_byte_is_4(character) (((character) & f_utf_byte_off_4) == f_utf_byte_4) // (1111 0xxx & 1111 1000) == 1111 0000 #define f_macro_utf_byte_width(character) ((!f_macro_utf_byte_is(character) || f_macro_utf_byte_is_1(character)) ? 1 : (f_macro_utf_byte_is_2(character) ? 2 : (f_macro_utf_byte_is_3(character) ? 3 : 4))) #define f_macro_utf_byte_width_is(character) (f_macro_utf_byte_is(character) ? (f_macro_utf_byte_is_1(character) ? 1 : (f_macro_utf_byte_is_2(character) ? 2 : (f_macro_utf_byte_is_3(character) ? 3 : 4))) : 0) @@ -104,6 +105,9 @@ extern "C" { * * This is intended to be used when a single variable is desired to represent a 1-byte, 2-byte, 3-byte, or even 4-byte character. * + * This "character" type is stored as a big-endian 4-byte integer (32-bits). + * A helper function, f_utf_is_big_endian(), is provided to detect system endianness so that character arrays (char []) can be correctly processed. + * * The byte structure is intended to be read left to right. * * The f_macro_utf_character_mask_byte_* are used to get the entire character set fo a given width. @@ -116,6 +120,8 @@ extern "C" { * * The f_macro_utf_character_width is used to determine the width of the UTF-8 character based on f_macro_utf_byte_width. * The f_macro_utf_character_width_is is used to determine the width of the UTF-8 character based on f_macro_utf_byte_width_is. + * + * @see f_utf_is_big_endian() */ #ifndef _di_f_utf_character_ typedef uint32_t f_utf_character; @@ -132,20 +138,306 @@ extern "C" { #define f_utf_character_mask_char_3 0x0000ff00 // 0000 0000, 0000 0000, 1111 1111, 0000 0000 #define f_utf_character_mask_char_4 0x000000ff // 0000 0000, 0000 0000, 0000 0000, 1111 1111 - #define f_macro_utf_character_to_char_1(character) ((f_utf_character_mask_char_1 & character) >> 24) // grab first byte. - #define f_macro_utf_character_to_char_2(character) ((f_utf_character_mask_char_2 & character) >> 16) // grab second byte. - #define f_macro_utf_character_to_char_3(character) ((f_utf_character_mask_char_3 & character) >> 8) // grab third byte. - #define f_macro_utf_character_to_char_4(character) (f_utf_character_mask_char_4 & character) // grab fourth byte. + #define f_macro_utf_character_to_char_1(character) (((character) & f_utf_character_mask_char_1) >> 24) // grab first byte. + #define f_macro_utf_character_to_char_2(character) (((character) & f_utf_character_mask_char_2) >> 16) // grab second byte. + #define f_macro_utf_character_to_char_3(character) (((character) & f_utf_character_mask_char_3) >> 8) // grab third byte. + #define f_macro_utf_character_to_char_4(character) ((character) & f_utf_character_mask_char_4) // grab fourth byte. - #define f_macro_utf_character_from_char_1(character) (character << 24) // shift the first byte. - #define f_macro_utf_character_from_char_2(character) (character << 16) // shift the second byte. - #define f_macro_utf_character_from_char_3(character) (character << 8) // shift the third byte. - #define f_macro_utf_character_from_char_4(character) (character) // shift the fourth byte. + #define f_macro_utf_character_from_char_1(character) ((character) << 24) // shift the first byte. + #define f_macro_utf_character_from_char_2(character) ((character) << 16) // shift the second byte. + #define f_macro_utf_character_from_char_3(character) ((character) << 8) // shift the third byte. + #define f_macro_utf_character_from_char_4(character) ((character)) // shift the fourth byte. #define f_macro_utf_character_width(character) (f_macro_utf_byte_width(f_macro_utf_character_to_char_1(character))) #define f_macro_utf_character_width_is(character) (f_macro_utf_byte_width_is(f_macro_utf_character_to_char_1(character))) #endif // _di_f_utf_character_ +#ifndef _di_f_utf_character_have_eol_ + #define f_utf_character_eol 0x0a000000 // 0000 1010, 0000 0000, 0000 0000, 0000 0000 +#endif // _di_f_utf_character_have_eol_ + +#ifndef _di_f_utf_character_have_eos_ + #define f_utf_character_eos 0x00000000 // 0000 0000, 0000 0000, 0000 0000, 0000 0000 +#endif // _di_f_utf_character_have_eos_ + +#ifndef _di_f_utf_character_have_placeholder_ + #define f_utf_character_placeholder 0x00000000 // 0000 0000, 0000 0000, 0000 0000, 0000 0000 +#endif // _di_f_utf_character_have_placeholder_ + +/** + * Provide a UTF-8 characters set to 4-bits wide as a string. + */ +#ifndef _di_f_utf_string_ + typedef f_utf_character *f_utf_string; + + #define f_utf_string_max_size f_signed_long_size + #define f_utf_string_initialize f_eos + + #define f_new_utf_char(status, string, length) status = f_new_array((void **) & string, sizeof(f_utf_string), length) + #define f_delete_utf_char(status, string, size) status = f_delete((void **) & string, sizeof(f_utf_string), size) + #define f_destroy_utf_char(status, string, size) status = f_destroy((void **) & string, sizeof(f_utf_string), size) + + #define f_resize_utf_char(status, string, old_length, new_length) \ + status = f_resize((void **) & string, sizeof(f_utf_string), old_length, new_length) + + #define f_adjust_utf_char(status, string, old_length, new_length) \ + status = f_adjust((void **) & string, sizeof(f_utf_string), old_length, new_length) +#endif // _di_f_utf_string_ + +/** + * Provide a type specifically for UTF-8 strings. + */ +#ifndef _di_f_utf_string_length_ + typedef f_s_long f_utf_string_length; + + #define f_new_utf_string_length(status, string, length) status = f_new_array((void **) & string, sizeof(f_utf_string_length), length) + #define f_delete_utf_string_length(status, string, length) status = f_delete((void **) & string, sizeof(f_utf_string_length), length) + #define f_destroy_utf_string_length(status, string, size) status = f_destroy((f_void_P *) & string, sizeof(f_utf_string_length), size) + + #define f_resize_utf_string_length(status, length, old_length, new_length) \ + status = f_resize((void **) & length, sizeof(f_utf_string_length), old_length, new_length) + + #define f_adjust_utf_string_length(status, length, old_length, new_length) \ + status = f_adjust((void **) & length, sizeof(f_utf_string_length), old_length, new_length) +#endif // _di_f_utf_string_length_ + +/** + * size: total amount of allocated space. + * used: total number of allocated spaces used. + */ +#ifndef _di_f_utf_string_lengths_ + typedef struct { + f_utf_string_length *array; + f_array_length size; + f_array_length used; + } f_utf_string_lengths; + + #define f_utf_string_lengths_initialize { 0, 0, 0 } + + #define f_new_utf_string_lengths(status, lengths) \ + f_new_structure(status, lengths, f_utf_string_length) + + #define f_delete_utf_string_lengths(status, lengths) \ + f_delete_structure(status, lengths, f_utf_string_length) + + #define f_destroy_utf_string_lengths(status, lengths) \ + f_destroy_structure(status, lengths, f_utf_string_length) + + #define f_resize_utf_string_lengths(status, lengths, new_length) \ + f_resize_structure(status, lengths, f_utf_string_length, new_length) + + #define f_adjust_utf_string_lengths(status, lengths, new_length) \ + f_adjust_structure(status, lengths, f_utf_string_length, new_length) +#endif // _di_f_utf_string_lengths_ + +/** + * designates a start and stop position that represents a sub-string inside of some parent string. + * use this to avoid resizing, restructuring, and reallocating the parent string to separate the sub-string. + */ +#ifndef _di_f_utf_string_location_ + typedef struct { + f_utf_string_length start; + f_utf_string_length stop; + } f_utf_string_location; + + #define f_utf_string_location_initialize { 1, 0 } + + #define f_new_utf_string_location(status, utf_string_location, length) status = f_new_array((void **) & utf_string_location, sizeof(f_utf_string_location), length) + #define f_delete_utf_string_location(status, utf_string_location, size) status = f_delete((void **) & utf_string_location, sizeof(f_utf_string_location), size) + #define f_destroy_utf_string_location(status, utf_string_location, size) status = f_destroy((void **) & utf_string_location, sizeof(f_utf_string_location), size) + + #define f_resize_utf_string_location(status, utf_string_location, old_length, new_length) \ + status = f_resize((void **) & utf_string_location, sizeof(f_utf_string_location), old_length, new_length) + + #define f_adjust_utf_string_location(status, utf_string_location, old_length, new_length) \ + status = f_adjust((void **) & utf_string_location, sizeof(f_utf_string_location), old_length, new_length) +#endif // _di_f_utf_string_location_ + +/** + * an array of string locations. + * + * size: total amount of allocated space. + * used: total number of allocated spaces used. + */ +#ifndef _di_f_utf_string_locations_ + typedef struct { + f_utf_string_location *array; + f_array_length size; + f_array_length used; + } f_utf_string_locations; + + #define f_utf_string_locations_initialize {0, 0, 0} + + #define f_clear_utf_string_locations(locations) \ + f_clear_structure(locations) + + #define f_new_utf_string_locations(status, locations, length) \ + f_new_structure(status, locations, f_utf_string_location, length) + + #define f_delete_utf_string_locations(status, locations) \ + f_delete_structure(status, locations, f_utf_string_location) + + #define f_destroy_utf_string_locations(status, locations) \ + f_destroy_structure(status, locations, f_utf_string_location) + + #define f_resize_utf_string_locations(status, locations, new_length) \ + f_resize_structure(status, locations, f_utf_string_location, new_length) + + #define f_adjust_utf_string_locations(status, locations, new_length) \ + f_adjust_structure(status, locations, f_utf_string_location, new_length) +#endif // _di_f_utf_string_locations_ + +/** + * a string that supports contains a size attribute to handle dynamic allocations and deallocations. + * save the string size along with the string, so that strlen(..) commands can be avoided as much as possible. + * + * size: total amount of allocated space. + * used: total number of allocated spaces used. + */ +#ifndef _di_f_utf_string_dynamic_ + typedef struct { + f_utf_string string; + f_utf_string_length size; + f_utf_string_length used; + } f_utf_string_dynamic; + + #define f_utf_string_dynamic_initialize { f_utf_string_initialize, 0, 0 } + + #define f_clear_utf_string_dynamic(dynamic) \ + dynamic.string = 0; \ + dynamic.size = 0; \ + dynamic.used = 0; + + #define f_new_utf_string_dynamic(status, dynamic, new_length) \ + f_clear_utf_string_dynamic(dynamic) \ + status = f_new_array((void **) & dynamic.string, sizeof(f_utf_string), new_length); \ + if (status == f_none) { \ + dynamic.size = new_length; \ + dynamic.used = 0; \ + } + + #define f_delete_utf_string_dynamic(status, dynamic) \ + status = f_delete((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size); \ + if (status == f_none) { \ + dynamic.size = 0; \ + dynamic.used = 0; \ + } + + #define f_destroy_utf_string_dynamic(status, dynamic) \ + status = f_destroy((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size); \ + if (status == f_none) { \ + dynamic.size = 0; \ + dynamic.used = 0; \ + } + + #define f_resize_utf_string_dynamic(status, dynamic, new_length) \ + status = f_resize((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size, new_length); \ + if (status == f_none) { \ + dynamic.size = new_length; \ + if (dynamic.used > dynamic.size) dynamic.used = new_length; \ + } + + #define f_adjust_utf_string_dynamic(status, dynamic, new_length) \ + status = f_adjust((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size, new_length); \ + if (status == f_none) { \ + dynamic.size = new_length; \ + if (dynamic.used > dynamic.size) dynamic.used = new_length; \ + } +#endif // _di_f_utf_string_dynamic_ + +/** + * an array of dynamic utf_strings. + * + * size: total amount of allocated space. + * used: total number of allocated spaces used. + */ +#ifndef _di_f_utf_string_dynamics_ + typedef struct { + f_utf_string_dynamic *array; + f_utf_string_length size; + f_utf_string_length used; + } f_utf_string_dynamics; + + #define f_utf_string_dynamics_initialize { 0, 0, 0 } + + #define f_clear_utf_string_dynamics(dynamics) \ + dynamics.array = 0; \ + dynamics.size = 0; \ + dynamics.used = 0; + + #define f_new_utf_string_dynamics(status, dynamics, length) \ + dynamics.array = 0; \ + dynamics.size = 0; \ + dynamics.used = 0; \ + status = f_new_array((void **) & dynamics.array, sizeof(f_utf_string_dynamic), length); \ + if (status == f_none) { \ + dynamics.size = length; \ + dynamics.used = 0; \ + } + + #define f_delete_utf_string_dynamics(status, dynamics) \ + status = f_none; \ + while (dynamics.size > 0) { \ + --dynamics.size; \ + f_destroy_utf_string_dynamic(status, dynamics.array[dynamics.size]); \ + if (status != f_none) break; \ + } \ + if (status == f_none) status = f_delete((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size); \ + if (status == f_none) dynamics.used = 0; + + #define f_destroy_utf_string_dynamics(status, dynamics) \ + status = f_none; \ + while (dynamics.size > 0) { \ + --dynamics.size; \ + f_destroy_utf_string_dynamic(status, dynamics.array[dynamics.size]); \ + if (status != f_none) break; \ + } \ + if (status == f_none) status = f_destroy((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size); \ + if (status == f_none) dynamics.used = 0; + + #define f_resize_utf_string_dynamics(status, dynamics, new_length) \ + status = f_none; \ + if (new_length < dynamics.size) { \ + f_utf_string_length i = dynamics.size - new_length; \ + for (; i < dynamics.size; ++i) { \ + f_destroy_utf_string_dynamic(status, dynamics.array[i]); \ + if (status != f_none) break; \ + } \ + } \ + if (status == f_none) status = f_resize((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size, new_length); \ + if (status == f_none) { \ + if (new_length > dynamics.size) { \ + f_utf_string_length i = dynamics.size; \ + for (; i < new_length; ++i) { \ + memset(&dynamics.array[i], 0, sizeof(f_utf_string_dynamic)); \ + } \ + } \ + dynamics.size = new_length; \ + if (dynamics.used > dynamics.size) dynamics.used = new_length; \ + } + + #define f_adjust_utf_string_dynamics(status, dynamics, new_length) \ + status = f_none; \ + if (new_length < dynamics.size) { \ + f_utf_string_length i = dynamics.size - new_length; \ + for (; i < dynamics.size; ++i) { \ + f_destroy_utf_string_dynamic(status, dynamics.array[i], f_utf_string_dynamic); \ + if (status != f_none) break; \ + } \ + } \ + if (status == f_none) status = f_adjust((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size, new_length); \ + if (status == f_none) { \ + if (new_length > dynamics.size) { \ + f_utf_string_length i = dynamics.size; \ + for (; i < new_length; ++i) { \ + memset(&dynamics.array[i], 0, sizeof(f_utf_string_dynamic)); \ + } \ + } \ + dynamics.size = new_length; \ + if (dynamics.used > dynamics.size) dynamics.used = new_length; \ + } +#endif // _di_f_utf_string_dynamic_ + /** * Define the UTF-8 general whitespace codes. * @@ -235,6 +527,18 @@ extern "C" { #endif // _di_f_utf_substitute_ /** + * Helper function for UTF-8 processing code to determine endianess of the system. + * + * + * @return + * f_true if the system is big-endian. + * f_false if the system is little-endian. + */ +#ifndef _di_f_utf_is_big_endian_ + extern f_return_status f_utf_is_big_endian(); +#endif // _di_f_utf_is_big_endian_ + +/** * Check to see if the entire byte block of the character is a UTF-8 character. * * @param character @@ -284,7 +588,7 @@ extern "C" { * @return * f_true if a UTF-8 character. * f_false if not a UTF-8 character. - * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. + * f_invalid_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_invalid_parameter (with error bit) if a parameter is invalid. */ #ifndef _di_f_utf_is_ @@ -497,6 +801,34 @@ extern "C" { extern f_return_status f_utf_char_to_character(const f_string character, const f_u_short max_width, f_utf_character *utf_character); #endif // _di_f_utf_char_to_character_ +/** + * Convert a specialized f_utf_character type to a char, stored as a string (character buffer). + * + * This will also convert ASCII characters stored in the utf_character array. + * + * @param utf_character + * The UTF-8 characterr to convert from. + * @param character + * A char representation of the UTF-8 character, stored as a string of width bytes. + * If max_width is 0, then this should not be allocated (set the pointer address to 0). + * @param max_width + * The number of bytes the generated character represents. + * If this is set to 0, then the character will be allocated and this will be set to the width of the utf_character. + * If this is set to some value greater than 0 (up to 4), then this represents the size of the character array (no allocations are performed). + * If this is greater than 0, and the utf_character width is larger than this size, then an error is returned. + * + * @return + * f_none if conversion was successful. + * f_failure (with error bit) if width is not long enough to convert. + * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. + * f_invalid_parameter (with error bit) if a parameter is invalid. + * f_allocation_error (with error bit) on memory allocation error. + * f_failure (with error bit) if width is not long enough to convert. + */ +#ifndef _di_f_utf_character_to_char_ + extern f_return_status f_utf_character_to_char(const f_utf_character utf_character, f_string *character, f_u_short *max_width); +#endif // _di_f_utf_character_to_char_ + #ifdef __cplusplus } // extern "C" #endif diff --git a/level_1/fl_fss/c/fss.c b/level_1/fl_fss/c/fss.c index 72523c4..006607f 100644 --- a/level_1/fl_fss/c/fss.c +++ b/level_1/fl_fss/c/fss.c @@ -281,33 +281,13 @@ extern "C" { if (input.start >= buffer.used) return f_status_set_error(f_invalid_parameter); #endif // _di_level_1_parameter_checking_ - f_u_short utf_width = f_macro_utf_byte_width_is(buffer.string[input.start]); - - if (utf_width == 0) { - if (isgraph(buffer.string[input.start])) { - return f_true; - } - - return f_false; - } - f_string_length max_width = (input.stop - input.start) + 1; if (max_width > buffer.used - input.start) { max_width = buffer.used - input.start; } - f_status status = f_utf_is_space(buffer.string + input.start, max_width); - - if (f_status_is_error(status)) { - return status; - } - - if (status == f_true) { - return f_false; - } - - return f_true; + return f_utf_is_graph(buffer.string + input.start, max_width); } #endif // _di_fl_fss_is_graph_ @@ -320,33 +300,13 @@ extern "C" { if (input.start >= buffer.used) return f_status_set_error(f_invalid_parameter); #endif // _di_level_1_parameter_checking_ - f_u_short utf_width = f_macro_utf_byte_width_is(buffer.string[input.start]); - - if (utf_width == 0) { - if (isspace(buffer.string[input.start])) { - return f_true; - } - - return f_false; - } - f_string_length max_width = (input.stop - input.start) + 1; if (max_width > buffer.used - input.start) { max_width = buffer.used - input.start; } - f_status status = f_utf_is_space(buffer.string + input.start, max_width); - - if (f_status_is_error(status)) { - return status; - } - - if (status == f_true) { - return f_true; - } - - return f_false; + return f_utf_is_space(buffer.string + input.start, max_width); } #endif // _di_fl_fss_is_space_ @@ -354,32 +314,58 @@ extern "C" { f_return_status fl_fss_skip_past_whitespace(const f_dynamic_string buffer, f_string_location *input) { #ifndef _di_level_1_parameter_checking_ if (buffer.used <= 0) return f_status_set_error(f_invalid_parameter); + if (input == 0) return f_status_set_error(f_invalid_parameter); if (input->start < 0) return f_status_set_error(f_invalid_parameter); if (input->stop < input->start) return f_status_set_error(f_invalid_parameter); if (input->start >= buffer.used) return f_status_set_error(f_invalid_parameter); #endif // _di_level_1_parameter_checking_ f_status status = f_none; - f_u_short max_width = 0; + f_u_short width = 0; + + f_string_length max_width = (input->stop - input->start) + 1; + + if (max_width > buffer.used - input->start) { + max_width = buffer.used - input->start; + } - while (input->start < buffer.used && input->start > input->stop) { - if (isgraph(buffer.string[input->start])) break; + while (buffer.string[input->start] == f_eos || (status = f_utf_is_graph(buffer.string + input->start, max_width)) == f_false) { + if (f_status_is_error(status)) { + return status; + } - if (buffer.string[input->start] == f_eol) break; + if (buffer.string[input->start] == f_eol) return f_none_on_eol; - if (buffer.string[input->start] != f_fss_delimit_placeholder) { - max_width = (input->stop - input->start) + 1; + width = f_macro_utf_byte_width_is(buffer.string[input->start]); - if (f_utf_is_space(buffer.string +input->start, max_width) != f_true) { - if (f_utf_is_bom(buffer.string + input->start, max_width) != f_true) { - break; - } - } + if (width == 0) { + width = 1; } + // Do not operate on UTF-8 fragments that are not the first byte of the character. + else if (width == 1) { + return f_status_set_error(f_incomplete_utf); + } + else { + if (input->start + width >= buffer.used) return f_status_set_error(f_incomplete_utf_on_eos); + if (input->start + width > input->stop) return f_status_set_error(f_incomplete_utf_on_stop); + } + + input->start += width; + + if (input->start >= buffer.used) return f_none_on_eos; + if (input->start > input->stop) return f_none_on_stop; - input->start++; + max_width = (input->stop - input->start) + 1; + + if (max_width > buffer.used - input->start) { + max_width = buffer.used - input->start; + } } // while + if (f_status_is_error(status)) { + return status; + } + return f_none; } #endif // _di_fl_fss_skip_past_whitespace_ @@ -388,30 +374,56 @@ extern "C" { f_return_status fl_fss_skip_past_all_whitespace(const f_dynamic_string buffer, f_string_location *input) { #ifndef _di_level_1_parameter_checking_ if (buffer.used <= 0) return f_status_set_error(f_invalid_parameter); + if (input == 0) return f_status_set_error(f_invalid_parameter); if (input->start < 0) return f_status_set_error(f_invalid_parameter); if (input->stop < input->start) return f_status_set_error(f_invalid_parameter); if (input->start >= buffer.used) return f_status_set_error(f_invalid_parameter); #endif // _di_level_1_parameter_checking_ f_status status = f_none; - f_u_short max_width = 0; + f_u_short width = 0; - while (input->start < buffer.used && input->start > input->stop) { - if (isgraph(buffer.string[input->start])) break; + f_string_length max_width = (input->stop - input->start) + 1; - if (buffer.string[input->start] != f_fss_delimit_placeholder) { - max_width = (input->stop - input->start) + 1; + if (max_width > buffer.used - input->start) { + max_width = buffer.used - input->start; + } - if (f_utf_is_space(buffer.string + input->start, max_width) != f_true) { - if (f_utf_is_bom(buffer.string + input->start, max_width) != f_true) { - break; - } - } + while (buffer.string[input->start] == f_eos || (status = f_utf_is_graph(buffer.string + input->start, max_width)) == f_false) { + if (f_status_is_error(status)) { + return status; + } + + width = f_macro_utf_byte_width_is(buffer.string[input->start]); + + if (width == 0) { + width = 1; } + // Do not operate on UTF-8 fragments that are not the first byte of the character. + else if (width == 1) { + return f_status_set_error(f_incomplete_utf); + } + else { + if (input->start + width >= buffer.used) return f_status_set_error(f_incomplete_utf_on_eos); + if (input->start + width > input->stop) return f_status_set_error(f_incomplete_utf_on_stop); + } + + input->start += width; + + if (input->start >= buffer.used) return f_none_on_eos; + if (input->start > input->stop) return f_none_on_stop; - input->start++; + max_width = (input->stop - input->start) + 1; + + if (max_width > buffer.used - input->start) { + max_width = buffer.used - input->start; + } } // while + if (f_status_is_error(status)) { + return status; + } + return f_none; } #endif // _di_fl_fss_skip_past_all_whitespace_ diff --git a/level_1/fl_fss/data/build/settings b/level_1/fl_fss/data/build/settings index 557f3e0..5abd997 100644 --- a/level_1/fl_fss/data/build/settings +++ b/level_1/fl_fss/data/build/settings @@ -12,7 +12,7 @@ build_linker ar build_libraries -lc build_libraries_fll -lf_conversion -lf_file -lf_memory -lf_utf build_sources_library fss.c fss_basic.c fss_basic_list.c fss_extended.c -build_sources_program +build_sources_program build_sources_headers fss.h fss_basic.h fss_basic_list.h fss_status.h fss_extended.h fss_macro.h build_sources_bash build_sources_settings diff --git a/level_1/fl_strings/c/strings.c b/level_1/fl_strings/c/strings.c index 58a6fa0..6413553 100644 --- a/level_1/fl_strings/c/strings.c +++ b/level_1/fl_strings/c/strings.c @@ -59,7 +59,11 @@ extern "C" { max_width = buffer.used - location->start; } - while (buffer.string[location->start] == placeholder || (!isgraph(buffer.string[location->start]) && (status = f_utf_is_graph(buffer.string + location->start, max_width)) == f_false)) { + while (buffer.string[location->start] == placeholder || (status = f_utf_is_graph(buffer.string + location->start, max_width)) == f_false) { + if (f_status_is_error(status)) { + return status; + } + if (buffer.string[location->start] == f_eol) return f_none_on_eol; width = f_macro_utf_byte_width_is(buffer.string[location->start]); @@ -115,7 +119,11 @@ extern "C" { max_width = buffer.used - location->start; } - while (buffer.string[location->start] == placeholder || (isgraph(buffer.string[location->start]) && (status = f_utf_is_space(buffer.string + location->start, max_width)) == f_false)) { + while (buffer.string[location->start] == placeholder || (status = f_utf_is_space(buffer.string + location->start, max_width)) == f_false) { + if (f_status_is_error(status)) { + return status; + } + if (buffer.string[location->start] == f_eol) return f_none_on_eol; width = f_macro_utf_byte_width_is(buffer.string[location->start]); @@ -175,8 +183,8 @@ extern "C" { } #endif // _di_fl_seek_line_to_ -#ifndef _di_fl_seek_line_to_character_ - f_return_status fl_seek_line_to_character(const f_dynamic_string buffer, f_string_location *location, const f_utf_character seek_to_this) { +#ifndef _di_fl_seek_line_to_utf_character_ + f_return_status fl_seek_line_to_utf_character(const f_dynamic_string buffer, f_string_location *location, const f_utf_character seek_to_this) { #ifndef _di_level_1_parameter_checking_ if (location == 0) return f_status_set_error(f_invalid_parameter); if (location->start < 0) return f_status_set_error(f_invalid_parameter); @@ -240,7 +248,7 @@ extern "C" { return f_none_on_eos; } -#endif // _di_fl_seek_line_to_character_ +#endif // _di_fl_seek_line_to_utf_character_ #ifndef _di_fl_seek_to_ f_return_status fl_seek_to(const f_dynamic_string buffer, f_string_location *location, const char seek_to_this) { @@ -263,8 +271,8 @@ extern "C" { } #endif // _di_fl_seek_to_ -#ifndef _di_fl_seek_to_character_ - f_return_status fl_seek_to_character(const f_dynamic_string buffer, f_string_location *location, const f_utf_character seek_to_this) { +#ifndef _di_fl_seek_to_utf_character_ + f_return_status fl_seek_to_utf_character(const f_dynamic_string buffer, f_string_location *location, const f_utf_character seek_to_this) { #ifndef _di_level_1_parameter_checking_ if (location == 0) return f_status_set_error(f_invalid_parameter); if (location->start < 0) return f_status_set_error(f_invalid_parameter); @@ -326,7 +334,7 @@ extern "C" { return f_none_on_eos; } -#endif // _di_fl_seek_to_character_ +#endif // _di_fl_seek_to_utf_character_ #ifndef _di_fl_compare_strings_ f_return_status fl_compare_strings(const f_string string1, const f_string string2, const f_string_length length1, const f_string_length length2) { diff --git a/level_1/fl_strings/c/strings.h b/level_1/fl_strings/c/strings.h index 7e82cbd..6bbf5d3 100644 --- a/level_1/fl_strings/c/strings.h +++ b/level_1/fl_strings/c/strings.h @@ -119,7 +119,7 @@ extern "C" { * f_none_on_stop on success, but stopped stop location. * f_invalid_parameter (with error bit) if a parameter is invalid. * - * @see: fl_seek_line_to_character() + * @see fl_seek_line_to_utf_character() */ #ifndef _di_fl_seek_line_to_ extern f_return_status fl_seek_line_to(const f_dynamic_string buffer, f_string_location *location, const char seek_to_this); @@ -146,11 +146,11 @@ extern "C" { * f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed. * f_invalid_parameter (with error bit) if a parameter is invalid. * - * @see: fl_seek_line_to() + * @see fl_seek_line_to() */ -#ifndef _di_fl_seek_line_to_character_ - extern f_return_status fl_seek_line_to_character(const f_dynamic_string buffer, f_string_location *location, const f_utf_character seek_to_this); -#endif // _di_fl_seek_line_to_character_ +#ifndef _di_fl_seek_line_to_utf_character_ + extern f_return_status fl_seek_line_to_utf_character(const f_dynamic_string buffer, f_string_location *location, const f_utf_character seek_to_this); +#endif // _di_fl_seek_line_to_utf_character_ /** * Seek the buffer location forward until the character (1-byte wide) is reached. @@ -172,14 +172,14 @@ extern "C" { * f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed. * f_invalid_parameter (with error bit) if a parameter is invalid. * - * @see: fl_seek_to_character() + * @see fl_seek_to_utf_character() */ #ifndef _di_fl_seek_to_ extern f_return_status fl_seek_to(const f_dynamic_string buffer, f_string_location *location, const char seek_to_this); #endif // _di_fl_seek_to_ /** - * Seek the buffer location forward until the character (up to 4-byte wide) is reached. + * Seek the buffer location forward until the UTF-8 character (up to 4-byte wide) is reached. * * @param buffer * The buffer to traverse. @@ -192,17 +192,18 @@ extern "C" { * @return * f_none on success. * f_none_on_eos on success, but stopped at end of buffer. + * f_none_on_stop on success, but stopped stop location. * f_invalid_utf (with error bit) if character is an invalid UTF-8 character. * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment. * f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed. * f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed. * f_invalid_parameter (with error bit) if a parameter is invalid. * - * @see: fl_seek_to() + * @see fl_seek_to() */ -#ifndef _di_fl_seek_to_character_ - extern f_return_status fl_seek_to_character(const f_dynamic_string buffer, f_string_location *location, const f_utf_character seek_to_this); -#endif // _di_fl_seek_to_character_ +#ifndef _di_fl_seek_to_utf_character_ + extern f_return_status fl_seek_to_utf_character(const f_dynamic_string buffer, f_string_location *location, const f_utf_character seek_to_this); +#endif // _di_fl_seek_to_utf_character_ /** * Compare two strings, similar to strncmp(). @@ -224,8 +225,8 @@ extern "C" { * f_not_equal_to when both strings do not equal. * f_invalid_parameter (with error bit) if a parameter is invalid. * - * @see: fl_compare_dynamic_strings() - * @see: fl_compare_dynamic_strings_partial() + * @see fl_compare_dynamic_strings() + * @see fl_compare_dynamic_strings_partial() */ #ifndef _di_fl_compare_strings_ extern f_return_status fl_compare_strings(const f_string string1, const f_string string2, const f_string_length length1, const f_string_length length2); @@ -247,8 +248,8 @@ extern "C" { * f_not_equal_to when both strings do not equal. * f_invalid_parameter (with error bit) if a parameter is invalid. * - * @see: fl_compare_strings() - * @see: fl_compare_dynamic_strings_partial() + * @see fl_compare_strings() + * @see fl_compare_dynamic_strings_partial() */ #ifndef _di_fl_compare_dynamic_strings_ extern f_return_status fl_compare_dynamic_strings(const f_dynamic_string string1, const f_dynamic_string string2); @@ -274,8 +275,8 @@ extern "C" { * f_not_equal_to when both strings do not equal. * f_invalid_parameter (with error bit) if a parameter is invalid. * - * @see: fl_compare_strings() - * @see: fl_compare_dynamic_strings() + * @see fl_compare_strings() + * @see fl_compare_dynamic_strings() */ #ifndef _di_fl_compare_partial_dynamic_strings_ extern f_return_status fl_compare_dynamic_strings_partial(const f_dynamic_string string1, const f_dynamic_string string2, const f_string_location offset1, const f_string_location offset2); diff --git a/level_1/fl_utf/c/utf.c b/level_1/fl_utf/c/utf.c new file mode 100644 index 0000000..1afe76b --- /dev/null +++ b/level_1/fl_utf/c/utf.c @@ -0,0 +1,347 @@ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _di_fl_rip_utf_string_ + f_return_status fl_rip_utf_string(const f_utf_string_dynamic buffer, const f_utf_string_location location, f_utf_string_dynamic *result) { + #ifndef _di_level_1_parameter_checking_ + if (location.start < 0) return f_status_set_error(f_invalid_parameter); + if (location.stop < location.start) return f_status_set_error(f_invalid_parameter); + if (buffer.used <= 0) return f_status_set_error(f_invalid_parameter); + if (location.start >= buffer.used) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_1_parameter_checking_ + + // the start and stop point are inclusive locations, and therefore start - stop is actually 1 too few locations + f_utf_string_length size = (location.stop - location.start) + 1; + + if (size > 0) { + f_status status = f_none; + + if (result == 0) { + f_new_utf_string_dynamic(status, (*result), size); + } + else { + f_resize_utf_string_dynamic(status, (*result), size); + } + + if (f_status_is_error(status)) { + return status; + } + + memcpy(result->string, buffer.string + location.start, sizeof(f_utf_character) * size); + result->used = size; + + return f_none; + } + + return f_no_data; + } +#endif // _di_fl_rip_utf_string_ + +#ifndef _di_fl_utf_seek_line_until_graph_ + f_return_status fl_utf_seek_line_until_graph(const f_utf_string_dynamic buffer, f_utf_string_location *location, const f_utf_character placeholder) { + #ifndef _di_level_1_parameter_checking_ + if (location == 0) return f_status_set_error(f_invalid_parameter); + if (location->start < 0) return f_status_set_error(f_invalid_parameter); + if (location->stop < location->start) return f_status_set_error(f_invalid_parameter); + if (buffer.used <= 0) return f_status_set_error(f_invalid_parameter); + if (location->start >= buffer.used) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_1_parameter_checking_ + + f_status status = f_none; + + while (buffer.string[location->start] == placeholder || (status = f_utf_is_graph_character(buffer.string[location->start])) == f_false) { + if (f_status_is_error(status)) { + return status; + } + + if (buffer.string[location->start] == f_utf_character_eol) return f_none_on_eol; + + if (f_macro_utf_character_width_is(buffer.string[location->start]) == 1) { + return f_status_set_error(f_invalid_utf); + } + + location->start++; + + if (location->start >= buffer.used) return f_none_on_eos; + if (location->start > location->stop) return f_none_on_stop; + } // while + + if (f_status_is_error(status)) { + return status; + } + + return f_none; + } +#endif // _di_fl_utf_seek_line_until_graph_ + +#ifndef _di_fl_utf_seek_line_until_non_graph_ + f_return_status fl_utf_seek_line_until_non_graph(const f_utf_string_dynamic buffer, f_utf_string_location *location, const f_utf_character placeholder) { + #ifndef _di_level_1_parameter_checking_ + if (location == 0) return f_status_set_error(f_invalid_parameter); + if (location->start < 0) return f_status_set_error(f_invalid_parameter); + if (location->stop < location->start) return f_status_set_error(f_invalid_parameter); + if (buffer.used <= 0) return f_status_set_error(f_invalid_parameter); + if (location->start >= buffer.used) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_1_parameter_checking_ + + f_status status = f_none; + + while (buffer.string[location->start] == placeholder || (status = f_utf_is_space_character(buffer.string[location->start])) == f_false) { + if (f_status_is_error(status)) { + return status; + } + + if (buffer.string[location->start] == f_utf_character_eol) return f_none_on_eol; + + if (f_macro_utf_character_width_is(buffer.string[location->start]) == 1) { + return f_status_set_error(f_invalid_utf); + } + + location->start++; + + if (location->start >= buffer.used) return f_none_on_eos; + if (location->start > location->stop) return f_none_on_stop; + } // while + + if (f_status_is_error(status)) { + return status; + } + + return f_none; + } +#endif // _di_fl_utf_seek_line_until_non_graph_ + +#ifndef _di_fl_utf_seek_line_to_ + f_return_status fl_utf_seek_line_to(const f_utf_string_dynamic buffer, f_utf_string_location *location, const f_utf_character seek_to_this) { + #ifndef _di_level_1_parameter_checking_ + if (location == 0) return f_status_set_error(f_invalid_parameter); + if (location->start < 0) return f_status_set_error(f_invalid_parameter); + if (location->stop < location->start) return f_status_set_error(f_invalid_parameter); + if (buffer.used <= 0) return f_status_set_error(f_invalid_parameter); + if (location->start >= buffer.used) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_1_parameter_checking_ + + while (buffer.string[location->start] != seek_to_this) { + if (buffer.string[location->start] == f_utf_character_eol) return f_none_on_eol; + + if (f_macro_utf_character_width_is(buffer.string[location->start]) == 1) { + return f_status_set_error(f_invalid_utf); + } + + location->start++; + + if (location->start >= buffer.used) return f_none_on_eos; + if (location->start > location->stop) return f_none_on_stop; + } // while + + return f_none; + } +#endif // _di_fl_utf_seek_line_to_ + +#ifndef _di_fl_utf_seek_line_to_char_ + f_return_status fl_utf_seek_line_to_char(const f_utf_string_dynamic buffer, f_utf_string_location *location, const char seek_to_this) { + #ifndef _di_level_1_parameter_checking_ + if (location == 0) return f_status_set_error(f_invalid_parameter); + if (location->start < 0) return f_status_set_error(f_invalid_parameter); + if (location->stop < location->start) return f_status_set_error(f_invalid_parameter); + if (buffer.used <= 0) return f_status_set_error(f_invalid_parameter); + if (location->start >= buffer.used) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_1_parameter_checking_ + + f_utf_character seek_to_character = seek_to_this << 24; + + while (buffer.string[location->start] != seek_to_character) { + if (buffer.string[location->start] == f_utf_character_eol) return f_none_on_eol; + + if (f_macro_utf_character_width_is(buffer.string[location->start]) == 1) { + return f_status_set_error(f_invalid_utf); + } + + location->start++; + + if (location->start >= buffer.used) return f_none_on_eos; + if (location->start > location->stop) return f_none_on_stop; + } // while + + return f_none; + } +#endif // _di_fl_utf_seek_line_to_character_ + +#ifndef _di_fl_utf_string_seek_to_ + f_return_status fl_utf_string_seek_to(const f_utf_string_dynamic buffer, f_utf_string_location *location, const f_utf_character seek_to_this) { + #ifndef _di_level_1_parameter_checking_ + if (location == 0) return f_status_set_error(f_invalid_parameter); + if (location->start < 0) return f_status_set_error(f_invalid_parameter); + if (location->stop < location->start) return f_status_set_error(f_invalid_parameter); + if (buffer.used <= 0) return f_status_set_error(f_invalid_parameter); + if (location->start >= buffer.used) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_1_parameter_checking_ + + while (buffer.string[location->start] != seek_to_this) { + if (f_macro_utf_character_width_is(buffer.string[location->start]) == 1) { + return f_status_set_error(f_invalid_utf); + } + + location->start++; + + if (location->start >= buffer.used) return f_none_on_eos; + if (location->start > location->stop) return f_none_on_stop; + } // while + + return f_none; + } +#endif // _di_fl_utf_string_seek_to_ + +#ifndef _di_fl_utf_string_seek_to_char_ + f_return_status fl_utf_string_seek_to_char(const f_utf_string_dynamic buffer, f_utf_string_location *location, const char seek_to_this) { + #ifndef _di_level_1_parameter_checking_ + if (location == 0) return f_status_set_error(f_invalid_parameter); + if (location->start < 0) return f_status_set_error(f_invalid_parameter); + if (location->stop < location->start) return f_status_set_error(f_invalid_parameter); + if (buffer.used <= 0) return f_status_set_error(f_invalid_parameter); + if (location->start >= buffer.used) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_1_parameter_checking_ + + f_utf_character seek_to_character = seek_to_this << 24; + + while (buffer.string[location->start] != seek_to_character) { + if (f_macro_utf_character_width_is(buffer.string[location->start]) == 1) { + return f_status_set_error(f_invalid_utf); + } + + location->start++; + + if (location->start >= buffer.used) return f_none_on_eos; + if (location->start > location->stop) return f_none_on_stop; + } // while + + return f_none; + } +#endif // _di_fl_utf_string_seek_to_char_ + +#ifndef _di_fl_utf_string_compare_ + f_return_status fl_utf_string_compare(const f_utf_string string1, const f_utf_string string2, const f_utf_string_length length1, const f_utf_string_length length2) { + #ifndef _di_level_1_parameter_checking_ + if (length1 <= 0) return f_status_set_error(f_invalid_parameter); + if (length2 <= 0) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_1_parameter_checking_ + + f_utf_string_length i1 = 0; + f_utf_string_length i2 = 0; + + for (; i1 < length1 && i2 < length2; i1++, i2++) { + // skip past newlines in string1. + while (i1 < length1 && string1[i1] == f_utf_character_eos) i1++; + if (i1 == length1) break; + + // skip past newlines in string2. + while (i2 < length2 && string2[i2] == f_utf_character_eos) i2++; + if (i2 == length2) break; + + if (string1[i1] != string2[i2]) return f_not_equal_to; + } // for + + // only return f_equal_to if all remaining characters are NULL. + while (i1 < length1) { + if (string1[i1] != f_utf_character_eos) return f_not_equal_to; + i1++; + } // while + + while (i2 < length2) { + if (string2[i2] != f_utf_character_eos) return f_not_equal_to; + i2++; + } // while + + return f_equal_to; + } +#endif // _di_fl_utf_string_compare_ + +#ifndef _di_fl_utf_string_dynamic_compare_ + f_return_status fl_utf_string_dynamic_compare(const f_utf_string_dynamic string1, const f_utf_string_dynamic string2) { + #ifndef _di_level_1_parameter_checking_ + if (string1.used <= 0) return f_status_set_error(f_invalid_parameter); + if (string2.used <= 0) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_1_parameter_checking_ + + f_utf_string_length i1 = 0; + f_utf_string_length i2 = 0; + + for (; i1 < string1.used && i2 < string2.used; i1++, i2++) { + // skip past newlines in string1. + while (i1 < string1.used && string1.string[i1] == f_utf_character_eos) i1++; + if (i1 == string1.used) break; + + // skip past newlines in string2. + while (i2 < string2.used && string2.string[i2] == f_utf_character_eos) i2++; + if (i2 == string2.used) break; + + if (string1.string[i1] != string2.string[i2]) return f_not_equal_to; + } // for + + // only return f_equal_to if all remaining characters are NULL. + while (i1 < string1.used) { + if (string1.string[i1] != f_utf_character_eos) return f_not_equal_to; + i1++; + } // while + + while (i2 < string2.used) { + if (string2.string[i2] != f_utf_character_eos) return f_not_equal_to; + i2++; + } // while + + return f_equal_to; + } +#endif // _di_fl_utf_string_dynamic_compare_ + +#ifndef _di_fl_compare_partial_dynamic_strings_ + f_return_status fl_utf_string_compare_dynamic_partial(const f_utf_string_dynamic string1, const f_utf_string_dynamic string2, const f_utf_string_location offset1, const f_utf_string_location offset2) { + #ifndef _di_level_1_parameter_checking_ + if (string1.used <= 0) return f_status_set_error(f_invalid_parameter); + if (string2.used <= 0) return f_status_set_error(f_invalid_parameter); + + if (offset1.start > offset1.stop) return f_status_set_error(f_invalid_parameter); + if (offset2.start > offset2.stop) return f_status_set_error(f_invalid_parameter); + + if (string1.used <= offset1.stop) return f_status_set_error(f_invalid_parameter); + if (string2.used <= offset2.stop) return f_status_set_error(f_invalid_parameter); + #endif // _di_level_1_parameter_checking_ + + f_utf_string_length i1 = offset1.start; + f_utf_string_length i2 = offset2.start; + + const f_utf_string_length stop1 = offset1.stop + 1; + const f_utf_string_length stop2 = offset2.stop + 1; + + for (; i1 < stop1 && i2 < stop2; i1++, i2++) { + // skip past newlines in string1. + while (i1 < stop1 && string1.string[i1] == f_utf_character_eos) i1++; + if (i1 == stop1) break; + + // skip past newlines in string2. + while (i2 < stop2 && string2.string[i2] == f_utf_character_eos) i2++; + if (i2 == stop2) break; + + if (string1.string[i1] != string2.string[i2]) return f_not_equal_to; + } // for + + // only return f_equal_to if all remaining characters are NULL. + while (i1 < stop1) { + if (string1.string[i1] != f_utf_character_eos) return f_not_equal_to; + i1++; + } // while + + while (i2 < stop2) { + if (string2.string[i2] != f_utf_character_eos) return f_not_equal_to; + i2++; + } // while + + return f_equal_to; + } +#endif // _di_fl_compare_partial_dynamic_strings_ + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/level_1/fl_utf/c/utf.h b/level_1/fl_utf/c/utf.h new file mode 100644 index 0000000..9776e0a --- /dev/null +++ b/level_1/fl_utf/c/utf.h @@ -0,0 +1,274 @@ +/** + * FLL - Level 1 + * + * Project: Utf + * API Version: 0.5 + * Licenses: lgplv2.1 + * + * Provides UTF-8 character manipulation and processing capabilities. + */ +#ifndef _FL_strings_h +#define _FL_strings_h + +// libc includes +#include +#include + +// fll includes +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Allocated a new UTF-8 string from the provided range in the buffer. + * + * @param buffer + * The buffer to rip from. + * @param location + * A range within the buffer representing the string to rip. + * @param result + * The new string, which will be allocated or reallocated as necessary. + * + * @return + * f_none on success. + * f_no_data if nothing to rip, no allocations or reallocations are performed. + * f_invalid_parameter (with error bit) if a parameter is invalid. + * f_allocation_error (with error bit) on memory allocation error. + * f_reallocation_error (with error bit) on memory reallocation error. + */ +#ifndef _di_fl_rip_utf_string_ + extern f_return_status fl_rip_utf_string(const f_utf_string_dynamic buffer, const f_utf_string_location location, f_utf_string_dynamic *result); +#endif // _di_fl_rip_utf_string_ + +/** + * Increment buffer location until a graph character or an EOL is matched. + * + * This will ignore the UTF-8 BOM. + * + * @param buffer + * The buffer to traverse. + * @param location + * A range within the buffer representing the start and stop locations. + * @param placeholder + * A UTF-8 character representing a placeholder to ignore (may be NULL). + * + * @return + * f_none on success. + * f_none_on_eol on success, but stopped at EOL. + * f_none_on_eos on success, but stopped at end of buffer. + * f_invalid_utf (with error bit) if a character in the buffer is an invalid UTF-8 character. + * f_invalid_parameter (with error bit) if a parameter is invalid. + */ +#ifndef _di_fl_utf_seek_line_until_graph_ + extern f_return_status fl_utf_seek_line_until_graph(const f_utf_string_dynamic buffer, f_utf_string_location *location, const f_utf_character placeholder); +#endif // _di_fl_utf_seek_line_until_graph_ + +/** + * Increment buffer location until a non-graph character or an EOL is matched. + * + * This will ignore the UTF-8 BOM. + * + * @param buffer + * The buffer to traverse. + * @param location + * A range within the buffer representing the start and stop locations. + * @param placeholder + * A single-width character representing a placeholder to ignore (may be NULL). + * + * @return + * f_none on success. + * f_none_on_eol on success, but stopped at EOL. + * f_none_on_eos on success, but stopped at end of buffer. + * f_none_on_stop on success, but stopped stop location. + * f_invalid_utf (with error bit) if a character in the buffer is an invalid UTF-8 character. + * f_invalid_parameter (with error bit) if a parameter is invalid. + */ +#ifndef _di_fl_utf_seek_line_until_non_graph_ + extern f_return_status fl_utf_seek_line_until_non_graph(const f_utf_string_dynamic buffer, f_utf_string_location *location, const f_utf_character placeholder); +#endif // _di_fl_utf_seek_line_until_non_graph_ + +/** + * Seek the buffer location forward until the UTF-8 character or EOL is reached. + * + * @param buffer + * The buffer to traverse. + * @param location + * A range within the buffer representing the start and stop locations. + * The start location will be incremented by seek. + * @param seek_to_this + * A UTF-8 character representing a character to seek to. + * + * @return + * f_none on success. + * f_none_on_eol on success, but stopped at EOL. + * f_none_on_eos on success, but stopped at end of buffer. + * f_none_on_stop on success, but stopped stop location. + * f_invalid_utf (with error bit) if a character in the buffer is an invalid UTF-8 character. + * f_invalid_parameter (with error bit) if a parameter is invalid. + * + * @see fl_utf_seek_line_to_char() + */ +#ifndef _di_fl_utf_seek_line_to_ + extern f_return_status fl_utf_seek_line_to(const f_utf_string_dynamic buffer, f_utf_string_location *location, const f_utf_character seek_to_this); +#endif // _di_fl_utf_seek_line_to_ + +/** + * Seek the buffer location forward until the 1-byte wide character or EOL is reached. + * + * @param buffer + * The buffer to traverse. + * @param location + * A range within the buffer representing the start and stop locations. + * The start location will be incremented by seek. + * @param seek_to_this + * A single-width non-UTF-8 character. + * + * @return + * f_none on success. + * f_none_on_eol on success, but stopped at EOL. + * f_none_on_eos on success, but stopped at end of buffer. + * f_invalid_utf (with error bit) if a character in the buffer is an invalid UTF-8 character. + * f_invalid_parameter (with error bit) if a parameter is invalid. + * + * @see fl_utf_seek_line_to() + */ +#ifndef _di_fl_utf_string_seek_line_to_char_ + extern f_return_status fl_utf_seek_line_to_char(const f_utf_string_dynamic buffer, f_utf_string_location *location, const char seek_to_this); +#endif // _di_fl_utf_string_seek_line_to_char_ + +/** + * Seek the buffer location forward until the character (1-byte wide) is reached. + * + * @param buffer + * The buffer to traverse. + * @param location + * A range within the buffer representing the start and stop locations. + * The start location will be incremented by seek. + * @param seek_to_this + * A UTF-8 character representing a character to seek to. + * + * @return + * f_none on success. + * f_none_on_eos on success, but stopped at end of buffer. + * f_none_on_stop on success, but stopped stop location. + * f_invalid_utf (with error bit) if a character in the buffer is an invalid UTF-8 character. + * f_invalid_parameter (with error bit) if a parameter is invalid. + * + * @see fl_utf_string_seek_to_char() + */ +#ifndef _di_fl_utf_string_seek_to_ + extern f_return_status fl_utf_string_seek_to(const f_utf_string_dynamic buffer, f_utf_string_location *location, const f_utf_character seek_to_this); +#endif // _di_fl_utf_string_seek_to_ + +/** + * Seek the buffer location forward until the UTF-8 character (up to 4-byte wide) is reached. + * + * @param buffer + * The buffer to traverse. + * @param location + * A range within the buffer representing the start and stop locations. + * The start location will be incremented by seek. + * @param seek_to_this + * A single-width non-UTF-8 character. + * + * @return + * f_none on success. + * f_none_on_eos on success, but stopped at end of buffer. + * f_none_on_stop on success, but stopped stop location. + * f_invalid_utf (with error bit) if a character in the buffer is an invalid UTF-8 character. + * f_invalid_parameter (with error bit) if a parameter is invalid. + * + * @see fl_utf_string_seek_to() + */ +#ifndef _di_fl_utf_string_seek_to_character_ + extern f_return_status fl_utf_string_seek_to_char(const f_utf_string_dynamic buffer, f_utf_string_location *location, const char seek_to_this); +#endif // _di_fl_utf_string_seek_to_character_ + +/** + * Compare two strings, similar to strncmp(). + * + * This does not stop on NULL. + * NULL characters are ignored. + * + * @param string1 + * String to compare. + * @param string2 + * String to compare. + * @param length1 + * Length of string1. + * @param length2 + * Length of string2. + * + * @return + * f_equal_to when both strings equal. + * f_not_equal_to when both strings do not equal. + * f_invalid_parameter (with error bit) if a parameter is invalid. + * + * @see fl_utf_string_dynamic_compare() + * @see fl_utf_string_compare_dynamic_partial() + */ +#ifndef _di_fl_utf_string_compare_ + extern f_return_status fl_utf_string_compare(const f_utf_string string1, const f_utf_string string2, const f_utf_string_length length1, const f_utf_string_length length2); +#endif // _di_fl_utf_string_compare_ + +/** + * Compare two strings, similar to strncmp(). + * + * This does not stop on NULL. + * NULL characters are ignored. + * + * @param string1 + * String to compare. + * @param string2 + * String to compare. + * + * @return + * f_equal_to when both strings equal. + * f_not_equal_to when both strings do not equal. + * f_invalid_parameter (with error bit) if a parameter is invalid. + * + * @see fl_utf_string_compare() + * @see fl_utf_string_compare_dynamic_partial() + */ +#ifndef _di_fl_utf_string_dynamic_compare_ + extern f_return_status fl_utf_string_dynamic_compare(const f_utf_string_dynamic string1, const f_utf_string_dynamic string2); +#endif // _di_fl_utf_string_dynamic_compare_ + +/** + * Compare two strings, similar to strncmp(), but restricted to the given ranges. + * + * This does not stop on NULL. + * NULL characters are ignored. + * + * @param string1 + * String to compare. + * @param string2 + * String to compare. + * @param offset1 + * A range within the string1 to restrict the comparison to. + * @param offset2 + * A range within the string2 to restrict the comparison to. + * + * @return + * f_equal_to when both strings equal. + * f_not_equal_to when both strings do not equal. + * f_invalid_parameter (with error bit) if a parameter is invalid. + * + * @see fl_utf_string_compare() + * @see fl_utf_string_dynamic_compare() + */ +#ifndef _di_fl_utf_string_compare_dynamic_partial_ + extern f_return_status fl_utf_string_compare_dynamic_partial(const f_utf_string_dynamic string1, const f_utf_string_dynamic string2, const f_utf_string_location offset1, const f_utf_string_location offset2); +#endif // _di_fl_utf_string_compare_dynamic_partial_ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // _FL_strings_h diff --git a/level_1/fl_utf/data/build/dependencies b/level_1/fl_utf/data/build/dependencies new file mode 100644 index 0000000..41ae6af --- /dev/null +++ b/level_1/fl_utf/data/build/dependencies @@ -0,0 +1,5 @@ +f_types +f_errors +f_memory +f_strings +f_utf diff --git a/level_1/fl_utf/data/build/settings b/level_1/fl_utf/data/build/settings new file mode 100644 index 0000000..b3c4e38 --- /dev/null +++ b/level_1/fl_utf/data/build/settings @@ -0,0 +1,30 @@ +# fss-0000 + +project_name fl_utf +project_level 1 + +version_major 0 +version_minor 5 +version_micro 0 + +build_compiler gcc +build_linker ar +build_libraries -lc +build_libraries_fll -lf_memory -lf_utf +build_sources_library utf.c +build_sources_program +build_sources_headers utf.h +build_sources_bash +build_sources_settings +build_shared yes +build_static yes + +defines_all +defines_static +defines_shared + +flags_all -z now +flags_shared +flags_static +flags_library -fPIC +flags_program -fPIE