From 9050db201e1b7f6f603703589dae015fb9c4f8d1 Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Sat, 20 Jun 2020 11:42:45 -0500 Subject: [PATCH] Update: relocate utf.h code into utf-common.h and improve zero-width support There is too much in utf.h. Move the defines and other common things into utf-common.h. There actually are zero-width ASCII characters (such as bell). Fix this inaccuracy and handle all of the zero-width ASCII characters. Newlines, tabs, and otherwise space that prints something, including vertical tabs however odd, are not to be considered zero-width. --- build/level_0/settings | 2 +- build/monolithic/settings | 2 +- level_0/f_utf/c/utf-common.h | 590 ++++++++++++++++++++++++++++++++++++++ level_0/f_utf/c/utf.c | 36 ++- level_0/f_utf/c/utf.h | 570 +----------------------------------- level_0/f_utf/data/build/settings | 2 +- 6 files changed, 630 insertions(+), 572 deletions(-) create mode 100644 level_0/f_utf/c/utf-common.h diff --git a/build/level_0/settings b/build/level_0/settings index f448fb3..37fa8e7 100644 --- a/build/level_0/settings +++ b/build/level_0/settings @@ -22,7 +22,7 @@ build_libraries -lc build_libraries-level build_sources_library console.c conversion.c directory.c private-directory.c environment.c private-environment.c file.c private-file.c fss.c memory.c path.c pipe.c print.c serialize.c private-serialize.c socket.c utf.c private-utf.c build_sources_program -build_sources_headers color.h console.h conversion.h directory.h directory_type.h environment.h file.h fss.h fss-common.h fss-named.h fss-nest.h fss-set.h memory.h path.h pipe.h print.h serialize.h socket.h status.h status_array.h string.h type.h type_array.h utf.h +build_sources_headers color.h console.h conversion.h directory.h directory_type.h environment.h file.h fss.h fss-common.h fss-named.h fss-nest.h fss-set.h memory.h path.h pipe.h print.h serialize.h socket.h status.h status_array.h string.h type.h type_array.h utf.h utf-common.h build_sources_script build_sources_setting build_script yes diff --git a/build/monolithic/settings b/build/monolithic/settings index deb6246..f119b30 100644 --- a/build/monolithic/settings +++ b/build/monolithic/settings @@ -22,7 +22,7 @@ build_libraries -lc build_libraries-monolithic build_sources_library level_0/console.c level_0/conversion.c level_0/directory.c level_0/private-directory.c level_0/environment.c level_0/private-environment.c level_0/file.c level_0/private-file.c level_0/fss.c level_0/memory.c level_0/path.c level_0/pipe.c level_0/print.c level_0/serialize.c level_0/private-serialize.c level_0/socket.c level_0/utf.c level_0/private-utf.c level_1/color.c level_1/console.c level_1/directory.c level_1/private-directory.c level_1/fss.c level_1/private-fss.c level_1/fss_basic.c level_1/fss_basic_list.c level_1/fss_extended.c level_1/fss_extended_list.c level_1/print.c level_1/status.c level_1/string.c level_1/private-string.c level_1/utf.c level_1/private-utf.c level_1/utf_file.c level_1/private-utf_file.c level_2/execute.c level_2/private-execute.c level_2/file.c level_2/fss.c level_2/fss_basic.c level_2/fss_basic_list.c level_2/fss_extended.c level_2/fss_extended_list.c level_2/fss_status.c level_2/program.c level_2/status.c build_sources_program -build_sources_headers level_0/color.h level_0/console.h level_0/conversion.h level_0/directory.h level_0/directory_type.h level_0/environment.h level_0/file.h level_0/fss.h level_0/fss-common.h level_0/fss-named.h level_0/fss-nest.h level_0/fss-set.h level_0/memory.h level_0/path.h level_0/pipe.h level_0/print.h level_0/serialize.h level_0/socket.h level_0/status.h level_0/status_array.h level_0/string.h level_0/type.h level_0/type_array.h level_0/utf.h level_1/color.h level_1/console.h level_1/directory.h level_1/fss.h level_1/fss_basic.h level_1/fss_basic_list.h level_1/fss_extended.h level_1/fss_extended_list.h level_1/fss_macro.h level_1/fss_status.h level_1/print.h level_1/status.h level_1/string.h level_1/utf.h level_1/utf_file.h level_2/execute.h level_2/file.h level_2/fss.h level_2/fss_basic.h level_2/fss_basic_list.h level_2/fss_extended.h level_2/fss_extended_list.h level_2/fss_status.h level_2/program.h level_2/status.h +build_sources_headers level_0/color.h level_0/console.h level_0/conversion.h level_0/directory.h level_0/directory_type.h level_0/environment.h level_0/file.h level_0/fss.h level_0/fss-common.h level_0/fss-named.h level_0/fss-nest.h level_0/fss-set.h level_0/memory.h level_0/path.h level_0/pipe.h level_0/print.h level_0/serialize.h level_0/socket.h level_0/status.h level_0/status_array.h level_0/string.h level_0/type.h level_0/type_array.h level_0/utf.h level_0/utf-common.h level_1/color.h level_1/console.h level_1/directory.h level_1/fss.h level_1/fss_basic.h level_1/fss_basic_list.h level_1/fss_extended.h level_1/fss_extended_list.h level_1/fss_macro.h level_1/fss_status.h level_1/print.h level_1/status.h level_1/string.h level_1/utf.h level_1/utf_file.h level_2/execute.h level_2/file.h level_2/fss.h level_2/fss_basic.h level_2/fss_basic_list.h level_2/fss_extended.h level_2/fss_extended_list.h level_2/fss_status.h level_2/program.h level_2/status.h build_sources_script build_sources_setting build_script yes diff --git a/level_0/f_utf/c/utf-common.h b/level_0/f_utf/c/utf-common.h new file mode 100644 index 0000000..bee9cfb --- /dev/null +++ b/level_0/f_utf/c/utf-common.h @@ -0,0 +1,590 @@ +/** + * FLL - Level 0 + * + * Project: UTF + * API Version: 0.5 + * Licenses: lgplv2.1 + * + * Defines common data to be used for/by project utf. + * + * This is auto-included by utf.h and should not need to be explicitly included. + */ +#ifndef _F_utf_common_h +#define _F_utf_common_h + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Define the UTF-8 bytes. + * + * The bytes are for checking a single 8-bit character value (specifically, checking the first bits). + * + * The byte offs are used for determining the character width of a UTF-8 character. + * + * The f_utf_byte_is method will return non-zero if the character is a UTF-8 character of any width. + * + * The f_utf_byte_1 is specifically used only on UTF-8 fragments. + * For example, with the 2-byte-wide UTF-8 character '1100x xxxx 10yy yyyy', the 8-byte block '10yy yyyy' would be a fragment. + * + * The f_macro_utf_byte_is_* macros are used to determine a width of the character (either 1, 2, 3, or 4, respectively). + * + * The f_macro_utf_byte_width macro determines a width of the character. + * The f_macro_utf_byte_width_is is identical to f_macro_utf_byte_width, except it returns 0 when character is not UTF-8. + */ +#ifndef _di_f_utf_byte_ + #define f_utf_byte_1 0x80 // 1000 0000 + #define f_utf_byte_2 0xc0 // 1100 0000 + #define f_utf_byte_3 0xe0 // 1110 0000 + #define f_utf_byte_4 0xf0 // 1111 0000 + + #define f_utf_byte_off_1 0xc0 // 1100 0000 + #define f_utf_byte_off_2 0xe0 // 1110 0000 + #define f_utf_byte_off_3 0xf0 // 1111 0000 + #define f_utf_byte_off_4 0xf8 // 1111 1000 + + #define f_macro_utf_byte_is(character) ((character) & f_utf_byte_1) + + #define f_macro_utf_byte_is_1(character) (((character) & f_utf_byte_off_1) == f_utf_byte_1) // (10xx xxxx & 1100 0000) == 1000 0000 + #define f_macro_utf_byte_is_2(character) (((character) & f_utf_byte_off_2) == f_utf_byte_2) // (110x xxxx & 1110 0000) == 1100 0000 + #define f_macro_utf_byte_is_3(character) (((character) & f_utf_byte_off_3) == f_utf_byte_3) // (1110 xxxx & 1111 0000) == 1110 0000 + #define f_macro_utf_byte_is_4(character) (((character) & f_utf_byte_off_4) == f_utf_byte_4) // (1111 0xxx & 1111 1000) == 1111 0000 + + #define f_macro_utf_byte_width(character) ((!f_macro_utf_byte_is(character) || f_macro_utf_byte_is_1(character)) ? 1 : (f_macro_utf_byte_is_2(character) ? 2 : (f_macro_utf_byte_is_3(character) ? 3 : 4))) + #define f_macro_utf_byte_width_is(character) (f_macro_utf_byte_is(character) ? (f_macro_utf_byte_is_1(character) ? 1 : (f_macro_utf_byte_is_2(character) ? 2 : (f_macro_utf_byte_is_3(character) ? 3 : 4))) : 0) +#endif // _di_f_utf_byte_ + +/** + * Provide a basic UTF-8 character as a single 4-byte variable. + * + * This is intended to be used when a single variable is desired to represent a 1-byte, 2-byte, 3-byte, or even 4-byte character. + * + * This "character" type is stored as a big-endian 4-byte integer (32-bits). + * A helper function, f_utf_is_big_endian(), is provided to detect system endianness so that character arrays (int8_t []) can be correctly processed. + * + * The byte structure is intended to be read left to right. + * + * The f_macro_utf_character_mask_byte_* are used to get the entire character set fo a given width. + * + * The f_macro_utf_character_mask_char_* are used to get a specific UTF-8 block as a single character range. + * + * The f_macro_utf_character_to_char_* are used to convert a f_utf_character into a int8_t, for a given 8-bit block. + * + * The f_macro_utf_character_from_char_* are used to convert a int8_t into part of a f_utf_character, for a given 8-bit block. + * + * The f_macro_utf_character_width is used to determine the width of the UTF-8 character based on f_macro_utf_byte_width. + * The f_macro_utf_character_width_is is used to determine the width of the UTF-8 character based on f_macro_utf_byte_width_is. + * + * @see f_utf_is_big_endian() + */ +#ifndef _di_f_utf_character_ + typedef uint32_t f_utf_character; + + #define f_utf_character_mask_byte_1 0xff000000 // 1111 1111, 0000 0000, 0000 0000, 0000 0000 + #define f_utf_character_mask_byte_2 0xffff0000 // 1111 1111, 1111 1111, 0000 0000, 0000 0000 + #define f_utf_character_mask_byte_3 0xffffff00 // 1111 1111, 1111 1111, 1111 1111, 0000 0000 + #define f_utf_character_mask_byte_4 0xffffffff // 1111 1111, 1111 1111, 1111 1111, 1111 1111 + + #define f_utf_character_mask_char_1 0xff000000 // 1111 1111, 0000 0000, 0000 0000, 0000 0000 + #define f_utf_character_mask_char_2 0x00ff0000 // 0000 0000, 1111 1111, 0000 0000, 0000 0000 + #define f_utf_character_mask_char_3 0x0000ff00 // 0000 0000, 0000 0000, 1111 1111, 0000 0000 + #define f_utf_character_mask_char_4 0x000000ff // 0000 0000, 0000 0000, 0000 0000, 1111 1111 + + #define f_macro_utf_character_to_char_1(character) (((character) & f_utf_character_mask_char_1) >> 24) // grab first byte. + #define f_macro_utf_character_to_char_2(character) (((character) & f_utf_character_mask_char_2) >> 16) // grab second byte. + #define f_macro_utf_character_to_char_3(character) (((character) & f_utf_character_mask_char_3) >> 8) // grab third byte. + #define f_macro_utf_character_to_char_4(character) ((character) & f_utf_character_mask_char_4) // grab fourth byte. + + #define f_macro_utf_character_from_char_1(character) (((character) << 24) & f_utf_character_mask_char_1) // shift to first byte. + #define f_macro_utf_character_from_char_2(character) (((character) << 16) & f_utf_character_mask_char_2) // shift to second byte. + #define f_macro_utf_character_from_char_3(character) (((character) << 8) & f_utf_character_mask_char_3) // shift to third byte. + #define f_macro_utf_character_from_char_4(character) ((character) & f_utf_character_mask_char_4) // shift to fourth byte. + + #define f_macro_utf_character_width(character) (f_macro_utf_byte_width(f_macro_utf_character_to_char_1(character))) + #define f_macro_utf_character_width_is(character) (f_macro_utf_byte_width_is(f_macro_utf_character_to_char_1(character))) +#endif // _di_f_utf_character_ + +#ifndef _di_f_utf_character_have_eol_ + #define f_utf_character_eol 0x0a000000 // 0000 1010, 0000 0000, 0000 0000, 0000 0000 +#endif // _di_f_utf_character_have_eol_ + +#ifndef _di_f_utf_character_have_eos_ + #define f_utf_character_eos 0x00000000 // 0000 0000, 0000 0000, 0000 0000, 0000 0000 +#endif // _di_f_utf_character_have_eos_ + +#ifndef _di_f_utf_character_have_placeholder_ + #define f_utf_character_placeholder 0x00000000 // 0000 0000, 0000 0000, 0000 0000, 0000 0000 +#endif // _di_f_utf_character_have_placeholder_ + +/** + * Provide a UTF-8 characters set to 4-bits wide as a string. + */ +#ifndef _di_f_utf_string_ + typedef f_utf_character *f_utf_string; + + #define f_utf_string_max_size f_type_number_size_unsigned + #define f_utf_string_initialize f_utf_character_eos + + #define f_macro_utf_string_new(status, string, length) status = f_memory_new((void **) & string, sizeof(f_utf_string), length) + + #define f_macro_utf_string_delete(status, string, size) status = f_memory_delete((void **) & string, sizeof(f_utf_string), size) + #define f_macro_utf_string_destroy(status, string, size) status = f_memory_destroy((void **) & string, sizeof(f_utf_string), size) + + #define f_macro_utf_string_resize(status, string, old_length, new_length) status = f_memory_resize((void **) & string, sizeof(f_utf_string), old_length, new_length) + #define f_macro_utf_string_adjust(status, string, old_length, new_length) status = f_memory_adjust((void **) & string, sizeof(f_utf_string), old_length, new_length) +#endif // _di_f_utf_string_ + +/** + * Provide a type specifically for UTF-8 strings. + */ +#ifndef _di_f_utf_string_length_ + typedef f_number_unsigned f_utf_string_length; + + #define f_utf_string_length_size 0xfffffffffffffffe + #define f_utf_string_length_size_max f_type_number_size_max_unsigned + + #define f_macro_utf_string_length_new(status, string, length) status = f_memory_new((void **) & string, sizeof(f_utf_string_length), length) + + #define f_macro_utf_string_length_delete(status, string, length) status = f_memory_delete((void **) & string, sizeof(f_utf_string_length), length) + #define f_macro_utf_string_length_destroy(status, string, size) status = f_memory_destroy((f_void_P *) & string, sizeof(f_utf_string_length), size) + + #define f_macro_utf_string_length_resize(status, length, old_length, new_length) status = f_memory_resize((void **) & length, sizeof(f_utf_string_length), old_length, new_length) + #define f_macro_utf_string_length_adjust(status, length, old_length, new_length) status = f_memory_adjust((void **) & length, sizeof(f_utf_string_length), old_length, new_length) +#endif // _di_f_utf_string_length_ + +/** + * size: total amount of allocated space. + * used: total number of allocated spaces used. + */ +#ifndef _di_f_utf_string_lengths_ + typedef struct { + f_utf_string_length *array; + + f_array_length size; + f_array_length used; + } f_utf_string_lengths; + + #define f_utf_string_lengths_initialize { 0, 0, 0 } + + #define f_macro_utf_string_lengths_clear(lengths) f_macro_memory_structure_clear(lengths) + + #define f_macro_utf_string_lengths_new(status, lengths, length) f_macro_memory_structure_new(status, lengths, f_utf_string_length, length) + + #define f_macro_utf_string_lengths_delete(status, lengths) f_macro_memory_structure_delete(status, lengths, f_utf_string_length) + #define f_macro_utf_string_lengths_destroy(status, lengths) f_macro_memory_structure_destroy(status, lengths, f_utf_string_length) + + #define f_macro_utf_string_lengths_resize(status, lengths, new_length) f_macro_memory_structure_resize(status, lengths, f_utf_string_length, new_length) + #define f_macro_utf_string_lengths_adjust(status, lengths, new_length) f_macro_memory_structure_adjust(status, lengths, f_utf_string_length, new_length) +#endif // _di_f_utf_string_lengths_ + +/** + * Designates a start and stop position that represents a sub-string inside of some parent string. + * use this to avoid resizing, restructuring, and reallocating the parent string to separate the sub-string. + * + * A special f_macro_utf_string_range_initialize() is provided for the special purpose of easily initialize a static string range. + */ +#ifndef _di_f_utf_string_range_ + typedef struct { + f_utf_string_length start; + f_utf_string_length stop; + } f_utf_string_range; + + #define f_utf_string_range_initialize { 1, 0 } + + #define f_macro_utf_string_range_initialize(length) { 0, length - 1 } + + #define f_macro_utf_string_range_new(status, utf_string_range, length) status = f_memory_new((void **) & utf_string_range, sizeof(f_utf_string_range), length) + #define f_macro_utf_string_range_delete(status, utf_string_range, size) status = f_memory_delete((void **) & utf_string_range, sizeof(f_utf_string_range), size) + #define f_macro_utf_string_range_destroy(status, utf_string_range, size) status = f_memory_destroy((void **) & utf_string_range, sizeof(f_utf_string_range), size) + + #define f_macro_utf_string_range_resize(status, utf_string_range, old_length, new_length) \ + status = f_memory_resize((void **) & utf_string_range, sizeof(f_utf_string_range), old_length, new_length) + + #define f_macro_utf_string_range_adjust(status, utf_string_range, old_length, new_length) \ + status = f_memory_adjust((void **) & utf_string_range, sizeof(f_utf_string_range), old_length, new_length) +#endif // _di_f_utf_string_range_ + +/** + * An array of string ranges. + * + * size: total amount of allocated space. + * used: total number of allocated spaces used. + */ +#ifndef _di_f_utf_string_ranges_ + typedef struct { + f_utf_string_range *array; + + f_array_length size; + f_array_length used; + } f_utf_string_ranges; + + #define f_utf_string_ranges_initialize {0, 0, 0} + + #define f_clear_utf_string_ranges(ranges) f_macro_memory_structure_clear(ranges) + + #define f_macro_utf_string_range_news(status, ranges, length) f_macro_memory_structure_new(status, ranges, f_utf_string_range, length) + + #define f_macro_utf_string_range_deletes(status, ranges) f_macro_memory_structure_delete(status, ranges, f_utf_string_range) + #define f_macro_utf_string_range_destroys(status, ranges) f_macro_memory_structure_destroy(status, ranges, f_utf_string_range) + + #define f_macro_utf_string_range_resizes(status, ranges, new_length) f_macro_memory_structure_resize(status, ranges, f_utf_string_range, new_length) + #define f_macro_utf_string_range_adjusts(status, ranges, new_length) f_macro_memory_structure_adjust(status, ranges, f_utf_string_range, new_length) +#endif // _di_f_utf_string_ranges_ + +/** + * Store string quantity. + * + * Similar to f_utf_string_range, except total is relative to start and is not an absolute stop position. + * + * Two common uses for when total is 0 is: + * 1) Exactly that, process a total of 0 strings bytes. + * 2) Process with no limit, aka infinite. + * + * start: The position where the string starts (based on some string/buffer). + * total: The total number of elements within that string/buffer the quantity represents. + */ +#ifndef _di_f_utf_string_quantity_ + typedef struct { + f_utf_string_length start; + f_utf_string_length total; + } f_utf_string_quantity; + + #define f_utf_string_quantity_initialize { 0, 0 } +#endif // _di_f_utf_string_quantity_ + +/** + * An array of string quantities. + * + * array: the array of string quantities. + * size: total amount of allocated space. + * used: total number of allocated spaces used. + */ +#ifndef _di_f_utf_string_quantitys_ + typedef struct { + f_utf_string_quantity *array; + + f_array_length size; + f_array_length used; + } f_utf_string_quantitys; + + #define f_utf_string_quantitys_initialize {0, 0, 0} + + #define f_macro_utf_string_quantitys_clear(quantitys) f_macro_memory_structure_clear(quantitys) + + #define f_macro_utf_string_quantitys_new(status, quantitys, length) f_macro_memory_structure_new(status, quantitys, f_utf_string_quantity, length) + + #define f_macro_utf_string_quantitys_delete(status, quantitys) f_macro_memory_structure_delete(status, quantitys, f_utf_string_quantity) + #define f_macro_utf_string_quantitys_destroy(status, quantitys) f_macro_memory_structure_destroy(status, quantitys, f_utf_string_quantity) + + #define f_macro_utf_string_quantitys_delete_simple(quantitys) f_macro_memory_structure_delete_simple(quantitys, f_utf_string_quantity) + #define f_macro_utf_string_quantitys_destroy_simple(quantitys) f_macro_memory_structure_destroy_simple(quantitys, f_utf_string_quantity) + + #define f_macro_utf_string_quantitys_resize(status, quantitys, new_length) f_macro_memory_structure_resize(status, quantitys, f_utf_string_quantity, new_length) + #define f_macro_utf_string_quantitys_adjust(status, quantitys, new_length) f_macro_memory_structure_adjust(status, quantitys, f_utf_string_quantity, new_length) +#endif // _di_f_utf_string_quantitys_ + + +/** + * A string that is analogous to f_utf_string_dynamic but intended for static-only uses. + * + * The f_utf_string_static type should never be directly allocated or deallocated. + * + * A special f_macro_utf_string_static_initialize() is provided for the special purpose of easily initialize a static string. + * + * string: the string. + * size: total amount of space available. + * used: total number of space used. + */ +#ifndef _di_f_utf_string_static_ + typedef struct { + f_utf_string string; + + f_utf_string_length size; + f_utf_string_length used; + } f_utf_string_static; + + #define f_utf_string_static_initialize { f_utf_string_initialize, 0, 0 } + + #define f_macro_utf_string_static_clear(string_static) \ + string_static.string = 0; \ + string_static.size = 0; \ + string_static.used = 0; + + #define f_macro_utf_string_static_initialize(string, length) { string, length, length } +#endif // _di_f_string_static_ + +/** + * A string that supports contains a size attribute to handle dynamic allocations and deallocations. + * save the string size along with the string, so that strlen(..) commands can be avoided as much as possible. + * + * size: total amount of allocated space. + * used: total number of allocated spaces used. + */ +#ifndef _di_f_utf_string_dynamic_ + typedef f_utf_string_static f_utf_string_dynamic; + + #define f_utf_string_dynamic_initialize f_utf_string_static_initialize + + #define f_clear_utf_string_dynamic(dynamic) f_macro_utf_string_static_clear(dynamic) + + #define f_macro_utf_string_dynamic_new(status, dynamic, new_length) \ + f_clear_utf_string_dynamic(dynamic) \ + status = f_memory_new((void **) & dynamic.string, sizeof(f_utf_string), new_length); \ + if (status == F_none) { \ + dynamic.size = new_length; \ + dynamic.used = 0; \ + } + + #define f_macro_utf_string_dynamic_delete(status, dynamic) \ + status = f_memory_delete((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size); \ + if (status == F_none) { \ + dynamic.size = 0; \ + dynamic.used = 0; \ + } + + #define f_macro_utf_string_dynamic_destroy(status, dynamic) \ + status = f_memory_destroy((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size); \ + if (status == F_none) { \ + dynamic.size = 0; \ + dynamic.used = 0; \ + } + + #define f_macro_utf_string_dynamic_delete_simple(dynamic) \ + f_memory_delete((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size); \ + dynamic.size = 0; \ + dynamic.used = 0; + + #define f_macro_utf_string_dynamic_destroy_simple(dynamic) \ + f_memory_destroy((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size); \ + dynamic.size = 0; \ + dynamic.used = 0; + + #define f_macro_utf_string_dynamic_resize(status, dynamic, new_length) \ + status = f_memory_resize((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size, new_length); \ + if (status == F_none) { \ + dynamic.size = new_length; \ + if (dynamic.used > dynamic.size) dynamic.used = new_length; \ + } + + #define f_macro_utf_string_dynamic_adjust(status, dynamic, new_length) \ + status = f_memory_adjust((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size, new_length); \ + if (status == F_none) { \ + dynamic.size = new_length; \ + if (dynamic.used > dynamic.size) dynamic.used = new_length; \ + } +#endif // _di_f_utf_string_dynamic_ + +/** + * An array of dynamic utf_strings. + * + * size: total amount of allocated space. + * used: total number of allocated spaces used. + */ +#ifndef _di_f_utf_string_dynamics_ + typedef struct { + f_utf_string_dynamic *array; + + f_utf_string_length size; + f_utf_string_length used; + } f_utf_string_dynamics; + + #define f_utf_string_dynamics_initialize { 0, 0, 0 } + + #define f_clear_utf_string_dynamics(dynamics) \ + dynamics.array = 0; \ + dynamics.size = 0; \ + dynamics.used = 0; + + #define f_macro_utf_string_dynamics_new(status, dynamics, length) \ + dynamics.array = 0; \ + dynamics.size = 0; \ + dynamics.used = 0; \ + status = f_memory_new((void **) & dynamics.array, sizeof(f_utf_string_dynamic), length); \ + if (status == F_none) { \ + dynamics.size = length; \ + dynamics.used = 0; \ + } + + #define f_macro_utf_string_dynamics_delete(status, dynamics) \ + status = F_none; \ + while (dynamics.size > 0) { \ + --dynamics.size; \ + f_macro_utf_string_dynamic_destroy(status, dynamics.array[dynamics.size]); \ + if (status != F_none) break; \ + } \ + if (status == F_none) status = f_memory_delete((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size); \ + if (status == F_none) dynamics.used = 0; + + #define f_macro_utf_string_dynamics_destroy(status, dynamics) \ + status = F_none; \ + while (dynamics.size > 0) { \ + --dynamics.size; \ + f_macro_utf_string_dynamic_destroy(status, dynamics.array[dynamics.size]); \ + if (status != F_none) break; \ + } \ + if (status == F_none) status = f_memory_destroy((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size); \ + if (status == F_none) dynamics.used = 0; + + #define f_macro_utf_string_dynamics_delete_simple(dynamics) \ + dynamics.used = dynamics.size; \ + while (dynamics.used > 0) { \ + dynamics.used--; \ + f_macro_string_dynamic_delete_simple(dynamics.array[dynamics.used]); \ + if (dynamics.used == 0) { \ + if (f_memory_delete((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size)) { \ + dynamics.size = 0; \ + } \ + } \ + } + + #define f_macro_utf_string_dynamics_destroy_simple(dynamics) \ + dynamics.used = dynamics.size; \ + while (dynamics.used > 0) { \ + dynamics.used--; \ + f_macro_string_dynamic_destroy_simple(dynamics.array[dynamics.used]); \ + if (dynamics.used == 0) { \ + if (f_memory_destroy((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size)) { \ + dynamics.size = 0; \ + } \ + } \ + } + + #define f_macro_utf_string_dynamics_resize(status, dynamics, new_length) \ + status = F_none; \ + if (new_length < dynamics.size) { \ + f_utf_string_length i = dynamics.size - new_length; \ + for (; i < dynamics.size; i++) { \ + f_macro_utf_string_dynamic_destroy(status, dynamics.array[i]); \ + if (status != F_none) break; \ + } \ + } \ + if (status == F_none) status = f_memory_resize((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size, new_length); \ + if (status == F_none) { \ + if (new_length > dynamics.size) { \ + f_utf_string_length i = dynamics.size; \ + for (; i < new_length; i++) { \ + memset(&dynamics.array[i], 0, sizeof(f_utf_string_dynamic)); \ + } \ + } \ + dynamics.size = new_length; \ + if (dynamics.used > dynamics.size) dynamics.used = new_length; \ + } + + #define f_macro_utf_string_dynamics_adjust(status, dynamics, new_length) \ + status = F_none; \ + if (new_length < dynamics.size) { \ + f_utf_string_length i = dynamics.size - new_length; \ + for (; i < dynamics.size; i++) { \ + f_macro_utf_string_dynamic_destroy(status, dynamics.array[i], f_utf_string_dynamic); \ + if (status != F_none) break; \ + } \ + } \ + if (status == F_none) status = f_memory_adjust((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size, new_length); \ + if (status == F_none) { \ + if (new_length > dynamics.size) { \ + f_utf_string_length i = dynamics.size; \ + for (; i < new_length; i++) { \ + memset(&dynamics.array[i], 0, sizeof(f_utf_string_dynamic)); \ + } \ + } \ + dynamics.size = new_length; \ + if (dynamics.used > dynamics.size) dynamics.used = new_length; \ + } +#endif // _di_f_utf_string_dynamic_ + +/** + * Define the UTF-8 general whitespace codes. + * + * These are integers representing character codes that represent types of spaces. + * + * This does not provide whitespace codes for standard ascii whitespaces, such as '\t' or '\r'. + */ +#ifndef _di_f_utf_space_ + #define f_utf_space_em_length 3 + #define f_utf_space_em_quad_length 3 + #define f_utf_space_em_per_three_length 3 + #define f_utf_space_em_per_four_length 3 + #define f_utf_space_em_per_six_length 3 + + #define f_utf_space_en_length 3 + #define f_utf_space_en_quad_length 3 + + #define f_utf_space_line_feed_reverse_length 2 + #define f_utf_space_line_next_length 2 + + #define f_utf_space_medium_mathematical_length 3 + + #define f_utf_space_no_break_length 2 + #define f_utf_space_no_break_narrow_length 3 + + #define f_utf_space_ogham_length 3 + #define f_utf_space_figure_length 3 + #define f_utf_space_punctuation_length 3 + #define f_utf_space_thin_length 3 + #define f_utf_space_hair_length 3 + #define f_utf_space_ideographic_length 3 + + #define f_utf_space_separator_line_length 3 + #define f_utf_space_separator_paragraph_length 3 + + const static int8_t f_utf_space_em[f_utf_space_em_length] = { 0xe2, 0x80, 0x83 }; + const static int8_t f_utf_space_em_quad[f_utf_space_em_quad_length] = { 0xe2, 0x80, 0x81 }; + const static int8_t f_utf_space_em_per_three[f_utf_space_em_per_three_length] = { 0xe2, 0x80, 0x84 }; + const static int8_t f_utf_space_em_per_four[f_utf_space_em_per_four_length] = { 0xe2, 0x80, 0x85 }; + const static int8_t f_utf_space_em_per_six[f_utf_space_em_per_six_length] = { 0xe2, 0x80, 0x86 }; + + const static int8_t f_utf_space_en[f_utf_space_en_length] = { 0xe2, 0x80, 0x82 }; + const static int8_t f_utf_space_en_quad[f_utf_space_en_quad_length] = { 0xe2, 0x80, 0x80 }; + + const static int8_t f_utf_space_line_feed_reverse[f_utf_space_line_feed_reverse_length] = { 0xc2, 0x8d }; + const static int8_t f_utf_space_line_next[f_utf_space_line_next_length] = { 0xc2, 0x85 }; + + const static int8_t f_utf_space_medium_mathematical[f_utf_space_medium_mathematical_length] = { 0xe2, 0x81, 0x9f }; + + const static int8_t f_utf_space_no_break[f_utf_space_no_break_length] = { 0xc2, 0xa0 }; + const static int8_t f_utf_space_no_break_narrow[f_utf_space_no_break_narrow_length] = { 0xe2, 0x80, 0xaf }; + + const static int8_t f_utf_space_ogham[f_utf_space_ogham_length] = { 0xe1, 0x9a, 0x80 }; + const static int8_t f_utf_space_figure[f_utf_space_figure_length] = { 0xe2, 0x80, 0x87 }; + const static int8_t f_utf_space_punctuation[f_utf_space_punctuation_length] = { 0xe2, 0x80, 0x88 }; + const static int8_t f_utf_space_thin[f_utf_space_thin_length] = { 0xe2, 0x80, 0x89 }; + const static int8_t f_utf_space_hair[f_utf_space_hair_length] = { 0xe2, 0x80, 0x8a }; + const static int8_t f_utf_space_ideographic[f_utf_space_ideographic_length] = { 0xe3, 0x80, 0x80 }; + + const static int8_t f_utf_space_separator_line[f_utf_space_separator_line_length] = { 0xe2, 0x80, 0xa8 }; + const static int8_t f_utf_space_separator_paragraph[f_utf_space_separator_paragraph_length] = { 0xe2, 0x80, 0xa8 }; +#endif // _di_f_utf_space_ + +/** + * Define the UTF-8 general substitute whitespace codes. + * + * These are integers representing character codes that represent types of substitute spaces. + * + * Substitute codes are not actual codes and are actually prints of the codes so they should not be treated as the actual codes. + * + * This does not provide substitute whitespace codes for standard ascii whitespaces, such as '\t' or '\r'. + */ +#ifndef _di_f_utf_substitute_ + #define f_utf_substitute_symbol_blank_length 3 + #define f_utf_substitute_symbol_space_length 3 + + #define f_utf_substitute_middle_dot_length 2 + + #define f_utf_substitute_open_box_length 3 + #define f_utf_substitute_open_box_shouldered_length 3 + + const static int8_t f_utf_substitute_symbol_blank[f_utf_substitute_symbol_blank_length] = { 0xe2, 0x90, 0xa2 }; + const static int8_t f_utf_substitute_symbol_space[f_utf_substitute_symbol_space_length] = { 0xe2, 0x90, 0xa0 }; + + const static int8_t f_utf_substitute_middle_dot[f_utf_substitute_middle_dot_length] = { 0xc2, 0xb7 }; + + const static int8_t f_utf_substitute_open_box[f_utf_substitute_open_box_length] = { 0xe2, 0x90, 0xa3 }; + const static int8_t f_utf_substitute_open_box_shouldered[f_utf_substitute_open_box_shouldered_length] = { 0xe2, 0x8d, 0xbd }; +#endif // _di_f_utf_substitute_ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // _F_utf_common_h diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index 8b63011..e384622 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -259,7 +259,29 @@ extern "C" { #ifndef _di_f_utf_character_is_zero_width_ f_return_status f_utf_character_is_zero_width(const f_utf_character character) { - if (f_macro_utf_character_width_is(character) == 1) { + unsigned short width = f_macro_utf_character_width_is(character); + + if (width == 0) { + uint8_t ascii = f_macro_utf_character_to_char_1(character); + + // These control characters are considered zero-width spaces. + if (ascii >= 0x00 && ascii <= 0x08) { + return F_true; + } + else if (ascii == 0x0a) { + return F_true; + } + else if (ascii >= 0x0c && ascii <= 0x1f) { + return F_true; + } + else if (ascii == 0x7f) { + return F_true; + } + + return F_false; + } + + if (width == 1) { return F_status_is_error(F_utf); } @@ -741,7 +763,17 @@ extern "C" { uint8_t width = f_macro_utf_byte_width_is(*character); if (width == 0) { - // There are no zero-width spaces in ASCII. + // These control characters are considered zero-width spaces. + if (*character >= 0x00 && *character <= 0x08) { + return F_true; + } + else if (*character >= 0x0c && *character <= 0x1f) { + return F_true; + } + else if (*character == 0x7f) { + return F_true; + } + return F_false; } diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h index 6ce6cd0..827388c 100644 --- a/level_0/f_utf/c/utf.h +++ b/level_0/f_utf/c/utf.h @@ -43,578 +43,14 @@ #include #include +// fll-0 utf includes +#include + #ifdef __cplusplus extern "C" { #endif /** - * Define the UTF-8 bytes. - * - * The bytes are for checking a single 8-bit character value (specifically, checking the first bits). - * - * The byte offs are used for determining the character width of a UTF-8 character. - * - * The f_utf_byte_is method will return non-zero if the character is a UTF-8 character of any width. - * - * The f_utf_byte_1 is specifically used only on UTF-8 fragments. - * For example, with the 2-byte-wide UTF-8 character '1100x xxxx 10yy yyyy', the 8-byte block '10yy yyyy' would be a fragment. - * - * The f_macro_utf_byte_is_* macros are used to determine a width of the character (either 1, 2, 3, or 4, respectively). - * - * The f_macro_utf_byte_width macro determines a width of the character. - * The f_macro_utf_byte_width_is is identical to f_macro_utf_byte_width, except it returns 0 when character is not UTF-8. - */ -#ifndef _di_f_utf_byte_ - #define f_utf_byte_1 0x80 // 1000 0000 - #define f_utf_byte_2 0xc0 // 1100 0000 - #define f_utf_byte_3 0xe0 // 1110 0000 - #define f_utf_byte_4 0xf0 // 1111 0000 - - #define f_utf_byte_off_1 0xc0 // 1100 0000 - #define f_utf_byte_off_2 0xe0 // 1110 0000 - #define f_utf_byte_off_3 0xf0 // 1111 0000 - #define f_utf_byte_off_4 0xf8 // 1111 1000 - - #define f_macro_utf_byte_is(character) ((character) & f_utf_byte_1) - - #define f_macro_utf_byte_is_1(character) (((character) & f_utf_byte_off_1) == f_utf_byte_1) // (10xx xxxx & 1100 0000) == 1000 0000 - #define f_macro_utf_byte_is_2(character) (((character) & f_utf_byte_off_2) == f_utf_byte_2) // (110x xxxx & 1110 0000) == 1100 0000 - #define f_macro_utf_byte_is_3(character) (((character) & f_utf_byte_off_3) == f_utf_byte_3) // (1110 xxxx & 1111 0000) == 1110 0000 - #define f_macro_utf_byte_is_4(character) (((character) & f_utf_byte_off_4) == f_utf_byte_4) // (1111 0xxx & 1111 1000) == 1111 0000 - - #define f_macro_utf_byte_width(character) ((!f_macro_utf_byte_is(character) || f_macro_utf_byte_is_1(character)) ? 1 : (f_macro_utf_byte_is_2(character) ? 2 : (f_macro_utf_byte_is_3(character) ? 3 : 4))) - #define f_macro_utf_byte_width_is(character) (f_macro_utf_byte_is(character) ? (f_macro_utf_byte_is_1(character) ? 1 : (f_macro_utf_byte_is_2(character) ? 2 : (f_macro_utf_byte_is_3(character) ? 3 : 4))) : 0) -#endif // _di_f_utf_byte_ - -/** - * Provide a basic UTF-8 character as a single 4-byte variable. - * - * This is intended to be used when a single variable is desired to represent a 1-byte, 2-byte, 3-byte, or even 4-byte character. - * - * This "character" type is stored as a big-endian 4-byte integer (32-bits). - * A helper function, f_utf_is_big_endian(), is provided to detect system endianness so that character arrays (int8_t []) can be correctly processed. - * - * The byte structure is intended to be read left to right. - * - * The f_macro_utf_character_mask_byte_* are used to get the entire character set fo a given width. - * - * The f_macro_utf_character_mask_char_* are used to get a specific UTF-8 block as a single character range. - * - * The f_macro_utf_character_to_char_* are used to convert a f_utf_character into a int8_t, for a given 8-bit block. - * - * The f_macro_utf_character_from_char_* are used to convert a int8_t into part of a f_utf_character, for a given 8-bit block. - * - * The f_macro_utf_character_width is used to determine the width of the UTF-8 character based on f_macro_utf_byte_width. - * The f_macro_utf_character_width_is is used to determine the width of the UTF-8 character based on f_macro_utf_byte_width_is. - * - * @see f_utf_is_big_endian() - */ -#ifndef _di_f_utf_character_ - typedef uint32_t f_utf_character; - - #define f_utf_character_mask_byte_1 0xff000000 // 1111 1111, 0000 0000, 0000 0000, 0000 0000 - #define f_utf_character_mask_byte_2 0xffff0000 // 1111 1111, 1111 1111, 0000 0000, 0000 0000 - #define f_utf_character_mask_byte_3 0xffffff00 // 1111 1111, 1111 1111, 1111 1111, 0000 0000 - #define f_utf_character_mask_byte_4 0xffffffff // 1111 1111, 1111 1111, 1111 1111, 1111 1111 - - #define f_utf_character_mask_char_1 0xff000000 // 1111 1111, 0000 0000, 0000 0000, 0000 0000 - #define f_utf_character_mask_char_2 0x00ff0000 // 0000 0000, 1111 1111, 0000 0000, 0000 0000 - #define f_utf_character_mask_char_3 0x0000ff00 // 0000 0000, 0000 0000, 1111 1111, 0000 0000 - #define f_utf_character_mask_char_4 0x000000ff // 0000 0000, 0000 0000, 0000 0000, 1111 1111 - - #define f_macro_utf_character_to_char_1(character) (((character) & f_utf_character_mask_char_1) >> 24) // grab first byte. - #define f_macro_utf_character_to_char_2(character) (((character) & f_utf_character_mask_char_2) >> 16) // grab second byte. - #define f_macro_utf_character_to_char_3(character) (((character) & f_utf_character_mask_char_3) >> 8) // grab third byte. - #define f_macro_utf_character_to_char_4(character) ((character) & f_utf_character_mask_char_4) // grab fourth byte. - - #define f_macro_utf_character_from_char_1(character) (((character) << 24) & f_utf_character_mask_char_1) // shift to first byte. - #define f_macro_utf_character_from_char_2(character) (((character) << 16) & f_utf_character_mask_char_2) // shift to second byte. - #define f_macro_utf_character_from_char_3(character) (((character) << 8) & f_utf_character_mask_char_3) // shift to third byte. - #define f_macro_utf_character_from_char_4(character) ((character) & f_utf_character_mask_char_4) // shift to fourth byte. - - #define f_macro_utf_character_width(character) (f_macro_utf_byte_width(f_macro_utf_character_to_char_1(character))) - #define f_macro_utf_character_width_is(character) (f_macro_utf_byte_width_is(f_macro_utf_character_to_char_1(character))) -#endif // _di_f_utf_character_ - -#ifndef _di_f_utf_character_have_eol_ - #define f_utf_character_eol 0x0a000000 // 0000 1010, 0000 0000, 0000 0000, 0000 0000 -#endif // _di_f_utf_character_have_eol_ - -#ifndef _di_f_utf_character_have_eos_ - #define f_utf_character_eos 0x00000000 // 0000 0000, 0000 0000, 0000 0000, 0000 0000 -#endif // _di_f_utf_character_have_eos_ - -#ifndef _di_f_utf_character_have_placeholder_ - #define f_utf_character_placeholder 0x00000000 // 0000 0000, 0000 0000, 0000 0000, 0000 0000 -#endif // _di_f_utf_character_have_placeholder_ - -/** - * Provide a UTF-8 characters set to 4-bits wide as a string. - */ -#ifndef _di_f_utf_string_ - typedef f_utf_character *f_utf_string; - - #define f_utf_string_max_size f_type_number_size_unsigned - #define f_utf_string_initialize f_utf_character_eos - - #define f_macro_utf_string_new(status, string, length) status = f_memory_new((void **) & string, sizeof(f_utf_string), length) - - #define f_macro_utf_string_delete(status, string, size) status = f_memory_delete((void **) & string, sizeof(f_utf_string), size) - #define f_macro_utf_string_destroy(status, string, size) status = f_memory_destroy((void **) & string, sizeof(f_utf_string), size) - - #define f_macro_utf_string_resize(status, string, old_length, new_length) status = f_memory_resize((void **) & string, sizeof(f_utf_string), old_length, new_length) - #define f_macro_utf_string_adjust(status, string, old_length, new_length) status = f_memory_adjust((void **) & string, sizeof(f_utf_string), old_length, new_length) -#endif // _di_f_utf_string_ - -/** - * Provide a type specifically for UTF-8 strings. - */ -#ifndef _di_f_utf_string_length_ - typedef f_number_unsigned f_utf_string_length; - - #define f_utf_string_length_size 0xfffffffffffffffe - #define f_utf_string_length_size_max f_type_number_size_max_unsigned - - #define f_macro_utf_string_length_new(status, string, length) status = f_memory_new((void **) & string, sizeof(f_utf_string_length), length) - - #define f_macro_utf_string_length_delete(status, string, length) status = f_memory_delete((void **) & string, sizeof(f_utf_string_length), length) - #define f_macro_utf_string_length_destroy(status, string, size) status = f_memory_destroy((f_void_P *) & string, sizeof(f_utf_string_length), size) - - #define f_macro_utf_string_length_resize(status, length, old_length, new_length) status = f_memory_resize((void **) & length, sizeof(f_utf_string_length), old_length, new_length) - #define f_macro_utf_string_length_adjust(status, length, old_length, new_length) status = f_memory_adjust((void **) & length, sizeof(f_utf_string_length), old_length, new_length) -#endif // _di_f_utf_string_length_ - -/** - * size: total amount of allocated space. - * used: total number of allocated spaces used. - */ -#ifndef _di_f_utf_string_lengths_ - typedef struct { - f_utf_string_length *array; - - f_array_length size; - f_array_length used; - } f_utf_string_lengths; - - #define f_utf_string_lengths_initialize { 0, 0, 0 } - - #define f_macro_utf_string_lengths_clear(lengths) f_macro_memory_structure_clear(lengths) - - #define f_macro_utf_string_lengths_new(status, lengths, length) f_macro_memory_structure_new(status, lengths, f_utf_string_length, length) - - #define f_macro_utf_string_lengths_delete(status, lengths) f_macro_memory_structure_delete(status, lengths, f_utf_string_length) - #define f_macro_utf_string_lengths_destroy(status, lengths) f_macro_memory_structure_destroy(status, lengths, f_utf_string_length) - - #define f_macro_utf_string_lengths_resize(status, lengths, new_length) f_macro_memory_structure_resize(status, lengths, f_utf_string_length, new_length) - #define f_macro_utf_string_lengths_adjust(status, lengths, new_length) f_macro_memory_structure_adjust(status, lengths, f_utf_string_length, new_length) -#endif // _di_f_utf_string_lengths_ - -/** - * Designates a start and stop position that represents a sub-string inside of some parent string. - * use this to avoid resizing, restructuring, and reallocating the parent string to separate the sub-string. - * - * A special f_macro_utf_string_range_initialize() is provided for the special purpose of easily initialize a static string range. - */ -#ifndef _di_f_utf_string_range_ - typedef struct { - f_utf_string_length start; - f_utf_string_length stop; - } f_utf_string_range; - - #define f_utf_string_range_initialize { 1, 0 } - - #define f_macro_utf_string_range_initialize(length) { 0, length - 1 } - - #define f_macro_utf_string_range_new(status, utf_string_range, length) status = f_memory_new((void **) & utf_string_range, sizeof(f_utf_string_range), length) - #define f_macro_utf_string_range_delete(status, utf_string_range, size) status = f_memory_delete((void **) & utf_string_range, sizeof(f_utf_string_range), size) - #define f_macro_utf_string_range_destroy(status, utf_string_range, size) status = f_memory_destroy((void **) & utf_string_range, sizeof(f_utf_string_range), size) - - #define f_macro_utf_string_range_resize(status, utf_string_range, old_length, new_length) \ - status = f_memory_resize((void **) & utf_string_range, sizeof(f_utf_string_range), old_length, new_length) - - #define f_macro_utf_string_range_adjust(status, utf_string_range, old_length, new_length) \ - status = f_memory_adjust((void **) & utf_string_range, sizeof(f_utf_string_range), old_length, new_length) -#endif // _di_f_utf_string_range_ - -/** - * An array of string ranges. - * - * size: total amount of allocated space. - * used: total number of allocated spaces used. - */ -#ifndef _di_f_utf_string_ranges_ - typedef struct { - f_utf_string_range *array; - - f_array_length size; - f_array_length used; - } f_utf_string_ranges; - - #define f_utf_string_ranges_initialize {0, 0, 0} - - #define f_clear_utf_string_ranges(ranges) f_macro_memory_structure_clear(ranges) - - #define f_macro_utf_string_range_news(status, ranges, length) f_macro_memory_structure_new(status, ranges, f_utf_string_range, length) - - #define f_macro_utf_string_range_deletes(status, ranges) f_macro_memory_structure_delete(status, ranges, f_utf_string_range) - #define f_macro_utf_string_range_destroys(status, ranges) f_macro_memory_structure_destroy(status, ranges, f_utf_string_range) - - #define f_macro_utf_string_range_resizes(status, ranges, new_length) f_macro_memory_structure_resize(status, ranges, f_utf_string_range, new_length) - #define f_macro_utf_string_range_adjusts(status, ranges, new_length) f_macro_memory_structure_adjust(status, ranges, f_utf_string_range, new_length) -#endif // _di_f_utf_string_ranges_ - -/** - * Store string quantity. - * - * Similar to f_utf_string_range, except total is relative to start and is not an absolute stop position. - * - * Two common uses for when total is 0 is: - * 1) Exactly that, process a total of 0 strings bytes. - * 2) Process with no limit, aka infinite. - * - * start: The position where the string starts (based on some string/buffer). - * total: The total number of elements within that string/buffer the quantity represents. - */ -#ifndef _di_f_utf_string_quantity_ - typedef struct { - f_utf_string_length start; - f_utf_string_length total; - } f_utf_string_quantity; - - #define f_utf_string_quantity_initialize { 0, 0 } -#endif // _di_f_utf_string_quantity_ - -/** - * An array of string quantities. - * - * array: the array of string quantities. - * size: total amount of allocated space. - * used: total number of allocated spaces used. - */ -#ifndef _di_f_utf_string_quantitys_ - typedef struct { - f_utf_string_quantity *array; - - f_array_length size; - f_array_length used; - } f_utf_string_quantitys; - - #define f_utf_string_quantitys_initialize {0, 0, 0} - - #define f_macro_utf_string_quantitys_clear(quantitys) f_macro_memory_structure_clear(quantitys) - - #define f_macro_utf_string_quantitys_new(status, quantitys, length) f_macro_memory_structure_new(status, quantitys, f_utf_string_quantity, length) - - #define f_macro_utf_string_quantitys_delete(status, quantitys) f_macro_memory_structure_delete(status, quantitys, f_utf_string_quantity) - #define f_macro_utf_string_quantitys_destroy(status, quantitys) f_macro_memory_structure_destroy(status, quantitys, f_utf_string_quantity) - - #define f_macro_utf_string_quantitys_delete_simple(quantitys) f_macro_memory_structure_delete_simple(quantitys, f_utf_string_quantity) - #define f_macro_utf_string_quantitys_destroy_simple(quantitys) f_macro_memory_structure_destroy_simple(quantitys, f_utf_string_quantity) - - #define f_macro_utf_string_quantitys_resize(status, quantitys, new_length) f_macro_memory_structure_resize(status, quantitys, f_utf_string_quantity, new_length) - #define f_macro_utf_string_quantitys_adjust(status, quantitys, new_length) f_macro_memory_structure_adjust(status, quantitys, f_utf_string_quantity, new_length) -#endif // _di_f_utf_string_quantitys_ - - -/** - * A string that is analogous to f_utf_string_dynamic but intended for static-only uses. - * - * The f_utf_string_static type should never be directly allocated or deallocated. - * - * A special f_macro_utf_string_static_initialize() is provided for the special purpose of easily initialize a static string. - * - * string: the string. - * size: total amount of space available. - * used: total number of space used. - */ -#ifndef _di_f_utf_string_static_ - typedef struct { - f_utf_string string; - - f_utf_string_length size; - f_utf_string_length used; - } f_utf_string_static; - - #define f_utf_string_static_initialize { f_utf_string_initialize, 0, 0 } - - #define f_macro_utf_string_static_clear(string_static) \ - string_static.string = 0; \ - string_static.size = 0; \ - string_static.used = 0; - - #define f_macro_utf_string_static_initialize(string, length) { string, length, length } -#endif // _di_f_string_static_ - -/** - * A string that supports contains a size attribute to handle dynamic allocations and deallocations. - * save the string size along with the string, so that strlen(..) commands can be avoided as much as possible. - * - * size: total amount of allocated space. - * used: total number of allocated spaces used. - */ -#ifndef _di_f_utf_string_dynamic_ - typedef f_utf_string_static f_utf_string_dynamic; - - #define f_utf_string_dynamic_initialize f_utf_string_static_initialize - - #define f_clear_utf_string_dynamic(dynamic) f_macro_utf_string_static_clear(dynamic) - - #define f_macro_utf_string_dynamic_new(status, dynamic, new_length) \ - f_clear_utf_string_dynamic(dynamic) \ - status = f_memory_new((void **) & dynamic.string, sizeof(f_utf_string), new_length); \ - if (status == F_none) { \ - dynamic.size = new_length; \ - dynamic.used = 0; \ - } - - #define f_macro_utf_string_dynamic_delete(status, dynamic) \ - status = f_memory_delete((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size); \ - if (status == F_none) { \ - dynamic.size = 0; \ - dynamic.used = 0; \ - } - - #define f_macro_utf_string_dynamic_destroy(status, dynamic) \ - status = f_memory_destroy((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size); \ - if (status == F_none) { \ - dynamic.size = 0; \ - dynamic.used = 0; \ - } - - #define f_macro_utf_string_dynamic_delete_simple(dynamic) \ - f_memory_delete((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size); \ - dynamic.size = 0; \ - dynamic.used = 0; - - #define f_macro_utf_string_dynamic_destroy_simple(dynamic) \ - f_memory_destroy((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size); \ - dynamic.size = 0; \ - dynamic.used = 0; - - #define f_macro_utf_string_dynamic_resize(status, dynamic, new_length) \ - status = f_memory_resize((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size, new_length); \ - if (status == F_none) { \ - dynamic.size = new_length; \ - if (dynamic.used > dynamic.size) dynamic.used = new_length; \ - } - - #define f_macro_utf_string_dynamic_adjust(status, dynamic, new_length) \ - status = f_memory_adjust((void **) & dynamic.string, sizeof(f_utf_string), dynamic.size, new_length); \ - if (status == F_none) { \ - dynamic.size = new_length; \ - if (dynamic.used > dynamic.size) dynamic.used = new_length; \ - } -#endif // _di_f_utf_string_dynamic_ - -/** - * An array of dynamic utf_strings. - * - * size: total amount of allocated space. - * used: total number of allocated spaces used. - */ -#ifndef _di_f_utf_string_dynamics_ - typedef struct { - f_utf_string_dynamic *array; - - f_utf_string_length size; - f_utf_string_length used; - } f_utf_string_dynamics; - - #define f_utf_string_dynamics_initialize { 0, 0, 0 } - - #define f_clear_utf_string_dynamics(dynamics) \ - dynamics.array = 0; \ - dynamics.size = 0; \ - dynamics.used = 0; - - #define f_macro_utf_string_dynamics_new(status, dynamics, length) \ - dynamics.array = 0; \ - dynamics.size = 0; \ - dynamics.used = 0; \ - status = f_memory_new((void **) & dynamics.array, sizeof(f_utf_string_dynamic), length); \ - if (status == F_none) { \ - dynamics.size = length; \ - dynamics.used = 0; \ - } - - #define f_macro_utf_string_dynamics_delete(status, dynamics) \ - status = F_none; \ - while (dynamics.size > 0) { \ - --dynamics.size; \ - f_macro_utf_string_dynamic_destroy(status, dynamics.array[dynamics.size]); \ - if (status != F_none) break; \ - } \ - if (status == F_none) status = f_memory_delete((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size); \ - if (status == F_none) dynamics.used = 0; - - #define f_macro_utf_string_dynamics_destroy(status, dynamics) \ - status = F_none; \ - while (dynamics.size > 0) { \ - --dynamics.size; \ - f_macro_utf_string_dynamic_destroy(status, dynamics.array[dynamics.size]); \ - if (status != F_none) break; \ - } \ - if (status == F_none) status = f_memory_destroy((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size); \ - if (status == F_none) dynamics.used = 0; - - #define f_macro_utf_string_dynamics_delete_simple(dynamics) \ - dynamics.used = dynamics.size; \ - while (dynamics.used > 0) { \ - dynamics.used--; \ - f_macro_string_dynamic_delete_simple(dynamics.array[dynamics.used]); \ - if (dynamics.used == 0) { \ - if (f_memory_delete((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size)) { \ - dynamics.size = 0; \ - } \ - } \ - } - - #define f_macro_utf_string_dynamics_destroy_simple(dynamics) \ - dynamics.used = dynamics.size; \ - while (dynamics.used > 0) { \ - dynamics.used--; \ - f_macro_string_dynamic_destroy_simple(dynamics.array[dynamics.used]); \ - if (dynamics.used == 0) { \ - if (f_memory_destroy((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size)) { \ - dynamics.size = 0; \ - } \ - } \ - } - - #define f_macro_utf_string_dynamics_resize(status, dynamics, new_length) \ - status = F_none; \ - if (new_length < dynamics.size) { \ - f_utf_string_length i = dynamics.size - new_length; \ - for (; i < dynamics.size; i++) { \ - f_macro_utf_string_dynamic_destroy(status, dynamics.array[i]); \ - if (status != F_none) break; \ - } \ - } \ - if (status == F_none) status = f_memory_resize((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size, new_length); \ - if (status == F_none) { \ - if (new_length > dynamics.size) { \ - f_utf_string_length i = dynamics.size; \ - for (; i < new_length; i++) { \ - memset(&dynamics.array[i], 0, sizeof(f_utf_string_dynamic)); \ - } \ - } \ - dynamics.size = new_length; \ - if (dynamics.used > dynamics.size) dynamics.used = new_length; \ - } - - #define f_macro_utf_string_dynamics_adjust(status, dynamics, new_length) \ - status = F_none; \ - if (new_length < dynamics.size) { \ - f_utf_string_length i = dynamics.size - new_length; \ - for (; i < dynamics.size; i++) { \ - f_macro_utf_string_dynamic_destroy(status, dynamics.array[i], f_utf_string_dynamic); \ - if (status != F_none) break; \ - } \ - } \ - if (status == F_none) status = f_memory_adjust((void **) & dynamics.array, sizeof(f_utf_string_dynamic), dynamics.size, new_length); \ - if (status == F_none) { \ - if (new_length > dynamics.size) { \ - f_utf_string_length i = dynamics.size; \ - for (; i < new_length; i++) { \ - memset(&dynamics.array[i], 0, sizeof(f_utf_string_dynamic)); \ - } \ - } \ - dynamics.size = new_length; \ - if (dynamics.used > dynamics.size) dynamics.used = new_length; \ - } -#endif // _di_f_utf_string_dynamic_ - -/** - * Define the UTF-8 general whitespace codes. - * - * These are integers representing character codes that represent types of spaces. - * - * This does not provide whitespace codes for standard ascii whitespaces, such as '\t' or '\r'. - */ -#ifndef _di_f_utf_space_ - #define f_utf_space_em_length 3 - #define f_utf_space_em_quad_length 3 - #define f_utf_space_em_per_three_length 3 - #define f_utf_space_em_per_four_length 3 - #define f_utf_space_em_per_six_length 3 - - #define f_utf_space_en_length 3 - #define f_utf_space_en_quad_length 3 - - #define f_utf_space_line_feed_reverse_length 2 - #define f_utf_space_line_next_length 2 - - #define f_utf_space_medium_mathematical_length 3 - - #define f_utf_space_no_break_length 2 - #define f_utf_space_no_break_narrow_length 3 - - #define f_utf_space_ogham_length 3 - #define f_utf_space_figure_length 3 - #define f_utf_space_punctuation_length 3 - #define f_utf_space_thin_length 3 - #define f_utf_space_hair_length 3 - #define f_utf_space_ideographic_length 3 - - #define f_utf_space_separator_line_length 3 - #define f_utf_space_separator_paragraph_length 3 - - const static int8_t f_utf_space_em[f_utf_space_em_length] = { 0xe2, 0x80, 0x83 }; - const static int8_t f_utf_space_em_quad[f_utf_space_em_quad_length] = { 0xe2, 0x80, 0x81 }; - const static int8_t f_utf_space_em_per_three[f_utf_space_em_per_three_length] = { 0xe2, 0x80, 0x84 }; - const static int8_t f_utf_space_em_per_four[f_utf_space_em_per_four_length] = { 0xe2, 0x80, 0x85 }; - const static int8_t f_utf_space_em_per_six[f_utf_space_em_per_six_length] = { 0xe2, 0x80, 0x86 }; - - const static int8_t f_utf_space_en[f_utf_space_en_length] = { 0xe2, 0x80, 0x82 }; - const static int8_t f_utf_space_en_quad[f_utf_space_en_quad_length] = { 0xe2, 0x80, 0x80 }; - - const static int8_t f_utf_space_line_feed_reverse[f_utf_space_line_feed_reverse_length] = { 0xc2, 0x8d }; - const static int8_t f_utf_space_line_next[f_utf_space_line_next_length] = { 0xc2, 0x85 }; - - const static int8_t f_utf_space_medium_mathematical[f_utf_space_medium_mathematical_length] = { 0xe2, 0x81, 0x9f }; - - const static int8_t f_utf_space_no_break[f_utf_space_no_break_length] = { 0xc2, 0xa0 }; - const static int8_t f_utf_space_no_break_narrow[f_utf_space_no_break_narrow_length] = { 0xe2, 0x80, 0xaf }; - - const static int8_t f_utf_space_ogham[f_utf_space_ogham_length] = { 0xe1, 0x9a, 0x80 }; - const static int8_t f_utf_space_figure[f_utf_space_figure_length] = { 0xe2, 0x80, 0x87 }; - const static int8_t f_utf_space_punctuation[f_utf_space_punctuation_length] = { 0xe2, 0x80, 0x88 }; - const static int8_t f_utf_space_thin[f_utf_space_thin_length] = { 0xe2, 0x80, 0x89 }; - const static int8_t f_utf_space_hair[f_utf_space_hair_length] = { 0xe2, 0x80, 0x8a }; - const static int8_t f_utf_space_ideographic[f_utf_space_ideographic_length] = { 0xe3, 0x80, 0x80 }; - - const static int8_t f_utf_space_separator_line[f_utf_space_separator_line_length] = { 0xe2, 0x80, 0xa8 }; - const static int8_t f_utf_space_separator_paragraph[f_utf_space_separator_paragraph_length] = { 0xe2, 0x80, 0xa8 }; -#endif // _di_f_utf_space_ - -/** - * Define the UTF-8 general substitute whitespace codes. - * - * These are integers representing character codes that represent types of substitute spaces. - * - * Substitute codes are not actual codes and are actually prints of the codes so they should not be treated as the actual codes. - * - * This does not provide substitute whitespace codes for standard ascii whitespaces, such as '\t' or '\r'. - */ -#ifndef _di_f_utf_substitute_ - #define f_utf_substitute_symbol_blank_length 3 - #define f_utf_substitute_symbol_space_length 3 - - #define f_utf_substitute_middle_dot_length 2 - - #define f_utf_substitute_open_box_length 3 - #define f_utf_substitute_open_box_shouldered_length 3 - - const static int8_t f_utf_substitute_symbol_blank[f_utf_substitute_symbol_blank_length] = { 0xe2, 0x90, 0xa2 }; - const static int8_t f_utf_substitute_symbol_space[f_utf_substitute_symbol_space_length] = { 0xe2, 0x90, 0xa0 }; - - const static int8_t f_utf_substitute_middle_dot[f_utf_substitute_middle_dot_length] = { 0xc2, 0xb7 }; - - const static int8_t f_utf_substitute_open_box[f_utf_substitute_open_box_length] = { 0xe2, 0x90, 0xa3 }; - const static int8_t f_utf_substitute_open_box_shouldered[f_utf_substitute_open_box_shouldered_length] = { 0xe2, 0x8d, 0xbd }; -#endif // _di_f_utf_substitute_ - -/** * Check to see if the entire byte block of the character is a UTF-8 character. * * This does not validate if the UTF-8 character is a valid UTF-8 character, for that use f_utf_character_is_valid(). diff --git a/level_0/f_utf/data/build/settings b/level_0/f_utf/data/build/settings index 1827624..6d506b8 100644 --- a/level_0/f_utf/data/build/settings +++ b/level_0/f_utf/data/build/settings @@ -22,7 +22,7 @@ build_libraries -lc build_libraries-individual -lf_memory build_sources_library utf.c private-utf.c build_sources_program -build_sources_headers utf.h +build_sources_headers utf.h utf-common.h build_sources_script build_sources_setting build_script yes -- 1.8.3.1