From 989b47aab5740c5585176926b203cb85570ba8a6 Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Tue, 13 Aug 2019 22:46:55 -0500 Subject: [PATCH] Feature: add initial UTF-8 support via f_utf This provides the most basic functionality for processing UTF-8 characters. Additional modules will be updated to natively process strings as UTF-8. --- build/level_0/settings | 6 +- level_0/f_errors/c/errors.h | 11 +- level_0/f_utf/c/utf.c | 301 ++++++++++++++++++++++++++++++++++ level_0/f_utf/c/utf.h | 255 ++++++++++++++++++++++++++++ level_0/f_utf/data/build/defines | 2 + level_0/f_utf/data/build/dependencies | 3 + level_0/f_utf/data/build/settings | 30 ++++ 7 files changed, 603 insertions(+), 5 deletions(-) create mode 100644 level_0/f_utf/c/utf.c create mode 100644 level_0/f_utf/c/utf.h create mode 100644 level_0/f_utf/data/build/defines create mode 100644 level_0/f_utf/data/build/dependencies create mode 100644 level_0/f_utf/data/build/settings diff --git a/build/level_0/settings b/build/level_0/settings index 4d27b94..ddfcc50 100644 --- a/build/level_0/settings +++ b/build/level_0/settings @@ -10,9 +10,9 @@ version_micro 0 build_compiler gcc build_linker ar build_libraries -lc -build_sources_library console.c conversion.c file.c memory.c pipe.c print.c -build_sources_program -build_sources_headers colors.h console.h conversion.h errors.h file.h fss.h memory.h fll_paths.h filesystem_paths.h pipe.h print.h serialized.h strings.h types.h types_array.h +build_sources_library console.c conversion.c file.c memory.c pipe.c print.c utf.c +build_sources_program +build_sources_headers colors.h console.h conversion.h errors.h file.h fss.h memory.h fll_paths.h filesystem_paths.h pipe.h print.h serialized.h strings.h types.h types_array.h utf.h build_shared yes build_static yes diff --git a/level_0/f_errors/c/errors.h b/level_0/f_errors/c/errors.h index 6ba80b8..0eb0ef6 100644 --- a/level_0/f_errors/c/errors.h +++ b/level_0/f_errors/c/errors.h @@ -7,7 +7,7 @@ * * Provides error definitions. * - * Warning: changing error codes will break abi, so recompile every file that includes and uses these error codes when this gets changed. + * Warning: changing error codes will break ABI, so recompile every file that includes and uses these error codes when this gets changed. */ #ifndef _F_errors_h #define _F_errors_h @@ -167,13 +167,17 @@ extern "C" { #define f_error_set_fine(status) status & f_error_bit_fine #endif // _di_f_error_masks_ -// use of an enumerator makes more sense here than explicitly defining every error code +// use of an enumerator makes more sense here than explicitly defining every error code (or return code). enum { #ifndef _di_f_errors_booleans_ f_false = 0, f_true, #endif // _di_f_errors_booleans_ + #ifndef _di_f_errors_maybe_ + f_maybe = 2, + #endif // _di_f_errors_maybe_ + #ifndef _di_f_errors_signals_ f_signal_hangup = 1, f_signal_interrupt, @@ -278,6 +282,7 @@ enum { f_invalid_value, // similar to f_invalid_parameter, but the parameter value is invalid (example: an integer value representing aboolean and having a 3 as a value). f_invalid_buffer, f_invalid_process, + f_invalid_utf, #endif // _di_f_errors_invalid_ #ifndef _di_f_errors_busy_ @@ -368,6 +373,7 @@ enum { f_file_reallocation_error, f_file_stat_error, f_file_error, + f_file_not_utf, #endif // _di_f_errors_file_ // most of these are a guess until I get around to researching & implementing linux directory I/O @@ -386,6 +392,7 @@ enum { f_directory_allocation_error, f_directory_reallocation_error, f_directory_error, + f_directory_not_utf, #endif // _di_f_errors_directory_ #ifndef _di_f_errors_socket_ diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c new file mode 100644 index 0000000..b07f574 --- /dev/null +++ b/level_0/f_utf/c/utf.c @@ -0,0 +1,301 @@ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _di_f_utf_is_space_ + f_return_status f_utf_is_space(const f_string character, const f_u_short maxWidth) { + #ifndef _di_level_0_parameter_checking_ + if (maxWidth < 1) return f_error_set_error(f_invalid_parameter); + #endif // _di_level_0_parameter_checking_ + + f_u_short width = f_macro_utf_byte_width(*character); + + if (width == 1) { + return f_false; + } + + if (width > maxWidth) { + return f_error_set_error(f_maybe); + } + + if (width == 2) { + if (!memcmp(character, f_utf_space_no_break, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_line_feed_reverse, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_line_next, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_substitute_middle_dot, width)) { + return f_true; + } + + return f_false; + } + + if (width == 3) { + if (!memcmp(character, f_utf_space_no_break_narrow, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_en, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_en_quad, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_en_quad, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_em, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_em_quad, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_em_per_three, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_em_per_four, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_em_per_six, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_figure, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_punctuation, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_thin, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_hair, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_separator_line, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_separator_paragraph, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_ogham, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_ideographic, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_medium_mathematical, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_substitute_symbol_blank, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_substitute_symbol_space, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_substitute_open_box, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_substitute_open_box_shouldered, width)) { + return f_true; + } + + return f_false; + } + + return f_false; + } +#endif // _di_f_utf_is_space_ + +#ifndef _di_f_utf_is_substitute_ + f_return_status f_utf_is_substitute(const f_string character, const f_u_short maxWidth) { + #ifndef _di_level_0_parameter_checking_ + if (maxWidth < 1) return f_error_set_error(f_invalid_parameter); + #endif // _di_level_0_parameter_checking_ + + f_u_short width = f_macro_utf_byte_width(*character); + + if (width == 1) { + return f_false; + } + + if (width > maxWidth) { + return f_error_set_error(f_maybe); + } + + if (width == 2) { + if (!memcmp(character, f_utf_substitute_middle_dot, width)) { + return f_true; + } + + return f_false; + } + + if (width == 3) { + if (!memcmp(character, f_utf_substitute_symbol_blank, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_substitute_symbol_space, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_substitute_open_box, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_substitute_open_box_shouldered, width)) { + return f_true; + } + + return f_false; + } + + return f_false; + } +#endif // _di_f_utf_is_substitute_ + +#ifndef _di_f_utf_is_whitespace_ + f_return_status f_utf_is_whitespace(const f_string character, const f_u_short maxWidth) { + #ifndef _di_level_0_parameter_checking_ + if (maxWidth < 1) return f_error_set_error(f_invalid_parameter); + #endif // _di_level_0_parameter_checking_ + + f_u_short width = f_macro_utf_byte_width(*character); + + if (width == 1) { + return f_false; + } + + if (width > maxWidth) { + return f_error_set_error(f_maybe); + } + + if (width == 2) { + if (!memcmp(character, f_utf_space_no_break, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_line_feed_reverse, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_line_next, width)) { + return f_true; + } + + return f_false; + } + + if (width == 3) { + if (!memcmp(character, f_utf_space_no_break_narrow, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_en, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_en_quad, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_en_quad, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_em, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_em_quad, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_em_per_three, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_em_per_four, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_em_per_six, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_figure, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_punctuation, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_thin, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_hair, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_separator_line, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_separator_paragraph, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_ogham, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_ideographic, width)) { + return f_true; + } + + if (!memcmp(character, f_utf_space_medium_mathematical, width)) { + return f_true; + } + + return f_false; + } + + return f_false; + } +#endif // _di_f_utf_is_whitespace_ + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h new file mode 100644 index 0000000..2965bc0 --- /dev/null +++ b/level_0/f_utf/c/utf.h @@ -0,0 +1,255 @@ +/** + * FLL - Level 0 + * + * Project: UTF + * API Version: 0.5 + * Licenses: lgplv2.1 + * + * Provides UTF-8 capabilities. + * + * Identifiers: + * - UTF_8-1: 1000 0000 + * - UTF_8-2: 1100 0000 + * - UTF_8-3: 1110 0000 + * - UTF_8-4: 1111 0000 + * + * Values: + * - UTF_8-1: 1011 1111 + * - UTF_8-2: 1101 1111 + * - UTF_8-3: 1110 1111 + * - UTF_8-4: 1111 0111 + * + * Identifier Structure: + * - UTF_8-1: 1000 0000 + * - UTF_8-2: 1100 0000 1000 0000 + * - UTF_8-3: 1110 0000 1000 0000 1000 0000 + * - UTF_8-4: 1111 0000 1000 0000 1000 0000 1000 0000 + * + * Value Structure: + * - UTF_8-1: 10xx xxxx + * - UTF_8-2: 110x xxxx, 10xx xxxx + * - UTF_8-3: 1110 xxxx, 10xx xxxx, 10xx xxxx + * - UTF_8-4: 1111 0xxx, 10xx xxxx, 10xx xxxx, 10xx xxxx + */ +#ifndef _F_utf_h +#define _F_utf_h + +// libc includes +#include + +// fll includes +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Define the UTF-8 BOM. + * + * The BOM designates that a string is in UTF-8. + * The BOM must be checked for when processing strings. + * + * In many cases, this should be removed such that only one exists in some string block. + */ +#ifndef _di_f_utf_bom_ + #define f_utf_bom_length 3 + + const static char f_utf_bom[f_utf_bom_length] = { 0xef, 0xbb, 0xbf }; // 1110 1111, 1011 1011, 1011 1111 +#endif // _di_f_utf_bom_ + +/** + * Define the UTF-8 bytes. + * + * The bytes are for checking a single 8-bit character value (specifically, checking the first bits). + * + * The byte offs are used for determining the character width of a UTF-8 character. + * + * The f_utf_byte_is method will return non-zero if the character is a UTF-8 character of any width. + * + * The f_macro_utf_byte_is_* macros are used to determine a width of the character (either 1, 2, 3, or 4, respectively). + * + * The f_macro_utf_byte_width macro determines a width of the character. + */ +#ifndef _di_f_utf_byte_ + #define f_utf_byte_1 0x80 // 1000 0000 + #define f_utf_byte_2 0xc0 // 1100 0000 + #define f_utf_byte_3 0xe0 // 1110 0000 + #define f_utf_byte_4 0xf0 // 1111 0000 + + #define f_utf_byte_off_1 0xc0 // 1100 0000 + #define f_utf_byte_off_2 0xe0 // 1110 0000 + #define f_utf_byte_off_3 0xf0 // 1111 0000 + #define f_utf_byte_off_4 0xf8 // 1111 1000 + + #define f_macro_utf_byte_is(character) (character & f_utf_byte_1) + + #define f_macro_utf_byte_is_1(character) ((character & f_utf_byte_off_1) == f_utf_byte_1) // (10xx xxxx & 1100 0000) == 1000 0000 + #define f_macro_utf_byte_is_2(character) ((character & f_utf_byte_off_2) == f_utf_byte_2) // (110x xxxx & 1110 0000) == 1100 0000 + #define f_macro_utf_byte_is_3(character) ((character & f_utf_byte_off_3) == f_utf_byte_3) // (1110 xxxx & 1111 0000) == 1110 0000 + #define f_macro_utf_byte_is_4(character) ((character & f_utf_byte_off_4) == f_utf_byte_4) // (1111 0xxx & 1111 1000) == 1111 0000 + + #define f_macro_utf_byte_width(character) (!f_macro_utf_byte_is(character) || f_macro_utf_byte_is_1(character)) ? 1 : (f_macro_utf_byte_is_2(character) ? 2 : (f_macro_utf_byte_is_3(character) ? 3 : 4)) +#endif // _di_f_utf_byte_ + +/** + * Define the UTF-8 general whitespace codes. + * + * These are integers representing character codes that represent types of spaces. + * + * This does not provide whitespace codes for standard ascii whitespaces, such as '\t' or '\r'. + */ +#ifndef _di_f_utf_space_ + #define f_utf_space_em_length 3 + #define f_utf_space_em_quad_length 3 + #define f_utf_space_em_per_three_length 3 + #define f_utf_space_em_per_four_length 3 + #define f_utf_space_em_per_six_length 3 + + #define f_utf_space_en_length 3 + #define f_utf_space_en_quad_length 3 + + #define f_utf_space_line_feed_reverse_length 2 + #define f_utf_space_line_next_length 2 + + #define f_utf_space_medium_mathematical_length 3 + + #define f_utf_space_no_break_length 2 + #define f_utf_space_no_break_narrow_length 3 + + #define f_utf_space_ogham_length 3 + #define f_utf_space_figure_length 3 + #define f_utf_space_punctuation_length 3 + #define f_utf_space_thin_length 3 + #define f_utf_space_hair_length 3 + #define f_utf_space_ideographic_length 3 + + #define f_utf_space_separator_line_length 3 + #define f_utf_space_separator_paragraph_length 3 + + const static char f_utf_space_em[f_utf_space_em_length] = { 0xe2, 0x80, 0x83 }; + const static char f_utf_space_em_quad[f_utf_space_em_quad_length] = { 0xe2, 0x80, 0x81 }; + const static char f_utf_space_em_per_three[f_utf_space_em_per_three_length] = { 0xe2, 0x80, 0x84 }; + const static char f_utf_space_em_per_four[f_utf_space_em_per_four_length] = { 0xe2, 0x80, 0x85 }; + const static char f_utf_space_em_per_six[f_utf_space_em_per_six_length] = { 0xe2, 0x80, 0x86 }; + + const static char f_utf_space_en[f_utf_space_en_length] = { 0xe2, 0x80, 0x82 }; + const static char f_utf_space_en_quad[f_utf_space_en_quad_length] = { 0xe2, 0x80, 0x80 }; + + const static char f_utf_space_line_feed_reverse[f_utf_space_line_feed_reverse_length] = { 0xc2, 0x8d }; + const static char f_utf_space_line_next[f_utf_space_line_next_length] = { 0xc2, 0x85 }; + + const static char f_utf_space_medium_mathematical[f_utf_space_medium_mathematical_length] = { 0xe2, 0x81, 0x9f }; + + const static char f_utf_space_no_break[f_utf_space_no_break_length] = { 0xc2, 0xa0 }; + const static char f_utf_space_no_break_narrow[f_utf_space_no_break_narrow_length] = { 0xe2, 0x80, 0xaf }; + + const static char f_utf_space_ogham[f_utf_space_ogham_length] = { 0xe1, 0x9a, 0x80 }; + const static char f_utf_space_figure[f_utf_space_figure_length] = { 0xe2, 0x80, 0x87 }; + const static char f_utf_space_punctuation[f_utf_space_punctuation_length] = { 0xe2, 0x80, 0x88 }; + const static char f_utf_space_thin[f_utf_space_thin_length] = { 0xe2, 0x80, 0x89 }; + const static char f_utf_space_hair[f_utf_space_hair_length] = { 0xe2, 0x80, 0x8a }; + const static char f_utf_space_ideographic[f_utf_space_ideographic_length] = { 0xe3, 0x80, 0x80 }; + + const static char f_utf_space_separator_line[f_utf_space_separator_line_length] = { 0xe2, 0x80, 0xa8 }; + const static char f_utf_space_separator_paragraph[f_utf_space_separator_paragraph_length] = { 0xe2, 0x80, 0xa8 }; +#endif // _di_f_utf_space_ + +/** + * Define the UTF-8 general substitute whitespace codes. + * + * These are integers representing character codes that represent types of substitute spaces. + * + * This does not provide substitute whitespace codes for standard ascii whitespaces, such as '\t' or '\r'. + */ +#ifndef _di_f_utf_substitute_ + #define f_utf_substitute_symbol_blank_length 3 + #define f_utf_substitute_symbol_space_length 3 + + #define f_utf_substitute_middle_dot_length 2 + + #define f_utf_substitute_open_box_length 3 + #define f_utf_substitute_open_box_shouldered_length 3 + + const static char f_utf_substitute_symbol_blank[f_utf_substitute_symbol_blank_length] = { 0xe2, 0x90, 0xa2 }; + const static char f_utf_substitute_symbol_space[f_utf_substitute_symbol_space_length] = { 0xe2, 0x90, 0xa0 }; + + const static char f_utf_substitute_middle_dot[f_utf_substitute_middle_dot_length] = { 0xc2, 0xb7 }; + + const static char f_utf_substitute_open_box[f_utf_substitute_open_box_length] = { 0xe2, 0x90, 0xa3 }; + const static char f_utf_substitute_open_box_shouldered[f_utf_substitute_open_box_shouldered_length] = { 0xe2, 0x8d, 0xbd }; +#endif // _di_f_utf_substitute_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 whitespace or substitute character. + * + * This does not check non-UTF-8 whitespace. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by maxWidth. + * @param maxWidth + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * f_true if a UTF-8 whitespace or substitute. + * f_false if not a UTF-8 whitespace or substitute. + * f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough. + * f_invalid_parameter (with error bit) if a parameter is invalid. + */ +#ifndef _di_f_utf_is_space_ + extern f_return_status f_utf_is_space(const f_string character, const f_u_short maxWidth); +#endif // _di_f_utf_is_space_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 whitespace substitute character. + * + * This does not check non-UTF-8 whitespace. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by maxWidth. + * @param maxWidth + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * f_true if a UTF-8 substitute. + * f_false if not a UTF-8 substitute. + * f_maybe (with error bit) if this could be a substitute but width is not long enough. + * f_invalid_parameter (with error bit) if a parameter is invalid. + */ +#ifndef _di_f_utf_is_substitute_ + extern f_return_status f_utf_is_substitute(const f_string character, const f_u_short maxWidth); +#endif // _di_f_utf_is_substitute_ + +/** + * Check to see if the entire byte block of the character is a UTF-8 general whitespace character. + * + * This does not check non-UTF-8 whitespace. + * + * @param character + * The character to validate. + * There must be enough space allocated to compare against, as limited by maxWidth. + * @param maxWidth + * The maximum width available for checking. + * Can be anything greater than 0. + * + * @return + * f_true if a UTF-8 whitespace. + * f_false if not a UTF-8 whitespace. + * f_maybe (with error bit) if this could be a whitespace but width is not long enough. + * f_invalid_parameter (with error bit) if a parameter is invalid. + */ +#ifndef _di_f_utf_is_whitespace_ + extern f_return_status f_utf_is_whitespace(const f_string character, const f_u_short maxWidth); +#endif // _di_f_utf_is_whitespace_ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // _F_utf_h diff --git a/level_0/f_utf/data/build/defines b/level_0/f_utf/data/build/defines new file mode 100644 index 0000000..c665317 --- /dev/null +++ b/level_0/f_utf/data/build/defines @@ -0,0 +1,2 @@ +# fss-0000 + diff --git a/level_0/f_utf/data/build/dependencies b/level_0/f_utf/data/build/dependencies new file mode 100644 index 0000000..90e12bc --- /dev/null +++ b/level_0/f_utf/data/build/dependencies @@ -0,0 +1,3 @@ +f_types +f_errors +f_strings diff --git a/level_0/f_utf/data/build/settings b/level_0/f_utf/data/build/settings new file mode 100644 index 0000000..5e0a396 --- /dev/null +++ b/level_0/f_utf/data/build/settings @@ -0,0 +1,30 @@ +# fss-0000 + +project_name f_utf +project_level 0 + +version_major 0 +version_minor 5 +version_micro 0 + +build_compiler gcc +build_linker ar +build_libraries -lc +build_libraries_fll +build_sources_library utf.c +build_sources_program +build_sources_headers utf.h +build_sources_bash +build_sources_settings +build_shared yes +build_static yes + +defines_all +defines_static +defines_shared + +flags_all -z now +flags_shared +flags_static +flags_library -fPIC +flags_program -fPIE -- 1.8.3.1