From 52d120317e71bd1e6af51dc607704dd9c1aff1a9 Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Sun, 5 Jul 2020 18:59:09 -0500 Subject: [PATCH] Update: Unicode related specification adjustments, also perform minor fixes and cleanups. The Unicode includes some characters that violate some of the expectations of the FSS and IKI standards. To that end, provide non-strict modes to allow for the FLL based adjustments of these standards. There are some vertical line like characters that act as punctuation connectors. Because they connect vertically instead of horizontally, they violate part of the inteded design of FSS and IKI where everything is on a per-line basis. These Unicode vertical line like punctuation connectors are not to be considered punctuation connectors for the purposes of FSS and IKI processing. There are some non-printing characters, such as invisible plus. This is a punctuation character that is also a zero-width character. For the purposes of FSS and IKI, these characters must not be considered punctuation characters. Any other zero-width characters are to be considered zero-width and are not to be used for syntax and similar. (Combining characters are a different beast to be addessed at a later time.) Further clarify how certain Unicode characters are to be handled, in general. Unicode dash characters are now being limited in being interpreted as a dash for the purposes of connecting words. To this end, only the Unicode hpyhens are the allowed dash-like characters (with the ASCII dash being allowed as well for that matter). Use const where possible. Fix some wording in some documentation comments. --- level_0/f_iki/c/iki-common.h | 2 +- level_0/f_iki/c/iki.c | 2 +- level_0/f_utf/c/private-utf.c | 66 ++++++++++++++++++++++++++------------- level_0/f_utf/c/private-utf.h | 22 ++++++++++--- level_0/f_utf/c/utf.c | 72 +++++++++++++++++++++---------------------- level_0/f_utf/c/utf.h | 72 +++++++++++++++++++++++++++++++++++++------ specifications/fss-000D.txt | 4 +++ specifications/fss.txt | 19 +++++++++--- 8 files changed, 179 insertions(+), 80 deletions(-) diff --git a/level_0/f_iki/c/iki-common.h b/level_0/f_iki/c/iki-common.h index a07a1cf..6b6ed7e 100644 --- a/level_0/f_iki/c/iki-common.h +++ b/level_0/f_iki/c/iki-common.h @@ -303,7 +303,7 @@ extern "C" { continue; \ } \ f_macro_iki_determine_width_max(buffer, range, width_max); \ - status = f_utf_is_word_dash_plus(buffer->string + range->start, width_max); \ + status = f_utf_is_word_dash_plus(buffer->string + range->start, width_max, F_false); \ if (status == condition) break; \ else if (F_status_is_error(status)) break; \ status = f_utf_buffer_increment(*buffer, range, 1); \ diff --git a/level_0/f_iki/c/iki.c b/level_0/f_iki/c/iki.c index 698e559..453c2f0 100644 --- a/level_0/f_iki/c/iki.c +++ b/level_0/f_iki/c/iki.c @@ -146,7 +146,7 @@ extern "C" { else { f_macro_iki_determine_width_max(buffer, range, width_max); - status = f_utf_is_word_dash_plus(buffer->string + range->start, width_max); + status = f_utf_is_word_dash_plus(buffer->string + range->start, width_max, F_false); if (F_status_is_error(status)) { f_macro_string_lengths_delete(status, delimits); return status; diff --git a/level_0/f_utf/c/private-utf.c b/level_0/f_utf/c/private-utf.c index c1961bf..cb5aa04 100644 --- a/level_0/f_utf/c/private-utf.c +++ b/level_0/f_utf/c/private-utf.c @@ -266,7 +266,7 @@ extern "C" { f_return_status private_f_utf_character_is_emoji(const f_utf_character character, const uint8_t width) { // reduce the number of checks by grouping checks by first byte. - uint8_t byte_first = f_macro_utf_character_to_char_1(character); + const uint8_t byte_first = f_macro_utf_character_to_char_1(character); if (width == 2) { // Latin-1 Supplement: U+00A9, U+00AE. @@ -800,7 +800,7 @@ extern "C" { f_return_status private_f_utf_character_is_punctuation(const f_utf_character character, const uint8_t width) { // reduce the number of checks by grouping checks by first byte. - uint8_t byte_first = f_macro_utf_character_to_char_1(character); + const uint8_t byte_first = f_macro_utf_character_to_char_1(character); if (width == 2) { @@ -1475,7 +1475,7 @@ extern "C" { #if !defined(_di_f_utf_character_is_valid_) || !defined(_di_f_utf_is_valid_) f_return_status private_f_utf_character_is_valid(const f_utf_character character, const uint8_t width) { // reduce the number of checks by grouping checks by first byte. - uint8_t byte_first = f_macro_utf_character_to_char_1(character); + const uint8_t byte_first = f_macro_utf_character_to_char_1(character); if (width == 2) { uint8_t byte = f_macro_utf_character_to_char_2(character); @@ -3850,7 +3850,7 @@ extern "C" { } else if (width == 3) { // reduce the number of checks by grouping checks by first byte. - uint8_t byte_first = f_macro_utf_character_to_char_1(character); + const uint8_t byte_first = f_macro_utf_character_to_char_1(character); if (byte_first == 0xe2) { @@ -3905,27 +3905,41 @@ extern "C" { #endif // !defined(_di_f_utf_character_is_whitespace_other_) || !defined(_di_f_utf_is_whitespace_other_) #if !defined(_di_f_utf_character_is_word_) || !defined(_di_f_utf_is_word_) - f_return_status private_f_utf_character_is_word(const f_utf_character character, const uint8_t width) { + f_return_status private_f_utf_character_is_word(const f_utf_character character, const uint8_t width, const bool strict) { if (private_f_utf_character_is_alpha_digit(character, width)) { return F_true; } if (width == 3) { + // reduce the number of checks by grouping checks by first byte. + const uint8_t byte_first = f_macro_utf_character_to_char_1(character); - // General Punctuation: U+203F (‿), U+2017 (‗), U+203E (‾), U+2040 (⁀). - if (character == 0xe280bf00 || character == 0xe2809700 || character == 0xe280be00 || character == 0xe2818000) { - return F_true; - } + if (byte_first == 0xe2) { - // General Punctuation: U+2054 (⁔), U+FE4D (﹍), U+FE4E (﹎), U+FE4F (﹏). - if (character == 0xe2819400 || character == 0xefb98d00 || character == 0xefb98e00 || character == 0xefb98f00) { - return F_true; + // General Punctuation: U+203F (‿), U+203E (‾), U+2040 (⁀), U+2054 (⁔). + if (character == 0xe280bf00 || character == 0xe280be00 || character == 0xe2818000 || character == 0xe2819400) { + return F_true; + } } + else if (byte_first == 0xef) { - // General Punctuation: U+FF3F (_). - if (character == 0xefbcbf00) { - return F_true; + // General Punctuation: U+FE4D (﹍), U+FE4E (﹎), U+FE4F (﹏). + if (character == 0xefb98d00 || character == 0xefb98e00 || character == 0xefb98f00) { + return F_true; + } + + // General Punctuation: U+FF3F (_). + if (character == 0xefbcbf00) { + return F_true; + } + + if (strict) { + + // General Punctuation: U+FE33 (︳), U+FE34 (︴). + if (character == 0xefbcbf00 || character == 0xefbcbf00) { + } + } } } @@ -3934,16 +3948,16 @@ extern "C" { #endif // !defined(_di_f_utf_character_is_word_) || !defined(_di_f_utf_is_word_) #if !defined(_di_f_utf_character_is_word_dash_) || !defined(_di_f_utf_is_word_dash_) - f_return_status private_f_utf_character_is_word_dash(const f_utf_character character, const uint8_t width) { + f_return_status private_f_utf_character_is_word_dash(const f_utf_character character, const uint8_t width, const bool strict) { - if (private_f_utf_character_is_word(character, width)) { + if (private_f_utf_character_is_word(character, width, strict)) { return F_true; } if (width == 3) { - // General Punctuation: U+2010 to U+2015. - if (character >= 0xe2809000 && character <= 0xe2809500) { + // General Punctuation: U+2010, U+2011. + if (character == 0xe2809000 || character == 0xe2809100) { return F_true; } } @@ -3953,12 +3967,20 @@ extern "C" { #endif // !defined(_di_f_utf_character_is_word_dash_) || !defined(_di_f_utf_is_word_dash_) #if !defined(_di_f_utf_character_is_word_dash_plus_) || !defined(_di_f_utf_is_word_dash_plus_) - f_return_status private_f_utf_character_is_word_dash_plus(const f_utf_character character, const uint8_t width) { + f_return_status private_f_utf_character_is_word_dash_plus(const f_utf_character character, const uint8_t width, const bool strict) { - if (private_f_utf_character_is_word_dash(character, width)) { + if (private_f_utf_character_is_word_dash(character, width, strict)) { return F_true; } + if (strict) { + + // General Punctuation: U+2064. + if (character == 0xe281a400) { + return F_true; + } + } + return F_false; } #endif // !defined(_di_f_utf_character_is_word_dash_plus_) || !defined(_di_f_utf_is_word_dash_plus_) @@ -3966,7 +3988,7 @@ extern "C" { #if !defined(_di_f_utf_character_is_zero_width_) || !defined(_di_f_utf_is_zero_width_) f_return_status private_f_utf_character_is_zero_width(const f_utf_character character, const uint8_t width) { // reduce the number of checks by grouping checks by first byte. - uint8_t byte_first = f_macro_utf_character_to_char_1(character); + const uint8_t byte_first = f_macro_utf_character_to_char_1(character); if (byte_first == 0xe1) { diff --git a/level_0/f_utf/c/private-utf.h b/level_0/f_utf/c/private-utf.h index 367c274..a507f1c 100644 --- a/level_0/f_utf/c/private-utf.h +++ b/level_0/f_utf/c/private-utf.h @@ -378,6 +378,10 @@ extern "C" { * The character to validate. * @param width * The number of bytes repesenting the character width. + * @param strict + * When TRUE, include all appropriate characters by type as per Unicode. + * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). + * When FALSE, zero-width punctuation characters are not considered a character. * * @return * F_true if a UTF-8 control character. @@ -388,7 +392,7 @@ extern "C" { * @see f_utf_is_word() */ #if !defined(_di_f_utf_character_is_word_) || !defined(_di_f_utf_is_word_) - extern f_return_status private_f_utf_character_is_word(const f_utf_character character, const uint8_t width) f_gcc_attribute_visibility_internal; + extern f_return_status private_f_utf_character_is_word(const f_utf_character character, const uint8_t width, const bool strict) f_gcc_attribute_visibility_internal; #endif // !defined(_di_f_utf_character_is_word_) || !defined(_di_f_utf_is_word_) /** @@ -400,6 +404,10 @@ extern "C" { * The character to validate. * @param width * The number of bytes repesenting the character width. + * @param strict + * When TRUE, include all appropriate characters by type as per Unicode. + * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). + * When FALSE, zero-width punctuation characters are not considered a character. * * @return * F_true if a UTF-8 control character. @@ -410,7 +418,7 @@ extern "C" { * @see f_utf_is_word_dash() */ #if !defined(_di_f_utf_character_is_word_dash_) || !defined(_di_f_utf_is_word_dash_) - extern f_return_status private_f_utf_character_is_word_dash(const f_utf_character character, const uint8_t width) f_gcc_attribute_visibility_internal; + extern f_return_status private_f_utf_character_is_word_dash(const f_utf_character character, const uint8_t width, const bool strict) f_gcc_attribute_visibility_internal; #endif // !defined(_di_f_utf_character_is_word_dash_) || !defined(_di_f_utf_is_word_dash_) /** @@ -422,17 +430,21 @@ extern "C" { * The character to validate. * @param width * The number of bytes repesenting the character width. + * @param strict + * When TRUE, include all appropriate characters by type as per Unicode. + * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). + * When FALSE, zero-width punctuation characters are not considered a character. * * @return * F_true if a UTF-8 control character. * F_false if not a UTF-8 control character. * F_utf (with error bit) if character is an invalid UTF-8 character. * - * @see f_utf_character_is_word_dash() - * @see f_utf_is_word_dash() + * @see f_utf_character_is_word_dash_plus() + * @see f_utf_is_word_dash_plus() */ #if !defined(_di_f_utf_character_is_word_dash_plus_) || !defined(_di_f_utf_is_word_dash_plus_) - extern f_return_status private_f_utf_character_is_word_dash_plus(const f_utf_character character, const uint8_t width) f_gcc_attribute_visibility_internal; + extern f_return_status private_f_utf_character_is_word_dash_plus(const f_utf_character character, const uint8_t width, const bool strict) f_gcc_attribute_visibility_internal; #endif // !defined(_di_f_utf_character_is_word_dash_plus_) || !defined(_di_f_utf_is_word_dash_plus_) /** diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index a3d955c..31412f0 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -80,7 +80,7 @@ extern "C" { #ifndef _di_f_utf_character_is_ f_return_status f_utf_character_is(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { return F_false; @@ -96,7 +96,7 @@ extern "C" { #ifndef _di_f_utf_character_is_alpha_ f_return_status f_utf_character_is_alpha(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { if (isalpha(f_macro_utf_character_to_char_1(character))) { @@ -116,7 +116,7 @@ extern "C" { #ifndef _di_f_utf_character_is_alpha_digit_ f_return_status f_utf_character_is_alpha_digit(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { if (isalnum(f_macro_utf_character_to_char_1(character))) { @@ -136,7 +136,7 @@ extern "C" { #ifndef _di_f_utf_character_is_alpha_numeric_ f_return_status f_utf_character_is_alpha_numeric(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { if (isalnum(f_macro_utf_character_to_char_1(character))) { @@ -156,7 +156,7 @@ extern "C" { #ifndef _di_f_utf_character_is_combining_ f_return_status f_utf_character_is_combining(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { // There are no combining characters in ASCII. @@ -177,7 +177,7 @@ extern "C" { #ifndef _di_f_utf_character_is_control_ f_return_status f_utf_character_is_control(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { if (iscntrl(f_macro_utf_character_to_char_1(character))) { @@ -197,7 +197,7 @@ extern "C" { #ifndef _di_f_utf_character_is_control_picture_ f_return_status f_utf_character_is_control_picture(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { // There are no control picture characters in ASCII. @@ -218,7 +218,7 @@ extern "C" { #ifndef _di_f_utf_character_is_digit_ f_return_status f_utf_character_is_digit(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { if (isdigit(f_macro_utf_character_to_char_1(character))) { @@ -238,7 +238,7 @@ extern "C" { #ifndef _di_f_utf_character_is_emoji_ f_return_status f_utf_character_is_emoji(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { if (isdigit(f_macro_utf_character_to_char_1(character))) { @@ -258,7 +258,7 @@ extern "C" { #ifndef _di_f_utf_character_is_fragment_ f_return_status f_utf_character_is_fragment(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 1) return F_true; @@ -268,7 +268,7 @@ extern "C" { #ifndef _di_f_utf_character_is_graph_ f_return_status f_utf_character_is_graph(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { if (isgraph(f_macro_utf_character_to_char_1(character))) { @@ -300,7 +300,7 @@ extern "C" { #ifndef _di_f_utf_character_is_numeric_ f_return_status f_utf_character_is_numeric(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { if (isdigit(f_macro_utf_character_to_char_1(character))) { @@ -320,7 +320,7 @@ extern "C" { #ifndef _di_f_utf_character_is_phonetic_ f_return_status f_utf_character_is_phonetic(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { // There are no ASCII phonetic characters. @@ -337,7 +337,7 @@ extern "C" { #ifndef _di_f_utf_character_is_punctuation_ f_return_status f_utf_character_is_punctuation(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { // ASCII: '!' to '#'. @@ -383,7 +383,7 @@ extern "C" { #ifndef _di_f_utf_character_is_symbol_ f_return_status f_utf_character_is_symbol(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { // ASCII: '$' or '+'. @@ -414,7 +414,7 @@ extern "C" { #ifndef _di_f_utf_character_is_valid_ f_return_status f_utf_character_is_valid(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0 || width == 1) { return F_false; @@ -426,7 +426,7 @@ extern "C" { #ifndef _di_f_utf_character_is_whitespace_ f_return_status f_utf_character_is_whitespace(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { if (isspace(f_macro_utf_character_to_char_1(character))) { @@ -446,7 +446,7 @@ extern "C" { #ifndef _di_f_utf_character_is_whitespace_modifier_ f_return_status f_utf_character_is_whitespace_modifier(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { // There are no ASCII whitespace modifiers. @@ -463,7 +463,7 @@ extern "C" { #ifndef _di_f_utf_character_is_whitespace_other_ f_return_status f_utf_character_is_whitespace_other(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { // There are no ASCII whitespace other. @@ -479,8 +479,8 @@ extern "C" { #endif // _di_f_utf_character_is_whitespace_other_ #ifndef _di_f_utf_character_is_word_ - f_return_status f_utf_character_is_word(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + f_return_status f_utf_character_is_word(const f_utf_character character, const bool strict) { + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { if (isalnum(f_macro_utf_character_to_char_1(character)) || character == '_') { @@ -494,13 +494,13 @@ extern "C" { return F_status_is_error(F_utf); } - return private_f_utf_character_is_word(character, width); + return private_f_utf_character_is_word(character, width, strict); } #endif // _di_f_utf_character_is_word_ #ifndef _di_f_utf_character_is_word_dash_ - f_return_status f_utf_character_is_word_dash(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + f_return_status f_utf_character_is_word_dash(const f_utf_character character, const bool strict) { + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { if (isalnum(f_macro_utf_character_to_char_1(character)) || character == '_' || character == '-') { @@ -514,13 +514,13 @@ extern "C" { return F_status_is_error(F_utf); } - return private_f_utf_character_is_word_dash(character, width); + return private_f_utf_character_is_word_dash(character, width, strict); } #endif // _di_f_utf_character_is_word_dash_ #ifndef _di_f_utf_character_is_word_dash_plus_ - f_return_status f_utf_character_is_word_dash_plus(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + f_return_status f_utf_character_is_word_dash_plus(const f_utf_character character, const bool strict) { + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { if (isalnum(f_macro_utf_character_to_char_1(character)) || character == '_' || character == '-' || character == '+') { @@ -534,16 +534,16 @@ extern "C" { return F_status_is_error(F_utf); } - return private_f_utf_character_is_word_dash_plus(character, width); + return private_f_utf_character_is_word_dash_plus(character, width, strict); } #endif // _di_f_utf_character_is_word_dash_plus_ #ifndef _di_f_utf_character_is_zero_width_ f_return_status f_utf_character_is_zero_width(const f_utf_character character) { - unsigned short width = f_macro_utf_character_width_is(character); + const unsigned short width = f_macro_utf_character_width_is(character); if (width == 0) { - uint8_t ascii = f_macro_utf_character_to_char_1(character); + const uint8_t ascii = f_macro_utf_character_to_char_1(character); // These control characters are considered zero-width spaces. if (ascii >= 0x00 && ascii <= 0x08) { @@ -1281,7 +1281,7 @@ extern "C" { #endif // _di_f_utf_is_whitespace_other_ #ifndef _di_f_utf_is_word_ - f_return_status f_utf_is_word(const f_string character, const f_string_length width_max) { + f_return_status f_utf_is_word(const f_string character, const f_string_length width_max, const bool strict) { #ifndef _di_level_0_parameter_checking_ if (width_max < 1) return F_status_set_error(F_parameter); #endif // _di_level_0_parameter_checking_ @@ -1310,12 +1310,12 @@ extern "C" { if (status != F_none) return status; } - return private_f_utf_character_is_word(character_utf, width); + return private_f_utf_character_is_word(character_utf, width, strict); } #endif // _di_f_utf_is_word_ #ifndef _di_f_utf_is_word_dash_ - f_return_status f_utf_is_word_dash(const f_string character, const f_string_length width_max) { + f_return_status f_utf_is_word_dash(const f_string character, const f_string_length width_max, const bool strict) { #ifndef _di_level_0_parameter_checking_ if (width_max < 1) return F_status_set_error(F_parameter); #endif // _di_level_0_parameter_checking_ @@ -1344,12 +1344,12 @@ extern "C" { if (status != F_none) return status; } - return private_f_utf_character_is_word_dash(character_utf, width); + return private_f_utf_character_is_word_dash(character_utf, width, strict); } #endif // _di_f_utf_is_word_dash_ #ifndef _di_f_utf_is_word_dash_plus_ - f_return_status f_utf_is_word_dash_plus(const f_string character, const f_string_length width_max) { + f_return_status f_utf_is_word_dash_plus(const f_string character, const f_string_length width_max, const bool strict) { #ifndef _di_level_0_parameter_checking_ if (width_max < 1) return F_status_set_error(F_parameter); #endif // _di_level_0_parameter_checking_ @@ -1378,7 +1378,7 @@ extern "C" { if (status != F_none) return status; } - return private_f_utf_character_is_word_dash_plus(character_utf, width); + return private_f_utf_character_is_word_dash_plus(character_utf, width, strict); } #endif // _di_f_utf_is_word_dash_plus_ diff --git a/level_0/f_utf/c/utf.h b/level_0/f_utf/c/utf.h index 8715191..dcc4757 100644 --- a/level_0/f_utf/c/utf.h +++ b/level_0/f_utf/c/utf.h @@ -490,8 +490,14 @@ extern "C" { * * A word character is alpha-numeric or an underscore '_'. * + * This does not include zero-width punctuation. + * * @param character * The character to validate. + * @param strict + * When TRUE, include all appropriate characters by type as per Unicode. + * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). + * When FALSE, zero-width punctuation characters are not considered a character. * * @return * F_true if a UTF-8 word character. @@ -501,7 +507,7 @@ extern "C" { * @see isalnum() */ #ifndef _di_f_utf_character_is_word_ - extern f_return_status f_utf_character_is_word(const f_utf_character character); + extern f_return_status f_utf_character_is_word(const f_utf_character character, const bool strict); #endif // _di_f_utf_character_is_word_ /** @@ -509,8 +515,19 @@ extern "C" { * * A word dash character is alpha-numeric, an underscore '_' or a dash '-'. * + * Unicode appears to refer to dashes that connect words as a hyphen. + * Therefore, only these hyphens are considered dashes for the purposes of this function. + * All other dash-like Unicode characters are not considered a dash here. + * The dash here is intended for combining words, which matches the context of the Unicode "hyphen". + * + * This does not include zero-width punctuation. + * * @param character * The character to validate. + * @param strict + * When TRUE, include all appropriate characters by type as per Unicode. + * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). + * When FALSE, zero-width punctuation characters are not considered a character. * * @return * F_true if a UTF-8 word or dash character. @@ -520,18 +537,27 @@ extern "C" { * @see isalnum() */ #ifndef _di_f_utf_character_is_word_dash_ - extern f_return_status f_utf_character_is_word_dash(const f_utf_character character); + extern f_return_status f_utf_character_is_word_dash(const f_utf_character character, const bool strict); #endif // _di_f_utf_character_is_word_dash_ /** * Check to see if the entire byte block of the character is an ASCII or UTF-8 word, dash, or plus character. * - * A word dash character is alpha-numeric, an underscore '_', a dash '-', or a plus '+'. + * A word dash plus character is alpha-digit, an underscore '_', a dash '-', or a plus '+'. + * + * Unicode appears to refer to dashes that connect words as a hyphen. + * Therefore, only these hyphens are considered dashes for the purposes of this function. + * All other dash-like Unicode characters are not considered a dash here. + * The dash here is intended for combining words, which matches the context of the Unicode "hyphen". * - * This does not include "invisible plus". + * This does not include zero-width punctuation, such as "invisible plus" (U+2064) (even in strict mode). * * @param character * The character to validate. + * @param strict + * When TRUE, include all appropriate characters by type as per Unicode. + * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). + * When FALSE, zero-width punctuation characters are not considered a character. * * @return * F_true if a UTF-8 word or dash character. @@ -541,7 +567,7 @@ extern "C" { * @see isalnum() */ #ifndef _di_f_utf_character_is_word_dash_plus_ - extern f_return_status f_utf_character_is_word_dash_plus(const f_utf_character character); + extern f_return_status f_utf_character_is_word_dash_plus(const f_utf_character character, const bool strict); #endif // _di_f_utf_character_is_word_dash_plus_ /** @@ -1048,12 +1074,18 @@ extern "C" { * * A word character is alpha-digit or an underscore '_'. * + * This does not include zero-width punctuation. + * * @param character * The character to validate. * There must be enough space allocated to compare against, as limited by width_max. * @param width_max * The maximum width available for checking. * Can be anything greater than 0. + * @param strict + * When TRUE, include all appropriate characters by type as per Unicode. + * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). + * When FALSE, zero-width punctuation characters are not considered a character. * * @return * F_true if a UTF-8 word character. @@ -1063,7 +1095,7 @@ extern "C" { * @see isalnum() */ #ifndef _di_f_utf_is_word_ - extern f_return_status f_utf_is_word(const f_string character, const f_string_length width_max); + extern f_return_status f_utf_is_word(const f_string character, const f_string_length width_max, const bool strict); #endif // _di_f_utf_is_word_ /** @@ -1071,12 +1103,23 @@ extern "C" { * * A word dash character is alpha-digit, an underscore '_' or a dash '-'. * + * Unicode appears to refer to dashes that connect words as a hyphen. + * Therefore, only these hyphens are considered dashes for the purposes of this function. + * All other dash-like Unicode characters are not considered a dash here. + * The dash here is intended for combining words, which matches the context of the Unicode "hyphen". + * + * This does not include zero-width punctuation. + * * @param character * The character to validate. * There must be enough space allocated to compare against, as limited by width_max. * @param width_max * The maximum width available for checking. * Can be anything greater than 0. + * @param strict + * When TRUE, include all appropriate characters by type as per Unicode. + * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). + * When FALSE, zero-width punctuation characters are not considered a character. * * @return * F_true if a UTF-8 word or dash character. @@ -1086,15 +1129,20 @@ extern "C" { * @see isalnum() */ #ifndef _di_f_utf_is_word_dash_ - extern f_return_status f_utf_is_word_dash(const f_string character, const f_string_length width_max); + extern f_return_status f_utf_is_word_dash(const f_string character, const f_string_length width_max, const bool strict); #endif // _di_f_utf_is_word_dash_ /** * Check to see if the entire byte block of the character is an ASCII or UTF-8 word, dash, or plus character. * - * A word dash character is alpha-digit, an underscore '_', a dash '-', or a plus '+'. + * A word dash plus character is alpha-digit, an underscore '_', a dash '-', or a plus '+'. + * + * Unicode appears to refer to dashes that connect words as a hyphen. + * Therefore, only these hyphens are considered dashes for the purposes of this function. + * All other dash-like Unicode characters are not considered a dash here. + * The dash here is intended for combining words, which matches the context of the Unicode "hyphen". * - * This does not include "invisible plus". + * This does not include zero-width punctuation, such as "invisible plus" (U+2064) (even in strict mode). * * @param character * The character to validate. @@ -1102,6 +1150,10 @@ extern "C" { * @param width_max * The maximum width available for checking. * Can be anything greater than 0. + * @param strict + * When TRUE, include all appropriate characters by type as per Unicode. + * When FALSE, non-inline punctuation connectors are not considered a character (such as U+FE33 '︳'). + * When FALSE, zero-width punctuation characters are not considered a character. * * @return * F_true if a UTF-8 word or dash character. @@ -1111,7 +1163,7 @@ extern "C" { * @see isalnum() */ #ifndef _di_f_utf_is_word_dash_plus_ - extern f_return_status f_utf_is_word_dash_plus(const f_string character, const f_string_length width_max); + extern f_return_status f_utf_is_word_dash_plus(const f_string character, const f_string_length width_max, const bool strict); #endif // _di_f_utf_is_word_dash_plus_ /** diff --git a/specifications/fss-000D.txt b/specifications/fss-000D.txt index 1e09b65..3d1ccac 100644 --- a/specifications/fss-000D.txt +++ b/specifications/fss-000D.txt @@ -18,10 +18,14 @@ Featureless Settings Specification: 000D - Iki Text: Whitespace, non-word (and non "_", "-", "+") character punctuations, or the start of file must exist before any valid variable name. Whitespace and non-word (and non "_", "-", "+") character punctuations may not exist as part of the variable name. + The only Unicode dash-like characters allowed as a "dash" are those intended to connect, such as the Unicode hyphens (U+2010 and U+2011). The IKI format will use IKI-0000 to represent an IKI with no explicitly defined vocabulary. Whereas IKI-0001 and beyond represent a specific IKI vocabulary. + Unicode punctuation connector characters are supported just like "_", except when they connect outside the current line (such as U+FE33 "︳"). + Unicode invisible punctuations (such as invisible plus: U+2064) are not considered a punctuations in this standard (because they a zero-width characters), therefore they are not to be considered a valid '_', '-', or '+' Unicode equivalents. + Key\: \o = any printable word character, including "_", "-", "+" (and Unicode equivalents). \c = any character, including whitespace and non-printing, and any delimited quote (used as the opening quote) or a any quote (undelimited) not used as the opening quote. diff --git a/specifications/fss.txt b/specifications/fss.txt index b03de73..8c8beac 100644 --- a/specifications/fss.txt +++ b/specifications/fss.txt @@ -26,7 +26,7 @@ Featureless Settings Specifications: Unless explicitly defined by the specification, all specifications are newline sensitive ('\n' only). Newline characters are only '\n' and are never anything else ('\r' is not considered newline in any manner). Whitespaces characters that are printable, such as tabs and spaces must be considered the same type. - Non-printing whitespaces characters (zero-width characters) are ignored or are treated as placeholders for processing. + Non-printing whitespaces characters (zero-width characters) are ignored or are treated as placeholders for processing (this includes zero-width punctutations characters and similar). In terms of processing, it is recommended that the NULL character is not considered the end of a string, but this is only a suggestion. Unless explicitly defined, newlines designate the start of a potential new Object or the potential end of some Content. @@ -97,16 +97,25 @@ Featureless Settings Specifications: Unless explicitly defined, all designation characters must represent ASCII codes. With designation characters being any character code used to designate how to identify an Object or Content (such as a colon ':' at the end of a basic list). This keeps the processing and logic simple, for both UTF-8 and ASCII. - Whitespace used for designation characters must include support for UTF-8 whitespace characters, unless explicitly designated not to by a standard. - Control characters used for designation characters must include support UTF-8 control character support, unless explicitly designated not to by a standard. + Whitespace used for designation characters must include support for UTF-8 whitespace characters, unless explicitly stated otherwise. + Control characters used for designation characters must include support UTF-8 control character support, unless explicitly stated otherwise. The UTF-8 BOM is not allowed as a Byte Order Mark; instead, it must always be treated as the character represented by its code (unless explicitly allowed to represent a BOM by a standard). Unless explicitly defined, whitespace is to be considered all not visible characters that take up space (including newlines '\n'). - Any visible/graph character that is considered a whitespace (such as U+1680 ' ') is not to be considered a whitespace by the FSS. + Any visible/graph character that is considered a whitespace (such as U+1680 ' ') is not to be considered a whitespace. + + When used for syntax matching purposes, zero-width Unicode characters are only to be considered zero-width unless explicitly stated otherwise. + For example, the "invisible plus" character (U+2064) is not to be considered as a plus (unless explicitly stated otherwise). + + The only Unicode dash-like characters allowed as a "dash" are those intended to connect, such as the Unicode hyphens (U+2010 and U+2011) (unless explicitly stated otherwise). + + In any specification where security is intended, if there exists a Unicode character that matches an ASCII character, that Unicode character may potentially be prohibeted by that standard in favor of the ASCII equivalent. + One such example is in the case of a URL, where the name could be used to trick a person (http://this-site.com/ vs http://this‐site.com/). + This (potential insecure behavior) is allowed in general because a well written program would be able to detect and communicate the possible misunderstanding and thereby avoid mistakes without imposing any character restrictions. @todo document that certain special UTF-8 characters are not allowed in certain terminology groups, such as "word characters", "number characters", etc.. so that potential confusion can be avoided. - This is a common behavior for character security reasons, each character used for any special purposes must be visibly distinct, with whitespace and non-printing characters as the exception. + This is a common behavior for security reasons, each character used for any special purposes must be visibly distinct, with whitespace and non-printing characters as the exception to the words "visibly distinct". The follow specifications are defined in this project. Each of these specifications has a common name associated with the specification number. -- 1.8.3.1