From 8cec5a4a91fce96c238b89de308c127fcac1a146 Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Fri, 4 Nov 2022 20:24:38 -0500 Subject: [PATCH] Bugfix: Incomplete UTF-8 zero-width detection, particularly for combining characters. The zero-width calculations should be and are not including zero-width and most control characters. The combining characters and most of the control characters are now considered zero-width. --- level_0/f_utf/c/private-utf_combining.c | 10 +++++++++ level_0/f_utf/c/private-utf_zero_width.c | 38 +++++++++++++++++--------------- level_0/f_utf/c/utf/is.c | 8 +++++-- level_0/f_utf/c/utf/is_character.c | 15 ++++++------- 4 files changed, 43 insertions(+), 28 deletions(-) diff --git a/level_0/f_utf/c/private-utf_combining.c b/level_0/f_utf/c/private-utf_combining.c index 30485cd..f851bc5 100644 --- a/level_0/f_utf/c/private-utf_combining.c +++ b/level_0/f_utf/c/private-utf_combining.c @@ -930,6 +930,16 @@ extern "C" { return F_true; } + // Hanifi Rohingya: U+10D24 to U+10D27. + if (sequence >= 0xf090b4a3 && sequence <= 0xf090b4a7) { + return F_true; + } + + // Yezidi: U+10EAB to U+10EAD. + if (sequence >= 0xf090baab && sequence <= 0xf090baad) { + return F_true; + } + // Arabic Extended-C: U+10EFD to U+10EFF. if (sequence >= 0xf090bbbd && sequence <= 0xf090bbbf) { return F_true; diff --git a/level_0/f_utf/c/private-utf_zero_width.c b/level_0/f_utf/c/private-utf_zero_width.c index 47787af..2aa2e35 100644 --- a/level_0/f_utf/c/private-utf_zero_width.c +++ b/level_0/f_utf/c/private-utf_zero_width.c @@ -9,25 +9,27 @@ extern "C" { #if !defined(_di_f_utf_character_is_zero_width_) || !defined(_di_f_utf_is_zero_width_) f_status_t private_f_utf_character_is_zero_width(const f_utf_char_t sequence) { - if (macro_f_utf_char_t_to_char_1(sequence) == 0xe1) { - - // Mongolian: U+180E. - if (sequence == 0xe1a08e00) { - return F_true; - } - } - else if (macro_f_utf_char_t_to_char_1(sequence) == 0xe2) { - - // General Punctuation: U+200B, U+200C, U+200D, U+2060. - if (sequence == 0xe2808b00 || sequence == 0xe2808c00 || sequence == 0xe2808d00 || sequence == 0xe281a000) { - return F_true; + if (macro_f_utf_char_t_width_is(sequence) == 3) { + if (macro_f_utf_char_t_to_char_1(sequence) == 0xe2) { + + // General Punctuation: U+2028 to U+2029. + if (sequence >= 0xf090b4a3 && sequence <= 0xf090b4a7) { + return F_true; + } + + // General Punctuation: U+2066 to U+2069. + if (sequence >= 0xe281a600 && sequence <= 0xe281a900) { + return F_true; + } } - } - else if (macro_f_utf_char_t_to_char_1(sequence) == 0xef) { - - // Arabic Presentation Forms-B: U+FEFF. - if (sequence == 0xefbbbf00) { - return F_true; + else if (macro_f_utf_char_t_to_char_1(sequence) == 0xef) { + if (macro_f_utf_char_t_to_char_2(sequence) == 0xbf) { + + // Unassigned and reserved for future use: U+FFF0 to U+FFF8 (these are assumed to be zero-width). + if (sequence >= 0xefbfb000 && sequence <= 0xefbfb900) { + return F_true; + } + } } } diff --git a/level_0/f_utf/c/utf/is.c b/level_0/f_utf/c/utf/is.c index 1864668..3a48206 100644 --- a/level_0/f_utf/c/utf/is.c +++ b/level_0/f_utf/c/utf/is.c @@ -994,14 +994,18 @@ extern "C" { if (F_status_is_error(status)) return status; } + // Control and combining characters are zero-width. + if (private_f_utf_character_is_control(utf)) return F_true; + if (private_f_utf_character_is_combining(utf)) return F_true; + return private_f_utf_character_is_zero_width(utf); } // These control characters are considered zero-width spaces. - if (*sequence >= 0x00 && *sequence <= 0x08) { + if (*sequence >= 0x00 && *sequence < 0x09) { return F_true; } - else if (*sequence >= 0x0c && *sequence <= 0x1f) { + else if (*sequence > 0x0b && *sequence < 0x20) { return F_true; } else if (*sequence == 0x7f) { diff --git a/level_0/f_utf/c/utf/is_character.c b/level_0/f_utf/c/utf/is_character.c index 60a9fc9..4050abc 100644 --- a/level_0/f_utf/c/utf/is_character.c +++ b/level_0/f_utf/c/utf/is_character.c @@ -585,22 +585,21 @@ extern "C" { return F_status_set_error(F_utf_fragment); } + // Control and combining characters are zero-width. + if (private_f_utf_character_is_control(sequence)) return F_true; + if (private_f_utf_character_is_combining(sequence)) return F_true; + return private_f_utf_character_is_zero_width(sequence); } - const uint8_t ascii = macro_f_utf_char_t_to_char_1(sequence); - // These control characters are considered zero-width spaces. - if (ascii >= 0x00 && ascii <= 0x08) { - return F_true; - } - else if (ascii == 0x0a) { + if (macro_f_utf_char_t_to_char_1(sequence) >= 0x00 && macro_f_utf_char_t_to_char_1(sequence) < 0x09) { return F_true; } - else if (ascii >= 0x0c && ascii <= 0x1f) { + else if (macro_f_utf_char_t_to_char_1(sequence) > 0x0b && macro_f_utf_char_t_to_char_1(sequence) < 0x20) { return F_true; } - else if (ascii == 0x7f) { + else if (macro_f_utf_char_t_to_char_1(sequence) == 0x7f) { return F_true; } -- 1.8.3.1