From 3b6d88de300df33200f5d96b1d5fc268db8ec7f6 Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Thu, 4 Nov 2021 20:53:17 -0500 Subject: [PATCH] Bugfix: UTF-8 functions fail to properly handle ASCII. This seems to be a problem where there are two ways of processing ASCII detection of UTF-8 code. The macro_f_utf_byte_width() will return the width of 1 for ASCII. The macro_f_utf_byte_width_is() will return a width of 0 for ASCII. The affected code is assuming a width of 0, but some functions send a width of 1 for ASCII. These are private functions, so it is relatively safe to just allow both. Change the behavior to accept both 0 and 1 and treat them as ASCII. Update comment about Unicode 12.1, setting it to 14.0. --- level_0/f_utf/c/private-utf.c | 4 ++-- level_0/f_utf/c/private-utf.h | 2 ++ level_3/byte_dump/c/byte_dump.h | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/level_0/f_utf/c/private-utf.c b/level_0/f_utf/c/private-utf.c index 98d76fd..50b0908 100644 --- a/level_0/f_utf/c/private-utf.c +++ b/level_0/f_utf/c/private-utf.c @@ -187,7 +187,7 @@ extern "C" { #if !defined(_di_f_utf_character_is_ascii_) || !defined(_di_f_utf_is_ascii_) f_status_t private_f_utf_character_is_ascii(const f_utf_character_t character, const uint8_t width) { - if (!width) { + if (width < 2) { const uint8_t byte_first = macro_f_utf_character_t_to_char_1(character); if (byte_first >= 0x00 && byte_first <= 0x7f) { @@ -1975,7 +1975,7 @@ extern "C" { // reduce the number of checks by grouping checks by byte. const uint8_t byte_first = macro_f_utf_character_t_to_char_1(character); - if (!width) { + if (width < 2) { if (byte_first >= 0x00 && byte_first <= 0x7f) { return F_true; } diff --git a/level_0/f_utf/c/private-utf.h b/level_0/f_utf/c/private-utf.h index 3687553..f2e0c3e 100644 --- a/level_0/f_utf/c/private-utf.h +++ b/level_0/f_utf/c/private-utf.h @@ -149,6 +149,7 @@ extern "C" { * The character to validate. * @param width * The number of bytes repesenting the character width. + * A width of 0 or 1 are treated as ASCII (width 1). * * @return * F_true if a UTF-8 control picture character. @@ -425,6 +426,7 @@ extern "C" { * The character to validate. * @param width * The number of bytes repesenting the character width. + * A width of 0 or 1 are treated as ASCII (width 1). * * @return * F_true if a UTF-8 character. diff --git a/level_3/byte_dump/c/byte_dump.h b/level_3/byte_dump/c/byte_dump.h index 2002fa4..5c2f738 100644 --- a/level_3/byte_dump/c/byte_dump.h +++ b/level_3/byte_dump/c/byte_dump.h @@ -5,7 +5,7 @@ * API Version: 0.5 * Licenses: lgplv2.1 * - * This is intendend to support Unicode 12.1. + * This is intendend to support Unicode 14.0. * * When using "text" mode, this program attempts to translate UTF-8 sequences such that certain codes don't cause printing problems. * There may be cases where there are unknown codes that get printed and the invalid UTF-8 marker may be displayed not by this program but instead by the shell or some other program. -- 1.8.3.1