From a4e63c1483a32356f32174c9b396fcbd5e554f7d Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Sat, 11 Jun 2022 14:09:39 -0500 Subject: [PATCH] Bugfix: Last character of file after conversion from code point is not printed by utf8 program. The algorithm doesn't print the character until it knows when the character is complete. There are no checks for when end of file is reached. This results in the last character not being printed, even if the code is complete. Be sure to return the status rather than always returning F_none under certain circumstances in utf8_detect_codepoint(). Update documentation about return value in utf8_detect_codepoint(). Initialize the character.used to 0 rather than 4 (because it has no data!). For better practice, compare using >= rather than ==. Remove unnecessary i = 0 assignment. --- level_3/utf8/c/private-utf8_codepoint.c | 37 ++++++++++++++++++++++++++++----- level_3/utf8/c/private-utf8_codepoint.h | 1 + 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/level_3/utf8/c/private-utf8_codepoint.c b/level_3/utf8/c/private-utf8_codepoint.c index 30cbe37..fcf8427 100644 --- a/level_3/utf8/c/private-utf8_codepoint.c +++ b/level_3/utf8/c/private-utf8_codepoint.c @@ -237,6 +237,9 @@ extern "C" { if (status == F_true) { status = F_space; } + else { + status = F_none; + } } } @@ -300,7 +303,7 @@ extern "C" { } } - return F_none; + return status; } #endif // _di_utf8_detect_codepoint_ @@ -316,12 +319,37 @@ extern "C" { f_array_length_t j = 0; f_char_t block[5] = { 0, 0, 0, 0, 0 }; - f_string_static_t character = macro_f_string_static_t_initialize(block, 0, 4); + f_string_static_t character = macro_f_string_static_t_initialize(block, 0, 0); do { status = f_file_read_block(file, &data->buffer); - if (status == F_none_eof && !data->buffer.used) break; + if (status == F_none_eof && !data->buffer.used) { + + // Handle complete character, which must be explicitly set to end in this situation. + if (mode_codepoint == utf8_codepoint_mode_number_e || mode_codepoint == utf8_codepoint_mode_raw_number_e) { + if (mode_codepoint == utf8_codepoint_mode_number_e) { + mode_codepoint = utf8_codepoint_mode_end_e; + + status = utf8_convert_codepoint(data, character, &mode_codepoint); + } + else if (mode_codepoint == utf8_codepoint_mode_raw_number_e) { + mode_codepoint = utf8_codepoint_mode_raw_end_e; + + status = utf8_convert_raw(data, character, &mode_codepoint); + + // Raw mode represents an invalid Unicode sequence. + valid = F_false; + } + + j = 0; + next = F_true; + status = F_none_eof; + mode_codepoint = utf8_codepoint_mode_ready_e; + } + + break; + } for (i = 0; F_status_is_fine(status) && i < data->buffer.used; ) { @@ -349,7 +377,7 @@ extern "C" { character.string[j] = data->buffer.string[i]; } // for - if (j == character.used) { + if (j >= character.used) { if (data->mode & utf8_mode_from_bytesequence_d) { status = utf8_convert_bytesequence(data, character); } @@ -378,7 +406,6 @@ extern "C" { } } // for - i = 0; data->buffer.used = 0; } while (F_status_is_fine(status) && status != F_interrupt); diff --git a/level_3/utf8/c/private-utf8_codepoint.h b/level_3/utf8/c/private-utf8_codepoint.h index 69317a5..2954022 100644 --- a/level_3/utf8/c/private-utf8_codepoint.h +++ b/level_3/utf8/c/private-utf8_codepoint.h @@ -77,6 +77,7 @@ extern "C" { * @return * F_none on success. * F_next on success, but should not be processed (it is white space or NULL). + * F_space on success, but the character is whitespace. * * Errors (with error bit) from: f_utf_is_whitespace() */ -- 1.8.3.1