From: Kevin Day Date: Wed, 11 May 2022 03:19:54 +0000 (-0500) Subject: Update: Utilize the state.flag to allow for iki read to not fail out on invalid UTF... X-Git-Tag: 0.5.10~153 X-Git-Url: https://git.kevux.org/?a=commitdiff_plain;h=f6aa54cd5e5b576c7493689cec4e4a5e10550a8b;p=fll Update: Utilize the state.flag to allow for iki read to not fail out on invalid UTF-8 code sequence. One of the original goals of the FLL project is to achieve fail-through functionality. Knowing that this is a lot of work, I have ignored a lot of situations where I can implement fail-through and simply performed fail-out or fail-over. With the upcoming stable release, I believe that this must handle bad data files. This adds the option to conditionally change the behavior between fail-through and fail-out for the f_iki_read() and related for invalid UTF-8 code sequences. The default behavior is now changed from fail-out to fail-through. --- diff --git a/level_0/f_iki/c/iki.c b/level_0/f_iki/c/iki.c index 49c68b6..5f58e6f 100644 --- a/level_0/f_iki/c/iki.c +++ b/level_0/f_iki/c/iki.c @@ -112,7 +112,7 @@ extern "C" { do { // Find the start of the vocabulary name. - while (F_status_is_error_not(status) && range->start <= range->stop && range->start < buffer->used) { + while (range->start <= range->stop && range->start < buffer->used) { if (state.interrupt) { status = state.interrupt((void *) &state, 0); @@ -127,7 +127,19 @@ extern "C" { width_max = buffer->used - range->start; status = f_utf_is_word_dash_plus(buffer->string + range->start, width_max, F_false); - if (F_status_is_error(status)) break; + + if (F_status_is_error(status)) { + if (F_status_set_fine(status) == F_utf_fragment || F_status_set_fine(status) == F_complete_not_utf) { + if (state.flag & f_iki_state_flag_utf_fail_on_valid_not_e) { + break; + } + + status = F_false; + } + else { + break; + } + } if (status == F_true) { found_vocabulary.start = range->start++; @@ -136,6 +148,7 @@ extern "C" { } status = f_utf_buffer_increment(*buffer, range, 1); + if (F_status_is_error(status)) break; } // while // Find the end of the vocabulary name. @@ -232,7 +245,19 @@ extern "C" { width_max = buffer->used - range->start; status = f_utf_is_word_dash_plus(buffer->string + range->start, width_max, F_false); - if (F_status_is_error(status)) break; + + if (F_status_is_error(status)) { + if (F_status_set_fine(status) == F_utf_fragment || F_status_set_fine(status) == F_complete_not_utf) { + if (state.flag & f_iki_state_flag_utf_fail_on_valid_not_e) { + break; + } + + status = F_false; + } + else { + break; + } + } // Not a valid IKI vocabulary name. if (status != F_true) break; @@ -431,6 +456,12 @@ extern "C" { } while (range->start <= range->stop && range->start < buffer->used); + if (F_status_set_fine(status) == F_complete_not_utf_eos || F_status_set_fine(status) == F_complete_not_utf_stop) { + if (!(state.flag & f_iki_state_flag_utf_fail_on_valid_not_e)) { + status = F_status_set_fine(status); + } + } + if (F_status_is_error(status)) { data->delimits.used = delimits_used; diff --git a/level_0/f_iki/c/iki.h b/level_0/f_iki/c/iki.h index ded0ee5..a463044 100644 --- a/level_0/f_iki/c/iki.h +++ b/level_0/f_iki/c/iki.h @@ -135,10 +135,10 @@ extern "C" { * Calling this more than once on the same buffer range could result in multiple escaping. * * @param state - * A state for handling interrupts during long running operations. - * There is no print_error() usage at this time (@todo this should be implemented and supported). + * A state for providing flags and handling interrupts during long running operations. + * There is no print_error(). * There is no functions structure. - * There is no data structure passed to these functions (@todo the additional parameters could be moved to a custom structure). + * There is no data structure passed to these functions. * * When interrupt() returns, only F_interrupt and F_interrupt_not are processed. * Error bit designates an error but must be passed along with F_interrupt. @@ -157,10 +157,14 @@ extern "C" { * F_none on success and an IKI vocabulary name was found. * F_none_eos on success and an IKI vocabulary name was found and end of string was reached. * F_none_stop on success and an IKI vocabulary name was found and stop point was reached. + * F_complete_not_utf_eos on success but string ended on incomplete UTF-8 and f_iki_state_flag_utf_fail_on_valid_not_e is not set. + * F_complete_not_utf_stop on success but stop point reached on incomplete UTF-8 and f_iki_state_flag_utf_fail_on_valid_not_e is not set. * F_data_not on success, but there were no IKI vocabulary names found. * F_data_not_eos on success and EOS was reached, but there were no IKI vocabulary names found. * F_data_not_stop on success and stop point was reached, but there were no IKI vocabulary names found. * + * F_complete_not_utf_eos (with error bit) on success but string ended on incomplete UTF-8 and f_iki_state_flag_utf_fail_on_valid_not_e is set. + * F_complete_not_utf_stop (with error bit) on success but stop point reached on incomplete UTF-8 and f_iki_state_flag_utf_fail_on_valid_not_e is set. * F_interrupt (with error bit) if stopping due to an interrupt. * F_memory_not (with error bit) on out of memory. * F_parameter (with error bit) if a parameter is invalid. diff --git a/level_0/f_iki/c/iki/common.h b/level_0/f_iki/c/iki/common.h index 38a56ac..704cc89 100644 --- a/level_0/f_iki/c/iki/common.h +++ b/level_0/f_iki/c/iki/common.h @@ -17,6 +17,24 @@ extern "C" { #endif /** + * State flags associated with iki functions. + * + * These flags are meant to be bitwise for the 32-bit f_state_t flag property. + * + * The f_iki_state_flag_none_e is expected to be 0, therefore it must be safe to use 0 directly. + * + * f_iki_state_flag_*: + * - none: No flags are set. + * - utf_fail_on_valid_not: Immediately fail on invalid UTF-8 character (including incomplete). + */ +#ifndef _di_f_iki_state_flags_ + enum { + f_iki_state_flag_none_e = 0, + f_iki_state_flag_utf_fail_on_valid_not_e = 0x1, + }; // enum +#endif // _di_f_iki_state_flags_ + +/** * IKI-specific syntax. */ #ifndef _di_f_iki_syntax_ diff --git a/level_1/fl_iki/c/iki.c b/level_1/fl_iki/c/iki.c index 08ffd6f..37fef36 100644 --- a/level_1/fl_iki/c/iki.c +++ b/level_1/fl_iki/c/iki.c @@ -17,17 +17,9 @@ extern "C" { status = f_iki_read(state, buffer, range, data); if (F_status_is_error(status)) return status; - if (status == F_data_not_eos || status == F_data_not_stop) { - return status; - } - - if (status == F_none_eos || status == F_none_stop) { - return status; - } - } while (range->start <= range->stop && range->start < buffer->used); - return F_none; + return status; } #endif // _di_fl_iki_read_ diff --git a/level_1/fl_iki/c/iki.h b/level_1/fl_iki/c/iki.h index 385f2f2..4cabc88 100644 --- a/level_1/fl_iki/c/iki.h +++ b/level_1/fl_iki/c/iki.h @@ -36,14 +36,7 @@ extern "C" { * This only finds complete vocabulary names and their respective content. * * @param state - * A state for handling interrupts during long running operations. - * There is no print_error() usage at this time (@todo this should be implemented and supported). - * There is no functions structure. - * There is no data structure passed to these functions (@todo the additional parameters could be moved to a custom structure). - * - * When interrupt() returns, only F_interrupt and F_interrupt_not are processed. - * Error bit designates an error but must be passed along with F_interrupt. - * All other statuses are ignored. + * A state for providing flags and handling interrupts during long running operations. * @param buffer * The string to process. * @param range @@ -58,6 +51,8 @@ extern "C" { * F_none on success and an IKI vocabulary name was found. * F_none_stop on success and an IKI vocabulary name was found and stop point was reached. * F_none_eos on success and an IKI vocabulary name was found and end of string was reached. + * F_complete_not_utf_eos on success and EOS was reached, but at an incomplete UTF-8 sequence. + * F_complete_not_utf_stop on success and stop point was reached, but at an incomplete UTF-8 sequence. * F_data_not_eos on success and EOS was reached, but there were no IKI vocabularie names found. * F_data_not_stop on success and stop point was reached, but there were no IKI vocabularie names found. * diff --git a/level_3/iki_read/c/iki_read.c b/level_3/iki_read/c/iki_read.c index 0ebb649..8d5f755 100644 --- a/level_3/iki_read/c/iki_read.c +++ b/level_3/iki_read/c/iki_read.c @@ -417,18 +417,15 @@ extern "C" { if (size_file > iki_read_block_max) { file.size_read = iki_read_block_read_large; size_block = iki_read_block_max; - - // Pre-allocate entire file buffer plus space for the terminating NULL. - f_string_dynamic_increase_by(size_file + (size_block - (size_file % size_block)) + 1, &data.buffer); } else { file.size_read = iki_read_block_read_small; size_block = size_file; - - // Pre-allocate entire file buffer plus space for the terminating NULL. - f_string_dynamic_increase_by(size_file + 1, &data.buffer); } + // Pre-allocate entire file buffer plus space for the terminating NULL. + f_string_dynamic_increase_by(size_file + 1, &data.buffer); + if (F_status_is_error(status)) { fll_error_file_print(main->error, F_status_set_fine(status), "f_string_dynamic_resize", F_true, data.argv[main->parameters.remaining.array[i]], f_file_operation_process_s, fll_error_file_type_file_e); diff --git a/level_3/iki_read/c/private-read.c b/level_3/iki_read/c/private-read.c index 0592b51..494c286 100644 --- a/level_3/iki_read/c/private-read.c +++ b/level_3/iki_read/c/private-read.c @@ -73,7 +73,7 @@ extern "C" { status = iki_read_process_at(data, &buffer_range); - if (status == F_true && buffer_range.start > data->buffer.used || status == F_data_not) { + if ((status == F_true && buffer_range.start > data->buffer.used) || status == F_data_not) { f_iki_data_delete(&iki_data); return F_data_not;