From: Kevin Day <thekevinday@gmail.com>
Date: Wed, 11 May 2022 03:19:54 +0000 (-0500)
Subject: Update: Utilize the state.flag to allow for iki read to not fail out on invalid UTF... 
X-Git-Tag: 0.5.10~153
X-Git-Url: https://git.kevux.org/?a=commitdiff_plain;h=f6aa54cd5e5b576c7493689cec4e4a5e10550a8b;p=fll

Update: Utilize the state.flag to allow for iki read to not fail out on invalid UTF-8 code sequence.

One of the original goals of the FLL project is to achieve fail-through functionality.
Knowing that this is a lot of work, I have ignored a lot of situations where I can implement fail-through and simply performed fail-out or fail-over.

With the upcoming stable release, I believe that this must handle bad data files.
This adds the option to conditionally change the behavior between fail-through and fail-out for the f_iki_read() and related for invalid UTF-8 code sequences.
The default behavior is now changed from fail-out to fail-through.
---

diff --git a/level_0/f_iki/c/iki.c b/level_0/f_iki/c/iki.c
index 49c68b6..5f58e6f 100644
--- a/level_0/f_iki/c/iki.c
+++ b/level_0/f_iki/c/iki.c
@@ -112,7 +112,7 @@ extern "C" {
     do {
 
       // Find the start of the vocabulary name.
-      while (F_status_is_error_not(status) && range->start <= range->stop && range->start < buffer->used) {
+      while (range->start <= range->stop && range->start < buffer->used) {
 
         if (state.interrupt) {
           status = state.interrupt((void *) &state, 0);
@@ -127,7 +127,19 @@ extern "C" {
         width_max = buffer->used - range->start;
 
         status = f_utf_is_word_dash_plus(buffer->string + range->start, width_max, F_false);
-        if (F_status_is_error(status)) break;
+
+        if (F_status_is_error(status)) {
+          if (F_status_set_fine(status) == F_utf_fragment || F_status_set_fine(status) == F_complete_not_utf) {
+            if (state.flag & f_iki_state_flag_utf_fail_on_valid_not_e) {
+              break;
+            }
+
+            status = F_false;
+          }
+          else {
+            break;
+          }
+        }
 
         if (status == F_true) {
           found_vocabulary.start = range->start++;
@@ -136,6 +148,7 @@ extern "C" {
         }
 
         status = f_utf_buffer_increment(*buffer, range, 1);
+        if (F_status_is_error(status)) break;
       } // while
 
       // Find the end of the vocabulary name.
@@ -232,7 +245,19 @@ extern "C" {
           width_max = buffer->used - range->start;
 
           status = f_utf_is_word_dash_plus(buffer->string + range->start, width_max, F_false);
-          if (F_status_is_error(status)) break;
+
+          if (F_status_is_error(status)) {
+            if (F_status_set_fine(status) == F_utf_fragment || F_status_set_fine(status) == F_complete_not_utf) {
+              if (state.flag & f_iki_state_flag_utf_fail_on_valid_not_e) {
+                break;
+              }
+
+              status = F_false;
+            }
+            else {
+              break;
+            }
+          }
 
           // Not a valid IKI vocabulary name.
           if (status != F_true) break;
@@ -431,6 +456,12 @@ extern "C" {
 
     } while (range->start <= range->stop && range->start < buffer->used);
 
+    if (F_status_set_fine(status) == F_complete_not_utf_eos || F_status_set_fine(status) == F_complete_not_utf_stop) {
+      if (!(state.flag & f_iki_state_flag_utf_fail_on_valid_not_e)) {
+        status = F_status_set_fine(status);
+      }
+    }
+
     if (F_status_is_error(status)) {
       data->delimits.used = delimits_used;
 
diff --git a/level_0/f_iki/c/iki.h b/level_0/f_iki/c/iki.h
index ded0ee5..a463044 100644
--- a/level_0/f_iki/c/iki.h
+++ b/level_0/f_iki/c/iki.h
@@ -135,10 +135,10 @@ extern "C" {
  * Calling this more than once on the same buffer range could result in multiple escaping.
  *
  * @param state
- *   A state for handling interrupts during long running operations.
- *   There is no print_error() usage at this time (@todo this should be implemented and supported).
+ *   A state for providing flags and handling interrupts during long running operations.
+ *   There is no print_error().
  *   There is no functions structure.
- *   There is no data structure passed to these functions (@todo the additional parameters could be moved to a custom structure).
+ *   There is no data structure passed to these functions.
  *
  *   When interrupt() returns, only F_interrupt and F_interrupt_not are processed.
  *   Error bit designates an error but must be passed along with F_interrupt.
@@ -157,10 +157,14 @@ extern "C" {
  *   F_none on success and an IKI vocabulary name was found.
  *   F_none_eos on success and an IKI vocabulary name was found and end of string was reached.
  *   F_none_stop on success and an IKI vocabulary name was found and stop point was reached.
+ *   F_complete_not_utf_eos on success but string ended on incomplete UTF-8 and f_iki_state_flag_utf_fail_on_valid_not_e is not set.
+ *   F_complete_not_utf_stop on success but stop point reached on incomplete UTF-8 and f_iki_state_flag_utf_fail_on_valid_not_e is not set.
  *   F_data_not on success, but there were no IKI vocabulary names found.
  *   F_data_not_eos on success and EOS was reached, but there were no IKI vocabulary names found.
  *   F_data_not_stop on success and stop point was reached, but there were no IKI vocabulary names found.
  *
+ *   F_complete_not_utf_eos (with error bit) on success but string ended on incomplete UTF-8 and f_iki_state_flag_utf_fail_on_valid_not_e is set.
+ *   F_complete_not_utf_stop (with error bit) on success but stop point reached on incomplete UTF-8 and f_iki_state_flag_utf_fail_on_valid_not_e is set.
  *   F_interrupt (with error bit) if stopping due to an interrupt.
  *   F_memory_not (with error bit) on out of memory.
  *   F_parameter (with error bit) if a parameter is invalid.
diff --git a/level_0/f_iki/c/iki/common.h b/level_0/f_iki/c/iki/common.h
index 38a56ac..704cc89 100644
--- a/level_0/f_iki/c/iki/common.h
+++ b/level_0/f_iki/c/iki/common.h
@@ -17,6 +17,24 @@ extern "C" {
 #endif
 
 /**
+ * State flags associated with iki functions.
+ *
+ * These flags are meant to be bitwise for the 32-bit f_state_t flag property.
+ *
+ * The f_iki_state_flag_none_e is expected to be 0, therefore it must be safe to use 0 directly.
+ *
+ * f_iki_state_flag_*:
+ *   - none:                  No flags are set.
+ *   - utf_fail_on_valid_not: Immediately fail on invalid UTF-8 character (including incomplete).
+ */
+#ifndef _di_f_iki_state_flags_
+  enum {
+    f_iki_state_flag_none_e                  = 0,
+    f_iki_state_flag_utf_fail_on_valid_not_e = 0x1,
+  }; // enum
+#endif // _di_f_iki_state_flags_
+
+/**
  * IKI-specific syntax.
  */
 #ifndef _di_f_iki_syntax_
diff --git a/level_1/fl_iki/c/iki.c b/level_1/fl_iki/c/iki.c
index 08ffd6f..37fef36 100644
--- a/level_1/fl_iki/c/iki.c
+++ b/level_1/fl_iki/c/iki.c
@@ -17,17 +17,9 @@ extern "C" {
       status = f_iki_read(state, buffer, range, data);
       if (F_status_is_error(status)) return status;
 
-      if (status == F_data_not_eos || status == F_data_not_stop) {
-        return status;
-      }
-
-      if (status == F_none_eos || status == F_none_stop) {
-        return status;
-      }
-
     } while (range->start <= range->stop && range->start < buffer->used);
 
-    return F_none;
+    return status;
   }
 #endif // _di_fl_iki_read_
 
diff --git a/level_1/fl_iki/c/iki.h b/level_1/fl_iki/c/iki.h
index 385f2f2..4cabc88 100644
--- a/level_1/fl_iki/c/iki.h
+++ b/level_1/fl_iki/c/iki.h
@@ -36,14 +36,7 @@ extern "C" {
  * This only finds complete vocabulary names and their respective content.
  *
  * @param state
- *   A state for handling interrupts during long running operations.
- *   There is no print_error() usage at this time (@todo this should be implemented and supported).
- *   There is no functions structure.
- *   There is no data structure passed to these functions (@todo the additional parameters could be moved to a custom structure).
- *
- *   When interrupt() returns, only F_interrupt and F_interrupt_not are processed.
- *   Error bit designates an error but must be passed along with F_interrupt.
- *   All other statuses are ignored.
+ *   A state for providing flags and handling interrupts during long running operations.
  * @param buffer
  *   The string to process.
  * @param range
@@ -58,6 +51,8 @@ extern "C" {
  *   F_none on success and an IKI vocabulary name was found.
  *   F_none_stop on success and an IKI vocabulary name was found and stop point was reached.
  *   F_none_eos on success and an IKI vocabulary name was found and end of string was reached.
+ *   F_complete_not_utf_eos on success and EOS was reached, but at an incomplete UTF-8 sequence.
+ *   F_complete_not_utf_stop on success and stop point was reached, but at an incomplete UTF-8 sequence.
  *   F_data_not_eos on success and EOS was reached, but there were no IKI vocabularie names found.
  *   F_data_not_stop on success and stop point was reached, but there were no IKI vocabularie names found.
  *
diff --git a/level_3/iki_read/c/iki_read.c b/level_3/iki_read/c/iki_read.c
index 0ebb649..8d5f755 100644
--- a/level_3/iki_read/c/iki_read.c
+++ b/level_3/iki_read/c/iki_read.c
@@ -417,18 +417,15 @@ extern "C" {
           if (size_file > iki_read_block_max) {
             file.size_read = iki_read_block_read_large;
             size_block = iki_read_block_max;
-
-            // Pre-allocate entire file buffer plus space for the terminating NULL.
-            f_string_dynamic_increase_by(size_file + (size_block - (size_file % size_block)) + 1, &data.buffer);
           }
           else {
             file.size_read = iki_read_block_read_small;
             size_block = size_file;
-
-            // Pre-allocate entire file buffer plus space for the terminating NULL.
-            f_string_dynamic_increase_by(size_file + 1, &data.buffer);
           }
 
+          // Pre-allocate entire file buffer plus space for the terminating NULL.
+          f_string_dynamic_increase_by(size_file + 1, &data.buffer);
+
           if (F_status_is_error(status)) {
             fll_error_file_print(main->error, F_status_set_fine(status), "f_string_dynamic_resize", F_true, data.argv[main->parameters.remaining.array[i]], f_file_operation_process_s, fll_error_file_type_file_e);
 
diff --git a/level_3/iki_read/c/private-read.c b/level_3/iki_read/c/private-read.c
index 0592b51..494c286 100644
--- a/level_3/iki_read/c/private-read.c
+++ b/level_3/iki_read/c/private-read.c
@@ -73,7 +73,7 @@ extern "C" {
 
       status = iki_read_process_at(data, &buffer_range);
 
-      if (status == F_true && buffer_range.start > data->buffer.used || status == F_data_not) {
+      if ((status == F_true && buffer_range.start > data->buffer.used) || status == F_data_not) {
         f_iki_data_delete(&iki_data);
 
         return F_data_not;