A 1-width UTF-8 character (that is not a valid ASCII character) is used to designate part of a complete UTF-8 character block (aka: 1-width UTF-8 characters are fragments).
Because this fragment cannot exist in isolation, it must be handled as either an invalid or an incomplete UTF-8 fragment.
Provide new status codes for handling incomplete UTF-8 fragments.
Update appropriate functions to detect and handle these invalid or incomplete fragments.
f_unterminated_group_on_eol,
f_unterminated_group_on_eos,
f_unterminated_group_on_stop,
+ f_incomplete_utf,
+ f_incomplete_utf_on_eof,
+ f_incomplete_utf_on_eol,
f_incomplete_utf_on_eos,
f_incomplete_utf_on_stop,
#endif // _di_f_status_buffers_
if (max_width < 1) return f_status_set_error(f_invalid_parameter);
#endif // _di_level_0_parameter_checking_
- f_u_short width = f_macro_utf_byte_width(*character);
+ f_u_short width = f_macro_utf_byte_width_is(*character);
- if (width == 1) {
+ if (width == 0) {
return f_false;
}
+ else if (width == 1) {
+ return f_status_is_error(f_incomplete_utf);
+ }
if (width > max_width) {
return f_status_set_error(f_maybe);
if (width == 0) {
return f_false;
}
+ else if (width == 1) {
+ return f_status_is_error(f_incomplete_utf);
+ }
+
+ // Do not operate on UTF-8 fragments that are not the first byte of the character.
+ if (width == 1) {
+ return f_status_set_error(f_incomplete_utf);
+ }
if (width > max_width) {
return f_status_set_error(f_maybe);
if (max_width < 1) return f_status_set_error(f_invalid_parameter);
#endif // _di_level_0_parameter_checking_
- f_u_short width = f_macro_utf_byte_width(*character);
+ f_u_short width = f_macro_utf_byte_width_is(*character);
- if (width == 1) {
+ if (width == 0) {
return f_false;
}
+ else if (width == 1) {
+ return f_status_is_error(f_incomplete_utf);
+ }
if (width > max_width) {
return f_status_set_error(f_maybe);
if (max_width < 1) return f_status_set_error(f_invalid_parameter);
#endif // _di_level_0_parameter_checking_
- f_u_short width = f_macro_utf_byte_width(*character);
+ f_u_short width = f_macro_utf_byte_width_is(*character);
- if (width == 1) {
+ if (width == 0) {
return f_false;
}
+ else if (width == 1) {
+ return f_status_is_error(f_incomplete_utf);
+ }
if (width > max_width) {
return f_status_set_error(f_maybe);
if (max_width < 1) return f_status_set_error(f_invalid_parameter);
#endif // _di_level_0_parameter_checking_
- f_u_short width = f_macro_utf_byte_width(*character);
+ f_u_short width = f_macro_utf_byte_width_is(*character);
- if (width == 1) {
+ if (width == 0) {
return f_false;
}
+ else if (width == 1) {
+ return f_status_is_error(f_incomplete_utf);
+ }
if (width > max_width) {
return f_status_set_error(f_maybe);
if (width == 0) {
return f_false;
}
+ else if (width == 1) {
+ return f_status_is_error(f_incomplete_utf);
+ }
// for now, just assume that any non-whitespace, non-substitute utf-8 character is a graph.
#ifndef _di_f_utf_is_space_character_
f_return_status f_utf_is_space_character(const f_utf_character character) {
- f_u_short width = f_macro_utf_character_width(character);
+ f_u_short width = f_macro_utf_character_width_is(character);
- if (width == 1) {
+ if (width == 0) {
return f_false;
}
+ else if (width == 1) {
+ return f_status_is_error(f_incomplete_utf);
+ }
if (width == 2) {
char utf[2] = { f_macro_utf_character_to_char_1(character), f_macro_utf_character_to_char_2(character) };
#ifndef _di_f_utf_is_substitute_character_
f_return_status f_utf_is_substitute_character(const f_utf_character character) {
- f_u_short width = f_macro_utf_character_width(character);
+ f_u_short width = f_macro_utf_character_width_is(character);
- if (width == 1) {
+ if (width == 0) {
return f_false;
}
+ else if (width == 1) {
+ return f_status_is_error(f_incomplete_utf);
+ }
if (width == 2) {
char utf[2] = { f_macro_utf_character_to_char_1(character), f_macro_utf_character_to_char_2(character) };
#ifndef _di_f_utf_is_whitespace_character_
f_return_status f_utf_is_whitespace_character(const f_utf_character character) {
- f_u_short width = f_macro_utf_character_width(character);
+ f_u_short width = f_macro_utf_character_width_is(character);
- if (width == 1) {
+ if (width == 0) {
return f_false;
}
+ else if (width == 1) {
+ return f_status_is_error(f_incomplete_utf);
+ }
+
+ // Do not operate on UTF-8 fragments that are not the first byte of the character.
+ if (width == 1) {
+ return f_status_set_error(f_incomplete_utf);
+ }
if (width == 2) {
char utf[2] = { f_macro_utf_character_to_char_1(character), f_macro_utf_character_to_char_2(character) };
if (utf_character == 0) return f_status_set_error(f_invalid_parameter);
#endif // _di_level_0_parameter_checking_
- f_u_short width = f_macro_utf_byte_width(*character);
+ f_u_short width = f_macro_utf_byte_width_is(*character);
- if (width == 1) {
+ if (width == 0) {
*utf_character = f_macro_utf_character_from_char_1(character[0]);
return f_none;
}
+ else if (width == 1) {
+ return f_status_is_error(f_incomplete_utf);
+ }
if (width > max_width) {
return f_status_set_error(f_failure);
*
* The f_utf_byte_is method will return non-zero if the character is a UTF-8 character of any width.
*
+ * The f_utf_byte_1 is specifically used only on UTF-8 fragments.
+ * For example, with the 2-byte-wide UTF-8 character '1100x xxxx 10yy yyyy', the 8-byte block '10yy yyyy' would be a fragment.
+ *
* The f_macro_utf_byte_is_* macros are used to determine a width of the character (either 1, 2, 3, or 4, respectively).
*
* The f_macro_utf_byte_width macro determines a width of the character.
* f_true if a UTF-8 whitespace or substitute.
* f_false if not a UTF-8 whitespace or substitute.
* f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough.
+ * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
* f_invalid_parameter (with error bit) if a parameter is invalid.
*/
#ifndef _di_f_utf_is_bom_
* f_true if a UTF-8 graph.
* f_false if not a UTF-8 graph.
* f_maybe (with error bit) if this could be a graph but width is not long enough.
+ * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
* f_invalid_parameter (with error bit) if a parameter is invalid.
*/
#ifndef _di_f_utf_is_graph_
* f_true if a UTF-8 whitespace or substitute.
* f_false if not a UTF-8 whitespace or substitute.
* f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough.
+ * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
* f_invalid_parameter (with error bit) if a parameter is invalid.
*/
#ifndef _di_f_utf_is_space_
* f_true if a UTF-8 substitute.
* f_false if not a UTF-8 substitute.
* f_maybe (with error bit) if this could be a substitute but width is not long enough.
+ * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
* f_invalid_parameter (with error bit) if a parameter is invalid.
*/
#ifndef _di_f_utf_is_substitute_
* f_true if a UTF-8 whitespace.
* f_false if not a UTF-8 whitespace.
* f_maybe (with error bit) if this could be a whitespace but width is not long enough.
+ * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
* f_invalid_parameter (with error bit) if a parameter is invalid.
*/
#ifndef _di_f_utf_is_whitespace_
* f_true if a UTF-8 whitespace or substitute.
* f_false if not a UTF-8 whitespace or substitute.
* f_maybe (with error bit) if this could be a whitespace or substitute but width is not long enough.
+ * f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
* f_invalid_parameter (with error bit) if a parameter is invalid.
*/
#ifndef _di_f_utf_is_bom_character_
* @return
* f_true if a UTF-8 graph.
* f_false if not a UTF-8 graph.
+ * f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
* f_invalid_parameter (with error bit) if a parameter is invalid.
*/
#ifndef _di_f_utf_is_graph_character_
* @return
* f_true if a UTF-8 whitespace or substitute.
* f_false if not a UTF-8 whitespace or substitute.
+ * f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
* f_invalid_parameter (with error bit) if a parameter is invalid.
*/
#ifndef _di_f_utf_is_space_character_
* @return
* f_true if a UTF-8 substitute.
* f_false if not a UTF-8 substitute.
+ * f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
* f_invalid_parameter (with error bit) if a parameter is invalid.
*/
#ifndef _di_f_utf_is_substitute_character_
* @return
* f_true if a UTF-8 whitespace.
* f_false if not a UTF-8 whitespace.
+ * f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
* f_invalid_parameter (with error bit) if a parameter is invalid.
*/
#ifndef _di_f_utf_is_whitespace_character_
* @return
* f_none if conversion was successful.
* f_failure (with error bit) if width is not long enough to convert.
+ * f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
* f_invalid_parameter (with error bit) if a parameter is invalid.
*/
#ifndef _di_f_utf_char_to_character_
case f_unterminated_group_on_stop:
*string = fl_status_string_unterminated_group_on_stop;
break;
+ case f_incomplete_utf:
+ *string = fl_status_string_incomplete_utf;
+ break;
+ case f_incomplete_utf_on_eof:
+ *string = fl_status_string_incomplete_utf_on_eof;
+ break;
+ case f_incomplete_utf_on_eol:
+ *string = fl_status_string_incomplete_utf_on_eol;
+ break;
case f_incomplete_utf_on_eos:
*string = fl_status_string_incomplete_utf_on_eos;
break;
#define fl_status_string_unterminated_group_on_stop "f_unterminated_group_on_stop"
#define fl_status_string_unterminated_group_on_stop_length 29
+ #define fl_status_string_incomplete_utf "f_incomplete_utf"
+ #define fl_status_string_incomplete_utf_length 17
+
+ #define fl_status_string_incomplete_utf_on_eof "f_incomplete_utf_on_eof"
+ #define fl_status_string_incomplete_utf_on_eof_length 24
+
+ #define fl_status_string_incomplete_utf_on_eol "f_incomplete_utf_on_eol"
+ #define fl_status_string_incomplete_utf_on_eol_length 24
+
#define fl_status_string_incomplete_utf_on_eos "f_incomplete_utf_on_eos"
#define fl_status_string_incomplete_utf_on_eos_length 24
while (buffer.string[location->start] == placeholder || (!isgraph(buffer.string[location->start]) && (status = f_utf_is_graph(buffer.string + location->start, max_width)) == f_false)) {
if (buffer.string[location->start] == f_eol) return f_none_on_eol;
- width = f_macro_utf_byte_width(buffer.string[location->start]);
+ width = f_macro_utf_byte_width_is(buffer.string[location->start]);
- if (width > 1) {
+ if (width == 0) {
+ width = 1;
+ }
+ // Do not operate on UTF-8 fragments that are not the first byte of the character.
+ else if (width == 1) {
+ return f_status_set_error(f_incomplete_utf);
+ }
+ else {
if (location->start + width >= buffer.used) return f_status_set_error(f_incomplete_utf_on_eos);
if (location->start + width > location->stop) return f_status_set_error(f_incomplete_utf_on_stop);
}
while (buffer.string[location->start] == placeholder || (isgraph(buffer.string[location->start]) && (status = f_utf_is_space(buffer.string + location->start, max_width)) == f_false)) {
if (buffer.string[location->start] == f_eol) return f_none_on_eol;
- width = f_macro_utf_byte_width(buffer.string[location->start]);
+ width = f_macro_utf_byte_width_is(buffer.string[location->start]);
- if (width > 1) {
+ if (width == 0) {
+ width = 1;
+ }
+ // Do not operate on UTF-8 fragments that are not the first byte of the character.
+ else if (width == 1) {
+ return f_status_set_error(f_incomplete_utf);
+ }
+ else {
if (location->start + width >= buffer.used) return f_status_set_error(f_incomplete_utf_on_eos);
if (location->start + width > location->stop) return f_status_set_error(f_incomplete_utf_on_stop);
}
max_width = buffer.used - location->start;
}
- width = f_macro_utf_byte_width(buffer.string[location->start]);
+ width = f_macro_utf_byte_width_is(buffer.string[location->start]);
+
+ if (width == 0) {
+ width = 1;
- if (width == 1) {
if (buffer.string[location->start] == f_eol) return f_none_on_eol;
if (seek_width == width) {
if (buffer.string[location->start] == seek_to_this) return f_none;
}
}
+ // Do not operate on UTF-8 fragments that are not the first byte of the character.
+ else if (width == 1) {
+ return f_status_set_error(f_incomplete_utf);
+ }
else {
if (location->start + width >= buffer.used) return f_status_set_error(f_incomplete_utf_on_eos);
if (location->start + width > location->stop) return f_status_set_error(f_incomplete_utf_on_stop);
max_width = buffer.used - location->start;
}
- width = f_macro_utf_byte_width(buffer.string[location->start]);
+ width = f_macro_utf_byte_width_is(buffer.string[location->start]);
+
+ if (width == 0) {
+ width = 1;
- if (width == 1) {
if (seek_width == width) {
if (buffer.string[location->start] == seek_to_this) return f_none;
}
}
+ // Do not operate on UTF-8 fragments that are not the first byte of the character.
+ else if (width == 1) {
+ return f_status_set_error(f_incomplete_utf);
+ }
else {
if (location->start + width >= buffer.used) return f_status_set_error(f_incomplete_utf_on_eos);
if (location->start + width > location->stop) return f_status_set_error(f_incomplete_utf_on_stop);
* @return
* f_none on success.
* f_no_data if nothing to rip, no allocations or reallocations are performed.
- * f_incomplete_utf_on_eos if end of sting is reached before a complete UTF-8 character can be processed.
* f_invalid_parameter (with error bit) if a parameter is invalid.
* f_allocation_error (with error bit) on memory allocation error.
* f_reallocation_error (with error bit) on memory reallocation error.
* f_none on success.
* f_none_on_eol on success, but stopped at EOL.
* f_none_on_eos on success, but stopped at end of buffer.
+ * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
* f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed.
* f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed.
* f_invalid_parameter (with error bit) if a parameter is invalid.
* f_none_on_eol on success, but stopped at EOL.
* f_none_on_eos on success, but stopped at end of buffer.
* f_none_on_stop on success, but stopped stop location.
+ * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
* f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed.
* f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed.
* f_invalid_parameter (with error bit) if a parameter is invalid.
* f_none on success.
* f_none_on_eol on success, but stopped at EOL.
* f_none_on_eos on success, but stopped at end of buffer.
+ * f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
* f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed.
* f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed.
* f_invalid_parameter (with error bit) if a parameter is invalid.
* f_none on success.
* f_none_on_eos on success, but stopped at end of buffer.
* f_none_on_stop on success, but stopped stop location.
+ * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
* f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed.
* f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed.
* f_invalid_parameter (with error bit) if a parameter is invalid.
* @return
* f_none on success.
* f_none_on_eos on success, but stopped at end of buffer.
+ * f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ * f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
* f_incomplete_utf_on_stop (with error bit) if the stop location is reached before the complete UTF-8 character can be processed.
* f_incomplete_utf_on_eos (with error bit) if end of string is reached before a complete UTF-8 character can be processed.
* f_invalid_parameter (with error bit) if a parameter is invalid.
return f_none;
}
+ if (fl_compare_strings(string, fl_status_string_incomplete_utf, length, fl_status_string_incomplete_utf_length) == f_equal_to) {
+ *code = f_incomplete_utf;
+ return f_none;
+ }
+
+ if (fl_compare_strings(string, fl_status_string_incomplete_utf_on_eof, length, fl_status_string_incomplete_utf_on_eof_length) == f_equal_to) {
+ *code = f_incomplete_utf_on_eof;
+ return f_none;
+ }
+
+ if (fl_compare_strings(string, fl_status_string_incomplete_utf_on_eol, length, fl_status_string_incomplete_utf_on_eol_length) == f_equal_to) {
+ *code = f_incomplete_utf_on_eol;
+ return f_none;
+ }
+
if (fl_compare_strings(string, fl_status_string_incomplete_utf_on_eos, length, fl_status_string_incomplete_utf_on_eos_length) == f_equal_to) {
*code = f_incomplete_utf_on_eos;
return f_none;