]> Kevux Git Server - fll/commitdiff
Feature: add word, dash, and plus UTF-8 character checker
authorKevin Day <thekevinday@gmail.com>
Sat, 2 May 2020 05:10:23 +0000 (00:10 -0500)
committerKevin Day <thekevinday@gmail.com>
Sat, 2 May 2020 05:10:23 +0000 (00:10 -0500)
The use of the '+' operator is more common than in the past.
Add support for detecting if a character is a word character, a dash character, or a plus character.

level_0/f_utf/c/private-utf.c
level_0/f_utf/c/private-utf.h
level_0/f_utf/c/utf.c
level_0/f_utf/c/utf.h

index 752a77bccb652b53cbec09db4a60144d480f9d93..0529a99c558748130648acce1bd257fc75148680 100644 (file)
@@ -2470,6 +2470,15 @@ extern "C" {
   }
 #endif // !defined(_di_f_utf_character_is_word_dash_) || !defined(_di_f_utf_is_word_dash_)
 
+#if !defined(_di_f_utf_character_is_word_dash_plus_) || !defined(_di_f_utf_is_word_dash_plus_)
+  f_return_status private_f_utf_character_is_word_dash_plus(const f_utf_character character, const uint8_t width) {
+
+    // @todo: handle all Unicode "word_dash_plus".
+
+    return f_false;
+  }
+#endif // !defined(_di_f_utf_character_is_word_dash_plus_) || !defined(_di_f_utf_is_word_dash_plus_)
+
 #if !defined(_di_f_utf_character_is_zero_width_) || !defined(_di_f_utf_is_zero_width_)
   f_return_status private_f_utf_character_is_zero_width(const f_utf_character character) {
     // reduce the number of checks by grouping checks by first byte.
index e89c277b4d22a2392b37298c4e11282275426a86..8a2afb46c467269acf66f43578834d59da6a22b9 100644 (file)
@@ -218,6 +218,29 @@ extern "C" {
 #endif // !defined(_di_f_utf_character_is_word_dash_) || !defined(_di_f_utf_is_word_dash_)
 
 /**
+ * Private implementation of f_utf_character_is_word_dash_plus().
+ *
+ * Intended to be shared to each of the different implementation variations.
+ *
+ * @param character
+ *   The character to validate.
+ * @param width
+ *   The number of bytes repesenting the character width.
+ *
+ * @return
+ *   f_true if a UTF-8 control character.
+ *   f_false if not a UTF-8 control character.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see iscntrl()
+ * @see f_utf_character_is_word_dash()
+ * @see f_utf_is_word_dash()
+ */
+#if !defined(_di_f_utf_character_is_word_dash_plus_) || !defined(_di_f_utf_is_word_dash_plus_)
+  extern f_return_status private_f_utf_character_is_word_dash_plus(const f_utf_character character, const uint8_t width) f_gcc_attribute_visibility_internal;
+#endif // !defined(_di_f_utf_character_is_word_dash_plus_) || !defined(_di_f_utf_is_word_dash_plus_)
+
+/**
  * Private implementation of f_utf_character_is_zero_width().
  *
  * Intended to be shared to each of the different implementation variations.
index 5573aea4ed2e403a80153f63dadc1a64a6cc68a9..3e82b6c0a48bd86be58595ae087a515cd49e346f 100644 (file)
@@ -237,6 +237,26 @@ extern "C" {
   }
 #endif // _di_f_utf_character_is_word_dash_
 
+#ifndef _di_f_utf_character_is_word_dash_plus_
+  f_return_status f_utf_character_is_word_dash_plus(const f_utf_character character) {
+    unsigned short width = f_macro_utf_character_width_is(character);
+
+    if (width == 0) {
+      if (isalnum(f_macro_utf_character_to_char_1(character)) || character == '_' || character == '-' || character == '+') {
+        return f_true;
+      }
+
+      return f_false;
+    }
+
+    if (width == 1) {
+      return f_status_is_error(f_invalid_utf);
+    }
+
+    return private_f_utf_character_is_word_dash_plus(character, width);
+  }
+#endif // _di_f_utf_character_is_word_dash_plus_
+
 #ifndef _di_f_utf_character_is_zero_width_
   f_return_status f_utf_character_is_zero_width(const f_utf_character character) {
     if (f_macro_utf_character_width_is(character) == 1) {
@@ -678,6 +698,40 @@ extern "C" {
   }
 #endif // _di_f_utf_is_word_dash_
 
+#ifndef _di_f_utf_is_word_dash_plus_
+  f_return_status f_utf_is_word_dash_plus(const f_string character, const f_string_length width_max) {
+    #ifndef _di_level_0_parameter_checking_
+      if (width_max < 1) return f_status_set_error(f_invalid_parameter);
+    #endif // _di_level_0_parameter_checking_
+
+    uint8_t width = f_macro_utf_byte_width_is(*character);
+
+    if (width == 0) {
+      if (isalnum(*character) || *character == '_' || *character == '-' || *character == '+') {
+        return f_true;
+      }
+
+      return f_false;
+    }
+
+    if (width == 1) {
+      return f_status_is_error(f_incomplete_utf);
+    }
+
+    f_utf_character character_utf = 0;
+
+    {
+      f_status status = 0;
+
+      status = f_utf_char_to_character(character, width_max, &character_utf);
+
+      if (status != f_none) return status;
+    }
+
+    return private_f_utf_character_is_word_dash_plus(character_utf, width);
+  }
+#endif // _di_f_utf_is_word_dash_plus_
+
 #ifndef _di_f_utf_is_zero_width_
   f_return_status f_utf_is_zero_width(const f_string character, const f_string_length width_max) {
     #ifndef _di_level_0_parameter_checking_
index 1acf35466f4808f23de9e0af1a9711ef9256e663..a4f9ad0e8dc121fd3c1fa4fa46c639a07145ab64 100644 (file)
@@ -749,6 +749,28 @@ extern "C" {
 #endif // _di_f_utf_character_is_word_dash_
 
 /**
+ * Check to see if the entire byte block of the character is an ASCII or UTF-8 word, dash, or plus character.
+ *
+ * A word dash character is alpha-numeric, an underscore '_', a dash '-', or a plus '+'.
+ *
+ * @todo Incomplete, UTF-8 codes not yet checked!
+ *
+ * @param character
+ *   The character to validate.
+ *
+ * @return
+ *   f_true if a UTF-8 word or dash character.
+ *   f_false if not a UTF-8 word or dash character.
+ *   f_invalid_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see iscntrl()
+ * @see f_utf_is_word_dash()
+ */
+#ifndef _di_f_utf_character_is_word_dash_plus_
+  extern f_return_status f_utf_character_is_word_dash_plus(const f_utf_character character);
+#endif // _di_f_utf_character_is_word_dash_plus_
+
+/**
  * Check to see if the entire byte block of the character is an ASCII or UTF-8 general non-printing character.
  *
  * Only characters that do not print, which are generally called zero-width.
@@ -1143,6 +1165,32 @@ extern "C" {
 #endif // _di_f_utf_is_word_dash_
 
 /**
+ * Check to see if the entire byte block of the character is an ASCII or UTF-8 word, dash, or plus character.
+ *
+ * A word dash character is alpha-numeric, an underscore '_', a dash '-', or a plus '+'.
+ *
+ * @todo Incomplete, UTF-8 codes not yet checked!
+ *
+ * @param character
+ *   The character to validate.
+ *   There must be enough space allocated to compare against, as limited by width_max.
+ * @param width_max
+ *   The maximum width available for checking.
+ *   Can be anything greater than 0.
+ *
+ * @return
+ *   f_true if a UTF-8 word or dash character.
+ *   f_false if not a UTF-8 word or dash character.
+ *   f_incomplete_utf (with error bit) if character is an incomplete UTF-8 fragment.
+ *
+ * @see iscntrl()
+ * @see f_utf_character_is_word_dash_plus()
+ */
+#ifndef _di_f_utf_is_word_dash_plus_
+  extern f_return_status f_utf_is_word_dash_plus(const f_string character, const f_string_length width_max);
+#endif // _di_f_utf_is_word_dash_plus_
+
+/**
  * Check to see if the entire byte block of the character is an ASCII or UTF-8 general non-printing character.
  *
  * Only characters that do not print, which are generally called zero-width.