if (!character_utf) return F_status_set_error(F_parameter);
#endif // _di_level_0_parameter_checking_
- const uint8_t width = macro_f_utf_byte_width_is(*character);
-
- if (!width) {
- *character_utf = macro_f_utf_character_t_from_char_1(character[0]);
-
- return F_none;
- }
- else if (width == 1) {
- return F_status_is_error(F_utf);
- }
-
- if (width > width_max) {
- return F_status_set_error(F_failure);
- }
-
- *character_utf = macro_f_utf_character_t_from_char_1(character[0]);
-
- if (width < 2) {
- return F_none;
- }
-
- *character_utf |= macro_f_utf_character_t_from_char_2(character[1]);
-
- if (width == 2) {
- return F_none;
- }
-
- *character_utf |= macro_f_utf_character_t_from_char_3(character[2]);
-
- if (width == 3) {
- return F_none;
- }
-
- *character_utf |= macro_f_utf_character_t_from_char_4(character[3]);
-
- return F_none;
+ return private_f_utf_char_to_character(character, width_max, character_utf);
}
#endif // _di_f_utf_char_to_character_
}
#endif // _di_f_utf_character_to_char_
+#ifndef _di_f_utf_character_unicode_to_
+ f_status_t f_utf_character_unicode_to(const f_utf_character_t character, uint32_t *unicode) {
+ #ifndef _di_level_0_parameter_checking_
+ if (!unicode) return F_status_set_error(F_parameter);
+ #endif // _di_level_0_parameter_checking_
+
+ const uint8_t width = macro_f_utf_character_t_width(character);
+
+ if (private_f_utf_character_is_valid(character, width) == F_false) {
+ return F_status_set_error(F_utf);
+ }
+
+ if (width < 2) {
+
+ // U+0000 -> U+007F
+ *unicode = macro_f_utf_character_t_to_char_1(character) & 0x7f;
+ }
+ else if (width == 2) {
+
+ // U+0080 -> U+07FF
+ *unicode = (macro_f_utf_character_t_to_char_1(character) & 0x1f) << 6;
+ *unicode |= macro_f_utf_character_t_to_char_2(character) & 0x3f;
+ }
+ else if (width == 3) {
+
+ // U+0800 -> U+FFFF
+ *unicode = (macro_f_utf_character_t_to_char_1(character) & 0xf) << 12;
+ *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 6;
+ *unicode |= macro_f_utf_character_t_to_char_3(character) & 0x3f;
+ }
+ else if (width == 4) {
+
+ // U+10000 -> U+10FFFF
+ *unicode = (macro_f_utf_character_t_to_char_1(character) & 0x7) << 18;
+ *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 12;
+ *unicode |= (macro_f_utf_character_t_to_char_2(character) & 0x3f) << 6;
+ *unicode |= macro_f_utf_character_t_to_char_4(character) & 0x3f;
+ }
+
+ return F_none;
+ }
+#endif // _di_f_utf_character_unicode_to_
+
+#ifndef _di_f_utf_character_unicode_from_
+ f_status_t f_utf_character_unicode_from(const uint32_t unicode, f_utf_character_t *character) {
+ #ifndef _di_level_0_parameter_checking_
+ if (!character) return F_status_set_error(F_parameter);
+ #endif // _di_level_0_parameter_checking_
+
+ if (unicode > 0x10ffff) {
+ return F_status_set_error(F_utf);
+ }
+
+ if (unicode < 0x80) {
+
+ // U+0000 -> U+007F
+ *character = unicode;
+ }
+ else if (unicode < 0x800) {
+
+ // U+0080 -> U+07FF
+ *character = (unicode & 0x7c0) << 2;
+ *character |= unicode & 0x3f;
+ *character |= 0xc080;
+ }
+ else if (unicode < 0x10000) {
+
+ // U+0800 -> U+FFFF
+ *character = (unicode & 0xf000) << 4;
+ *character |= (unicode & 0xfc0) << 2;
+ *character |= unicode & 0x3f;
+ *character |= 0xe08080;
+ }
+ else {
+
+ // U+10000 -> U+
+ *character = (unicode & 0x1c0000) << 6;
+ *character |= (unicode & 0x3f000) << 4;
+ *character |= (unicode & 0xfc0) << 2;
+ *character |= unicode & 0x3f;
+ *character |= 0xe0808080;
+ }
+
+ return F_none;
+ }
+#endif // _di_f_utf_character_unicode_from_
+
#ifndef _di_f_utf_is_big_endian_
f_status_t f_utf_is_big_endian() {
uint16_t test_int = (0x01 << 8) | 0x02;
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
f_utf_character_t character_utf = 0;
{
- const f_status_t status = f_utf_char_to_character(character, width_max, &character_utf);
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
if (F_status_is_error(status)) return status;
}
}
#endif // _di_f_utf_string_seek_to_
+#ifndef _di_f_utf_unicode_to_
+ f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, uint32_t *unicode) {
+ #ifndef _di_level_0_parameter_checking_
+ if (width_max < 1) return F_status_set_error(F_parameter);
+ if (!unicode) return F_status_set_error(F_parameter);
+ #endif // _di_level_0_parameter_checking_
+
+ const uint8_t width = macro_f_utf_byte_width(*character);
+
+ {
+ f_utf_character_t character_utf = 0;
+
+ const f_status_t status = private_f_utf_char_to_character(character, width_max, &character_utf);
+ if (F_status_is_error(status)) return status;
+
+ if (private_f_utf_character_is_valid(character_utf, width) == F_false) {
+ return F_status_set_error(F_utf);
+ }
+ }
+
+ // @fixme the code here needs to be reviewed for endianess accuracy for both big and little endian.
+ if (width == 1) {
+
+ // U+0000 -> U+007F
+ *unicode = character[0] & 0x7f;
+ }
+ else if (width == 2) {
+
+ // U+0080 -> U+07FF
+ *unicode = (character[0] & 0x1f) << 6;
+ *unicode |= character[1] & 0x3f;
+ }
+ else if (width == 3) {
+
+ // U+0800 -> U+FFFF
+ *unicode = (character[0] & 0xf) << 12;
+ *unicode |= (character[1] & 0x3f) << 6;
+ *unicode |= character[2] & 0x3f;
+ }
+ else if (width == 4) {
+
+ // U+10000 -> U+10FFFF
+ *unicode = (character[0] & 0x7) << 18;
+ *unicode |= (character[1] & 0x3f) << 12;
+ *unicode |= (character[2] & 0x3f) << 6;
+ *unicode |= character[3] & 0x3f;
+ }
+
+ return F_none;
+ }
+#endif // _di_f_utf_unicode_to_
+
+#ifndef _di_f_utf_unicode_from_
+ f_status_t f_utf_unicode_from(const uint32_t unicode, const f_array_length_t width_max, f_string_t *character) {
+ #ifndef _di_level_0_parameter_checking_
+ if (width_max < 1) return F_status_set_error(F_parameter);
+ if (!unicode) return F_status_set_error(F_parameter);
+ #endif // _di_level_0_parameter_checking_
+
+ // @fixme the code here needs to be reviewed for endianess accuracy for both big and little endian.
+ if (unicode > 0x10ffff) {
+ return F_status_set_error(F_utf);
+ }
+
+ if (unicode < 0x80) {
+
+ // U+0000 -> U+007F
+ (*character)[0] = (char) unicode;
+
+ if (width_max > 1) {
+ (*character)[1] = 0;
+
+ if (width_max > 2) {
+ (*character)[2] = 0;
+
+ if (width_max > 3) {
+ (*character)[3] = 0;
+ }
+ }
+ }
+ }
+ else if (unicode < 0x800) {
+ if (width_max < 2) {
+ return F_status_set_error(F_utf);
+ }
+
+ // U+0080 -> U+07FF
+ (*character)[0] = f_utf_byte_2 | ((char) ((unicode & 0x7c0) >> 6));
+ (*character)[1] = f_utf_byte_1 | ((char) (unicode & 0x3f));
+
+ if (width_max > 2) {
+ (*character)[2] = 0;
+
+ if (width_max > 2) {
+ (*character)[2] = 0;
+ }
+ }
+ }
+ else if (unicode < 0x10000) {
+ if (width_max < 3) {
+ return F_status_set_error(F_utf);
+ }
+
+ // U+0800 -> U+FFFF
+ (*character)[0] = f_utf_byte_3 | ((char) ((unicode & 0xf000) >> 12));
+ (*character)[1] = f_utf_byte_1 | ((char) ((unicode & 0xfc0) >> 6));
+ (*character)[2] = f_utf_byte_1 | ((char) (unicode & 0x3f));
+
+ if (width_max > 3) {
+ character[3] = 0;
+ }
+ }
+ else {
+ if (width_max < 4) {
+ return F_status_set_error(F_utf);
+ }
+
+ // U+10000 -> U+10FFFF
+ (*character)[0] = f_utf_byte_4 | ((char) ((unicode & 0x1c0000) >> 18));
+ (*character)[1] = f_utf_byte_1 | ((char) ((unicode & 0x3f000) >> 12));
+ (*character)[2] = f_utf_byte_1 | ((char) ((unicode & 0xfc0) >> 6));
+ (*character)[3] = f_utf_byte_1 | ((char) (unicode & 0x3f));
+ }
+
+ return F_none;
+ }
+#endif // _di_f_utf_unicode_from_
+
#ifdef __cplusplus
} // extern "C"
#endif
#endif // _di_f_utf_character_to_char_
/**
+ * Convert a given (UTF-8) character into Unicode.
+ *
+ * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged.
+ * The Unicode is a 32-bit integer representing the Unicode (such as U+0001).
+ * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF).
+ *
+ * @param character
+ * The (UTF-8) character.
+ * @param unicode
+ * The Unicode number.
+ *
+ * @return
+ * F_none on success.
+ *
+ * F_parameter (with error bit) if a parameter is invalid.
+ * F_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see f_utf_character_is_valid()
+ */
+#ifndef _di_f_utf_character_unicode_to_
+ extern f_status_t f_utf_character_unicode_to(const f_utf_character_t character, uint32_t *unicode);
+#endif // _di_f_utf_character_unicode_to_
+
+/**
+ * Convert a given Unicode into (UTF-8) character.
+ *
+ * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged.
+ * The Unicode is a 32-bit integer representing the Unicode (such as U+0001).
+ * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF).
+ *
+ * @param unicode
+ * The Unicode number.
+ * @param character
+ * The (UTF-8) character.
+ *
+ * @return
+ * F_none on success.
+ *
+ * F_parameter (with error bit) if a parameter is invalid.
+ * F_utf (with error bit) if unicode is an invalid Unicode character.
+ */
+#ifndef _di_f_utf_character_unicode_from_
+ extern f_status_t f_utf_character_unicode_from(const uint32_t unicode, f_utf_character_t *character);
+#endif // _di_f_utf_character_unicode_from_
+
+/**
* Helper function for UTF-8 processing code to determine endianess of the system.
*
* @todo relocate this outside of f_utf into a more general path, perhaps f_memory (f_memory_is_big_endian).
extern f_status_t f_utf_string_seek_to(const f_utf_string_t string, const uint8_t seek_to, f_utf_string_range_t *range);
#endif // _di_f_utf_string_seek_to_
+/**
+ * Convert a given string block representing a single character into Unicode.
+ *
+ * The f_utf_character_t is a 32-bit integer containing UTF-8 sequences, unchanged.
+ * The Unicode is a 32-bit integer representing the Unicode (such as U+0001).
+ * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF).
+ *
+ * @param character
+ * The (UTF-8) character to convert to the Unicode representation.
+ * @param width_max
+ * The max width available for representing the UTF-8 character.
+ * There must be enough space in the character buffer to handle the Unicode width.
+ * It is recommended to always have 4 characters (4 uint8_t) of space available in character.
+ * @param unicode
+ * The Unicode number.
+ *
+ * @return
+ * F_none on success.
+ *
+ * F_failure (with error bit) if width is not long enough to convert.
+ * F_parameter (with error bit) if a parameter is invalid.
+ * F_utf (with error bit) if character is an invalid UTF-8 character.
+ *
+ * @see f_utf_character_is_valid()
+ */
+#ifndef _di_f_utf_unicode_to_
+ extern f_status_t f_utf_unicode_to(const f_string_t character, const f_array_length_t width_max, uint32_t *unicode);
+#endif // _di_f_utf_unicode_to_
+
+/**
+ * Convert a given Unicode into a string block representing a single character.
+ *
+ * The f_string is a 32-bit integer containing UTF-8 sequences, unchanged.
+ * The Unicode is a 32-bit integer representing the Unicode (such as U+0001).
+ * The Unciode does not need to be interpretted like UTF-8, it simple is a sequence of number from 0 onto max supported Unicode integer value (U+10FFFF).
+ *
+ * @param character
+ * The (UTF-8) character.
+ * @param width_max
+ * The max width available for representing the UTF-8 character.
+ * There must be enough space in the character buffer to handle the Unicode width.
+ * It is recommended to always have 4 characters (4 uint8_t) of space available in character.
+ * @param unicode
+ * The Unicode number.
+ *
+ * @return
+ * F_none on success.
+ *
+ * F_failure (with error bit) if width is not long enough to convert.
+ * F_parameter (with error bit) if a parameter is invalid.
+ * F_utf (with error bit) if unicode is an invalid Unicode character.
+ */
+#ifndef _di_f_utf_unicode_from_
+ extern f_status_t f_utf_unicode_from(const uint32_t unicode, const f_array_length_t width_max, f_string_t *character);
+#endif // _di_f_utf_unicode_from_
+
#ifdef __cplusplus
} // extern "C"
#endif