From 8a78b401da8275492c69096bff53a3fcb6470587 Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Mon, 8 Nov 2021 20:36:22 -0600 Subject: [PATCH] Update: Experimentall cast characters to uint8_t in UTF-8 processing code. I've noticed with calls like printf("%d", string[0]) the printed number might be a negative huge number. Explicitly casting it to uint8_t (rather than char) seems to be a way to avoid this and allow for the number to be better printed. I am suspecting that this should be done in general rather than just to the print functions. This is an experimental commit designed to make it so. This also has an affect on bitwise operations because bitwise shifts respond differently between signed and unsigned integers. --- level_0/f_utf/c/utf.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/level_0/f_utf/c/utf.c b/level_0/f_utf/c/utf.c index 984246f..6081392 100644 --- a/level_0/f_utf/c/utf.c +++ b/level_0/f_utf/c/utf.c @@ -2005,28 +2005,28 @@ extern "C" { if (width == 1) { // U+0000 -> U+007F - *unicode = character[0] & 0x7f; + *unicode = ((uint8_t) character[0]) & 0x7f; } else if (width == 2) { // U+0080 -> U+07FF - *unicode = (character[0] & 0x1f) << 6; - *unicode |= character[1] & 0x3f; + *unicode = (((uint8_t) character[0]) & 0x1f) << 6; + *unicode |= ((uint8_t) character[1]) & 0x3f; } else if (width == 3) { // U+0800 -> U+FFFF - *unicode = (character[0] & 0xf) << 12; - *unicode |= (character[1] & 0x3f) << 6; - *unicode |= character[2] & 0x3f; + *unicode = (((uint8_t) character[0]) & 0xf) << 12; + *unicode |= (((uint8_t) character[1]) & 0x3f) << 6; + *unicode |= ((uint8_t) character[2]) & 0x3f; } else if (width == 4) { // U+10000 -> U+10FFFF - *unicode = (character[0] & 0x7) << 18; - *unicode |= (character[1] & 0x3f) << 12; - *unicode |= (character[2] & 0x3f) << 6; - *unicode |= character[3] & 0x3f; + *unicode = (((uint8_t) character[0]) & 0x7) << 18; + *unicode |= (((uint8_t) character[1]) & 0x3f) << 12; + *unicode |= (((uint8_t) character[2]) & 0x3f) << 6; + *unicode |= ((uint8_t) character[3]) & 0x3f; } return F_none; @@ -2048,7 +2048,7 @@ extern "C" { if (unicode < 0x80) { // U+0000 -> U+007F - (*character)[0] = (char) unicode; + (*character)[0] = (uint8_t) unicode; if (width_max > 1) { (*character)[1] = 0; @@ -2068,8 +2068,8 @@ extern "C" { } // U+0080 -> U+07FF - (*character)[0] = F_utf_byte_2_d | ((char) ((unicode & 0x7c0) >> 6)); - (*character)[1] = F_utf_byte_1_d | ((char) (unicode & 0x3f)); + (*character)[0] = F_utf_byte_2_d | ((uint8_t) ((unicode & 0x7c0) >> 6)); + (*character)[1] = F_utf_byte_1_d | ((uint8_t) (unicode & 0x3f)); if (width_max > 2) { (*character)[2] = 0; @@ -2085,9 +2085,9 @@ extern "C" { } // U+0800 -> U+FFFF - (*character)[0] = F_utf_byte_3_d | ((char) ((unicode & 0xf000) >> 12)); - (*character)[1] = F_utf_byte_1_d | ((char) ((unicode & 0xfc0) >> 6)); - (*character)[2] = F_utf_byte_1_d | ((char) (unicode & 0x3f)); + (*character)[0] = F_utf_byte_3_d | ((uint8_t) ((unicode & 0xf000) >> 12)); + (*character)[1] = F_utf_byte_1_d | ((uint8_t) ((unicode & 0xfc0) >> 6)); + (*character)[2] = F_utf_byte_1_d | ((uint8_t) (unicode & 0x3f)); if (width_max > 3) { character[3] = 0; @@ -2099,10 +2099,10 @@ extern "C" { } // U+10000 -> U+10FFFF - (*character)[0] = F_utf_byte_4_d | ((char) ((unicode & 0x1c0000) >> 18)); - (*character)[1] = F_utf_byte_1_d | ((char) ((unicode & 0x3f000) >> 12)); - (*character)[2] = F_utf_byte_1_d | ((char) ((unicode & 0xfc0) >> 6)); - (*character)[3] = F_utf_byte_1_d | ((char) (unicode & 0x3f)); + (*character)[0] = F_utf_byte_4_d | ((uint8_t) ((unicode & 0x1c0000) >> 18)); + (*character)[1] = F_utf_byte_1_d | ((uint8_t) ((unicode & 0x3f000) >> 12)); + (*character)[2] = F_utf_byte_1_d | ((uint8_t) ((unicode & 0xfc0) >> 6)); + (*character)[3] = F_utf_byte_1_d | ((uint8_t) (unicode & 0x3f)); } return F_none; -- 1.8.3.1