From f1000a0d576666664bbd54cddb8eb5662be92cc7 Mon Sep 17 00:00:00 2001 From: Kevin Day Date: Sun, 8 Sep 2019 22:57:47 -0500 Subject: [PATCH] Update: add a space after "combining" characters and catch a few more invalid UTF-8 sequences Previously, I just printed a space instead of printing the "combining" characters. It occurred to me that I could print a space following a known "combining" character to cause it to combine into a space. This makes things easier to view and still displays the combining character instead of hiding it behind a blank space. The downside is that this might cause problems if someone tried to copy and paste these combined characters. Catch a few more invalid UTF-8 sequences that I came across while making these changes. Fix an existing invalid UTF-8 sequence detection that seems to have been incomplete and incorrect. --- level_3/byte_dump/c/byte_dump.c | 4 ++ level_3/byte_dump/c/private-byte_dump.c | 105 ++++++++++++++++++++++++++------ 2 files changed, 90 insertions(+), 19 deletions(-) diff --git a/level_3/byte_dump/c/byte_dump.c b/level_3/byte_dump/c/byte_dump.c index ff6487f..8c5c723 100644 --- a/level_3/byte_dump/c/byte_dump.c +++ b/level_3/byte_dump/c/byte_dump.c @@ -36,6 +36,10 @@ extern "C" { printf("%c%c", f_string_eol, f_string_eol); + printf("UTF-8 \"Combining\" characters might have a space appended to allow a proper display but this may cause copy and paste issues."); + + printf("%c%c", f_string_eol, f_string_eol); + return f_none; } #endif // _di_byte_dump_print_help_ diff --git a/level_3/byte_dump/c/private-byte_dump.c b/level_3/byte_dump/c/private-byte_dump.c index 27bb179..8dfb181 100644 --- a/level_3/byte_dump/c/private-byte_dump.c +++ b/level_3/byte_dump/c/private-byte_dump.c @@ -137,7 +137,57 @@ invalid[character_current] = width_utf; } // The unicode codes U+D800 to U+DFFF are for "UTF-16 surrogate halves" which are not supported in UTF-8. - else if (width_utf == 3 && characters.string[character_current] > 0xefbfb000 && characters.string[character_current] <= 0xc0ff0000) { + else if (width_utf == 3 && characters.string[character_current] >= 0xeda08000 && characters.string[character_current] <= 0xeda3bf00) { + found_invalid_utf = f_true; + invalid[character_current] = width_utf; + } + // Common Indic Number Forms, some codes of which are invalid in UTF-8. + else if (width_utf == 3 && characters.string[character_current] >= 0xeaa0ba00 && characters.string[character_current] <= 0xeaa0bf00) { + found_invalid_utf = f_true; + invalid[character_current] = width_utf; + } + // U+061D, unsupported in UTF-8. + else if (width_utf == 2 && characters.string[character_current] == 0xd89d0000) { + found_invalid_utf = f_true; + invalid[character_current] = width_utf; + } + // U+0E00, unsupported in UTF-8. + else if (width_utf == 3 && characters.string[character_current] == 0xe0b88000) { + found_invalid_utf = f_true; + invalid[character_current] = width_utf; + } + // U+0E3B to U+0E3E, unsupported in UTF-8. + else if (width_utf == 3 && characters.string[character_current] >= 0xe0b8bb00 && characters.string[character_current] <= 0xe0b8be00) { + found_invalid_utf = f_true; + invalid[character_current] = width_utf; + } + // U+0E5C to U+0E7F, unsupported in UTF-8. + else if (width_utf == 3 && characters.string[character_current] >= 0xe0b99c00 && characters.string[character_current] <= 0xe0b9bf00) { + found_invalid_utf = f_true; + invalid[character_current] = width_utf; + } + // (Thana) U+07B2 to U+07BF, unsupported in UTF-8. + else if (width_utf == 2 && characters.string[character_current] >= 0xdeb20000 && characters.string[character_current] <= 0xdebf0000) { + found_invalid_utf = f_true; + invalid[character_current] = width_utf; + } + // (Hebrew) U+0590, unsupported in UTF-8. + else if (width_utf == 2 && characters.string[character_current] == 0xd6900000) { + found_invalid_utf = f_true; + invalid[character_current] = width_utf; + } + // (Hebrew) U+05C8 to U+05CF, unsupported in UTF-8. + else if (width_utf == 2 && characters.string[character_current] >= 0xd7880000 && characters.string[character_current] <= 0xd78f0000) { + found_invalid_utf = f_true; + invalid[character_current] = width_utf; + } + // (Hebrew) U+05EB to U+05FF, unsupported in UTF-8. + else if (width_utf == 2 && characters.string[character_current] >= 0xd7ab0000 && characters.string[character_current] <= 0xd7bf0000) { + found_invalid_utf = f_true; + invalid[character_current] = width_utf; + } + // Unicode supports nothing above this (U+10FFFF). + else if (width_utf == 4 && characters.string[character_current] > 0xf48fbfbf) { found_invalid_utf = f_true; invalid[character_current] = width_utf; } @@ -517,24 +567,8 @@ // This is an "Overlong Null" and is a valid NULL character. printf("%s", byte_dump_sequence_null); } - else if (width_utf == 2 && characters.string[i] >= 0xcc800000 && characters.string[i] <= 0xcdaf0000) { - // Combining characters should not be combined here, instead display a space. - printf(" "); - } - else if (width_utf == 3 && characters.string[i] >= 0xe1aab000 && characters.string[i] <= 0xe1abbf00) { - // Combining characters should not be combined here, instead display a space. - printf(" "); - } - else if (width_utf == 3 && characters.string[i] >= 0xe1b78000 && characters.string[i] <= 0xe1b7bf00) { - // Combining characters should not be combined here, instead display a space. - printf(" "); - } - else if (width_utf == 3 && characters.string[i] >= 0xe2839000 && characters.string[i] <= 0xe283bf00) { - // Combining characters should not be combined here, instead display a space. - printf(" "); - } - else if (width_utf == 2 && characters.string[i] >= 0xd8900000 && characters.string[i] <= 0xd89a0000) { - // Combining characters should not be combined here, instead display a space. + else if (width_utf == 2 && characters.string[i] == 0xd89d0000) { + // U+061C printf(" "); } else if (width_utf == 2 && characters.string[i] >= 0xc2800000 && characters.string[i] <= 0xc29f0000) { @@ -608,6 +642,39 @@ } } } + + // print a space for combining characters to combine into, thereby allowing it to be safely and readably displayed. + if (width_utf == 2 && characters.string[i] >= 0xdea60000 && characters.string[i] <= 0xdeb00000) { + // Thana combining codes: U+07A6 to U+07B0. + printf(" "); + } + else if (width_utf == 2 && characters.string[i] >= 0xcc800000 && characters.string[i] <= 0xcdaf0000) { + printf(" "); + } + else if (width_utf == 3 && characters.string[i] >= 0xe1aab000 && characters.string[i] <= 0xe1abbf00) { + printf(" "); + } + else if (width_utf == 3 && characters.string[i] >= 0xe1b78000 && characters.string[i] <= 0xe1b7bf00) { + printf(" "); + } + else if (width_utf == 3 && characters.string[i] >= 0xe2839000 && characters.string[i] <= 0xe283bf00) { + printf(" "); + } + else if (width_utf == 2 && characters.string[i] >= 0xd8900000 && characters.string[i] <= 0xd89a0000) { + printf(" "); + } + else if (width_utf == 2 && characters.string[i] >= 0xd98b0000 && characters.string[i] <= 0xd99f0000) { + // Arabic, U+064B to U+065F. + printf(" "); + } + else if (width_utf == 2 && characters.string[i] >= 0xdb960000 && characters.string[i] <= 0xdb9c0000) { + // Arabic, U+06D6 to U+06DC. + printf(" "); + } + else if (width_utf == 2 && characters.string[i] >= 0xd6910000 && characters.string[i] <= 0xd6bd0000) { + // Hebrew, U+0591 to U+05BD. + printf(" "); + } } else { printf("%c", output); -- 1.8.3.1