From f1000a0d576666664bbd54cddb8eb5662be92cc7 Mon Sep 17 00:00:00 2001
From: Kevin Day <thekevinday@gmail.com>
Date: Sun, 8 Sep 2019 22:57:47 -0500
Subject: [PATCH] Update: add a space after "combining" characters and catch a
 few more invalid UTF-8 sequences

Previously, I just printed a space instead of printing the "combining" characters.
It occurred to me that I could print a space following a known "combining" character to cause it to combine into a space.
This makes things easier to view and still displays the combining character instead of hiding it behind a blank space.
The downside is that this might cause problems if someone tried to copy and paste these combined characters.

Catch a few more invalid UTF-8 sequences that I came across while making these changes.
Fix an existing invalid UTF-8 sequence detection that seems to have been incomplete and incorrect.
---
 level_3/byte_dump/c/byte_dump.c         |   4 +
 level_3/byte_dump/c/private-byte_dump.c | 105 +++++++++++++++++++-----
 2 files changed, 90 insertions(+), 19 deletions(-)

diff --git a/level_3/byte_dump/c/byte_dump.c b/level_3/byte_dump/c/byte_dump.c
index ff6487fea..8c5c7237a 100644
--- a/level_3/byte_dump/c/byte_dump.c
+++ b/level_3/byte_dump/c/byte_dump.c
@@ -36,6 +36,10 @@ extern "C" {
 
     printf("%c%c", f_string_eol, f_string_eol);
 
+    printf("UTF-8 \"Combining\" characters might have a space appended to allow a proper display but this may cause copy and paste issues.");
+
+    printf("%c%c", f_string_eol, f_string_eol);
+
     return f_none;
   }
 #endif // _di_byte_dump_print_help_
diff --git a/level_3/byte_dump/c/private-byte_dump.c b/level_3/byte_dump/c/private-byte_dump.c
index 27bb179f5..8dfb18164 100644
--- a/level_3/byte_dump/c/private-byte_dump.c
+++ b/level_3/byte_dump/c/private-byte_dump.c
@@ -137,7 +137,57 @@
         invalid[character_current] = width_utf;
       }
       // The unicode codes U+D800 to U+DFFF are for "UTF-16 surrogate halves" which are not supported in UTF-8.
-      else if (width_utf == 3 && characters.string[character_current] > 0xefbfb000 && characters.string[character_current] <= 0xc0ff0000) {
+      else if (width_utf == 3 && characters.string[character_current] >= 0xeda08000 && characters.string[character_current] <= 0xeda3bf00) {
+        found_invalid_utf = f_true;
+        invalid[character_current] = width_utf;
+      }
+      // Common Indic Number Forms, some codes of which are invalid in UTF-8.
+      else if (width_utf == 3 && characters.string[character_current] >= 0xeaa0ba00 && characters.string[character_current] <= 0xeaa0bf00) {
+        found_invalid_utf = f_true;
+        invalid[character_current] = width_utf;
+      }
+      // U+061D, unsupported in UTF-8.
+      else if (width_utf == 2 && characters.string[character_current] == 0xd89d0000) {
+        found_invalid_utf = f_true;
+        invalid[character_current] = width_utf;
+      }
+      // U+0E00, unsupported in UTF-8.
+      else if (width_utf == 3 && characters.string[character_current] == 0xe0b88000) {
+        found_invalid_utf = f_true;
+        invalid[character_current] = width_utf;
+      }
+      // U+0E3B to U+0E3E, unsupported in UTF-8.
+      else if (width_utf == 3 && characters.string[character_current] >= 0xe0b8bb00 && characters.string[character_current] <= 0xe0b8be00) {
+        found_invalid_utf = f_true;
+        invalid[character_current] = width_utf;
+      }
+      // U+0E5C to U+0E7F, unsupported in UTF-8.
+      else if (width_utf == 3 && characters.string[character_current] >= 0xe0b99c00 && characters.string[character_current] <= 0xe0b9bf00) {
+        found_invalid_utf = f_true;
+        invalid[character_current] = width_utf;
+      }
+      // (Thana) U+07B2 to U+07BF, unsupported in UTF-8.
+      else if (width_utf == 2 && characters.string[character_current] >= 0xdeb20000 && characters.string[character_current] <= 0xdebf0000) {
+        found_invalid_utf = f_true;
+        invalid[character_current] = width_utf;
+      }
+      // (Hebrew) U+0590, unsupported in UTF-8.
+      else if (width_utf == 2 && characters.string[character_current] == 0xd6900000) {
+        found_invalid_utf = f_true;
+        invalid[character_current] = width_utf;
+      }
+      // (Hebrew) U+05C8 to U+05CF, unsupported in UTF-8.
+      else if (width_utf == 2 && characters.string[character_current] >= 0xd7880000 && characters.string[character_current] <= 0xd78f0000) {
+        found_invalid_utf = f_true;
+        invalid[character_current] = width_utf;
+      }
+      // (Hebrew) U+05EB to U+05FF, unsupported in UTF-8.
+      else if (width_utf == 2 && characters.string[character_current] >= 0xd7ab0000 && characters.string[character_current] <= 0xd7bf0000) {
+        found_invalid_utf = f_true;
+        invalid[character_current] = width_utf;
+      }
+      // Unicode supports nothing above this (U+10FFFF).
+      else if (width_utf == 4 && characters.string[character_current] > 0xf48fbfbf) {
         found_invalid_utf = f_true;
         invalid[character_current] = width_utf;
       }
@@ -517,24 +567,8 @@
         // This is an "Overlong Null" and is a valid NULL character.
         printf("%s", byte_dump_sequence_null);
       }
-      else if (width_utf == 2 && characters.string[i] >= 0xcc800000 && characters.string[i] <= 0xcdaf0000) {
-        // Combining characters should not be combined here, instead display a space.
-        printf(" ");
-      }
-      else if (width_utf == 3 && characters.string[i] >= 0xe1aab000 && characters.string[i] <= 0xe1abbf00) {
-        // Combining characters should not be combined here, instead display a space.
-        printf(" ");
-      }
-      else if (width_utf == 3 && characters.string[i] >= 0xe1b78000 && characters.string[i] <= 0xe1b7bf00) {
-        // Combining characters should not be combined here, instead display a space.
-        printf(" ");
-      }
-      else if (width_utf == 3 && characters.string[i] >= 0xe2839000 && characters.string[i] <= 0xe283bf00) {
-        // Combining characters should not be combined here, instead display a space.
-        printf(" ");
-      }
-      else if (width_utf == 2 && characters.string[i] >= 0xd8900000 && characters.string[i] <= 0xd89a0000) {
-        // Combining characters should not be combined here, instead display a space.
+      else if (width_utf == 2 && characters.string[i] == 0xd89d0000) {
+        // U+061C
         printf(" ");
       }
       else if (width_utf == 2 && characters.string[i] >= 0xc2800000 && characters.string[i] <= 0xc29f0000) {
@@ -608,6 +642,39 @@
             }
           }
         }
+
+        // print a space for combining characters to combine into, thereby allowing it to be safely and readably displayed.
+        if (width_utf == 2 && characters.string[i] >= 0xdea60000 && characters.string[i] <= 0xdeb00000) {
+          // Thana combining codes: U+07A6 to U+07B0.
+          printf(" ");
+        }
+        else if (width_utf == 2 && characters.string[i] >= 0xcc800000 && characters.string[i] <= 0xcdaf0000) {
+          printf(" ");
+        }
+        else if (width_utf == 3 && characters.string[i] >= 0xe1aab000 && characters.string[i] <= 0xe1abbf00) {
+          printf(" ");
+        }
+        else if (width_utf == 3 && characters.string[i] >= 0xe1b78000 && characters.string[i] <= 0xe1b7bf00) {
+          printf(" ");
+        }
+        else if (width_utf == 3 && characters.string[i] >= 0xe2839000 && characters.string[i] <= 0xe283bf00) {
+          printf(" ");
+        }
+        else if (width_utf == 2 && characters.string[i] >= 0xd8900000 && characters.string[i] <= 0xd89a0000) {
+          printf(" ");
+        }
+        else if (width_utf == 2 && characters.string[i] >= 0xd98b0000 && characters.string[i] <= 0xd99f0000) {
+          // Arabic, U+064B to U+065F.
+          printf(" ");
+        }
+        else if (width_utf == 2 && characters.string[i] >= 0xdb960000 && characters.string[i] <= 0xdb9c0000) {
+          // Arabic, U+06D6 to U+06DC.
+          printf(" ");
+        }
+        else if (width_utf == 2 && characters.string[i] >= 0xd6910000 && characters.string[i] <= 0xd6bd0000) {
+          // Hebrew, U+0591 to U+05BD.
+          printf(" ");
+        }
       }
       else {
         printf("%c", output);
-- 
2.52.0