Initial UTF-8 helpers

2025-07-13 15:57:36 +02:00 · 2017-10-19 13:29:57 +00:00 · 2017-10-19 13:29:57 +00:00 · ae3aa1941d
commit ae3aa1941d
parent 4b2b866ba7
1 changed files with 102 additions and 0 deletions
--- a/runtime.c
+++ b/runtime.c
@ -6364,3 +6364,105 @@ void Cyc_io_read_token(void *data, object cont, object port)
  }
 }

+////////////// UTF-8 Section //////////////
+
+// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 1
+
+static const uint8_t utf8d[] = {
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+  0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+  0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+  0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+  1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+  1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+  1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+uint32_t inline
+decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
+  uint32_t type = utf8d[byte];
+
+  *codep = (*state != UTF8_ACCEPT) ?
+    (byte & 0x3fu) | (*codep << 6) :
+    (0xff >> type) & (byte);
+
+  *state = utf8d[256 + *state*16 + type];
+  return *state;
+}
+// END Bjoern Hoehrmann
+
+/**
+ * @brief
+ * Count the number of code points in a string.
+ * Based on example code from Bjoern Hoehrmann.
+ */
+int countCodePoints(uint8_t* s, size_t* count) {
+  uint32_t codepoint;
+  uint32_t state = 0;
+
+  for (*count = 0; *s; ++s)
+    if (!decode(&state, &codepoint, *s))
+      *count += 1;
+
+  return state != UTF8_ACCEPT;
+}
+
+// TODO: index into X codepoint in a string 
+
+/**
+ * @brief
+ * Use this when validating from a stream, as it may be that the stream stopped
+ * in the middle of a codepoint, hence state passed in as an arg, so it can be
+ * tested in a loop and also after the loop has finished.
+ *
+ * From https://stackoverflow.com/a/22135005/101258
+ */
+uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
+   size_t i;
+   uint32_t type;
+
+    for (i = 0; i < len; i++) {
+        // We don't care about the codepoint, so this is
+        // a simplified version of the decode function.
+        type = utf8d[(uint8_t)str[i]];
+        *state = utf8d[256 + (*state) * 16 + type];
+
+        if (*state == UTF8_REJECT)
+            break;
+    }
+
+    return *state;
+}
+
+/**
+ * @brief Simplified version of above, always called with a complete string buffer
+ */
+uint32_t valid_utf8(char *str, size_t len) {
+   size_t i;
+   uint32_t state = UTF8_ACCEPT, type;
+
+    for (i = 0; i < len; i++) {
+        // We don't care about the codepoint, so this is
+        // a simplified version of the decode function.
+        type = utf8d[(uint8_t)str[i]];
+        state = utf8d[256 + (state) * 16 + type];
+
+        if (state == UTF8_REJECT)
+            break;
+    }
+
+    return state;
+}
+
+////////////// END UTF-8 Section //////////////