Added UTF8 encoder, final version of string->utf8

2025-07-09 14:07:34 +02:00 · 2017-10-25 18:35:11 -04:00 · 2017-10-25 18:35:11 -04:00 · aa0b0a7567
commit aa0b0a7567
parent ccfde220ff
2 changed files with 98 additions and 11 deletions
--- a/include/cyclone/runtime.h
+++ b/include/cyclone/runtime.h
@ -715,6 +715,13 @@ void Cyc_set_globals_changed(gc_thread_data *thd);
 /**@{*/
 #define CYC_UTF8_ACCEPT 0
 #define CYC_UTF8_REJECT 1
+
+/**
+ * Simple macro to make it more convenient to convert a single char
+ */
+#define Cyc_utf8_encode_char(dest, dest_size, char_value) \
+  Cyc_utf8_encode(dest, dest_size, &char_value, 1)
+
 uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
 int Cyc_utf8_count_code_points(uint8_t* s);
 uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); 
--- a/runtime.c
+++ b/runtime.c
@ -2595,23 +2595,41 @@ object Cyc_string2utf8(void *data, object cont, object str, object start,
  e = unbox_number(end);
  len = e - s;

-  if (s < 0 || (s >= string_len(str) && len > 0)) {
+  if (s < 0 || (s >= string_num_cp(str) && len > 0)) {
    Cyc_rt_raise2(data, "string->utf8 - invalid start", start);
  }

-  if (e < 0 || e < s || e > string_len(str)) {
+  if (e < 0 || e < s || e > string_num_cp(str)) {
    Cyc_rt_raise2(data, "string->utf8 - invalid end", end);
  }

-  // TODO: we have code point positions s, e, and length. We need to take those
-  // and walk the string to figure out the starting and ending BYTE positions
-
-  // TODO: fast path, can keep below if string_num_cp(str) == string_len(str)
-
-  result.len = len;
-  result.data = alloca(sizeof(char) * len);
-  memcpy(&result.data[0], &(string_str(str))[s], len);
-  _return_closcall1(data, cont, &result);
+  // Fast path
+  if (string_num_cp(str) == string_len(str)) { // TODO: disable for testing purposes
+    result.len = len;
+    result.data = alloca(sizeof(char) * len);
+    memcpy(&result.data[0], &(string_str(str))[s], len);
+    _return_closcall1(data, cont, &result);
+  } else {
+    int i, start_i = 0, end_i = 0;
+    const char *tmp = string_str(str);
+    char_type codepoint;
+    uint32_t state = 0;
+    for (i = 0; *tmp; ++tmp) {
+      if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){
+        if (i == s) {
+          start_i = i;
+        } else if (i == e) {
+          break;
+        }
+      }
+      i++;
+    }
+    end_i = i;
+    result.len = end_i - start_i;
+    result.data = alloca(sizeof(char) * result.len);
+    memcpy(&result.data[0], &(string_str(str))[start_i], result.len);
+    _return_closcall1(data, cont, &result);
+  }
 }

 object Cyc_bytevector_u8_ref(void *data, object bv, object k)
@ -6563,4 +6581,66 @@ int uint32_num_bytes(uint32_t x) {
  return 4;
 }

+/**
+ * This function takes one or more 32-bit chars and encodes them 
+ * as an array of UTF-8 bytes.
+ * FROM: https://www.cprogramming.com/tutorial/utf8.c
+ *
+ * @param dest    Destination byte buffer
+ * @param sz      size of dest buffer in bytes
+ * @param src     Buffer of source data, in 32-bit characters
+ * @param srcsz   number of source characters, or -1 if 0-terminated
+ *
+ * @return Number of characters converted
+ *
+ * dest will only be '\0'-terminated if there is enough space. this is
+ * for consistency; imagine there are 2 bytes of space left, but the next
+ * character requires 3 bytes. in this case we could NUL-terminate, but in
+ * general we can't when there's insufficient space. therefore this function
+ * only NUL-terminates if all the characters fit, and there's space for
+ * the NUL as well.
+ * the destination string will never be bigger than the source string.
+ */
+int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz)
+{
+    u_int32_t ch;
+    int i = 0;
+    char *dest_end = dest + sz;
+
+    while (srcsz<0 ? src[i]!=0 : i < srcsz) {
+        ch = src[i];
+        if (ch < 0x80) {
+            if (dest >= dest_end)
+                return i;
+            *dest++ = (char)ch;
+        }
+        else if (ch < 0x800) {
+            if (dest >= dest_end-1)
+                return i;
+            *dest++ = (ch>>6) | 0xC0;
+            *dest++ = (ch & 0x3F) | 0x80;
+        }
+        else if (ch < 0x10000) {
+            if (dest >= dest_end-2)
+                return i;
+            *dest++ = (ch>>12) | 0xE0;
+            *dest++ = ((ch>>6) & 0x3F) | 0x80;
+            *dest++ = (ch & 0x3F) | 0x80;
+        }
+        else if (ch < 0x110000) {
+            if (dest >= dest_end-3)
+                return i;
+            *dest++ = (ch>>18) | 0xF0;
+            *dest++ = ((ch>>12) & 0x3F) | 0x80;
+            *dest++ = ((ch>>6) & 0x3F) | 0x80;
+            *dest++ = (ch & 0x3F) | 0x80;
+        }
+        i++;
+    }
+    if (dest < dest_end)
+        *dest = '\0';
+    return i;
+}
+
+
 ////////////// END UTF-8 Section //////////////