From ae3aa1941d4efc705f1249973d035c718cc78af3 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Thu, 19 Oct 2017 13:29:57 +0000
Subject: [PATCH 01/61] Initial UTF-8 helpers

---
 runtime.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/runtime.c b/runtime.c
index 1525f2d0..dd799d88 100644
--- a/runtime.c
+++ b/runtime.c
@@ -6364,3 +6364,105 @@ void Cyc_io_read_token(void *data, object cont, object port)
   }
 }
 
+////////////// UTF-8 Section //////////////
+
+// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 1
+
+static const uint8_t utf8d[] = {
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+  0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+  0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+  0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+  1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+  1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+  1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+uint32_t inline
+decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
+  uint32_t type = utf8d[byte];
+
+  *codep = (*state != UTF8_ACCEPT) ?
+    (byte & 0x3fu) | (*codep << 6) :
+    (0xff >> type) & (byte);
+
+  *state = utf8d[256 + *state*16 + type];
+  return *state;
+}
+// END Bjoern Hoehrmann
+
+/**
+ * @brief
+ * Count the number of code points in a string.
+ * Based on example code from Bjoern Hoehrmann.
+ */
+int countCodePoints(uint8_t* s, size_t* count) {
+  uint32_t codepoint;
+  uint32_t state = 0;
+
+  for (*count = 0; *s; ++s)
+    if (!decode(&state, &codepoint, *s))
+      *count += 1;
+
+  return state != UTF8_ACCEPT;
+}
+
+// TODO: index into X codepoint in a string 
+
+/**
+ * @brief
+ * Use this when validating from a stream, as it may be that the stream stopped
+ * in the middle of a codepoint, hence state passed in as an arg, so it can be
+ * tested in a loop and also after the loop has finished.
+ *
+ * From https://stackoverflow.com/a/22135005/101258
+ */
+uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
+   size_t i;
+   uint32_t type;
+
+    for (i = 0; i < len; i++) {
+        // We don't care about the codepoint, so this is
+        // a simplified version of the decode function.
+        type = utf8d[(uint8_t)str[i]];
+        *state = utf8d[256 + (*state) * 16 + type];
+
+        if (*state == UTF8_REJECT)
+            break;
+    }
+
+    return *state;
+}
+
+/**
+ * @brief Simplified version of above, always called with a complete string buffer
+ */
+uint32_t valid_utf8(char *str, size_t len) {
+   size_t i;
+   uint32_t state = UTF8_ACCEPT, type;
+
+    for (i = 0; i < len; i++) {
+        // We don't care about the codepoint, so this is
+        // a simplified version of the decode function.
+        type = utf8d[(uint8_t)str[i]];
+        state = utf8d[256 + (state) * 16 + type];
+
+        if (state == UTF8_REJECT)
+            break;
+    }
+
+    return state;
+}
+
+////////////// END UTF-8 Section //////////////

From 71c7ed3e7f8f0e83af363be2a90cc7e4e835104a Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Fri, 20 Oct 2017 12:54:13 +0000
Subject: [PATCH 02/61] Cleanup and added UTF 8 definitions to header file

---
 include/cyclone/runtime.h | 14 ++++++++++++++
 runtime.c                 | 28 ++++++++++++----------------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h
index 6a8027dd..dbbfb8d9 100644
--- a/include/cyclone/runtime.h
+++ b/include/cyclone/runtime.h
@@ -707,4 +707,18 @@ void add_global(object * glo);
 void Cyc_set_globals_changed(gc_thread_data *thd);
 /**@}*/
 
+/**
+ * \defgroup prim_utf8 UTF-8
+ *
+ * @brief Unicode processing using UTF-8
+ */
+/**@{*/
+#define CYC_UTF8_ACCEPT 0
+#define CYC_UTF8_REJECT 1
+uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
+int Cyc_utf8_count_code_points(uint8_t* s, size_t* count);
+uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); 
+uint32_t Cyc_utf8_validate(char *str, size_t len);
+/**@}*/
+
 #endif                          /* CYCLONE_RUNTIME_H */
diff --git a/runtime.c b/runtime.c
index dd799d88..a0662e81 100644
--- a/runtime.c
+++ b/runtime.c
@@ -6368,10 +6368,6 @@ void Cyc_io_read_token(void *data, object cont, object port)
 
 // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
-
-#define UTF8_ACCEPT 0
-#define UTF8_REJECT 1
-
 static const uint8_t utf8d[] = {
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
@@ -6389,11 +6385,11 @@ static const uint8_t utf8d[] = {
   1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
 };
 
-uint32_t inline
-decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
+//uint32_t inline
+uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
   uint32_t type = utf8d[byte];
 
-  *codep = (*state != UTF8_ACCEPT) ?
+  *codep = (*state != CYC_UTF8_ACCEPT) ?
     (byte & 0x3fu) | (*codep << 6) :
     (0xff >> type) & (byte);
 
@@ -6407,15 +6403,15 @@ decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
  * Count the number of code points in a string.
  * Based on example code from Bjoern Hoehrmann.
  */
-int countCodePoints(uint8_t* s, size_t* count) {
+int Cyc_utf8_count_code_points(uint8_t* s, size_t* count) {
   uint32_t codepoint;
   uint32_t state = 0;
 
   for (*count = 0; *s; ++s)
-    if (!decode(&state, &codepoint, *s))
+    if (!Cyc_utf8_decode(&state, &codepoint, *s))
       *count += 1;
 
-  return state != UTF8_ACCEPT;
+  return state != CYC_UTF8_ACCEPT;
 }
 
 // TODO: index into X codepoint in a string 
@@ -6428,7 +6424,7 @@ int countCodePoints(uint8_t* s, size_t* count) {
  *
  * From https://stackoverflow.com/a/22135005/101258
  */
-uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
+uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len) {
    size_t i;
    uint32_t type;
 
@@ -6438,7 +6434,7 @@ uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
         type = utf8d[(uint8_t)str[i]];
         *state = utf8d[256 + (*state) * 16 + type];
 
-        if (*state == UTF8_REJECT)
+        if (*state == CYC_UTF8_REJECT)
             break;
     }
 
@@ -6446,11 +6442,11 @@ uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
 }
 
 /**
- * @brief Simplified version of above, always called with a complete string buffer
+ * @brief Simplified version of Cyc_utf8_validate_stream that must always be called with a complete string buffer.
  */
-uint32_t valid_utf8(char *str, size_t len) {
+uint32_t Cyc_utf8_validate(char *str, size_t len) {
    size_t i;
-   uint32_t state = UTF8_ACCEPT, type;
+   uint32_t state = CYC_UTF8_ACCEPT, type;
 
     for (i = 0; i < len; i++) {
         // We don't care about the codepoint, so this is
@@ -6458,7 +6454,7 @@ uint32_t valid_utf8(char *str, size_t len) {
         type = utf8d[(uint8_t)str[i]];
         state = utf8d[256 + (state) * 16 + type];
 
-        if (state == UTF8_REJECT)
+        if (state == CYC_UTF8_REJECT)
             break;
     }
 

From ccad99062681226530f56cc656bdc32380822d4a Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Fri, 20 Oct 2017 13:28:16 +0000
Subject: [PATCH 03/61] Beginning to change string type

---
 include/cyclone/types.h | 47 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/include/cyclone/types.h b/include/cyclone/types.h
index 647f4ba3..f6fb2b1e 100644
--- a/include/cyclone/types.h
+++ b/include/cyclone/types.h
@@ -721,16 +721,21 @@ typedef enum {
 typedef struct {
   gc_header_type hdr;
   tag_type tag;
+  int num_cp;
   int len;
   char *str;
 } string_type;
 
+// TODO: below macros are obsolete, need new ones that populate num_cp and
+// raise an error if an invalid UTF-8 char is detected
+
 /** Create a new string in the nursery */
 #define make_string(cs, s) string_type cs; \
 { int len = strlen(s); \
   cs.hdr.mark = gc_color_red; \
   cs.hdr.grayed = 0; \
   cs.tag = string_tag; \
+  cs.num_cp = len; \
   cs.len = len; \
   cs.str = alloca(sizeof(char) * (len + 1)); \
   memcpy(cs.str, s, len + 1);}
@@ -744,6 +749,7 @@ typedef struct {
   cs.hdr.mark = gc_color_red; \
   cs.hdr.grayed = 0; \
   cs.tag = string_tag; cs.len = len; \
+  cs.num_cp = len; \
   cs.str = alloca(sizeof(char) * (len + 1)); \
   memcpy(cs.str, s, len); \
   cs.str[len] = '\0';}
@@ -755,9 +761,48 @@ typedef struct {
 #define make_string_noalloc(cs, s, length) string_type cs; \
 { cs.hdr.mark = gc_color_red; cs.hdr.grayed = 0; \
   cs.tag = string_tag; cs.len = length; \
+  cs.num_cp = length; \
   cs.str = s; }
 
-/** Get the length of a string */
+///** Create a new string in the nursery */
+//#define make_string(cs, s) string_type cs; \
+//{ int len = strlen(s); \
+//  cs.hdr.mark = gc_color_red; \
+//  cs.hdr.grayed = 0; \
+//  cs.tag = string_tag; \
+//  cs.num_cp = len; \
+//  cs.len = len; \
+//  cs.str = alloca(sizeof(char) * (len + 1)); \
+//  memcpy(cs.str, s, len + 1);}
+//
+///** 
+// * Create a new string with the given length 
+// * (so it does not need to be computed) 
+// */
+//#define make_string_with_len(cs, s, length) string_type cs;  \
+//{ int len = length; \
+//  cs.hdr.mark = gc_color_red; \
+//  cs.hdr.grayed = 0; \
+//  cs.tag = string_tag; cs.len = len; \
+//  cs.num_cp = len; \
+//  cs.str = alloca(sizeof(char) * (len + 1)); \
+//  memcpy(cs.str, s, len); \
+//  cs.str[len] = '\0';}
+//
+///**
+// * Create a string object using the given C string and length.
+// * No allocation is done for the given C string.
+// */
+//#define make_string_noalloc(cs, s, length) string_type cs; \
+//{ cs.hdr.mark = gc_color_red; cs.hdr.grayed = 0; \
+//  cs.tag = string_tag; cs.len = length; \
+//  cs.num_cp = length; \
+//  cs.str = s; }
+
+/** Get the length of a string, in characters (code points) */
+#define string_num_cp(x) (((string_type *) x)->num_cp)
+
+/** Get the length of a string, in bytes */
 #define string_len(x) (((string_type *) x)->len)
 
 /** Get a string object's C string */

From 0ca396f8fa7fe8f35c7c08b458b8bc67194bf532 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Fri, 20 Oct 2017 16:29:47 +0000
Subject: [PATCH 04/61] Add new string_type field

---
 gc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gc.c b/gc.c
index 6fdf7f82..0c0e57f4 100644
--- a/gc.c
+++ b/gc.c
@@ -451,6 +451,7 @@ char *gc_copy_obj(object dest, char *obj, gc_thread_data * thd)
       memcpy(s, string_str(obj), string_len(obj) + 1);
       mark(hp) = thd->gc_alloc_color;
       type_of(hp) = string_tag;
+      string_num_cp(hp) = string_num_cp(obj);
       string_len(hp) = string_len(obj);
       string_str(hp) = s;
       return (char *)hp;

From ac8b280578d1b34ff1632d5f44dd7fdd5e48a125 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Fri, 20 Oct 2017 16:29:56 +0000
Subject: [PATCH 05/61] Refactoring, added make_utf8_string

---
 include/cyclone/runtime.h |  2 +-
 include/cyclone/types.h   | 25 ++++++++++++++-----------
 runtime.c                 | 11 +++++++----
 3 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h
index dbbfb8d9..21b204c5 100644
--- a/include/cyclone/runtime.h
+++ b/include/cyclone/runtime.h
@@ -716,7 +716,7 @@ void Cyc_set_globals_changed(gc_thread_data *thd);
 #define CYC_UTF8_ACCEPT 0
 #define CYC_UTF8_REJECT 1
 uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
-int Cyc_utf8_count_code_points(uint8_t* s, size_t* count);
+int Cyc_utf8_count_code_points(uint8_t* s);
 uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); 
 uint32_t Cyc_utf8_validate(char *str, size_t len);
 /**@}*/
diff --git a/include/cyclone/types.h b/include/cyclone/types.h
index f6fb2b1e..776aada6 100644
--- a/include/cyclone/types.h
+++ b/include/cyclone/types.h
@@ -764,17 +764,20 @@ typedef struct {
   cs.num_cp = length; \
   cs.str = s; }
 
-///** Create a new string in the nursery */
-//#define make_string(cs, s) string_type cs; \
-//{ int len = strlen(s); \
-//  cs.hdr.mark = gc_color_red; \
-//  cs.hdr.grayed = 0; \
-//  cs.tag = string_tag; \
-//  cs.num_cp = len; \
-//  cs.len = len; \
-//  cs.str = alloca(sizeof(char) * (len + 1)); \
-//  memcpy(cs.str, s, len + 1);}
-//
+/** Create a new string in the nursery */
+#define make_utf8_string(data, cs, s) string_type cs; \
+{ int len = strlen(s); \
+  cs.hdr.mark = gc_color_red; \
+  cs.hdr.grayed = 0; \
+  cs.tag = string_tag; \
+  cs.num_cp = Cyc_utf8_count_code_points(s); \
+  if (cs.num_cp < 0) { \
+    Cyc_rt_raise_msg(data, "Invalid UTF-8 characters in string"); \
+  } \
+  cs.len = len; \
+  cs.str = alloca(sizeof(char) * (len + 1)); \
+  memcpy(cs.str, s, len + 1);}
+
 ///** 
 // * Create a new string with the given length 
 // * (so it does not need to be computed) 
diff --git a/runtime.c b/runtime.c
index a0662e81..ef2bafb0 100644
--- a/runtime.c
+++ b/runtime.c
@@ -6403,15 +6403,18 @@ uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
  * Count the number of code points in a string.
  * Based on example code from Bjoern Hoehrmann.
  */
-int Cyc_utf8_count_code_points(uint8_t* s, size_t* count) {
+int Cyc_utf8_count_code_points(uint8_t* s) {
   uint32_t codepoint;
   uint32_t state = 0;
+  int count;
 
-  for (*count = 0; *s; ++s)
+  for (count = 0; *s; ++s)
     if (!Cyc_utf8_decode(&state, &codepoint, *s))
-      *count += 1;
+      count += 1;
 
-  return state != CYC_UTF8_ACCEPT;
+  if (state != CYC_UTF8_ACCEPT)
+    return -1;
+  return count;
 }
 
 // TODO: index into X codepoint in a string 

From 14626f15c449254f19735b5da33f8a92ab4e7a84 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Sun, 22 Oct 2017 18:59:35 -0400
Subject: [PATCH 06/61] Unicode changes, take code points into account

---
 runtime.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/runtime.c b/runtime.c
index ef2bafb0..c75c8ae1 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2032,7 +2032,7 @@ object Cyc_string_cmp(void *data, object str1, object str2)
 }
 
 #define Cyc_string_append_va_list(data, argc) { \
-    int i = 0, total_len = 1; \
+    int i = 0, total_cp = 0, total_len = 1; \
     int *len = alloca(sizeof(int) * argc); \
     char *buffer, *bufferp, **str = alloca(sizeof(char *) * argc); \
     object tmp; \
@@ -2041,6 +2041,7 @@ object Cyc_string_cmp(void *data, object str1, object str2)
       str[i] = ((string_type *)str1)->str; \
       len[i] = string_len((str1)); \
       total_len += len[i]; \
+      total_cp += string_num_cp(str1); \
     } \
     for (i = 1; i < argc; i++) { \
       tmp = va_arg(ap, object); \
@@ -2048,6 +2049,7 @@ object Cyc_string_cmp(void *data, object str1, object str2)
       str[i] = ((string_type *)tmp)->str; \
       len[i] = string_len((tmp)); \
       total_len += len[i]; \
+      total_cp += string_num_cp(tmp); \
     } \
     buffer = bufferp = alloca(sizeof(char) * total_len); \
     for (i = 0; i < argc; i++) { \
@@ -2056,6 +2058,7 @@ object Cyc_string_cmp(void *data, object str1, object str2)
     } \
     *bufferp = '\0'; \
     make_string(result, buffer); \
+    string_num_cp(result) = total_cp; \
     va_end(ap); \
     _return_closcall1(data, cont, &result); \
 }
@@ -2078,7 +2081,7 @@ object Cyc_string_append(void *data, object cont, int _argc, object str1, ...)
 object Cyc_string_length(void *data, object str)
 {
   Cyc_check_str(data, str);
-  return obj_int2obj(string_len(str));
+  return obj_int2obj(string_num_cp(str));
 }
 
 object Cyc_string_set(void *data, object str, object k, object chr)

From 8b817966e82aaae8b8ed0987a14c718f34b89227 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 23 Oct 2017 13:26:29 +0000
Subject: [PATCH 07/61] WIP

---
 include/cyclone/types.h | 52 +++++++++++++++++++++++------------------
 runtime.c               | 26 ++++++++++++++++-----
 2 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/include/cyclone/types.h b/include/cyclone/types.h
index 776aada6..3667d661 100644
--- a/include/cyclone/types.h
+++ b/include/cyclone/types.h
@@ -465,6 +465,12 @@ void clear_mutations(void *data);
 /** Minimum allowed value of a fixnum */
 #define CYC_FIXNUM_MIN -1073741824
 
+/**
+ * Explicit character type now that we are using UTF-8.
+ * Chars are still value types though
+ */
+typedef uint32_t char_type;
+
 /**
  * Determine if an object is an integer.
  */
@@ -778,29 +784,29 @@ typedef struct {
   cs.str = alloca(sizeof(char) * (len + 1)); \
   memcpy(cs.str, s, len + 1);}
 
-///** 
-// * Create a new string with the given length 
-// * (so it does not need to be computed) 
-// */
-//#define make_string_with_len(cs, s, length) string_type cs;  \
-//{ int len = length; \
-//  cs.hdr.mark = gc_color_red; \
-//  cs.hdr.grayed = 0; \
-//  cs.tag = string_tag; cs.len = len; \
-//  cs.num_cp = len; \
-//  cs.str = alloca(sizeof(char) * (len + 1)); \
-//  memcpy(cs.str, s, len); \
-//  cs.str[len] = '\0';}
-//
-///**
-// * Create a string object using the given C string and length.
-// * No allocation is done for the given C string.
-// */
-//#define make_string_noalloc(cs, s, length) string_type cs; \
-//{ cs.hdr.mark = gc_color_red; cs.hdr.grayed = 0; \
-//  cs.tag = string_tag; cs.len = length; \
-//  cs.num_cp = length; \
-//  cs.str = s; }
+/** 
+ * Create a new string with the given length 
+ * (so it does not need to be computed) 
+ */
+#define make_utf8_string_with_len(cs, s, length, num_cp) string_type cs;  \
+{ int len = length; \
+  cs.hdr.mark = gc_color_red; \
+  cs.hdr.grayed = 0; \
+  cs.tag = string_tag; cs.len = len; \
+  cs.num_cp = num_cp; \
+  cs.str = alloca(sizeof(char) * (len + 1)); \
+  memcpy(cs.str, s, len); \
+  cs.str[len] = '\0';}
+
+/**
+ * Create a string object using the given C string and length.
+ * No allocation is done for the given C string.
+ */
+#define make_utf8_string_noalloc(cs, s, length) string_type cs; \
+{ cs.hdr.mark = gc_color_red; cs.hdr.grayed = 0; \
+  cs.tag = string_tag; cs.len = length; \
+  cs.num_cp = length; \
+  cs.str = s; }
 
 /** Get the length of a string, in characters (code points) */
 #define string_num_cp(x) (((string_type *) x)->num_cp)
diff --git a/runtime.c b/runtime.c
index c75c8ae1..4b228cd6 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2041,7 +2041,7 @@ object Cyc_string_cmp(void *data, object str1, object str2)
       str[i] = ((string_type *)str1)->str; \
       len[i] = string_len((str1)); \
       total_len += len[i]; \
-      total_cp += string_num_cp(str1); \
+      total_cp += string_num_cp((str[i])); \
     } \
     for (i = 1; i < argc; i++) { \
       tmp = va_arg(ap, object); \
@@ -2049,7 +2049,7 @@ object Cyc_string_cmp(void *data, object str1, object str2)
       str[i] = ((string_type *)tmp)->str; \
       len[i] = string_len((tmp)); \
       total_len += len[i]; \
-      total_cp += string_num_cp(tmp); \
+      total_cp += string_num_cp((str[i])); \
     } \
     buffer = bufferp = alloca(sizeof(char) * total_len); \
     for (i = 0; i < argc; i++) { \
@@ -2058,7 +2058,7 @@ object Cyc_string_cmp(void *data, object str1, object str2)
     } \
     *bufferp = '\0'; \
     make_string(result, buffer); \
-    string_num_cp(result) = total_cp; \
+    string_num_cp((&result)) = total_cp; \
     va_end(ap); \
     _return_closcall1(data, cont, &result); \
 }
@@ -2081,7 +2081,7 @@ object Cyc_string_append(void *data, object cont, int _argc, object str1, ...)
 object Cyc_string_length(void *data, object str)
 {
   Cyc_check_str(data, str);
-  return obj_int2obj(string_num_cp(str));
+  return obj_int2obj(string_len(str));
 }
 
 object Cyc_string_set(void *data, object str, object k, object chr)
@@ -2115,13 +2115,27 @@ object Cyc_string_ref(void *data, object str, object k)
 
   raw = string_str(str);
   idx = unbox_number(k);
-  len = string_len(str);
+  len = string_num_cp(str);
 
   if (idx < 0 || idx >= len) {
     Cyc_rt_raise2(data, "string-ref - invalid index", k);
   }
 
-  return obj_char2obj(raw[idx]);
+  {
+    char_type codepoint;
+    uint32_t state = 0;
+    int count;
+
+    for (count = 0; *raw; ++raw){
+      if (!Cyc_utf8_decode(&state, &codepoint, *raw)){
+        if (count == idx) break; // Reached requested index
+        count += 1;
+      }
+    }
+    if (state != CYC_UTF8_ACCEPT)
+       Cyc_rt_raise2(data, "string-ref - invalid character at index", k);
+    return obj_char2obj(codepoint);
+  }
 }
 
 object Cyc_substring(void *data, object cont, object str, object start,

From 96e5692cb9fcddce5cbce4cb1ceb9b4be5b0f3db Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 23 Oct 2017 13:38:02 +0000
Subject: [PATCH 08/61] bugfix

---
 runtime.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runtime.c b/runtime.c
index 4b228cd6..dda093fa 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2041,7 +2041,7 @@ object Cyc_string_cmp(void *data, object str1, object str2)
       str[i] = ((string_type *)str1)->str; \
       len[i] = string_len((str1)); \
       total_len += len[i]; \
-      total_cp += string_num_cp((str[i])); \
+      total_cp += string_num_cp((str1)); \
     } \
     for (i = 1; i < argc; i++) { \
       tmp = va_arg(ap, object); \
@@ -2049,7 +2049,7 @@ object Cyc_string_cmp(void *data, object str1, object str2)
       str[i] = ((string_type *)tmp)->str; \
       len[i] = string_len((tmp)); \
       total_len += len[i]; \
-      total_cp += string_num_cp((str[i])); \
+      total_cp += string_num_cp((tmp)); \
     } \
     buffer = bufferp = alloca(sizeof(char) * total_len); \
     for (i = 0; i < argc; i++) { \

From 114e284566c69066c712c87671684e9839deb8a8 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 23 Oct 2017 13:39:04 +0000
Subject: [PATCH 09/61] string-length: return number of codepoints

---
 runtime.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime.c b/runtime.c
index dda093fa..863abab0 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2081,7 +2081,7 @@ object Cyc_string_append(void *data, object cont, int _argc, object str1, ...)
 object Cyc_string_length(void *data, object str)
 {
   Cyc_check_str(data, str);
-  return obj_int2obj(string_len(str));
+  return obj_int2obj(string_num_cp(str));
 }
 
 object Cyc_string_set(void *data, object str, object k, object chr)

From 424592ad8be8bebb6045d7a7dfd60ebe2fc19cb9 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 23 Oct 2017 17:10:43 +0000
Subject: [PATCH 10/61] Added TODO

---
 runtime.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/runtime.c b/runtime.c
index 863abab0..67f51bf3 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2121,6 +2121,8 @@ object Cyc_string_ref(void *data, object str, object k)
     Cyc_rt_raise2(data, "string-ref - invalid index", k);
   }
 
+TODO: we can take the fast path if num_cp == len, since that implies all chars are just 1 byte. 
+      would be the case for all string functions that need to be updated to be (possibly) O(n)
   {
     char_type codepoint;
     uint32_t state = 0;

From 3e64420101ab63cfda6f5486af9b55aca1526cc1 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 23 Oct 2017 17:43:37 -0400
Subject: [PATCH 11/61] Added UTF8 support to Cyc_substring

---
 include/cyclone/types.h |  4 ++--
 runtime.c               | 32 +++++++++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/include/cyclone/types.h b/include/cyclone/types.h
index 3667d661..d91a1aa0 100644
--- a/include/cyclone/types.h
+++ b/include/cyclone/types.h
@@ -788,12 +788,12 @@ typedef struct {
  * Create a new string with the given length 
  * (so it does not need to be computed) 
  */
-#define make_utf8_string_with_len(cs, s, length, num_cp) string_type cs;  \
+#define make_utf8_string_with_len(cs, s, length, num_code_points) string_type cs;  \
 { int len = length; \
   cs.hdr.mark = gc_color_red; \
   cs.hdr.grayed = 0; \
   cs.tag = string_tag; cs.len = len; \
-  cs.num_cp = num_cp; \
+  cs.num_cp = num_code_points; \
   cs.str = alloca(sizeof(char) * (len + 1)); \
   memcpy(cs.str, s, len); \
   cs.str[len] = '\0';}
diff --git a/runtime.c b/runtime.c
index 67f51bf3..607f935d 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2121,9 +2121,10 @@ object Cyc_string_ref(void *data, object str, object k)
     Cyc_rt_raise2(data, "string-ref - invalid index", k);
   }
 
-TODO: we can take the fast path if num_cp == len, since that implies all chars are just 1 byte. 
-      would be the case for all string functions that need to be updated to be (possibly) O(n)
-  {
+  // Take fast path if all chars are just 1 byte
+  if (string_num_cp(str) == string_len(str)) {
+    return obj_char2obj(raw[idx]);
+  } else {
     char_type codepoint;
     uint32_t state = 0;
     int count;
@@ -2153,7 +2154,7 @@ object Cyc_substring(void *data, object cont, object str, object start,
   raw = string_str(str);
   s = unbox_number(start);
   e = unbox_number(end);
-  len = string_len(str);
+  len = string_num_cp(str);
 
   if (s > e) {
     Cyc_rt_raise2(data, "substring - start cannot be greater than end", start);
@@ -2167,9 +2168,30 @@ object Cyc_substring(void *data, object cont, object str, object start,
     e = len;
   }
 
-  {
+  if (string_num_cp(str) == string_len(str)){ // Fast path for ASCII
     make_string_with_len(sub, raw + s, e - s);
     _return_closcall1(data, cont, &sub);
+  } else {
+    const char *tmp = raw;
+    char_type codepoint;
+    uint32_t state = 0;
+    int count, start_i = 0, end_i = 0;
+
+    for (count = 0; *tmp; ++tmp){
+      if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){
+        if (count == s) {
+          start_i = end_i;
+        } else if (count == e) {
+          break;
+        }
+        count += 1;
+      }
+      end_i++;
+    }
+    if (state != CYC_UTF8_ACCEPT)
+       Cyc_rt_raise2(data, "substring - invalid character in string", str);
+    make_utf8_string_with_len(sub, raw + start_i, end_i - start_i, e - s);
+    _return_closcall1(data, cont, &sub);
   }
 }
 

From cb1bfef031e9768f6a27550e6cd65de1605f74a7 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 23 Oct 2017 18:47:01 -0400
Subject: [PATCH 12/61] WIP - string-set!

---
 runtime.c | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/runtime.c b/runtime.c
index 607f935d..5387dbf9 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2101,7 +2101,45 @@ object Cyc_string_set(void *data, object str, object k, object chr)
   len = string_len(str);
 
   Cyc_check_bounds(data, "string-set!", len, idx);
-  raw[idx] = obj_obj2char(chr);
+
+  // Take fast path if all chars are just 1 byte
+  if (string_num_cp(str) == string_len(str)) {
+    raw[idx] = obj_obj2char(chr);
+  } else {
+    // TODO: utf8 support
+    // find codepoint at k, figure out how many bytes it is,
+    // allocate a new string (start) + chr + (end)
+    // or don't allocate if chr uses as many or fewer bytes 
+    // than the codepoint it is replacing
+
+    char *tmp = raw;
+    char_type codepoint;
+    uint32_t state = 0;
+    int i = 0, count, start_len = 0, start_cp = 0;
+
+    for (count = 0; *tmp; ++tmp){
+      if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){
+        if (count < idx) {
+          start_len = i;
+          start_cp = count;
+        } else if (count == idx) {
+          break;
+        }
+        count += 1;
+      }
+      i++;
+    }
+    if (state != CYC_UTF8_ACCEPT)
+       Cyc_rt_raise2(data, "string-set! - invalid character at index", k);
+
+    // TODO: perform actual mutation
+    //
+    // Now we know length of start (both in codepoints and bytes),
+    // and we know the codepoint to be replaced. by calculating its length
+    // we can compute where the end portion starts, and by using str we can
+    // figure out how many remaining bytes/codepoints are in end
+
+  }
   return str;
 }
 

From 13254d06f01e910e08f3170a815ed3761e021392 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Tue, 24 Oct 2017 13:23:48 +0000
Subject: [PATCH 13/61] WIP - utf8 / string conversion functions

---
 runtime.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/runtime.c b/runtime.c
index 5387dbf9..da3f255c 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2106,6 +2106,7 @@ object Cyc_string_set(void *data, object str, object k, object chr)
   if (string_num_cp(str) == string_len(str)) {
     raw[idx] = obj_obj2char(chr);
   } else {
+fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), len);
     // TODO: utf8 support
     // find codepoint at k, figure out how many bytes it is,
     // allocate a new string (start) + chr + (end)
@@ -2569,6 +2570,10 @@ object Cyc_utf82string(void *data, object cont, object bv, object start,
     st.str = alloca(sizeof(char) * (len + 1));
     memcpy(st.str, &buf[s], len);
     st.str[len] = '\0';
+    st.num_cp = Cyc_utf8_count_code_points((uint8_t *)(st.str));
+    if (st.num_cp < 0) {
+       Cyc_rt_raise2(data, "utf8->string - error decoding UTF 8", bv);
+    }
     _return_closcall1(data, cont, &st);
   }
 }
@@ -2596,6 +2601,11 @@ object Cyc_string2utf8(void *data, object cont, object str, object start,
     Cyc_rt_raise2(data, "string->utf8 - invalid end", end);
   }
 
+  // TODO: we have code point positions s, e, and length. We need to take those
+  // and walk the string to figure out the starting and ending BYTE positions
+
+  // TODO: fast path, can keep below if string_num_cp(str) == string_len(str)
+
   result.len = len;
   result.data = alloca(sizeof(char) * len);
   memcpy(&result.data[0], &(string_str(str))[s], len);

From 6c4dd4b740179932c81caa7949a927b5d7067a1c Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Tue, 24 Oct 2017 17:53:09 -0400
Subject: [PATCH 14/61] Compute number of code points and byte len

---
 scheme/base.sld | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scheme/base.sld b/scheme/base.sld
index 75ecc2f1..1f864709 100644
--- a/scheme/base.sld
+++ b/scheme/base.sld
@@ -952,7 +952,8 @@
       " object s = NULL;
         Cyc_check_int(data, count);
         char c = obj_obj2char(fill);
-        int len = obj_obj2int(count);
+        int num_cp = obj_obj2int(count);
+        int len = num_cp * uint32_num_bytes(c);
         if (len >= MAX_STACK_OBJ) {
           int heap_grown;
           s = gc_alloc(((gc_thread_data *)data)->heap, 
@@ -964,6 +965,7 @@
           ((string_type *) s)->hdr.grayed = 0;
           ((string_type *) s)->tag = string_tag; 
           ((string_type *) s)->len = len;
+          ((string_type *) s)->num_cp = num_cp;
           ((string_type *) s)->str = (((char *)s) + sizeof(string_type));
         } else {
           s = alloca(sizeof(string_type));
@@ -971,6 +973,7 @@
           ((string_type *)s)->hdr.grayed = 0;
           ((string_type *)s)->tag = string_tag; 
           ((string_type *)s)->len = len;
+          ((string_type *)s)->num_cp = num_cp;
           ((string_type *)s)->str = alloca(sizeof(char) * (len + 1));
         }
         memset(((string_type *)s)->str, c, len);

From 13e260300ffb3163e4040e1e012b0822615de16f Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Tue, 24 Oct 2017 17:53:43 -0400
Subject: [PATCH 15/61] Added utility function and stubs

---
 include/cyclone/runtime.h |  1 +
 runtime.c                 | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h
index 21b204c5..119338e9 100644
--- a/include/cyclone/runtime.h
+++ b/include/cyclone/runtime.h
@@ -719,6 +719,7 @@ uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
 int Cyc_utf8_count_code_points(uint8_t* s);
 uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); 
 uint32_t Cyc_utf8_validate(char *str, size_t len);
+int uint32_num_bytes(uint32_t val);
 /**@}*/
 
 #endif                          /* CYCLONE_RUNTIME_H */
diff --git a/runtime.c b/runtime.c
index da3f255c..79badbf9 100644
--- a/runtime.c
+++ b/runtime.c
@@ -178,6 +178,7 @@ void pack_env_variables(void *data, object k)
     svar->hdr.grayed = 0;
     svar->tag = string_tag; 
     svar->len = eqpos - e;
+    svar->num_cp = svar->len; // TODO: proper UTF-8 support!
     svar->str = alloca(sizeof(char) * (svar->len));
     strncpy(svar->str, e, svar->len);
     (svar->str)[svar->len] = '\0';
@@ -189,6 +190,7 @@ void pack_env_variables(void *data, object k)
     sval->hdr.grayed = 0;
     sval->tag = string_tag; 
     sval->len = strlen(eqpos);
+    sval->num_cp = sval->len; // TODO: proper UTF-8 support!
     sval->str = eqpos;
     set_pair(tmp, svar, sval);
     set_pair(p, tmp, NULL);
@@ -6553,4 +6555,12 @@ uint32_t Cyc_utf8_validate(char *str, size_t len) {
     return state;
 }
 
+int uint32_num_bytes(uint32_t x) {
+  // TODO: could compute log(val) / log(256)
+  if (x < 0x100) return 1;
+  if (x < 0x10000) return 2;
+  if (x < 0x1000000) return 3;
+  return 4;
+}
+
 ////////////// END UTF-8 Section //////////////

From 325112e50b56151b5e6daaa22c64a692d239697d Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Tue, 24 Oct 2017 19:00:45 -0400
Subject: [PATCH 16/61] Temporary file

---
 test.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 test.c

diff --git a/test.c b/test.c
new file mode 100644
index 00000000..4c97178b
--- /dev/null
+++ b/test.c
@@ -0,0 +1,20 @@
+// A temporary test file
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+void main(){
+  char c[128];
+  uint32_t val = 0x32363435;
+  uint8_t *ptr = (uint8_t *)&val;
+  int i, j = 0;
+  //memset(c, 0x34, 128);
+  for (i = 0; i < 127; i++) {
+    c[i] = ptr[j++];    
+    if (j == 4) j = 0;
+  }
+  c[127] = '\0';
+  printf("%s\n", c);
+  return;
+}

From 722d077367cb2ec4afc015c3e5760f04bff4903f Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Tue, 24 Oct 2017 19:01:20 -0400
Subject: [PATCH 17/61] WIP

---
 scheme/base.sld | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/scheme/base.sld b/scheme/base.sld
index 1f864709..d3a8ee78 100644
--- a/scheme/base.sld
+++ b/scheme/base.sld
@@ -976,7 +976,14 @@
           ((string_type *)s)->num_cp = num_cp;
           ((string_type *)s)->str = alloca(sizeof(char) * (len + 1));
         }
-        memset(((string_type *)s)->str, c, len);
+        //if (num_cp == 1) { /* Fast path */
+          memset(((string_type *)s)->str, c, len);
+        //} else {
+        //  int i;
+        //  uint32_t*
+        //  for (i = 0; i < len; i++) {
+        //  }
+        //}
         ((string_type *)s)->str[len] = '\\0';
         return_closcall1(data, k, s);
       ")

From 556f97dd5fc5a6d50699d64f9404b3c176931060 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Wed, 25 Oct 2017 13:54:36 +0000
Subject: [PATCH 18/61] WIP, testing encoders/decoders

---
 test.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 121 insertions(+), 12 deletions(-)

diff --git a/test.c b/test.c
index 4c97178b..306f1d85 100644
--- a/test.c
+++ b/test.c
@@ -4,17 +4,126 @@
 #include <stdint.h>
 #include <string.h>
 
-void main(){
-  char c[128];
-  uint32_t val = 0x32363435;
-  uint8_t *ptr = (uint8_t *)&val;
-  int i, j = 0;
-  //memset(c, 0x34, 128);
-  for (i = 0; i < 127; i++) {
-    c[i] = ptr[j++];    
-    if (j == 4) j = 0;
-  }
-  c[127] = '\0';
-  printf("%s\n", c);
+#define CYC_UTF8_ACCEPT 0
+
+// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+static const uint8_t utf8d[] = {
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+  0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+  0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+  0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+  1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+  1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+  1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+//uint32_t inline
+uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
+  uint32_t type = utf8d[byte];
+
+  *codep = (*state != CYC_UTF8_ACCEPT) ?
+    (byte & 0x3fu) | (*codep << 6) :
+    (0xff >> type) & (byte);
+
+  *state = utf8d[256 + *state*16 + type];
+  return *state;
+}
+
+// FROM: https://www.cprogramming.com/tutorial/utf8.c
+/* srcsz = number of source characters, or -1 if 0-terminated
+   sz = size of dest buffer in bytes
+
+   returns # characters converted
+   dest will only be '\0'-terminated if there is enough space. this is
+   for consistency; imagine there are 2 bytes of space left, but the next
+   character requires 3 bytes. in this case we could NUL-terminate, but in
+   general we can't when there's insufficient space. therefore this function
+   only NUL-terminates if all the characters fit, and there's space for
+   the NUL as well.
+   the destination string will never be bigger than the source string.
+*/
+int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz)
+{
+    u_int32_t ch;
+    int i = 0;
+    char *dest_end = dest + sz;
+
+    while (srcsz<0 ? src[i]!=0 : i < srcsz) {
+        ch = src[i];
+        if (ch < 0x80) {
+            if (dest >= dest_end)
+                return i;
+            *dest++ = (char)ch;
+        }
+        else if (ch < 0x800) {
+            if (dest >= dest_end-1)
+                return i;
+            *dest++ = (ch>>6) | 0xC0;
+            *dest++ = (ch & 0x3F) | 0x80;
+        }
+        else if (ch < 0x10000) {
+            if (dest >= dest_end-2)
+                return i;
+            *dest++ = (ch>>12) | 0xE0;
+            *dest++ = ((ch>>6) & 0x3F) | 0x80;
+            *dest++ = (ch & 0x3F) | 0x80;
+        }
+        else if (ch < 0x110000) {
+            if (dest >= dest_end-3)
+                return i;
+            *dest++ = (ch>>18) | 0xF0;
+            *dest++ = ((ch>>12) & 0x3F) | 0x80;
+            *dest++ = ((ch>>6) & 0x3F) | 0x80;
+            *dest++ = (ch & 0x3F) | 0x80;
+        }
+        i++;
+    }
+    if (dest < dest_end)
+        *dest = '\0';
+    return i;
+}
+
+void encoding() {
+  char dest[5];
+  int rv;
+  uint32_t val = 0x03bb;
+
+  rv = u8_toutf8(dest, 5, &val, 1);
+  printf("%d %x\n", rv, dest);
+TODO: above seems broken, should encode to 0xCEBB (see below)
+  return;
+}
+
+void main(){
+  char c[128];
+  uint8_t cv[] = {0xEC, 0xBA, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE
+//  uint8_t cv[] = {0xCE, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE
+  char *cptr;
+  uint32_t state = CYC_UTF8_ACCEPT, codepoint, val = 0x32363435;
+  uint8_t *ptr = (uint8_t *)&val;
+  int i, j = 0;
+//  //memset(c, 0x34, 128);
+//  for (i = 0; i < 127; i++) {
+//    c[i] = ptr[j++];    
+//    if (j == 4) j = 0;
+//  }
+//  c[127] = '\0';
+//  printf("%s\n", c);
+
+  ptr = cv;
+  for (i = 0; i < 3; i++) {
+    Cyc_utf8_decode(&state, &codepoint, ptr[i]);
+  }
+  printf("state = %d, cp = %d\n", state, codepoint);
+
+  encoding();
   return;
 }

From 96c3846b433fe470af4ff966ed4cee99d7a6a0ee Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Wed, 25 Oct 2017 17:14:10 +0000
Subject: [PATCH 19/61] Cleanup

---
 test.c | 53 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/test.c b/test.c
index 306f1d85..f3693d80 100644
--- a/test.c
+++ b/test.c
@@ -37,20 +37,27 @@ uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
   return *state;
 }
 
-// FROM: https://www.cprogramming.com/tutorial/utf8.c
-/* srcsz = number of source characters, or -1 if 0-terminated
-   sz = size of dest buffer in bytes
-
-   returns # characters converted
-   dest will only be '\0'-terminated if there is enough space. this is
-   for consistency; imagine there are 2 bytes of space left, but the next
-   character requires 3 bytes. in this case we could NUL-terminate, but in
-   general we can't when there's insufficient space. therefore this function
-   only NUL-terminates if all the characters fit, and there's space for
-   the NUL as well.
-   the destination string will never be bigger than the source string.
-*/
-int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz)
+/**
+ * This function takes one or more 32-bit chars and encodes them 
+ * as an array of UTF-8 bytes.
+ * FROM: https://www.cprogramming.com/tutorial/utf8.c
+ *
+ * @param dest    Destination byte buffer
+ * @param sz      size of dest buffer in bytes
+ * @param src     Buffer of source data, in 32-bit characters
+ * @param srcsz   number of source characters, or -1 if 0-terminated
+ *
+ * @return Number of characters converted
+ *
+ * dest will only be '\0'-terminated if there is enough space. this is
+ * for consistency; imagine there are 2 bytes of space left, but the next
+ * character requires 3 bytes. in this case we could NUL-terminate, but in
+ * general we can't when there's insufficient space. therefore this function
+ * only NUL-terminates if all the characters fit, and there's space for
+ * the NUL as well.
+ * the destination string will never be bigger than the source string.
+ */
+int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz)
 {
     u_int32_t ch;
     int i = 0;
@@ -91,14 +98,16 @@ int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz)
     return i;
 }
 
-void encoding() {
+void encode(uint32_t val) {
   char dest[5];
-  int rv;
-  uint32_t val = 0x03bb;
+  int rv, i;
 
-  rv = u8_toutf8(dest, 5, &val, 1);
-  printf("%d %x\n", rv, dest);
-TODO: above seems broken, should encode to 0xCEBB (see below)
+  rv = Cyc_utf8_encode(dest, 5, &val, 1);
+  printf("%x %d \n", val, rv);
+  for(i = 0; i < 5; i++) {
+    printf("[%x] ", (uint8_t)dest[i]);
+  }
+  printf("\n");
   return;
 }
 
@@ -124,6 +133,8 @@ void main(){
   }
   printf("state = %d, cp = %d\n", state, codepoint);
 
-  encoding();
+  encode(0x3bb);
+  encode(65);
+  encode(0xcebb);
   return;
 }

From ccfde220ffc762a975cfdeb2c81c593db0a28144 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Wed, 25 Oct 2017 17:21:53 +0000
Subject: [PATCH 20/61] WIP

---
 test.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/test.c b/test.c
index f3693d80..2d86b1a2 100644
--- a/test.c
+++ b/test.c
@@ -37,6 +37,12 @@ uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
   return *state;
 }
 
+/**
+ * Simple macro to make it more convenient to convert a single char
+ */
+#define Cyc_utf8_encode_char(dest, dest_size, char_value) \
+  Cyc_utf8_encode(dest, dest_size, &char_value, 1)
+
 /**
  * This function takes one or more 32-bit chars and encodes them 
  * as an array of UTF-8 bytes.
@@ -102,7 +108,7 @@ void encode(uint32_t val) {
   char dest[5];
   int rv, i;
 
-  rv = Cyc_utf8_encode(dest, 5, &val, 1);
+  rv = Cyc_utf8_encode_char(dest, 5, val);
   printf("%x %d \n", val, rv);
   for(i = 0; i < 5; i++) {
     printf("[%x] ", (uint8_t)dest[i]);

From aa0b0a75678b2b6134c02ba24c2e10d2ee12fd7d Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Wed, 25 Oct 2017 18:35:11 -0400
Subject: [PATCH 21/61] Added UTF8 encoder, final version of string->utf8

---
 include/cyclone/runtime.h |   7 +++
 runtime.c                 | 102 ++++++++++++++++++++++++++++++++++----
 2 files changed, 98 insertions(+), 11 deletions(-)

diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h
index 119338e9..6720002b 100644
--- a/include/cyclone/runtime.h
+++ b/include/cyclone/runtime.h
@@ -715,6 +715,13 @@ void Cyc_set_globals_changed(gc_thread_data *thd);
 /**@{*/
 #define CYC_UTF8_ACCEPT 0
 #define CYC_UTF8_REJECT 1
+
+/**
+ * Simple macro to make it more convenient to convert a single char
+ */
+#define Cyc_utf8_encode_char(dest, dest_size, char_value) \
+  Cyc_utf8_encode(dest, dest_size, &char_value, 1)
+
 uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
 int Cyc_utf8_count_code_points(uint8_t* s);
 uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); 
diff --git a/runtime.c b/runtime.c
index 79badbf9..d0e804e9 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2595,23 +2595,41 @@ object Cyc_string2utf8(void *data, object cont, object str, object start,
   e = unbox_number(end);
   len = e - s;
 
-  if (s < 0 || (s >= string_len(str) && len > 0)) {
+  if (s < 0 || (s >= string_num_cp(str) && len > 0)) {
     Cyc_rt_raise2(data, "string->utf8 - invalid start", start);
   }
 
-  if (e < 0 || e < s || e > string_len(str)) {
+  if (e < 0 || e < s || e > string_num_cp(str)) {
     Cyc_rt_raise2(data, "string->utf8 - invalid end", end);
   }
 
-  // TODO: we have code point positions s, e, and length. We need to take those
-  // and walk the string to figure out the starting and ending BYTE positions
-
-  // TODO: fast path, can keep below if string_num_cp(str) == string_len(str)
-
-  result.len = len;
-  result.data = alloca(sizeof(char) * len);
-  memcpy(&result.data[0], &(string_str(str))[s], len);
-  _return_closcall1(data, cont, &result);
+  // Fast path
+  if (string_num_cp(str) == string_len(str)) { // TODO: disable for testing purposes
+    result.len = len;
+    result.data = alloca(sizeof(char) * len);
+    memcpy(&result.data[0], &(string_str(str))[s], len);
+    _return_closcall1(data, cont, &result);
+  } else {
+    int i, start_i = 0, end_i = 0;
+    const char *tmp = string_str(str);
+    char_type codepoint;
+    uint32_t state = 0;
+    for (i = 0; *tmp; ++tmp) {
+      if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){
+        if (i == s) {
+          start_i = i;
+        } else if (i == e) {
+          break;
+        }
+      }
+      i++;
+    }
+    end_i = i;
+    result.len = end_i - start_i;
+    result.data = alloca(sizeof(char) * result.len);
+    memcpy(&result.data[0], &(string_str(str))[start_i], result.len);
+    _return_closcall1(data, cont, &result);
+  }
 }
 
 object Cyc_bytevector_u8_ref(void *data, object bv, object k)
@@ -6563,4 +6581,66 @@ int uint32_num_bytes(uint32_t x) {
   return 4;
 }
 
+/**
+ * This function takes one or more 32-bit chars and encodes them 
+ * as an array of UTF-8 bytes.
+ * FROM: https://www.cprogramming.com/tutorial/utf8.c
+ *
+ * @param dest    Destination byte buffer
+ * @param sz      size of dest buffer in bytes
+ * @param src     Buffer of source data, in 32-bit characters
+ * @param srcsz   number of source characters, or -1 if 0-terminated
+ *
+ * @return Number of characters converted
+ *
+ * dest will only be '\0'-terminated if there is enough space. this is
+ * for consistency; imagine there are 2 bytes of space left, but the next
+ * character requires 3 bytes. in this case we could NUL-terminate, but in
+ * general we can't when there's insufficient space. therefore this function
+ * only NUL-terminates if all the characters fit, and there's space for
+ * the NUL as well.
+ * the destination string will never be bigger than the source string.
+ */
+int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz)
+{
+    u_int32_t ch;
+    int i = 0;
+    char *dest_end = dest + sz;
+
+    while (srcsz<0 ? src[i]!=0 : i < srcsz) {
+        ch = src[i];
+        if (ch < 0x80) {
+            if (dest >= dest_end)
+                return i;
+            *dest++ = (char)ch;
+        }
+        else if (ch < 0x800) {
+            if (dest >= dest_end-1)
+                return i;
+            *dest++ = (ch>>6) | 0xC0;
+            *dest++ = (ch & 0x3F) | 0x80;
+        }
+        else if (ch < 0x10000) {
+            if (dest >= dest_end-2)
+                return i;
+            *dest++ = (ch>>12) | 0xE0;
+            *dest++ = ((ch>>6) & 0x3F) | 0x80;
+            *dest++ = (ch & 0x3F) | 0x80;
+        }
+        else if (ch < 0x110000) {
+            if (dest >= dest_end-3)
+                return i;
+            *dest++ = (ch>>18) | 0xF0;
+            *dest++ = ((ch>>12) & 0x3F) | 0x80;
+            *dest++ = ((ch>>6) & 0x3F) | 0x80;
+            *dest++ = (ch & 0x3F) | 0x80;
+        }
+        i++;
+    }
+    if (dest < dest_end)
+        *dest = '\0';
+    return i;
+}
+
+
 ////////////// END UTF-8 Section //////////////

From 596f225179dfa3f925d55e0e3e0bced598228acd Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Thu, 26 Oct 2017 13:02:55 +0000
Subject: [PATCH 22/61] Added memset test code

---
 test.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/test.c b/test.c
index 2d86b1a2..5f7996e8 100644
--- a/test.c
+++ b/test.c
@@ -117,9 +117,19 @@ void encode(uint32_t val) {
   return;
 }
 
+void multi_byte_memset(char *buf, int blen, char *src, int slen)
+{
+  int bi, si;
+  for (bi = 0, si = 0; bi < blen; bi++, si++) {
+    buf[bi] = src[si % slen];
+  }
+}
+
 void main(){
   char c[128];
   uint8_t cv[] = {0xEC, 0xBA, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE
+  uint8_t cv2[] = {0xCE, 0xBB}; // Lambda (0x03bb) is encoded with leading 0xCE
+  //uint8_t cv2[] = {0xEC, 0xBA, 0xBB}; // Lambda (0x03bb) is encoded with leading 0xCE
 //  uint8_t cv[] = {0xCE, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE
   char *cptr;
   uint32_t state = CYC_UTF8_ACCEPT, codepoint, val = 0x32363435;
@@ -132,6 +142,9 @@ void main(){
 //  }
 //  c[127] = '\0';
 //  printf("%s\n", c);
+  multi_byte_memset(c, 126, cv2, 2);
+  c[127] = '\0';
+  printf("TEST: %s\n", c);
 
   ptr = cv;
   for (i = 0; i < 3; i++) {

From 0bd0eeb7a6162b3cd202c9b14cc889c99bfe1a7b Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Thu, 26 Oct 2017 17:04:52 +0000
Subject: [PATCH 23/61] WIP

---
 scheme/base.sld | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scheme/base.sld b/scheme/base.sld
index d3a8ee78..2fdd8ece 100644
--- a/scheme/base.sld
+++ b/scheme/base.sld
@@ -950,10 +950,12 @@
     (define-c Cyc-make-string
       "(void *data, int argc, closure _, object k, object count, object fill)"
       " object s = NULL;
+        char ch_buf[5];
         Cyc_check_int(data, count);
-        char c = obj_obj2char(fill);
+        char_type c = obj_obj2char(fill);
+        Cyc_utf8_encode_char(ch_buf, 5, &c);
         int num_cp = obj_obj2int(count);
-        int len = num_cp * uint32_num_bytes(c);
+TODO: read encoded ch_buf        int len = num_cp * uint32_num_bytes(c);
         if (len >= MAX_STACK_OBJ) {
           int heap_grown;
           s = gc_alloc(((gc_thread_data *)data)->heap, 

From 703f863e4885c950796d5a6e9fdbd5f95f9bd65f Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Thu, 26 Oct 2017 21:56:35 +0000
Subject: [PATCH 24/61] Fixes for make-string

---
 include/cyclone/runtime.h |  1 +
 scheme/base.sld           | 21 +++++++++++----------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h
index 6720002b..3557ee6f 100644
--- a/include/cyclone/runtime.h
+++ b/include/cyclone/runtime.h
@@ -722,6 +722,7 @@ void Cyc_set_globals_changed(gc_thread_data *thd);
 #define Cyc_utf8_encode_char(dest, dest_size, char_value) \
   Cyc_utf8_encode(dest, dest_size, &char_value, 1)
 
+int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz);
 uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
 int Cyc_utf8_count_code_points(uint8_t* s);
 uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); 
diff --git a/scheme/base.sld b/scheme/base.sld
index 2fdd8ece..e2a54188 100644
--- a/scheme/base.sld
+++ b/scheme/base.sld
@@ -953,9 +953,9 @@
         char ch_buf[5];
         Cyc_check_int(data, count);
         char_type c = obj_obj2char(fill);
-        Cyc_utf8_encode_char(ch_buf, 5, &c);
+        Cyc_utf8_encode_char(ch_buf, 5, c);
         int num_cp = obj_obj2int(count);
-TODO: read encoded ch_buf        int len = num_cp * uint32_num_bytes(c);
+        int len = num_cp * strlen(ch_buf);
         if (len >= MAX_STACK_OBJ) {
           int heap_grown;
           s = gc_alloc(((gc_thread_data *)data)->heap, 
@@ -978,14 +978,15 @@ TODO: read encoded ch_buf        int len = num_cp * uint32_num_bytes(c);
           ((string_type *)s)->num_cp = num_cp;
           ((string_type *)s)->str = alloca(sizeof(char) * (len + 1));
         }
-        //if (num_cp == 1) { /* Fast path */
-          memset(((string_type *)s)->str, c, len);
-        //} else {
-        //  int i;
-        //  uint32_t*
-        //  for (i = 0; i < len; i++) {
-        //  }
-        //}
+        if (0 && num_cp == 1) { /* Fast path */
+          memset(((string_type *)s)->str, ch_buf[0], len);
+        } else {
+          char *buf = ((string_type *)s)->str;
+          int bi, si, slen = strlen(ch_buf);
+          for (bi = 0, si = 0; bi < len; bi++, si++) {
+            buf[bi] = ch_buf[si % slen];
+          }
+        }
         ((string_type *)s)->str[len] = '\\0';
         return_closcall1(data, k, s);
       ")

From 77e391cabcc83c209c31a6739f907000915f3416 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Thu, 26 Oct 2017 22:35:11 +0000
Subject: [PATCH 25/61] Uncomment fast path

---
 scheme/base.sld | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scheme/base.sld b/scheme/base.sld
index e2a54188..0a63fb55 100644
--- a/scheme/base.sld
+++ b/scheme/base.sld
@@ -978,7 +978,7 @@
           ((string_type *)s)->num_cp = num_cp;
           ((string_type *)s)->str = alloca(sizeof(char) * (len + 1));
         }
-        if (0 && num_cp == 1) { /* Fast path */
+        if (num_cp == 1) { /* Fast path */
           memset(((string_type *)s)->str, ch_buf[0], len);
         } else {
           char *buf = ((string_type *)s)->str;

From 4a77296ddf9124ac1eb33a5ea8aa65cc1b06acd9 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Fri, 27 Oct 2017 12:44:06 +0000
Subject: [PATCH 26/61] Added UTF-8 support to list->string

---
 runtime.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/runtime.c b/runtime.c
index d0e804e9..323ba363 100644
--- a/runtime.c
+++ b/runtime.c
@@ -1835,21 +1835,34 @@ object Cyc_string2symbol(void *data, object str)
 
 object Cyc_list2string(void *data, object cont, object lst)
 {
-  char *buf;
-  int i = 0;
-  object len;
+  char *buf, cbuf[5];
+  int i = 0, len = 0;
+  object cbox, tmp = lst;
+  char_type ch;
 
   Cyc_check_pair_or_null(data, lst);
-  len = Cyc_length(data, lst);  // Inefficient, walks whole list
+
+  // Need to walk the list of chars to compute multibyte length
+  while (tmp) {
+    if (is_value_type(tmp) || ((list) tmp)->tag != pair_tag) {
+      Cyc_rt_raise2(data, "length - invalid parameter, expected list", tmp);
+    }
+    cbox = car(tmp);
+    ch = obj_obj2char(cbox);
+    if (!obj_is_char(cbox)) {
+      Cyc_rt_raise2(data, "Expected character but received", cbox);
+    }
+    len += Cyc_utf8_encode_char(cbuf, 5, ch);
+    tmp = cdr(tmp);
+  }
 
   {
-    make_string_noalloc(str, NULL, (obj_obj2int(len)));
-    str.str = buf = alloca(sizeof(char) * (obj_obj2int(len) + 1));
+    make_string_noalloc(str, NULL, len);
+    str.str = buf = alloca(sizeof(char) * (len + 1));
     while ((lst != NULL)) {
-      if (!obj_is_char(car(lst))) {
-        Cyc_rt_raise2(data, "Expected character but received", car(lst));
-      }
-      buf[i++] = obj_obj2char(car(lst));
+      cbox = car(lst);
+      ch = obj_obj2char(cbox); // Already validated, can assume chars now
+      i += Cyc_utf8_encode_char(&(buf[i]), 5, ch);
       lst = cdr(lst);
     }
     buf[i] = '\0';

From 6aaa600ebca7bd31c4233d5af580691d89075bdf Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Fri, 27 Oct 2017 13:01:04 +0000
Subject: [PATCH 27/61] Bugfixes:

- Avoid unnecessary calls to `strlen`
- Type check the `fill` parameter to `make-string`
---
 scheme/base.sld | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/scheme/base.sld b/scheme/base.sld
index 0a63fb55..f5451340 100644
--- a/scheme/base.sld
+++ b/scheme/base.sld
@@ -951,11 +951,16 @@
       "(void *data, int argc, closure _, object k, object count, object fill)"
       " object s = NULL;
         char ch_buf[5];
+        char_type c;
+        int buflen, num_cp, len;
         Cyc_check_int(data, count);
-        char_type c = obj_obj2char(fill);
-        Cyc_utf8_encode_char(ch_buf, 5, c);
-        int num_cp = obj_obj2int(count);
-        int len = num_cp * strlen(ch_buf);
+        if (!obj_is_char(fill)) {
+          Cyc_rt_raise2(data, \"Expected character buf received\", fill);
+        }
+        c = obj_obj2char(fill);
+        buflen = Cyc_utf8_encode_char(ch_buf, 5, c);
+        num_cp = obj_obj2int(count);
+        len = num_cp * buflen;
         if (len >= MAX_STACK_OBJ) {
           int heap_grown;
           s = gc_alloc(((gc_thread_data *)data)->heap, 
@@ -982,7 +987,7 @@
           memset(((string_type *)s)->str, ch_buf[0], len);
         } else {
           char *buf = ((string_type *)s)->str;
-          int bi, si, slen = strlen(ch_buf);
+          int bi, si, slen = buflen;
           for (bi = 0, si = 0; bi < len; bi++, si++) {
             buf[bi] = ch_buf[si % slen];
           }

From 8289eca02a832663513c7810c7de74632cedd1bc Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Fri, 27 Oct 2017 13:02:51 +0000
Subject: [PATCH 28/61] Remove obsolete function

---
 include/cyclone/runtime.h |  1 -
 runtime.c                 | 14 +++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h
index 3557ee6f..49648706 100644
--- a/include/cyclone/runtime.h
+++ b/include/cyclone/runtime.h
@@ -727,7 +727,6 @@ uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
 int Cyc_utf8_count_code_points(uint8_t* s);
 uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); 
 uint32_t Cyc_utf8_validate(char *str, size_t len);
-int uint32_num_bytes(uint32_t val);
 /**@}*/
 
 #endif                          /* CYCLONE_RUNTIME_H */
diff --git a/runtime.c b/runtime.c
index 323ba363..6178ca72 100644
--- a/runtime.c
+++ b/runtime.c
@@ -6586,13 +6586,13 @@ uint32_t Cyc_utf8_validate(char *str, size_t len) {
     return state;
 }
 
-int uint32_num_bytes(uint32_t x) {
-  // TODO: could compute log(val) / log(256)
-  if (x < 0x100) return 1;
-  if (x < 0x10000) return 2;
-  if (x < 0x1000000) return 3;
-  return 4;
-}
+//int uint32_num_bytes(uint32_t x) {
+//  // TODO: could compute log(val) / log(256)
+//  if (x < 0x100) return 1;
+//  if (x < 0x10000) return 2;
+//  if (x < 0x1000000) return 3;
+//  return 4;
+//}
 
 /**
  * This function takes one or more 32-bit chars and encodes them 

From a5d768a8a40c0a5c50b91b7a7123d2d843028e85 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Fri, 27 Oct 2017 13:17:34 +0000
Subject: [PATCH 29/61] Cyc_io_get_output_string - populate num_cp correctly

---
 mstreams.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mstreams.c b/mstreams.c
index f1b1c50e..939d4047 100644
--- a/mstreams.c
+++ b/mstreams.c
@@ -102,6 +102,7 @@ void Cyc_io_get_output_string(void *data, object cont, object port)
   }
   {
     make_string_with_len(s, p->str_bv_in_mem_buf, p->str_bv_in_mem_buf_len);
+    s.num_cp = Cyc_utf8_count_code_points((uint8_t *)string_str(&s));
     return_closcall1(data, cont, &s);
   }
 }

From 0bcce5038ef561ce9e72964e35650075c3dc1155 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Fri, 27 Oct 2017 17:18:29 +0000
Subject: [PATCH 30/61] WIP

---
 include/cyclone/runtime.h |  1 +
 runtime.c                 | 22 ++++++++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h
index 49648706..5c31471d 100644
--- a/include/cyclone/runtime.h
+++ b/include/cyclone/runtime.h
@@ -725,6 +725,7 @@ void Cyc_set_globals_changed(gc_thread_data *thd);
 int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz);
 uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
 int Cyc_utf8_count_code_points(uint8_t* s);
+int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, int *cpts, int *bytes);
 uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); 
 uint32_t Cyc_utf8_validate(char *str, size_t len);
 /**@}*/
diff --git a/runtime.c b/runtime.c
index 6178ca72..a744d288 100644
--- a/runtime.c
+++ b/runtime.c
@@ -6313,7 +6313,7 @@ object Cyc_io_read_line(void *data, object cont, object port)
 {
   FILE *stream = ((port_type *) port)->fp;
   char buf[1024];
-  int len;
+  int len, num_cp;
 
   Cyc_check_port(data, port);
   if (stream == NULL) {
@@ -6322,7 +6322,8 @@ object Cyc_io_read_line(void *data, object cont, object port)
   set_thread_blocked(data, cont);
   errno = 0;
   if (fgets(buf, 1023, stream) != NULL) {
-    len = strlen(buf);
+    // TODO: not good enough for UTF-8, what if we stopped reading in the middle of a code point?
+    Cyc_utf8_count_code_points_and_bytes((uint8_t *)buf, &num_cp, &len);
     {
       // Remove any trailing CR / newline chars
       while (len > 0 && (buf[len - 1] == '\n' ||
@@ -6331,6 +6332,7 @@ object Cyc_io_read_line(void *data, object cont, object port)
       }
       buf[len] = '\0';
       make_string_noalloc(s, buf, len);
+      s.num_cp = num_cp;
       return_thread_runnable(data, &s);
     }
   } else {
@@ -6539,6 +6541,22 @@ int Cyc_utf8_count_code_points(uint8_t* s) {
   return count;
 }
 
+int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, int *cpts, int *bytes) {
+  uint32_t codepoint;
+  uint32_t state = 0;
+  *cpts = 0;
+  *bytes = 0;
+  for (; *s; ++s){
+    *bytes += 1;
+    if (!Cyc_utf8_decode(&state, &codepoint, *s))
+      *cpts += 1;
+  }
+
+  if (state != CYC_UTF8_ACCEPT)
+    return -1;
+  return 0;
+}
+
 // TODO: index into X codepoint in a string 
 
 /**

From 3783da2674e4725d3e0adfb7987538250b2a8af1 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 30 Oct 2017 13:17:37 +0000
Subject: [PATCH 31/61] WIP - obj_obj2char fixes

---
 include/cyclone/types.h |  4 ++--
 runtime.c               | 23 +++++++++++++++++------
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/include/cyclone/types.h b/include/cyclone/types.h
index d91a1aa0..903b4ab7 100644
--- a/include/cyclone/types.h
+++ b/include/cyclone/types.h
@@ -494,12 +494,12 @@ typedef uint32_t char_type;
 /**
  * Convert from an object to a char.
  */
-#define obj_obj2char(x) (char)((long)(x)>>2)
+#define obj_obj2char(x) (char_type)((uintmax_t)(x)>>2)
 
 /**
  * Convert from a char to an object.
  */
-#define obj_char2obj(c) ((void *)((((unsigned long)c)<<2) | 2))
+#define obj_char2obj(c) ((void *)((((uintmax_t)c)<<2) | 2))
 
 /**
  * Is the given object a value type?
diff --git a/runtime.c b/runtime.c
index a744d288..dc5d9aa8 100644
--- a/runtime.c
+++ b/runtime.c
@@ -804,7 +804,10 @@ object Cyc_display(void *data, object x, FILE * port)
     return quote_void;
   }
   if (obj_is_char(x)) {
-    fprintf(port, "%c", obj_obj2char(x));
+    char cbuf[5];
+    char_type unbox = obj_obj2char(x);
+    Cyc_utf8_encode_char(cbuf, 5, unbox);
+    fprintf(port, "%s", cbuf);
     return quote_void;
   }
   if (obj_is_int(x)) {
@@ -984,7 +987,7 @@ static object _Cyc_write(void *data, object x, FILE * port)
     return quote_void;
   }
   if (obj_is_char(x)) {
-    char c = obj_obj2char(x);
+    char_type c = obj_obj2char(x);
     switch (c) {
     case 0:   fprintf(port, "#\\null"); break;
     case 7:   fprintf(port, "#\\alarm"); break;
@@ -995,11 +998,13 @@ static object _Cyc_write(void *data, object x, FILE * port)
     case 27:  fprintf(port, "#\\escape"); break;
     case 32:  fprintf(port, "#\\space"); break;
     case 127: fprintf(port, "#\\delete"); break;
-    default:
-      fprintf(port, "#\\%c", obj_obj2char(x));
+    default: {
+      char cbuf[5];
+      Cyc_utf8_encode_char(cbuf, 5, c);
+      fprintf(port, "#\\%s", cbuf);
       break;
+      }
     }
-    //fprintf(port, "#\\%c", obj_obj2char(x));
     return quote_void;
   }
   if (obj_is_int(x)) {
@@ -1097,7 +1102,10 @@ object Cyc_write_char(void *data, object c, object port)
   if (obj_is_char(c)) {
     FILE *fp = ((port_type *) port)->fp;
     if (fp){
-      fprintf(fp, "%c", obj_obj2char(c));
+      char cbuf[5];
+      char_type unbox = obj_obj2char(c);
+      Cyc_utf8_encode_char(cbuf, 5, unbox);
+      fprintf(fp, "%s", cbuf);
     }
   } else {
     Cyc_rt_raise2(data, "Argument is not a character", c);
@@ -2119,6 +2127,7 @@ object Cyc_string_set(void *data, object str, object k, object chr)
 
   // Take fast path if all chars are just 1 byte
   if (string_num_cp(str) == string_len(str)) {
+    // TODO: not good enough, chr could be multi-byte
     raw[idx] = obj_obj2char(chr);
   } else {
 fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), len);
@@ -6323,6 +6332,8 @@ object Cyc_io_read_line(void *data, object cont, object port)
   errno = 0;
   if (fgets(buf, 1023, stream) != NULL) {
     // TODO: not good enough for UTF-8, what if we stopped reading in the middle of a code point?
+    // should reserve 3 extra bytes and, if last code point is not complete, read one byte at a
+    // time until it has been read
     Cyc_utf8_count_code_points_and_bytes((uint8_t *)buf, &num_cp, &len);
     {
       // Remove any trailing CR / newline chars

From 7f8cc02c5047f2c457e11b78ec5f5566ddd3ab34 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 30 Oct 2017 13:26:57 +0000
Subject: [PATCH 32/61] WIP - obj_char2obj

---
 runtime.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runtime.c b/runtime.c
index dc5d9aa8..5be2037e 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2755,7 +2755,7 @@ object Cyc_char2integer(object chr)
 
 object Cyc_integer2char(void *data, object n)
 {
-  int val = 0;
+  char_type val = 0;
 
   Cyc_check_num(data, n);
   val = unbox_number(n);
@@ -6095,7 +6095,7 @@ void _read_return_character(void *data, port_type *p)
     return_thread_runnable(data, obj_char2obj('\t'));
   } else if(strlen(p->tok_buf) > 1 && p->tok_buf[0] == 'x') {
     const char *buf = p->tok_buf + 1;
-    int result = strtol(buf, NULL, 16);
+    char_type result = strtol(buf, NULL, 16);
     return_thread_runnable(data, obj_char2obj(result));
   } else {
     char buf[31];

From 118822f353444a43a9a7615b956e6e5268e7a931 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 30 Oct 2017 16:57:39 +0000
Subject: [PATCH 33/61] WIP

---
 runtime.c | 14 ++++++++++++--
 test.c    |  2 ++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/runtime.c b/runtime.c
index 5be2037e..e57009a0 100644
--- a/runtime.c
+++ b/runtime.c
@@ -6001,8 +6001,15 @@ void _read_string(void *data, object cont, port_type *p)
         }
         buf[i] = '\0';
         {
-          int result = (int)strtol(buf, NULL, 16);
-          p->tok_buf[p->tok_end++] = (char)result;
+          char_type result = strtol(buf, NULL, 16);
+          char cbuf[5];
+          int i;
+          Cyc_utf8_encode_char(cbuf, 5, result);
+// TODO: infinite loop here or above if ; is not provided???
+          for (i = 0; cbuf[i] != 0; i++) {
+            _read_add_to_tok_buf(p, cbuf[i]);
+          }
+          //p->tok_buf[p->tok_end++] = (char)result;
         }
         break;
       }
@@ -6014,7 +6021,10 @@ void _read_string(void *data, object cont, port_type *p)
       p->tok_buf[p->tok_end] = '\0'; // TODO: what if buffer is full?
       p->tok_end = 0; // Reset for next atom
       {
+// TODO: need to change this below, but run into trouble in icyc, eg:
+//       (string-ref "ab\x3bb;" 2) crashes
         make_string(str, p->tok_buf);
+        //make_utf8_string(data, str, p->tok_buf);
         return_thread_runnable(data, &str);
       }
     } else if (c == '\\') {
diff --git a/test.c b/test.c
index 5f7996e8..ac48fef5 100644
--- a/test.c
+++ b/test.c
@@ -155,5 +155,7 @@ void main(){
   encode(0x3bb);
   encode(65);
   encode(0xcebb);
+
+  printf("%06X\n", 0x0fff);
   return;
 }

From a38295b22b05b8eddcf67806c479cec9a5c8e938 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 30 Oct 2017 17:52:16 +0000
Subject: [PATCH 34/61] WIP

---
 include/cyclone/types.h |  2 +-
 runtime.c               | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/cyclone/types.h b/include/cyclone/types.h
index 903b4ab7..65dc2d99 100644
--- a/include/cyclone/types.h
+++ b/include/cyclone/types.h
@@ -776,7 +776,7 @@ typedef struct {
   cs.hdr.mark = gc_color_red; \
   cs.hdr.grayed = 0; \
   cs.tag = string_tag; \
-  cs.num_cp = Cyc_utf8_count_code_points(s); \
+  cs.num_cp = Cyc_utf8_count_code_points((uint8_t *)s); \
   if (cs.num_cp < 0) { \
     Cyc_rt_raise_msg(data, "Invalid UTF-8 characters in string"); \
   } \
diff --git a/runtime.c b/runtime.c
index e57009a0..a0910378 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2143,7 +2143,7 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le
     int i = 0, count, start_len = 0, start_cp = 0;
 
     for (count = 0; *tmp; ++tmp){
-      if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){
+      if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
         if (count < idx) {
           start_len = i;
           start_cp = count;
@@ -2193,7 +2193,7 @@ object Cyc_string_ref(void *data, object str, object k)
     int count;
 
     for (count = 0; *raw; ++raw){
-      if (!Cyc_utf8_decode(&state, &codepoint, *raw)){
+      if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*raw)){
         if (count == idx) break; // Reached requested index
         count += 1;
       }
@@ -2241,7 +2241,7 @@ object Cyc_substring(void *data, object cont, object str, object start,
     int count, start_i = 0, end_i = 0;
 
     for (count = 0; *tmp; ++tmp){
-      if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){
+      if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
         if (count == s) {
           start_i = end_i;
         } else if (count == e) {
@@ -2637,7 +2637,7 @@ object Cyc_string2utf8(void *data, object cont, object str, object start,
     char_type codepoint;
     uint32_t state = 0;
     for (i = 0; *tmp; ++tmp) {
-      if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){
+      if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
         if (i == s) {
           start_i = i;
         } else if (i == e) {
@@ -6023,8 +6023,8 @@ void _read_string(void *data, object cont, port_type *p)
       {
 // TODO: need to change this below, but run into trouble in icyc, eg:
 //       (string-ref "ab\x3bb;" 2) crashes
-        make_string(str, p->tok_buf);
-        //make_utf8_string(data, str, p->tok_buf);
+        //make_string(str, p->tok_buf);
+        make_utf8_string(data, str, p->tok_buf);
         return_thread_runnable(data, &str);
       }
     } else if (c == '\\') {

From 8585a9f3ccf92b7135426cf4b5c6a8dd7a8d27ad Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 30 Oct 2017 18:58:47 -0400
Subject: [PATCH 35/61] Test scaffold for Cyc_substring

---
 test.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/test.c b/test.c
index ac48fef5..1f2ef29c 100644
--- a/test.c
+++ b/test.c
@@ -125,6 +125,29 @@ void multi_byte_memset(char *buf, int blen, char *src, int slen)
   }
 }
 
+void substring(int s, int e) {
+  uint8_t raw[] = {65, 66, 0xCE, 0xBB};
+
+    const char *tmp = raw;
+    uint32_t codepoint;
+    uint32_t state = 0;
+    int count, start_i = 0, end_i = 0;
+
+    for (count = 0; *tmp; ++tmp){
+      if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
+        if (count == s) {
+          start_i = end_i;
+        } else if (count == e) {
+          break;
+        }
+        count += 1;
+      }
+      end_i++;
+    }
+    raw[end_i] = '\0';
+    printf("raw=%s, s=%d, e=%d, start_i=%d, end_i=%d\n", raw, s, e, start_i, end_i);
+}
+
 void main(){
   char c[128];
   uint8_t cv[] = {0xEC, 0xBA, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE
@@ -157,5 +180,9 @@ void main(){
   encode(0xcebb);
 
   printf("%06X\n", 0x0fff);
+  substring(0, 1);
+  substring(0, 2);
+  substring(1, 3);
+  substring(1, 4);
   return;
 }

From 950d92615b2ff4e1bc735c8b3466e35116d9c6f9 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Tue, 31 Oct 2017 16:46:14 +0000
Subject: [PATCH 36/61] WIP

---
 test.c | 46 ++++++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/test.c b/test.c
index 1f2ef29c..c4d3e693 100644
--- a/test.c
+++ b/test.c
@@ -125,27 +125,38 @@ void multi_byte_memset(char *buf, int blen, char *src, int slen)
   }
 }
 
-void substring(int s, int e) {
-  uint8_t raw[] = {65, 66, 0xCE, 0xBB};
-
+void substring(int s, int e, const char *expected) {
+  uint8_t raw[] = {65, 66, 0xCE, 0xBB, 67};
     const char *tmp = raw;
     uint32_t codepoint;
     uint32_t state = 0;
-    int count, start_i = 0, end_i = 0;
-
-    for (count = 0; *tmp; ++tmp){
+    int num_ch, cur_ch_bytes = 0, start_i = 0, end_i = 0;
+    for (num_ch = 0; *tmp; ++tmp){
+      //printf("char = %d\n", (int)*tmp);
       if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
-        if (count == s) {
+        end_i += cur_ch_bytes;
+        num_ch += 1;
+        cur_ch_bytes = 0;
+
+        if (num_ch == s) {
           start_i = end_i;
-        } else if (count == e) {
+        }
+        if (num_ch == e) {
           break;
         }
-        count += 1;
+
+        //if (num_ch == s) {
+        //  start_i = end_i;
+        //} else if (num_ch == (e - 1)) {
+        //  end_i += cur_ch_bytes;
+        //  if (s == e) start_i = end_i;
+        //  break;
+        //}
       }
-      end_i++;
+      cur_ch_bytes++;
     }
-    raw[end_i] = '\0';
-    printf("raw=%s, s=%d, e=%d, start_i=%d, end_i=%d\n", raw, s, e, start_i, end_i);
+    raw[end_i + 1] = '\0';
+    printf("expected=%s, raw=%s, s=%d, e=%d, start_i=%d, end_i=%d\n", expected, raw + start_i, s, e, start_i, end_i);
 }
 
 void main(){
@@ -180,9 +191,12 @@ void main(){
   encode(0xcebb);
 
   printf("%06X\n", 0x0fff);
-  substring(0, 1);
-  substring(0, 2);
-  substring(1, 3);
-  substring(1, 4);
+  substring(0, 1, "A   ");
+  substring(0, 2, "AB  ");
+  substring(1, 3, "Bx  ");
+  substring(1, 4, "BxC ");
+  substring(2, 2, "    ");
+  substring(2, 3, "x   ");
+  substring(2, 4, "xC  ");
   return;
 }

From 509fd430224113efb0d804bd292fc43d4fbe1ee9 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Tue, 31 Oct 2017 17:58:17 -0400
Subject: [PATCH 37/61] Fixed substring

---
 runtime.c | 29 ++++++++++++++++++++++-------
 test.c    | 13 ++-----------
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/runtime.c b/runtime.c
index a0910378..c830e294 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2238,19 +2238,34 @@ object Cyc_substring(void *data, object cont, object str, object start,
     const char *tmp = raw;
     char_type codepoint;
     uint32_t state = 0;
-    int count, start_i = 0, end_i = 0;
-
-    for (count = 0; *tmp; ++tmp){
+    int num_ch, cur_ch_bytes = 0, start_i = 0, end_i = 0;
+    for (num_ch = 0; *tmp; ++tmp){
+      cur_ch_bytes++;
       if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
-        if (count == s) {
+        end_i += cur_ch_bytes;
+        num_ch += 1;
+        cur_ch_bytes = 0;
+
+        if (num_ch == s) {
           start_i = end_i;
-        } else if (count == e) {
+        }
+        if (num_ch == e) {
           break;
         }
-        count += 1;
       }
-      end_i++;
     }
+    //int count, start_i = 0, end_i = 0;
+    //for (count = 0; *tmp; ++tmp){
+    //  if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
+    //    if (count == s) {
+    //      start_i = end_i;
+    //    } else if (count == e) {
+    //      break;
+    //    }
+    //    count += 1;
+    //  }
+    //  end_i++;
+    //}
     if (state != CYC_UTF8_ACCEPT)
        Cyc_rt_raise2(data, "substring - invalid character in string", str);
     make_utf8_string_with_len(sub, raw + start_i, end_i - start_i, e - s);
diff --git a/test.c b/test.c
index c4d3e693..434c33cd 100644
--- a/test.c
+++ b/test.c
@@ -132,7 +132,7 @@ void substring(int s, int e, const char *expected) {
     uint32_t state = 0;
     int num_ch, cur_ch_bytes = 0, start_i = 0, end_i = 0;
     for (num_ch = 0; *tmp; ++tmp){
-      //printf("char = %d\n", (int)*tmp);
+      cur_ch_bytes++;
       if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
         end_i += cur_ch_bytes;
         num_ch += 1;
@@ -144,18 +144,9 @@ void substring(int s, int e, const char *expected) {
         if (num_ch == e) {
           break;
         }
-
-        //if (num_ch == s) {
-        //  start_i = end_i;
-        //} else if (num_ch == (e - 1)) {
-        //  end_i += cur_ch_bytes;
-        //  if (s == e) start_i = end_i;
-        //  break;
-        //}
       }
-      cur_ch_bytes++;
     }
-    raw[end_i + 1] = '\0';
+    raw[end_i] = '\0';
     printf("expected=%s, raw=%s, s=%d, e=%d, start_i=%d, end_i=%d\n", expected, raw + start_i, s, e, start_i, end_i);
 }
 

From b1ea22c940f6d4325a9456e533d8d4734436e0d2 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Tue, 31 Oct 2017 18:41:52 -0400
Subject: [PATCH 38/61] Fixed (string->utf8)

---
 runtime.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/runtime.c b/runtime.c
index c830e294..f5da4c46 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2254,18 +2254,6 @@ object Cyc_substring(void *data, object cont, object str, object start,
         }
       }
     }
-    //int count, start_i = 0, end_i = 0;
-    //for (count = 0; *tmp; ++tmp){
-    //  if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
-    //    if (count == s) {
-    //      start_i = end_i;
-    //    } else if (count == e) {
-    //      break;
-    //    }
-    //    count += 1;
-    //  }
-    //  end_i++;
-    //}
     if (state != CYC_UTF8_ACCEPT)
        Cyc_rt_raise2(data, "substring - invalid character in string", str);
     make_utf8_string_with_len(sub, raw + start_i, end_i - start_i, e - s);
@@ -2647,21 +2635,25 @@ object Cyc_string2utf8(void *data, object cont, object str, object start,
     memcpy(&result.data[0], &(string_str(str))[s], len);
     _return_closcall1(data, cont, &result);
   } else {
-    int i, start_i = 0, end_i = 0;
     const char *tmp = string_str(str);
     char_type codepoint;
     uint32_t state = 0;
-    for (i = 0; *tmp; ++tmp) {
+    int num_ch, cur_ch_bytes = 0, start_i = 0, end_i = 0;
+    for (num_ch = 0; *tmp; ++tmp){
+      cur_ch_bytes++;
       if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
-        if (i == s) {
-          start_i = i;
-        } else if (i == e) {
+        end_i += cur_ch_bytes;
+        num_ch += 1;
+        cur_ch_bytes = 0;
+
+        if (num_ch == s) {
+          start_i = end_i;
+        }
+        if (num_ch == e) {
           break;
         }
       }
-      i++;
     }
-    end_i = i;
     result.len = end_i - start_i;
     result.data = alloca(sizeof(char) * result.len);
     memcpy(&result.data[0], &(string_str(str))[start_i], result.len);

From 734a6e1911ce7d94319472ab9dc070756ad5a0e5 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Tue, 31 Oct 2017 20:54:21 +0000
Subject: [PATCH 39/61] Allow read-char to handle unicode characters

---
 runtime.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/runtime.c b/runtime.c
index f5da4c46..bb1fc51b 100644
--- a/runtime.c
+++ b/runtime.c
@@ -6319,17 +6319,21 @@ object Cyc_io_peek_char(void *data, object cont, object port)
 object Cyc_io_read_char(void *data, object cont, object port)
 {
   port_type *p = (port_type *)port;
-  int c;
   Cyc_check_port(data, port);
   if (p->fp == NULL) {
     Cyc_rt_raise2(data, "Unable to read from closed port: ", port);
   }
   {
+    uint32_t state = CYC_UTF8_ACCEPT;
+    char_type codepoint;
+    int c;
     set_thread_blocked(data, cont);
-    _read_next_char(data, cont, p);
-    c = p->mem_buf[p->buf_idx++];
+    do {
+      _read_next_char(data, cont, p);
+      c = p->mem_buf[p->buf_idx++];
+    } while(Cyc_utf8_decode(&state, &codepoint, (uint8_t)c));
     p->col_num++;
-    return_thread_runnable(data, (c != EOF) ? obj_char2obj(c) : Cyc_EOF);
+    return_thread_runnable(data, (c != EOF) ? obj_char2obj(codepoint) : Cyc_EOF);
   }
   return Cyc_EOF;
 }

From 3aa2a159b7eab99713a9dedb59b3882049d6a4d0 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Thu, 2 Nov 2017 17:41:26 -0400
Subject: [PATCH 40/61] Bugfix: Cyc_utf8_encode returns char count, not bytes

---
 runtime.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/runtime.c b/runtime.c
index bb1fc51b..690488ea 100644
--- a/runtime.c
+++ b/runtime.c
@@ -1860,7 +1860,8 @@ object Cyc_list2string(void *data, object cont, object lst)
     if (!obj_is_char(cbox)) {
       Cyc_rt_raise2(data, "Expected character but received", cbox);
     }
-    len += Cyc_utf8_encode_char(cbuf, 5, ch);
+    Cyc_utf8_encode_char(cbuf, 5, ch);
+    len += strlen(cbuf);
     tmp = cdr(tmp);
   }
 
@@ -1870,7 +1871,8 @@ object Cyc_list2string(void *data, object cont, object lst)
     while ((lst != NULL)) {
       cbox = car(lst);
       ch = obj_obj2char(cbox); // Already validated, can assume chars now
-      i += Cyc_utf8_encode_char(&(buf[i]), 5, ch);
+      Cyc_utf8_encode_char(&(buf[i]), 5, ch);
+      i += strlen(buf+i);
       lst = cdr(lst);
     }
     buf[i] = '\0';
@@ -6013,6 +6015,7 @@ void _read_string(void *data, object cont, port_type *p)
           int i;
           Cyc_utf8_encode_char(cbuf, 5, result);
 // TODO: infinite loop here or above if ; is not provided???
+// only because it is still waiting for the ; after it reads the closing quote
           for (i = 0; cbuf[i] != 0; i++) {
             _read_add_to_tok_buf(p, cbuf[i]);
           }

From bbe8fbb97070cd81eadb8b4d10756ae8cefbbbe3 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Thu, 2 Nov 2017 18:00:10 -0400
Subject: [PATCH 41/61] Allow read_return_character to parse UTF8 chars

---
 runtime.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/runtime.c b/runtime.c
index 690488ea..801857b7 100644
--- a/runtime.c
+++ b/runtime.c
@@ -6118,9 +6118,23 @@ void _read_return_character(void *data, port_type *p)
     char_type result = strtol(buf, NULL, 16);
     return_thread_runnable(data, obj_char2obj(result));
   } else {
-    char buf[31];
-    snprintf(buf, 30, "Unable to parse character %s", p->tok_buf);
-    _read_error(data, p, buf);
+    uint32_t state = CYC_UTF8_ACCEPT;
+    char_type codepoint;
+    uint8_t *s = (uint8_t *)p->tok_buf;
+    while(s) {
+      if (!Cyc_utf8_decode(&state, &codepoint, *s)) {
+        s++;
+        break;
+      }
+      s++;
+    }
+    if (state == CYC_UTF8_ACCEPT && *s == '\0') {
+      return_thread_runnable(data, obj_char2obj(codepoint));
+    } else {
+      char buf[31];
+      snprintf(buf, 30, "Unable to parse character %s", p->tok_buf);
+      _read_error(data, p, buf);
+    }
   }
 }
 

From 67398186d0aa26eecd5ab873c9458d0d32939417 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Fri, 3 Nov 2017 14:41:58 +0000
Subject: [PATCH 42/61] Added comments

---
 runtime.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/runtime.c b/runtime.c
index 801857b7..bfd0f64b 100644
--- a/runtime.c
+++ b/runtime.c
@@ -6094,6 +6094,7 @@ void _read_return_character(void *data, port_type *p)
   p->tok_buf[p->tok_end] = '\0'; // TODO: what if buffer is full?
   p->tok_end = 0; // Reset for next atom
   if (strlen(p->tok_buf) == 1) {
+    // ASCII char, consider merging with below?
     return_thread_runnable(data, obj_char2obj(p->tok_buf[0]));
   } else if(strncmp(p->tok_buf, "alarm", 5) == 0) {
     return_thread_runnable(data, obj_char2obj('\a'));
@@ -6118,6 +6119,7 @@ void _read_return_character(void *data, port_type *p)
     char_type result = strtol(buf, NULL, 16);
     return_thread_runnable(data, obj_char2obj(result));
   } else {
+    // Try to read a UTF-8 char and if so return it, otherwise throw an error
     uint32_t state = CYC_UTF8_ACCEPT;
     char_type codepoint;
     uint8_t *s = (uint8_t *)p->tok_buf;

From 6910e3e4cb31cfe56e21069f1827d615eb28b671 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Fri, 3 Nov 2017 14:51:34 +0000
Subject: [PATCH 43/61] Added TODO

---
 runtime.c | 48 ++++++++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/runtime.c b/runtime.c
index bfd0f64b..a0e140e4 100644
--- a/runtime.c
+++ b/runtime.c
@@ -5944,6 +5944,28 @@ static void _read_add_to_tok_buf(port_type *p, char c)
   p->tok_buf[p->tok_end++] = c;
 }
 
+/**
+ * @brief Determine if given string is numeric
+ */
+int _read_is_numeric(const char *tok)
+{
+  int len = strlen(tok);
+  return (len &&
+          ((isdigit(tok[0])) ||
+           ((len > 1) && tok[0] == '.' && isdigit(tok[1])) ||
+           ((len > 1) && (tok[1] == '.' || isdigit(tok[1])) && (tok[0] == '-' || tok[0] == '+'))));
+}
+
+/**
+ * @brief Helper function, determine if given number is a hex digit
+ * @param c Character to check
+ */
+int _read_is_hex_digit(char c)
+{
+  return (c >= 'a' && c <= 'f') ||
+         (c >= 'A' && c <= 'F');
+}
+
 /**
  * @brief Helper function to read a string
  * @param data Thread data object
@@ -6003,6 +6025,10 @@ void _read_string(void *data, object cont, port_type *p)
             p->buf_idx++;
             break;
           }
+          // TODO: verify if hex digit is valid
+          //if (!isdigit(p->buf_idx) && !_read_is_hex_digit(p->buf_idx)) {
+          //  _read_error(data, p, "invalid hex digit in string");
+          //}
           buf[i] = p->mem_buf[p->buf_idx];
           p->buf_idx++;
           p->col_num++;
@@ -6168,28 +6194,6 @@ void _read_character(void *data, port_type *p)
   }
 }
 
-/**
- * @brief Determine if given string is numeric
- */
-int _read_is_numeric(const char *tok)
-{
-  int len = strlen(tok);
-  return (len &&
-          ((isdigit(tok[0])) ||
-           ((len > 1) && tok[0] == '.' && isdigit(tok[1])) ||
-           ((len > 1) && (tok[1] == '.' || isdigit(tok[1])) && (tok[0] == '-' || tok[0] == '+'))));
-}
-
-/**
- * @brief Helper function, determine if given number is a hex digit
- * @param c Character to check
- */
-int _read_is_hex_digit(char c)
-{
-  return (c >= 'a' && c <= 'f') ||
-         (c >= 'A' && c <= 'F');
-}
-
 /**
  * @brief Helper function, return read number.
  * @param data Thread data object

From d431b2af1c4835b8c13703677dac2e0a46840865 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 6 Nov 2017 13:19:31 +0000
Subject: [PATCH 44/61] Updated Cyc_io_read_line to prevent truncation

Ensure last codepoint is fully-read before returning
---
 include/cyclone/runtime.h |  2 +-
 runtime.c                 | 32 ++++++++++++++++++++++----------
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h
index 5c31471d..4529fa85 100644
--- a/include/cyclone/runtime.h
+++ b/include/cyclone/runtime.h
@@ -725,7 +725,7 @@ void Cyc_set_globals_changed(gc_thread_data *thd);
 int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz);
 uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
 int Cyc_utf8_count_code_points(uint8_t* s);
-int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, int *cpts, int *bytes);
+int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, char_type *codepoint, int *cpts, int *bytes);
 uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); 
 uint32_t Cyc_utf8_validate(char *str, size_t len);
 /**@}*/
diff --git a/runtime.c b/runtime.c
index a0e140e4..a7de71fd 100644
--- a/runtime.c
+++ b/runtime.c
@@ -6365,8 +6365,10 @@ object Cyc_io_read_char(void *data, object cont, object port)
 object Cyc_io_read_line(void *data, object cont, object port)
 {
   FILE *stream = ((port_type *) port)->fp;
-  char buf[1024];
-  int len, num_cp;
+  char buf[1027];
+  int len, num_cp, i = 0;
+  char_type codepoint;
+  uint32_t state;
 
   Cyc_check_port(data, port);
   if (stream == NULL) {
@@ -6375,10 +6377,21 @@ object Cyc_io_read_line(void *data, object cont, object port)
   set_thread_blocked(data, cont);
   errno = 0;
   if (fgets(buf, 1023, stream) != NULL) {
-    // TODO: not good enough for UTF-8, what if we stopped reading in the middle of a code point?
-    // should reserve 3 extra bytes and, if last code point is not complete, read one byte at a
-    // time until it has been read
-    Cyc_utf8_count_code_points_and_bytes((uint8_t *)buf, &num_cp, &len);
+    state = Cyc_utf8_count_code_points_and_bytes((uint8_t *)buf, &codepoint, &num_cp, &len);
+    // Check if we stopped reading in the middle of a code point and
+    // if so, read one byte at a time until that code point is finished.
+    while (state != CYC_UTF8_ACCEPT && i < 3) {
+      int c = fgetc(stream);
+      buf[len] = c;
+      len++;
+      Cyc_utf8_decode(&state, &codepoint, (uint8_t)c);
+      if (state == CYC_UTF8_ACCEPT) {
+        num_cp++;
+        break;
+      }
+      i++;
+    }
+
     {
       // Remove any trailing CR / newline chars
       while (len > 0 && (buf[len - 1] == '\n' ||
@@ -6596,19 +6609,18 @@ int Cyc_utf8_count_code_points(uint8_t* s) {
   return count;
 }
 
-int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, int *cpts, int *bytes) {
-  uint32_t codepoint;
+int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, char_type *codepoint, int *cpts, int *bytes) {
   uint32_t state = 0;
   *cpts = 0;
   *bytes = 0;
   for (; *s; ++s){
     *bytes += 1;
-    if (!Cyc_utf8_decode(&state, &codepoint, *s))
+    if (!Cyc_utf8_decode(&state, codepoint, *s))
       *cpts += 1;
   }
 
   if (state != CYC_UTF8_ACCEPT)
-    return -1;
+    return state;
   return 0;
 }
 

From 9962bca854e749ee6cf4309700333f4c752bf020 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 6 Nov 2017 14:12:21 +0000
Subject: [PATCH 45/61] Validate hex digits in string with the \x; syntax

---
 runtime.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/runtime.c b/runtime.c
index a7de71fd..dccdc380 100644
--- a/runtime.c
+++ b/runtime.c
@@ -6025,10 +6025,12 @@ void _read_string(void *data, object cont, port_type *p)
             p->buf_idx++;
             break;
           }
-          // TODO: verify if hex digit is valid
-          //if (!isdigit(p->buf_idx) && !_read_is_hex_digit(p->buf_idx)) {
-          //  _read_error(data, p, "invalid hex digit in string");
-          //}
+          // Verify if hex digit is valid
+          if (!isdigit(p->mem_buf[p->buf_idx]) && 
+              !_read_is_hex_digit(p->mem_buf[p->buf_idx])) {
+            p->buf_idx++;
+            _read_error(data, p, "invalid hex digit in string");
+          }
           buf[i] = p->mem_buf[p->buf_idx];
           p->buf_idx++;
           p->col_num++;
@@ -6040,8 +6042,6 @@ void _read_string(void *data, object cont, port_type *p)
           char cbuf[5];
           int i;
           Cyc_utf8_encode_char(cbuf, 5, result);
-// TODO: infinite loop here or above if ; is not provided???
-// only because it is still waiting for the ; after it reads the closing quote
           for (i = 0; cbuf[i] != 0; i++) {
             _read_add_to_tok_buf(p, cbuf[i]);
           }

From 471f0d4b5042a0782faac44bc5d4ab0e88db3c77 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 6 Nov 2017 16:00:11 +0000
Subject: [PATCH 46/61] UTF8 support

---
 scheme/process-context.sld | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scheme/process-context.sld b/scheme/process-context.sld
index f8ce64f2..e5cfa5c3 100644
--- a/scheme/process-context.sld
+++ b/scheme/process-context.sld
@@ -24,7 +24,7 @@
         for (i = _cyc_argc; i > 0; i--) {
           object ps = alloca(sizeof(string_type));
           object pl = alloca(sizeof(pair_type));
-          make_string(s, _cyc_argv[i - 1]);
+          make_utf8_string(data, s, _cyc_argv[i - 1]);
           memcpy(ps, &s, sizeof(string_type));
           ((list)pl)->hdr.mark = gc_color_red;
           ((list)pl)->hdr.grayed = 0;
@@ -44,7 +44,7 @@
         if (v == NULL) {
           return_closcall1(data, k, boolean_f);
         } else {
-          make_string(str, v);
+          make_utf8_string(data, str, v);
           return_closcall1(data, k, &str);
         }
       ")

From ec5ef86b6ae7ac2ce3fd16bf2809d49d0848fe50 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 6 Nov 2017 16:00:55 +0000
Subject: [PATCH 47/61] Do not use make_string for UTF8 strings

---
 runtime.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/runtime.c b/runtime.c
index dccdc380..e96f8a37 100644
--- a/runtime.c
+++ b/runtime.c
@@ -560,7 +560,7 @@ void Cyc_rt_raise(void *data, object err)
 
 void Cyc_rt_raise2(void *data, const char *msg, object err)
 {
-  make_string(s, msg);
+  make_utf8_string(data, s, msg);
   make_pair(c3, err, NULL);
   make_pair(c2, &s, &c3);
   make_pair(c1, boolean_f, &c2);
@@ -573,7 +573,7 @@ void Cyc_rt_raise2(void *data, const char *msg, object err)
 
 void Cyc_rt_raise_msg(void *data, const char *err)
 {
-  make_string(s, err);
+  make_utf8_string(data, s, err);
   Cyc_rt_raise(data, &s);
 }
 
@@ -1826,7 +1826,7 @@ object Cyc_symbol2string(void *data, object cont, object sym)
   Cyc_check_sym(data, sym);
   {
     const char *desc = symbol_desc(sym);
-    make_string(str, desc);
+    make_utf8_string(data, str, desc);
     _return_closcall1(data, cont, &str);
 }}
 
@@ -2273,22 +2273,22 @@ object Cyc_installation_dir(void *data, object cont, object type)
       strncmp(((symbol) type)->desc, "sld", 5) == 0) {
     char buf[1024];
     snprintf(buf, sizeof(buf), "%s", CYC_INSTALL_SLD);
-    make_string(str, buf);
+    make_utf8_string(data, str, buf);
     _return_closcall1(data, cont, &str);
   } else if (Cyc_is_symbol(type) == boolean_t &&
              strncmp(((symbol) type)->desc, "lib", 5) == 0) {
     char buf[1024];
     snprintf(buf, sizeof(buf), "%s", CYC_INSTALL_LIB);
-    make_string(str, buf);
+    make_utf8_string(data, str, buf);
     _return_closcall1(data, cont, &str);
   } else if (Cyc_is_symbol(type) == boolean_t &&
              strncmp(((symbol) type)->desc, "inc", 5) == 0) {
     char buf[1024];
     snprintf(buf, sizeof(buf), "%s", CYC_INSTALL_INC);
-    make_string(str, buf);
+    make_utf8_string(data, str, buf);
     _return_closcall1(data, cont, &str);
   } else {
-    make_string(str, CYC_INSTALL_DIR);
+    make_utf8_string(data, str, CYC_INSTALL_DIR);
     _return_closcall1(data, cont, &str);
   }
 }
@@ -2302,22 +2302,22 @@ object Cyc_compilation_environment(void *data, object cont, object var)
     if (strncmp(((symbol) var)->desc, "cc-prog", 8) == 0) {
       char buf[1024];
       snprintf(buf, sizeof(buf), "%s", CYC_CC_PROG);
-      make_string(str, buf);
+      make_utf8_string(data, str, buf);
       _return_closcall1(data, cont, &str);
     } else if (strncmp(((symbol) var)->desc, "cc-exec", 8) == 0) {
       char buf[1024];
       snprintf(buf, sizeof(buf), "%s", CYC_CC_EXEC);
-      make_string(str, buf);
+      make_utf8_string(data, str, buf);
       _return_closcall1(data, cont, &str);
     } else if (strncmp(((symbol) var)->desc, "cc-lib", 7) == 0) {
       char buf[1024];
       snprintf(buf, sizeof(buf), "%s", CYC_CC_LIB);
-      make_string(str, buf);
+      make_utf8_string(data, str, buf);
       _return_closcall1(data, cont, &str);
     } else if (strncmp(((symbol) var)->desc, "cc-so", 6) == 0) {
       char buf[1024];
       snprintf(buf, sizeof(buf), "%s", CYC_CC_SO);
-      make_string(str, buf);
+      make_utf8_string(data, str, buf);
       _return_closcall1(data, cont, &str);
     }
   }
@@ -2343,7 +2343,7 @@ object Cyc_command_line_arguments(void *data, object cont)
   for (i = _cyc_argc; i > 1; i--) {     // skip program name
     object ps = alloca(sizeof(string_type));
     object pl = alloca(sizeof(pair_type));
-    make_string(s, _cyc_argv[i - 1]);
+    make_utf8_string(data, s, _cyc_argv[i - 1]);
     memcpy(ps, &s, sizeof(string_type));
     ((list) pl)->hdr.mark = gc_color_red;
     ((list) pl)->hdr.grayed = 0;
@@ -5775,7 +5775,7 @@ void Cyc_import_shared_object(void *data, object cont, object filename, object e
   handle = dlopen(string_str(filename), RTLD_GLOBAL | RTLD_LAZY);
   if (handle == NULL) {
     snprintf(buffer, 256, "%s", dlerror());
-    make_string(s, buffer);
+    make_utf8_string(data, s, buffer);
     Cyc_rt_raise2(data, "Unable to import library", &s);
   }
   dlerror();    /* Clear any existing error */
@@ -5783,7 +5783,7 @@ void Cyc_import_shared_object(void *data, object cont, object filename, object e
   entry_pt = (function_type) dlsym(handle, string_str(entry_pt_fnc));
   if (entry_pt == NULL) {
     snprintf(buffer, 256, "%s, %s, %s", string_str(filename), string_str(entry_pt_fnc), dlerror());
-    make_string(s, buffer);
+    make_utf8_string(data, s, buffer);
     Cyc_rt_raise2(data, "Unable to load symbol", &s);
   }
   mclosure1(clo, entry_pt, cont);
@@ -5832,6 +5832,7 @@ void _read_error(void *data, port_type *p, const char *msg)
   // the cont could receive an error and raise it though
   //Cyc_rt_raise_msg(data, buf);
   make_string(str, buf);
+  str.num_cp = Cyc_utf8_count_code_points((uint8_t *)buf);
   make_empty_vector(vec);
   vec.num_elements = 1;
   vec.elements = (object *) alloca(sizeof(object) * vec.num_elements);
@@ -6057,9 +6058,6 @@ void _read_string(void *data, object cont, port_type *p)
       p->tok_buf[p->tok_end] = '\0'; // TODO: what if buffer is full?
       p->tok_end = 0; // Reset for next atom
       {
-// TODO: need to change this below, but run into trouble in icyc, eg:
-//       (string-ref "ab\x3bb;" 2) crashes
-        //make_string(str, p->tok_buf);
         make_utf8_string(data, str, p->tok_buf);
         return_thread_runnable(data, &str);
       }
@@ -6273,6 +6271,7 @@ void _read_return_atom(void *data, object cont, port_type *p)
 
   if (_read_is_numeric(p->tok_buf)) {
     make_string(str, p->tok_buf);
+    str.num_cp = Cyc_utf8_count_code_points((uint8_t *)(p->tok_buf));
     make_c_opaque(opq, &str);
     return_thread_runnable(data, &opq);
   } else if (strncmp("+inf.0", p->tok_buf, 6) == 0 ||

From d43d019c2077fe7adf7e086f4afd5c648e9388d0 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 6 Nov 2017 17:06:12 +0000
Subject: [PATCH 48/61] Fix UTF8 support for pack_env_variables()

---
 runtime.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runtime.c b/runtime.c
index e96f8a37..81b48222 100644
--- a/runtime.c
+++ b/runtime.c
@@ -178,10 +178,10 @@ void pack_env_variables(void *data, object k)
     svar->hdr.grayed = 0;
     svar->tag = string_tag; 
     svar->len = eqpos - e;
-    svar->num_cp = svar->len; // TODO: proper UTF-8 support!
     svar->str = alloca(sizeof(char) * (svar->len));
     strncpy(svar->str, e, svar->len);
     (svar->str)[svar->len] = '\0';
+    svar->num_cp = Cyc_utf8_count_code_points((uint8_t *)svar->str);
 
     if (eqpos) {
       eqpos++;
@@ -190,7 +190,7 @@ void pack_env_variables(void *data, object k)
     sval->hdr.grayed = 0;
     sval->tag = string_tag; 
     sval->len = strlen(eqpos);
-    sval->num_cp = sval->len; // TODO: proper UTF-8 support!
+    svar->num_cp = Cyc_utf8_count_code_points((uint8_t *)eqpos);
     sval->str = eqpos;
     set_pair(tmp, svar, sval);
     set_pair(p, tmp, NULL);

From 348ed7205c49d81f1d0fd66cb9e813335d602f00 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 6 Nov 2017 17:46:56 +0000
Subject: [PATCH 49/61] Added a TODO for peek-char and UTF8

---
 runtime.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/runtime.c b/runtime.c
index 81b48222..8318ed93 100644
--- a/runtime.c
+++ b/runtime.c
@@ -6304,6 +6304,8 @@ object Cyc_io_peek_char(void *data, object cont, object port)
 {
   FILE *stream;
   port_type *p;
+  uint32_t state = CYC_UTF8_ACCEPT;
+  char_type codepoint;
   int c;
 
   Cyc_check_port(data, port);
@@ -6316,7 +6318,13 @@ object Cyc_io_peek_char(void *data, object cont, object port)
     set_thread_blocked(data, cont);
     _read_next_char(data, cont, p);
     c = p->mem_buf[p->buf_idx];
-    return_thread_runnable(data, (c != EOF) ? obj_char2obj(c) : Cyc_EOF);
+    if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)c)) {
+      // TODO: only have a partial UTF8 code point, read more chars.
+      // Problem is that there may not be enough space to store them
+      // and do need to set them aside since we are just peeking here
+      // and not actually supposed to be reading past chars.
+    }
+    return_thread_runnable(data, (c != EOF) ? obj_char2obj(codepoint) : Cyc_EOF);
   }
   return Cyc_EOF;
 }

From 42507606a53374a330dcc4b162ef7944eb546c22 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 6 Nov 2017 17:54:00 +0000
Subject: [PATCH 50/61] Added Cyc_string_byte_length()

---
 include/cyclone/runtime.h | 1 +
 runtime.c                 | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h
index 4529fa85..80ea3f87 100644
--- a/include/cyclone/runtime.h
+++ b/include/cyclone/runtime.h
@@ -194,6 +194,7 @@ int binstr2int(const char *str);
 int octstr2int(const char *str);
 object Cyc_string_append(void *data, object cont, int argc, object str1, ...);
 object Cyc_string_length(void *data, object str);
+object Cyc_string_byte_length(void *data, object str);
 object Cyc_substring(void *data, object cont, object str, object start,
                      object end);
 object Cyc_string_ref(void *data, object str, object k);
diff --git a/runtime.c b/runtime.c
index 8318ed93..e7792702 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2109,6 +2109,12 @@ object Cyc_string_length(void *data, object str)
   return obj_int2obj(string_num_cp(str));
 }
 
+object Cyc_string_byte_length(void *data, object str)
+{
+  Cyc_check_str(data, str);
+  return obj_int2obj(string_len(str));
+}
+
 object Cyc_string_set(void *data, object str, object k, object chr)
 {
   char *raw;

From cfdec73d78b88ecc2af7f63eaa99dad224ab5782 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Mon, 6 Nov 2017 18:57:56 +0000
Subject: [PATCH 51/61] Emit strings with char/byte lengths

---
 scheme/cyclone/cgen.sld | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/scheme/cyclone/cgen.sld b/scheme/cyclone/cgen.sld
index 7cdd74f0..d3456d11 100644
--- a/scheme/cyclone/cgen.sld
+++ b/scheme/cyclone/cgen.sld
@@ -521,7 +521,15 @@
             (string-append "&" cvar-name) ; Code is just the variable name
             (list     ; Allocate integer on the C stack
               (string-append 
-                "make_string(" cvar-name ", " (->cstr exp) ");")))))
+                "make_utf8_string_with_len(" 
+                cvar-name 
+                ", " 
+                (->cstr exp) 
+                ", " 
+                (number->string (string-byte-length exp))
+                ", " 
+                (number->string (string-length exp))
+                ");")))))
 ;TODO: not good enough, need to store new symbols in a table so they can
 ;be inserted into the C program
     ((symbol? exp)
@@ -536,6 +544,10 @@
 (define (->cstr str) 
   (string-append "\"" (cstr:escape-chars str) "\""))
 
+(define-c string-byte-length
+  "(void *data, int argc, closure _, object k, object s)"
+  " return_closcall1(data, k, Cyc_string_byte_length(data, s)); ")
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Primitives
 

From f5787184dae2e98a736f63225aafe90a68923514 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Tue, 7 Nov 2017 18:18:56 +0000
Subject: [PATCH 52/61] WIP - string-set!

---
 runtime.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/runtime.c b/runtime.c
index e7792702..f129899a 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2117,8 +2117,10 @@ object Cyc_string_byte_length(void *data, object str)
 
 object Cyc_string_set(void *data, object str, object k, object chr)
 {
+  char buf[5];
   char *raw;
-  int idx, len;
+  int idx, len, buf_len;
+  char_type input_char;
 
   Cyc_check_str(data, str);
   Cyc_check_num(data, k);
@@ -2127,6 +2129,10 @@ object Cyc_string_set(void *data, object str, object k, object chr)
     Cyc_rt_raise2(data, "Expected char but received", chr);
   }
 
+  input_char = obj_obj2char(chr);
+  Cyc_utf8_encode_char(buf, 5, input_char);
+  buf_len = strlen(buf);
+
   raw = string_str(str);
   idx = unbox_number(k);
   len = string_len(str);
@@ -2134,8 +2140,7 @@ object Cyc_string_set(void *data, object str, object k, object chr)
   Cyc_check_bounds(data, "string-set!", len, idx);
 
   // Take fast path if all chars are just 1 byte
-  if (string_num_cp(str) == string_len(str)) {
-    // TODO: not good enough, chr could be multi-byte
+  if (string_num_cp(str) == string_len(str) && buf_len == 1) {
     raw[idx] = obj_obj2char(chr);
   } else {
 fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), len);
@@ -2148,9 +2153,10 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le
     char *tmp = raw;
     char_type codepoint;
     uint32_t state = 0;
-    int i = 0, count, start_len = 0, start_cp = 0;
+    int i = 0, count, start_len = 0, start_cp = 0, bytes = 0;
 
     for (count = 0; *tmp; ++tmp){
+      bytes++;
       if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
         if (count < idx) {
           start_len = i;
@@ -2159,6 +2165,7 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le
           break;
         }
         count += 1;
+        bytes = 0;
       }
       i++;
     }
@@ -2171,6 +2178,13 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le
     // and we know the codepoint to be replaced. by calculating its length
     // we can compute where the end portion starts, and by using str we can
     // figure out how many remaining bytes/codepoints are in end
+    //
+    // 3 cases: 
+    // - buf_len = bytes, just straight replace
+    // - buf_len > bytes, will need to allocate more memory (!!)
+    // - buf_len < bytes, just replace, but pad with NULL chars.
+    //                    in this case need to ensure string_len is not 
+    //                    reduced because original value still matters for GC purposes
 
   }
   return str;

From 02014322b736dc451df61d835ac8172c80e93dbd Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Tue, 7 Nov 2017 17:47:08 -0500
Subject: [PATCH 53/61] Properly count bytes in make-string

---
 scheme/base.sld | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scheme/base.sld b/scheme/base.sld
index f5451340..5ab42fd0 100644
--- a/scheme/base.sld
+++ b/scheme/base.sld
@@ -958,7 +958,8 @@
           Cyc_rt_raise2(data, \"Expected character buf received\", fill);
         }
         c = obj_obj2char(fill);
-        buflen = Cyc_utf8_encode_char(ch_buf, 5, c);
+        Cyc_utf8_encode_char(ch_buf, 5, c);
+        buflen = strlen(ch_buf);
         num_cp = obj_obj2int(count);
         len = num_cp * buflen;
         if (len >= MAX_STACK_OBJ) {

From d584cf059ec09acb752c543c1ded6b49ed46ac67 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Tue, 7 Nov 2017 18:13:12 -0500
Subject: [PATCH 54/61] Partial fixes to string-set!

---
 runtime.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/runtime.c b/runtime.c
index f129899a..dbb1b448 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2150,27 +2150,26 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le
     // or don't allocate if chr uses as many or fewer bytes 
     // than the codepoint it is replacing
 
-    char *tmp = raw;
+    char *tmp = raw, *this_cp = raw;
     char_type codepoint;
     uint32_t state = 0;
-    int i = 0, count, start_len = 0, start_cp = 0, bytes = 0;
+    int i = 0, count, bytes = 0;
 
     for (count = 0; *tmp; ++tmp){
       bytes++;
       if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
-        if (count < idx) {
-          start_len = i;
-          start_cp = count;
-        } else if (count == idx) {
+        if (count == idx) {
           break;
         }
+        this_cp = tmp + 1;
         count += 1;
         bytes = 0;
       }
       i++;
     }
-    if (state != CYC_UTF8_ACCEPT)
+    if (state != CYC_UTF8_ACCEPT) {
        Cyc_rt_raise2(data, "string-set! - invalid character at index", k);
+    }
 
     // TODO: perform actual mutation
     //
@@ -2181,11 +2180,18 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le
     //
     // 3 cases: 
     // - buf_len = bytes, just straight replace
+    if (buf_len == bytes) {
+      for (i = 0; i < buf_len; i++) {
+        this_cp[i] = buf[i];
+      }
+    }
     // - buf_len > bytes, will need to allocate more memory (!!)
     // - buf_len < bytes, just replace, but pad with NULL chars.
     //                    in this case need to ensure string_len is not 
     //                    reduced because original value still matters for GC purposes
-
+    else {
+      Cyc_rt_raise2(data, "string-set! - unable to modify character", chr);
+    }
   }
   return str;
 }

From 61a18d8fb35ea1e59c8889119a7fd97c41c15c7d Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Tue, 7 Nov 2017 18:39:12 -0500
Subject: [PATCH 55/61] WIP

---
 runtime.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/runtime.c b/runtime.c
index dbb1b448..8e6a045a 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2189,6 +2189,12 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le
     // - buf_len < bytes, just replace, but pad with NULL chars.
     //                    in this case need to ensure string_len is not 
     //                    reduced because original value still matters for GC purposes
+    //else if (buf_len < bytes) {
+    //  for (i = 0; i < buf_len; i++) {
+    //    this_cp[i] = buf[i];
+    //  }
+    // TODO: memcpy remaining string, ensure trailing null is setup correctly, consolidate with above??
+    //}
     else {
       Cyc_rt_raise2(data, "string-set! - unable to modify character", chr);
     }

From 0f4a7b30c1d123c6e9fca1db917adbc6661da8f2 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Wed, 8 Nov 2017 14:55:57 +0000
Subject: [PATCH 56/61] Fixes for string-set!

Handle setting of a char to one that is represented using fewer bytes.
---
 runtime.c | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/runtime.c b/runtime.c
index 8e6a045a..f2a8ce9f 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2139,39 +2139,35 @@ object Cyc_string_set(void *data, object str, object k, object chr)
 
   Cyc_check_bounds(data, "string-set!", len, idx);
 
-  // Take fast path if all chars are just 1 byte
   if (string_num_cp(str) == string_len(str) && buf_len == 1) {
+    // Take fast path if all chars are just 1 byte
     raw[idx] = obj_obj2char(chr);
   } else {
-fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), len);
-    // TODO: utf8 support
-    // find codepoint at k, figure out how many bytes it is,
-    // allocate a new string (start) + chr + (end)
-    // or don't allocate if chr uses as many or fewer bytes 
-    // than the codepoint it is replacing
-
+    // Slower path for UTF-8, need to handle replacement differently 
+    // depending upon how the new char affects length of the string
     char *tmp = raw, *this_cp = raw;
     char_type codepoint;
     uint32_t state = 0;
-    int i = 0, count, bytes = 0;
+    int i = 0, count, prev_cp_bytes = 0, cp_idx;
 
     for (count = 0; *tmp; ++tmp){
-      bytes++;
+      prev_cp_bytes++;
       if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
         if (count == idx) {
           break;
         }
         this_cp = tmp + 1;
         count += 1;
-        bytes = 0;
+        prev_cp_bytes = 0;
       }
       i++;
     }
+    cp_idx = i;
     if (state != CYC_UTF8_ACCEPT) {
        Cyc_rt_raise2(data, "string-set! - invalid character at index", k);
     }
 
-    // TODO: perform actual mutation
+    // Perform actual mutation
     //
     // Now we know length of start (both in codepoints and bytes),
     // and we know the codepoint to be replaced. by calculating its length
@@ -2179,22 +2175,26 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le
     // figure out how many remaining bytes/codepoints are in end
     //
     // 3 cases: 
-    // - buf_len = bytes, just straight replace
-    if (buf_len == bytes) {
+    // - 1) buf_len = prev_cp_bytes, just straight replace
+    if (buf_len == prev_cp_bytes) {
       for (i = 0; i < buf_len; i++) {
         this_cp[i] = buf[i];
       }
     }
-    // - buf_len > bytes, will need to allocate more memory (!!)
-    // - buf_len < bytes, just replace, but pad with NULL chars.
-    //                    in this case need to ensure string_len is not 
-    //                    reduced because original value still matters for GC purposes
-    //else if (buf_len < bytes) {
-    //  for (i = 0; i < buf_len; i++) {
-    //    this_cp[i] = buf[i];
-    //  }
-    // TODO: memcpy remaining string, ensure trailing null is setup correctly, consolidate with above??
-    //}
+    // - 2) buf_len < prev_cp_bytes, replace and shift chars down
+    else if (buf_len < prev_cp_bytes) {
+      // Replace code point with shorter one
+      for (i = 0; i < buf_len; i++) {
+        this_cp[i] = buf[i];
+      }
+      // Move string down to eliminate unneeded chars
+      memmove(this_cp + buf_len, this_cp + prev_cp_bytes, len - cp_idx);
+      // Null terminate the shorter string.
+      // Ensure string_len is not reduced because original 
+      // value still matters for GC purposes
+      raw[len - (prev_cp_bytes - buf_len)] = '\0'; 
+    }
+    // - 3) TODO: buf_len > prev_cp_bytes, will need to allocate more memory (!!)
     else {
       Cyc_rt_raise2(data, "string-set! - unable to modify character", chr);
     }

From 40b729e11bfdce80db77e80f8b4a1efc7d21bc15 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Wed, 8 Nov 2017 18:38:31 +0000
Subject: [PATCH 57/61] WIP - peek-char UTF8 support

---
 runtime.c | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/runtime.c b/runtime.c
index f2a8ce9f..524e0b5e 100644
--- a/runtime.c
+++ b/runtime.c
@@ -6338,7 +6338,8 @@ object Cyc_io_peek_char(void *data, object cont, object port)
   port_type *p;
   uint32_t state = CYC_UTF8_ACCEPT;
   char_type codepoint;
-  int c;
+  int c, i = 0, at_mem_buf_end = 0;
+  char buf[5];
 
   Cyc_check_port(data, port);
   {
@@ -6348,14 +6349,39 @@ object Cyc_io_peek_char(void *data, object cont, object port)
       Cyc_rt_raise2(data, "Unable to read from closed port: ", port);
     }
     set_thread_blocked(data, cont);
-    _read_next_char(data, cont, p);
+    if (p->mem_buf_len == 0 || p->mem_buf_len == p->buf_idx) {
+      _read_next_char(data, cont, p);
+    }
     c = p->mem_buf[p->buf_idx];
-    if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)c)) {
-      // TODO: only have a partial UTF8 code point, read more chars.
+    if (Cyc_utf8_decode(&state, &codepoint, (uint8_t)c)) {
+      // Only have a partial UTF8 code point, read more chars.
       // Problem is that there may not be enough space to store them
       // and do need to set them aside since we are just peeking here
       // and not actually supposed to be reading past chars.
+
+      buf[0] = c;
+      i = 1;
+      while (1) { // TODO: limit to 4 chars??
+        if (p->mem_buf_len == p->buf_idx + i) {
+          // No more buffered chars
+          at_mem_buf_end = 1;
+          c = fgetc(stream);
+          if (c == EOF) break; // TODO: correct to do this here????
+        } else {
+          c = p->mem_buf[p->buf_idx + i];
+        }
+        buf[i++] = c;
+        if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)c)) {
+          break;
+        }
+      }
     }
+    if (at_mem_buf_end && c != EOF) {
+      p->buf_idx = 0;
+      p->mem_buf_len = i;
+      memmove(p->mem_buf, buf, i);
+    }
+
     return_thread_runnable(data, (c != EOF) ? obj_char2obj(codepoint) : Cyc_EOF);
   }
   return Cyc_EOF;
@@ -6393,7 +6419,9 @@ object Cyc_io_read_char(void *data, object cont, object port)
     do {
       _read_next_char(data, cont, p);
       c = p->mem_buf[p->buf_idx++];
+      if (c == EOF) break;
     } while(Cyc_utf8_decode(&state, &codepoint, (uint8_t)c));
+// TODO: limit above to 4 chars and then thrown an error?
     p->col_num++;
     return_thread_runnable(data, (c != EOF) ? obj_char2obj(codepoint) : Cyc_EOF);
   }

From a492ca379d2117f8a8cb3fb1fef43df65413ed3d Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Thu, 9 Nov 2017 19:00:21 -0500
Subject: [PATCH 58/61] Handle the null character

---
 runtime.c       | 24 ++++++++++++++++++------
 scheme/base.sld |  8 ++++++--
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/runtime.c b/runtime.c
index 524e0b5e..bf9d9e76 100644
--- a/runtime.c
+++ b/runtime.c
@@ -1860,8 +1860,12 @@ object Cyc_list2string(void *data, object cont, object lst)
     if (!obj_is_char(cbox)) {
       Cyc_rt_raise2(data, "Expected character but received", cbox);
     }
-    Cyc_utf8_encode_char(cbuf, 5, ch);
-    len += strlen(cbuf);
+    if (!ch) {
+      len++;
+    } else {
+      Cyc_utf8_encode_char(cbuf, 5, ch);
+      len += strlen(cbuf);
+    }
     tmp = cdr(tmp);
   }
 
@@ -1871,8 +1875,12 @@ object Cyc_list2string(void *data, object cont, object lst)
     while ((lst != NULL)) {
       cbox = car(lst);
       ch = obj_obj2char(cbox); // Already validated, can assume chars now
-      Cyc_utf8_encode_char(&(buf[i]), 5, ch);
-      i += strlen(buf+i);
+      if (!ch) {
+        i++;
+      } else {
+        Cyc_utf8_encode_char(&(buf[i]), 5, ch);
+        i += strlen(buf+i);
+      }
       lst = cdr(lst);
     }
     buf[i] = '\0';
@@ -2130,8 +2138,12 @@ object Cyc_string_set(void *data, object str, object k, object chr)
   }
 
   input_char = obj_obj2char(chr);
-  Cyc_utf8_encode_char(buf, 5, input_char);
-  buf_len = strlen(buf);
+  if (!input_char) {
+    buf_len = 1;
+  } else {
+    Cyc_utf8_encode_char(buf, 5, input_char);
+    buf_len = strlen(buf);
+  }
 
   raw = string_str(str);
   idx = unbox_number(k);
diff --git a/scheme/base.sld b/scheme/base.sld
index 5ab42fd0..c13cde79 100644
--- a/scheme/base.sld
+++ b/scheme/base.sld
@@ -958,8 +958,12 @@
           Cyc_rt_raise2(data, \"Expected character buf received\", fill);
         }
         c = obj_obj2char(fill);
-        Cyc_utf8_encode_char(ch_buf, 5, c);
-        buflen = strlen(ch_buf);
+        if (!c) {
+          buflen = 1;
+        } else {
+          Cyc_utf8_encode_char(ch_buf, 5, c);
+          buflen = strlen(ch_buf);
+        }
         num_cp = obj_obj2int(count);
         len = num_cp * buflen;
         if (len >= MAX_STACK_OBJ) {

From 39d3be81419ba8752e491244c0ea722f60b5e77e Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Sat, 11 Nov 2017 18:16:14 -0500
Subject: [PATCH 59/61] Removing

---
 opt-test.data |  3 ---
 opt-test.scm  | 45 ---------------------------------------------
 2 files changed, 48 deletions(-)
 delete mode 100644 opt-test.data
 delete mode 100644 opt-test.scm

diff --git a/opt-test.data b/opt-test.data
deleted file mode 100644
index 01e79c32..00000000
--- a/opt-test.data
+++ /dev/null
@@ -1,3 +0,0 @@
-1
-2
-3
diff --git a/opt-test.scm b/opt-test.scm
deleted file mode 100644
index d98394a1..00000000
--- a/opt-test.scm
+++ /dev/null
@@ -1,45 +0,0 @@
-;; Testing the next set of optimizations
-;; To run: ./opt-test < opt-test.data
-;;
-;; Timings: T430
-;; Baseline - 2.511
-;; Dyadic - 1.409
-;;
-(import (scheme base)
-        (scheme write)
-        (scheme read))
-(let ((x (read))
-      (y (read))
-      (z (read))
-      (iterations 10000000)
-      (sum 0))
-  (do ((i iterations (- i 1)))
-      ((zero? i))
-      (set! sum (+ sum sum (* x y z)))
-      (set! sum (- sum sum (* x y z))))
-  (write sum))
-
-;;; Take an expression containing a single function call and break it up
-;;; into many calls of 2 arguments each.
-;(define (->dyadic expr)
-;  (cond
-;    ((< (length expr) 4)
-;     expr)
-;    (else
-;     (let ((fnc (car expr)))
-;       (foldl
-;         (lambda (x acc)
-;           (list fnc acc x))
-;         `(,fnc ,(cadr expr) ,(caddr expr))
-;         (cdddr expr))))))
-;    
-;(write (->dyadic '(+ 1)))
-;(write (->dyadic '(+ 1 2)))
-;(write (->dyadic '(+ 1 2 3)))
-;(write (->dyadic '(+ 1 2 3 4)))
-;;(write
-;;  (foldl
-;;    (lambda (x acc)
-;;     (list 'Cyc-fast-plus acc x))
-;;    '(Cyc-fast-plus 1 2)
-;;    '(3 4 5)))

From 9cfb80677a13a5306dae6abe4216ab478c795ec1 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Sun, 12 Nov 2017 18:29:32 -0500
Subject: [PATCH 60/61] Cleanup

---
 runtime.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/runtime.c b/runtime.c
index bf9d9e76..d60de898 100644
--- a/runtime.c
+++ b/runtime.c
@@ -2162,6 +2162,7 @@ object Cyc_string_set(void *data, object str, object k, object chr)
     uint32_t state = 0;
     int i = 0, count, prev_cp_bytes = 0, cp_idx;
 
+    // Find index to change, and how many bytes it is
     for (count = 0; *tmp; ++tmp){
       prev_cp_bytes++;
       if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
@@ -2208,7 +2209,8 @@ object Cyc_string_set(void *data, object str, object k, object chr)
     }
     // - 3) TODO: buf_len > prev_cp_bytes, will need to allocate more memory (!!)
     else {
-      Cyc_rt_raise2(data, "string-set! - unable to modify character", chr);
+      // TODO: maybe we can try a little harder here, at least in some cases
+      Cyc_rt_raise2(data, "string-set! - Unable to allocate memory to store multibyte character", chr);
     }
   }
   return str;

From 1e8819d57ecfed13426e34c17c6a9ea822c6ddc6 Mon Sep 17 00:00:00 2001
From: Justin Ethier <justin.ethier@gmail.com>
Date: Sun, 12 Nov 2017 18:45:04 -0500
Subject: [PATCH 61/61] Limit iteration in Cyc_io_peek_char

---
 runtime.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime.c b/runtime.c
index d60de898..9b9e09af 100644
--- a/runtime.c
+++ b/runtime.c
@@ -6375,7 +6375,7 @@ object Cyc_io_peek_char(void *data, object cont, object port)
 
       buf[0] = c;
       i = 1;
-      while (1) { // TODO: limit to 4 chars??
+      while (i < 5) { // TODO: limit to 4 chars??
         if (p->mem_buf_len == p->buf_idx + i) {
           // No more buffered chars
           at_mem_buf_end = 1;