From ae3aa1941d4efc705f1249973d035c718cc78af3 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Thu, 19 Oct 2017 13:29:57 +0000 Subject: [PATCH 01/61] Initial UTF-8 helpers --- runtime.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/runtime.c b/runtime.c index 1525f2d0..dd799d88 100644 --- a/runtime.c +++ b/runtime.c @@ -6364,3 +6364,105 @@ void Cyc_io_read_token(void *data, object cont, object port) } } +////////////// UTF-8 Section ////////////// + +// Copyright (c) 2008-2009 Bjoern Hoehrmann +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 1 + +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +uint32_t inline +decode(uint32_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} +// END Bjoern Hoehrmann + +/** + * @brief + * Count the number of code points in a string. + * Based on example code from Bjoern Hoehrmann. + */ +int countCodePoints(uint8_t* s, size_t* count) { + uint32_t codepoint; + uint32_t state = 0; + + for (*count = 0; *s; ++s) + if (!decode(&state, &codepoint, *s)) + *count += 1; + + return state != UTF8_ACCEPT; +} + +// TODO: index into X codepoint in a string + +/** + * @brief + * Use this when validating from a stream, as it may be that the stream stopped + * in the middle of a codepoint, hence state passed in as an arg, so it can be + * tested in a loop and also after the loop has finished. + * + * From https://stackoverflow.com/a/22135005/101258 + */ +uint32_t validate_utf8(uint32_t *state, char *str, size_t len) { + size_t i; + uint32_t type; + + for (i = 0; i < len; i++) { + // We don't care about the codepoint, so this is + // a simplified version of the decode function. + type = utf8d[(uint8_t)str[i]]; + *state = utf8d[256 + (*state) * 16 + type]; + + if (*state == UTF8_REJECT) + break; + } + + return *state; +} + +/** + * @brief Simplified version of above, always called with a complete string buffer + */ +uint32_t valid_utf8(char *str, size_t len) { + size_t i; + uint32_t state = UTF8_ACCEPT, type; + + for (i = 0; i < len; i++) { + // We don't care about the codepoint, so this is + // a simplified version of the decode function. + type = utf8d[(uint8_t)str[i]]; + state = utf8d[256 + (state) * 16 + type]; + + if (state == UTF8_REJECT) + break; + } + + return state; +} + +////////////// END UTF-8 Section ////////////// From 71c7ed3e7f8f0e83af363be2a90cc7e4e835104a Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Fri, 20 Oct 2017 12:54:13 +0000 Subject: [PATCH 02/61] Cleanup and added UTF 8 definitions to header file --- include/cyclone/runtime.h | 14 ++++++++++++++ runtime.c | 28 ++++++++++++---------------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h index 6a8027dd..dbbfb8d9 100644 --- a/include/cyclone/runtime.h +++ b/include/cyclone/runtime.h @@ -707,4 +707,18 @@ void add_global(object * glo); void Cyc_set_globals_changed(gc_thread_data *thd); /**@}*/ +/** + * \defgroup prim_utf8 UTF-8 + * + * @brief Unicode processing using UTF-8 + */ +/**@{*/ +#define CYC_UTF8_ACCEPT 0 +#define CYC_UTF8_REJECT 1 +uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte); +int Cyc_utf8_count_code_points(uint8_t* s, size_t* count); +uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); +uint32_t Cyc_utf8_validate(char *str, size_t len); +/**@}*/ + #endif /* CYCLONE_RUNTIME_H */ diff --git a/runtime.c b/runtime.c index dd799d88..a0662e81 100644 --- a/runtime.c +++ b/runtime.c @@ -6368,10 +6368,6 @@ void Cyc_io_read_token(void *data, object cont, object port) // Copyright (c) 2008-2009 Bjoern Hoehrmann // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. - -#define UTF8_ACCEPT 0 -#define UTF8_REJECT 1 - static const uint8_t utf8d[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f @@ -6389,11 +6385,11 @@ static const uint8_t utf8d[] = { 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 }; -uint32_t inline -decode(uint32_t* state, uint32_t* codep, uint32_t byte) { +//uint32_t inline +uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { uint32_t type = utf8d[byte]; - *codep = (*state != UTF8_ACCEPT) ? + *codep = (*state != CYC_UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) : (0xff >> type) & (byte); @@ -6407,15 +6403,15 @@ decode(uint32_t* state, uint32_t* codep, uint32_t byte) { * Count the number of code points in a string. * Based on example code from Bjoern Hoehrmann. */ -int countCodePoints(uint8_t* s, size_t* count) { +int Cyc_utf8_count_code_points(uint8_t* s, size_t* count) { uint32_t codepoint; uint32_t state = 0; for (*count = 0; *s; ++s) - if (!decode(&state, &codepoint, *s)) + if (!Cyc_utf8_decode(&state, &codepoint, *s)) *count += 1; - return state != UTF8_ACCEPT; + return state != CYC_UTF8_ACCEPT; } // TODO: index into X codepoint in a string @@ -6428,7 +6424,7 @@ int countCodePoints(uint8_t* s, size_t* count) { * * From https://stackoverflow.com/a/22135005/101258 */ -uint32_t validate_utf8(uint32_t *state, char *str, size_t len) { +uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len) { size_t i; uint32_t type; @@ -6438,7 +6434,7 @@ uint32_t validate_utf8(uint32_t *state, char *str, size_t len) { type = utf8d[(uint8_t)str[i]]; *state = utf8d[256 + (*state) * 16 + type]; - if (*state == UTF8_REJECT) + if (*state == CYC_UTF8_REJECT) break; } @@ -6446,11 +6442,11 @@ uint32_t validate_utf8(uint32_t *state, char *str, size_t len) { } /** - * @brief Simplified version of above, always called with a complete string buffer + * @brief Simplified version of Cyc_utf8_validate_stream that must always be called with a complete string buffer. */ -uint32_t valid_utf8(char *str, size_t len) { +uint32_t Cyc_utf8_validate(char *str, size_t len) { size_t i; - uint32_t state = UTF8_ACCEPT, type; + uint32_t state = CYC_UTF8_ACCEPT, type; for (i = 0; i < len; i++) { // We don't care about the codepoint, so this is @@ -6458,7 +6454,7 @@ uint32_t valid_utf8(char *str, size_t len) { type = utf8d[(uint8_t)str[i]]; state = utf8d[256 + (state) * 16 + type]; - if (state == UTF8_REJECT) + if (state == CYC_UTF8_REJECT) break; } From ccad99062681226530f56cc656bdc32380822d4a Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Fri, 20 Oct 2017 13:28:16 +0000 Subject: [PATCH 03/61] Beginning to change string type --- include/cyclone/types.h | 47 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/include/cyclone/types.h b/include/cyclone/types.h index 647f4ba3..f6fb2b1e 100644 --- a/include/cyclone/types.h +++ b/include/cyclone/types.h @@ -721,16 +721,21 @@ typedef enum { typedef struct { gc_header_type hdr; tag_type tag; + int num_cp; int len; char *str; } string_type; +// TODO: below macros are obsolete, need new ones that populate num_cp and +// raise an error if an invalid UTF-8 char is detected + /** Create a new string in the nursery */ #define make_string(cs, s) string_type cs; \ { int len = strlen(s); \ cs.hdr.mark = gc_color_red; \ cs.hdr.grayed = 0; \ cs.tag = string_tag; \ + cs.num_cp = len; \ cs.len = len; \ cs.str = alloca(sizeof(char) * (len + 1)); \ memcpy(cs.str, s, len + 1);} @@ -744,6 +749,7 @@ typedef struct { cs.hdr.mark = gc_color_red; \ cs.hdr.grayed = 0; \ cs.tag = string_tag; cs.len = len; \ + cs.num_cp = len; \ cs.str = alloca(sizeof(char) * (len + 1)); \ memcpy(cs.str, s, len); \ cs.str[len] = '\0';} @@ -755,9 +761,48 @@ typedef struct { #define make_string_noalloc(cs, s, length) string_type cs; \ { cs.hdr.mark = gc_color_red; cs.hdr.grayed = 0; \ cs.tag = string_tag; cs.len = length; \ + cs.num_cp = length; \ cs.str = s; } -/** Get the length of a string */ +///** Create a new string in the nursery */ +//#define make_string(cs, s) string_type cs; \ +//{ int len = strlen(s); \ +// cs.hdr.mark = gc_color_red; \ +// cs.hdr.grayed = 0; \ +// cs.tag = string_tag; \ +// cs.num_cp = len; \ +// cs.len = len; \ +// cs.str = alloca(sizeof(char) * (len + 1)); \ +// memcpy(cs.str, s, len + 1);} +// +///** +// * Create a new string with the given length +// * (so it does not need to be computed) +// */ +//#define make_string_with_len(cs, s, length) string_type cs; \ +//{ int len = length; \ +// cs.hdr.mark = gc_color_red; \ +// cs.hdr.grayed = 0; \ +// cs.tag = string_tag; cs.len = len; \ +// cs.num_cp = len; \ +// cs.str = alloca(sizeof(char) * (len + 1)); \ +// memcpy(cs.str, s, len); \ +// cs.str[len] = '\0';} +// +///** +// * Create a string object using the given C string and length. +// * No allocation is done for the given C string. +// */ +//#define make_string_noalloc(cs, s, length) string_type cs; \ +//{ cs.hdr.mark = gc_color_red; cs.hdr.grayed = 0; \ +// cs.tag = string_tag; cs.len = length; \ +// cs.num_cp = length; \ +// cs.str = s; } + +/** Get the length of a string, in characters (code points) */ +#define string_num_cp(x) (((string_type *) x)->num_cp) + +/** Get the length of a string, in bytes */ #define string_len(x) (((string_type *) x)->len) /** Get a string object's C string */ From 0ca396f8fa7fe8f35c7c08b458b8bc67194bf532 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Fri, 20 Oct 2017 16:29:47 +0000 Subject: [PATCH 04/61] Add new string_type field --- gc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/gc.c b/gc.c index 6fdf7f82..0c0e57f4 100644 --- a/gc.c +++ b/gc.c @@ -451,6 +451,7 @@ char *gc_copy_obj(object dest, char *obj, gc_thread_data * thd) memcpy(s, string_str(obj), string_len(obj) + 1); mark(hp) = thd->gc_alloc_color; type_of(hp) = string_tag; + string_num_cp(hp) = string_num_cp(obj); string_len(hp) = string_len(obj); string_str(hp) = s; return (char *)hp; From ac8b280578d1b34ff1632d5f44dd7fdd5e48a125 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Fri, 20 Oct 2017 16:29:56 +0000 Subject: [PATCH 05/61] Refactoring, added make_utf8_string --- include/cyclone/runtime.h | 2 +- include/cyclone/types.h | 25 ++++++++++++++----------- runtime.c | 11 +++++++---- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h index dbbfb8d9..21b204c5 100644 --- a/include/cyclone/runtime.h +++ b/include/cyclone/runtime.h @@ -716,7 +716,7 @@ void Cyc_set_globals_changed(gc_thread_data *thd); #define CYC_UTF8_ACCEPT 0 #define CYC_UTF8_REJECT 1 uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte); -int Cyc_utf8_count_code_points(uint8_t* s, size_t* count); +int Cyc_utf8_count_code_points(uint8_t* s); uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); uint32_t Cyc_utf8_validate(char *str, size_t len); /**@}*/ diff --git a/include/cyclone/types.h b/include/cyclone/types.h index f6fb2b1e..776aada6 100644 --- a/include/cyclone/types.h +++ b/include/cyclone/types.h @@ -764,17 +764,20 @@ typedef struct { cs.num_cp = length; \ cs.str = s; } -///** Create a new string in the nursery */ -//#define make_string(cs, s) string_type cs; \ -//{ int len = strlen(s); \ -// cs.hdr.mark = gc_color_red; \ -// cs.hdr.grayed = 0; \ -// cs.tag = string_tag; \ -// cs.num_cp = len; \ -// cs.len = len; \ -// cs.str = alloca(sizeof(char) * (len + 1)); \ -// memcpy(cs.str, s, len + 1);} -// +/** Create a new string in the nursery */ +#define make_utf8_string(data, cs, s) string_type cs; \ +{ int len = strlen(s); \ + cs.hdr.mark = gc_color_red; \ + cs.hdr.grayed = 0; \ + cs.tag = string_tag; \ + cs.num_cp = Cyc_utf8_count_code_points(s); \ + if (cs.num_cp < 0) { \ + Cyc_rt_raise_msg(data, "Invalid UTF-8 characters in string"); \ + } \ + cs.len = len; \ + cs.str = alloca(sizeof(char) * (len + 1)); \ + memcpy(cs.str, s, len + 1);} + ///** // * Create a new string with the given length // * (so it does not need to be computed) diff --git a/runtime.c b/runtime.c index a0662e81..ef2bafb0 100644 --- a/runtime.c +++ b/runtime.c @@ -6403,15 +6403,18 @@ uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { * Count the number of code points in a string. * Based on example code from Bjoern Hoehrmann. */ -int Cyc_utf8_count_code_points(uint8_t* s, size_t* count) { +int Cyc_utf8_count_code_points(uint8_t* s) { uint32_t codepoint; uint32_t state = 0; + int count; - for (*count = 0; *s; ++s) + for (count = 0; *s; ++s) if (!Cyc_utf8_decode(&state, &codepoint, *s)) - *count += 1; + count += 1; - return state != CYC_UTF8_ACCEPT; + if (state != CYC_UTF8_ACCEPT) + return -1; + return count; } // TODO: index into X codepoint in a string From 14626f15c449254f19735b5da33f8a92ab4e7a84 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Sun, 22 Oct 2017 18:59:35 -0400 Subject: [PATCH 06/61] Unicode changes, take code points into account --- runtime.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/runtime.c b/runtime.c index ef2bafb0..c75c8ae1 100644 --- a/runtime.c +++ b/runtime.c @@ -2032,7 +2032,7 @@ object Cyc_string_cmp(void *data, object str1, object str2) } #define Cyc_string_append_va_list(data, argc) { \ - int i = 0, total_len = 1; \ + int i = 0, total_cp = 0, total_len = 1; \ int *len = alloca(sizeof(int) * argc); \ char *buffer, *bufferp, **str = alloca(sizeof(char *) * argc); \ object tmp; \ @@ -2041,6 +2041,7 @@ object Cyc_string_cmp(void *data, object str1, object str2) str[i] = ((string_type *)str1)->str; \ len[i] = string_len((str1)); \ total_len += len[i]; \ + total_cp += string_num_cp(str1); \ } \ for (i = 1; i < argc; i++) { \ tmp = va_arg(ap, object); \ @@ -2048,6 +2049,7 @@ object Cyc_string_cmp(void *data, object str1, object str2) str[i] = ((string_type *)tmp)->str; \ len[i] = string_len((tmp)); \ total_len += len[i]; \ + total_cp += string_num_cp(tmp); \ } \ buffer = bufferp = alloca(sizeof(char) * total_len); \ for (i = 0; i < argc; i++) { \ @@ -2056,6 +2058,7 @@ object Cyc_string_cmp(void *data, object str1, object str2) } \ *bufferp = '\0'; \ make_string(result, buffer); \ + string_num_cp(result) = total_cp; \ va_end(ap); \ _return_closcall1(data, cont, &result); \ } @@ -2078,7 +2081,7 @@ object Cyc_string_append(void *data, object cont, int _argc, object str1, ...) object Cyc_string_length(void *data, object str) { Cyc_check_str(data, str); - return obj_int2obj(string_len(str)); + return obj_int2obj(string_num_cp(str)); } object Cyc_string_set(void *data, object str, object k, object chr) From 8b817966e82aaae8b8ed0987a14c718f34b89227 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 23 Oct 2017 13:26:29 +0000 Subject: [PATCH 07/61] WIP --- include/cyclone/types.h | 52 +++++++++++++++++++++++------------------ runtime.c | 26 ++++++++++++++++----- 2 files changed, 49 insertions(+), 29 deletions(-) diff --git a/include/cyclone/types.h b/include/cyclone/types.h index 776aada6..3667d661 100644 --- a/include/cyclone/types.h +++ b/include/cyclone/types.h @@ -465,6 +465,12 @@ void clear_mutations(void *data); /** Minimum allowed value of a fixnum */ #define CYC_FIXNUM_MIN -1073741824 +/** + * Explicit character type now that we are using UTF-8. + * Chars are still value types though + */ +typedef uint32_t char_type; + /** * Determine if an object is an integer. */ @@ -778,29 +784,29 @@ typedef struct { cs.str = alloca(sizeof(char) * (len + 1)); \ memcpy(cs.str, s, len + 1);} -///** -// * Create a new string with the given length -// * (so it does not need to be computed) -// */ -//#define make_string_with_len(cs, s, length) string_type cs; \ -//{ int len = length; \ -// cs.hdr.mark = gc_color_red; \ -// cs.hdr.grayed = 0; \ -// cs.tag = string_tag; cs.len = len; \ -// cs.num_cp = len; \ -// cs.str = alloca(sizeof(char) * (len + 1)); \ -// memcpy(cs.str, s, len); \ -// cs.str[len] = '\0';} -// -///** -// * Create a string object using the given C string and length. -// * No allocation is done for the given C string. -// */ -//#define make_string_noalloc(cs, s, length) string_type cs; \ -//{ cs.hdr.mark = gc_color_red; cs.hdr.grayed = 0; \ -// cs.tag = string_tag; cs.len = length; \ -// cs.num_cp = length; \ -// cs.str = s; } +/** + * Create a new string with the given length + * (so it does not need to be computed) + */ +#define make_utf8_string_with_len(cs, s, length, num_cp) string_type cs; \ +{ int len = length; \ + cs.hdr.mark = gc_color_red; \ + cs.hdr.grayed = 0; \ + cs.tag = string_tag; cs.len = len; \ + cs.num_cp = num_cp; \ + cs.str = alloca(sizeof(char) * (len + 1)); \ + memcpy(cs.str, s, len); \ + cs.str[len] = '\0';} + +/** + * Create a string object using the given C string and length. + * No allocation is done for the given C string. + */ +#define make_utf8_string_noalloc(cs, s, length) string_type cs; \ +{ cs.hdr.mark = gc_color_red; cs.hdr.grayed = 0; \ + cs.tag = string_tag; cs.len = length; \ + cs.num_cp = length; \ + cs.str = s; } /** Get the length of a string, in characters (code points) */ #define string_num_cp(x) (((string_type *) x)->num_cp) diff --git a/runtime.c b/runtime.c index c75c8ae1..4b228cd6 100644 --- a/runtime.c +++ b/runtime.c @@ -2041,7 +2041,7 @@ object Cyc_string_cmp(void *data, object str1, object str2) str[i] = ((string_type *)str1)->str; \ len[i] = string_len((str1)); \ total_len += len[i]; \ - total_cp += string_num_cp(str1); \ + total_cp += string_num_cp((str[i])); \ } \ for (i = 1; i < argc; i++) { \ tmp = va_arg(ap, object); \ @@ -2049,7 +2049,7 @@ object Cyc_string_cmp(void *data, object str1, object str2) str[i] = ((string_type *)tmp)->str; \ len[i] = string_len((tmp)); \ total_len += len[i]; \ - total_cp += string_num_cp(tmp); \ + total_cp += string_num_cp((str[i])); \ } \ buffer = bufferp = alloca(sizeof(char) * total_len); \ for (i = 0; i < argc; i++) { \ @@ -2058,7 +2058,7 @@ object Cyc_string_cmp(void *data, object str1, object str2) } \ *bufferp = '\0'; \ make_string(result, buffer); \ - string_num_cp(result) = total_cp; \ + string_num_cp((&result)) = total_cp; \ va_end(ap); \ _return_closcall1(data, cont, &result); \ } @@ -2081,7 +2081,7 @@ object Cyc_string_append(void *data, object cont, int _argc, object str1, ...) object Cyc_string_length(void *data, object str) { Cyc_check_str(data, str); - return obj_int2obj(string_num_cp(str)); + return obj_int2obj(string_len(str)); } object Cyc_string_set(void *data, object str, object k, object chr) @@ -2115,13 +2115,27 @@ object Cyc_string_ref(void *data, object str, object k) raw = string_str(str); idx = unbox_number(k); - len = string_len(str); + len = string_num_cp(str); if (idx < 0 || idx >= len) { Cyc_rt_raise2(data, "string-ref - invalid index", k); } - return obj_char2obj(raw[idx]); + { + char_type codepoint; + uint32_t state = 0; + int count; + + for (count = 0; *raw; ++raw){ + if (!Cyc_utf8_decode(&state, &codepoint, *raw)){ + if (count == idx) break; // Reached requested index + count += 1; + } + } + if (state != CYC_UTF8_ACCEPT) + Cyc_rt_raise2(data, "string-ref - invalid character at index", k); + return obj_char2obj(codepoint); + } } object Cyc_substring(void *data, object cont, object str, object start, From 96e5692cb9fcddce5cbce4cb1ceb9b4be5b0f3db Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 23 Oct 2017 13:38:02 +0000 Subject: [PATCH 08/61] bugfix --- runtime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime.c b/runtime.c index 4b228cd6..dda093fa 100644 --- a/runtime.c +++ b/runtime.c @@ -2041,7 +2041,7 @@ object Cyc_string_cmp(void *data, object str1, object str2) str[i] = ((string_type *)str1)->str; \ len[i] = string_len((str1)); \ total_len += len[i]; \ - total_cp += string_num_cp((str[i])); \ + total_cp += string_num_cp((str1)); \ } \ for (i = 1; i < argc; i++) { \ tmp = va_arg(ap, object); \ @@ -2049,7 +2049,7 @@ object Cyc_string_cmp(void *data, object str1, object str2) str[i] = ((string_type *)tmp)->str; \ len[i] = string_len((tmp)); \ total_len += len[i]; \ - total_cp += string_num_cp((str[i])); \ + total_cp += string_num_cp((tmp)); \ } \ buffer = bufferp = alloca(sizeof(char) * total_len); \ for (i = 0; i < argc; i++) { \ From 114e284566c69066c712c87671684e9839deb8a8 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 23 Oct 2017 13:39:04 +0000 Subject: [PATCH 09/61] string-length: return number of codepoints --- runtime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime.c b/runtime.c index dda093fa..863abab0 100644 --- a/runtime.c +++ b/runtime.c @@ -2081,7 +2081,7 @@ object Cyc_string_append(void *data, object cont, int _argc, object str1, ...) object Cyc_string_length(void *data, object str) { Cyc_check_str(data, str); - return obj_int2obj(string_len(str)); + return obj_int2obj(string_num_cp(str)); } object Cyc_string_set(void *data, object str, object k, object chr) From 424592ad8be8bebb6045d7a7dfd60ebe2fc19cb9 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 23 Oct 2017 17:10:43 +0000 Subject: [PATCH 10/61] Added TODO --- runtime.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/runtime.c b/runtime.c index 863abab0..67f51bf3 100644 --- a/runtime.c +++ b/runtime.c @@ -2121,6 +2121,8 @@ object Cyc_string_ref(void *data, object str, object k) Cyc_rt_raise2(data, "string-ref - invalid index", k); } +TODO: we can take the fast path if num_cp == len, since that implies all chars are just 1 byte. + would be the case for all string functions that need to be updated to be (possibly) O(n) { char_type codepoint; uint32_t state = 0; From 3e64420101ab63cfda6f5486af9b55aca1526cc1 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 23 Oct 2017 17:43:37 -0400 Subject: [PATCH 11/61] Added UTF8 support to Cyc_substring --- include/cyclone/types.h | 4 ++-- runtime.c | 32 +++++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/include/cyclone/types.h b/include/cyclone/types.h index 3667d661..d91a1aa0 100644 --- a/include/cyclone/types.h +++ b/include/cyclone/types.h @@ -788,12 +788,12 @@ typedef struct { * Create a new string with the given length * (so it does not need to be computed) */ -#define make_utf8_string_with_len(cs, s, length, num_cp) string_type cs; \ +#define make_utf8_string_with_len(cs, s, length, num_code_points) string_type cs; \ { int len = length; \ cs.hdr.mark = gc_color_red; \ cs.hdr.grayed = 0; \ cs.tag = string_tag; cs.len = len; \ - cs.num_cp = num_cp; \ + cs.num_cp = num_code_points; \ cs.str = alloca(sizeof(char) * (len + 1)); \ memcpy(cs.str, s, len); \ cs.str[len] = '\0';} diff --git a/runtime.c b/runtime.c index 67f51bf3..607f935d 100644 --- a/runtime.c +++ b/runtime.c @@ -2121,9 +2121,10 @@ object Cyc_string_ref(void *data, object str, object k) Cyc_rt_raise2(data, "string-ref - invalid index", k); } -TODO: we can take the fast path if num_cp == len, since that implies all chars are just 1 byte. - would be the case for all string functions that need to be updated to be (possibly) O(n) - { + // Take fast path if all chars are just 1 byte + if (string_num_cp(str) == string_len(str)) { + return obj_char2obj(raw[idx]); + } else { char_type codepoint; uint32_t state = 0; int count; @@ -2153,7 +2154,7 @@ object Cyc_substring(void *data, object cont, object str, object start, raw = string_str(str); s = unbox_number(start); e = unbox_number(end); - len = string_len(str); + len = string_num_cp(str); if (s > e) { Cyc_rt_raise2(data, "substring - start cannot be greater than end", start); @@ -2167,9 +2168,30 @@ object Cyc_substring(void *data, object cont, object str, object start, e = len; } - { + if (string_num_cp(str) == string_len(str)){ // Fast path for ASCII make_string_with_len(sub, raw + s, e - s); _return_closcall1(data, cont, &sub); + } else { + const char *tmp = raw; + char_type codepoint; + uint32_t state = 0; + int count, start_i = 0, end_i = 0; + + for (count = 0; *tmp; ++tmp){ + if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){ + if (count == s) { + start_i = end_i; + } else if (count == e) { + break; + } + count += 1; + } + end_i++; + } + if (state != CYC_UTF8_ACCEPT) + Cyc_rt_raise2(data, "substring - invalid character in string", str); + make_utf8_string_with_len(sub, raw + start_i, end_i - start_i, e - s); + _return_closcall1(data, cont, &sub); } } From cb1bfef031e9768f6a27550e6cd65de1605f74a7 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 23 Oct 2017 18:47:01 -0400 Subject: [PATCH 12/61] WIP - string-set! --- runtime.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/runtime.c b/runtime.c index 607f935d..5387dbf9 100644 --- a/runtime.c +++ b/runtime.c @@ -2101,7 +2101,45 @@ object Cyc_string_set(void *data, object str, object k, object chr) len = string_len(str); Cyc_check_bounds(data, "string-set!", len, idx); - raw[idx] = obj_obj2char(chr); + + // Take fast path if all chars are just 1 byte + if (string_num_cp(str) == string_len(str)) { + raw[idx] = obj_obj2char(chr); + } else { + // TODO: utf8 support + // find codepoint at k, figure out how many bytes it is, + // allocate a new string (start) + chr + (end) + // or don't allocate if chr uses as many or fewer bytes + // than the codepoint it is replacing + + char *tmp = raw; + char_type codepoint; + uint32_t state = 0; + int i = 0, count, start_len = 0, start_cp = 0; + + for (count = 0; *tmp; ++tmp){ + if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){ + if (count < idx) { + start_len = i; + start_cp = count; + } else if (count == idx) { + break; + } + count += 1; + } + i++; + } + if (state != CYC_UTF8_ACCEPT) + Cyc_rt_raise2(data, "string-set! - invalid character at index", k); + + // TODO: perform actual mutation + // + // Now we know length of start (both in codepoints and bytes), + // and we know the codepoint to be replaced. by calculating its length + // we can compute where the end portion starts, and by using str we can + // figure out how many remaining bytes/codepoints are in end + + } return str; } From 13254d06f01e910e08f3170a815ed3761e021392 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Tue, 24 Oct 2017 13:23:48 +0000 Subject: [PATCH 13/61] WIP - utf8 / string conversion functions --- runtime.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/runtime.c b/runtime.c index 5387dbf9..da3f255c 100644 --- a/runtime.c +++ b/runtime.c @@ -2106,6 +2106,7 @@ object Cyc_string_set(void *data, object str, object k, object chr) if (string_num_cp(str) == string_len(str)) { raw[idx] = obj_obj2char(chr); } else { +fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), len); // TODO: utf8 support // find codepoint at k, figure out how many bytes it is, // allocate a new string (start) + chr + (end) @@ -2569,6 +2570,10 @@ object Cyc_utf82string(void *data, object cont, object bv, object start, st.str = alloca(sizeof(char) * (len + 1)); memcpy(st.str, &buf[s], len); st.str[len] = '\0'; + st.num_cp = Cyc_utf8_count_code_points((uint8_t *)(st.str)); + if (st.num_cp < 0) { + Cyc_rt_raise2(data, "utf8->string - error decoding UTF 8", bv); + } _return_closcall1(data, cont, &st); } } @@ -2596,6 +2601,11 @@ object Cyc_string2utf8(void *data, object cont, object str, object start, Cyc_rt_raise2(data, "string->utf8 - invalid end", end); } + // TODO: we have code point positions s, e, and length. We need to take those + // and walk the string to figure out the starting and ending BYTE positions + + // TODO: fast path, can keep below if string_num_cp(str) == string_len(str) + result.len = len; result.data = alloca(sizeof(char) * len); memcpy(&result.data[0], &(string_str(str))[s], len); From 6c4dd4b740179932c81caa7949a927b5d7067a1c Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Tue, 24 Oct 2017 17:53:09 -0400 Subject: [PATCH 14/61] Compute number of code points and byte len --- scheme/base.sld | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scheme/base.sld b/scheme/base.sld index 75ecc2f1..1f864709 100644 --- a/scheme/base.sld +++ b/scheme/base.sld @@ -952,7 +952,8 @@ " object s = NULL; Cyc_check_int(data, count); char c = obj_obj2char(fill); - int len = obj_obj2int(count); + int num_cp = obj_obj2int(count); + int len = num_cp * uint32_num_bytes(c); if (len >= MAX_STACK_OBJ) { int heap_grown; s = gc_alloc(((gc_thread_data *)data)->heap, @@ -964,6 +965,7 @@ ((string_type *) s)->hdr.grayed = 0; ((string_type *) s)->tag = string_tag; ((string_type *) s)->len = len; + ((string_type *) s)->num_cp = num_cp; ((string_type *) s)->str = (((char *)s) + sizeof(string_type)); } else { s = alloca(sizeof(string_type)); @@ -971,6 +973,7 @@ ((string_type *)s)->hdr.grayed = 0; ((string_type *)s)->tag = string_tag; ((string_type *)s)->len = len; + ((string_type *)s)->num_cp = num_cp; ((string_type *)s)->str = alloca(sizeof(char) * (len + 1)); } memset(((string_type *)s)->str, c, len); From 13e260300ffb3163e4040e1e012b0822615de16f Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Tue, 24 Oct 2017 17:53:43 -0400 Subject: [PATCH 15/61] Added utility function and stubs --- include/cyclone/runtime.h | 1 + runtime.c | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h index 21b204c5..119338e9 100644 --- a/include/cyclone/runtime.h +++ b/include/cyclone/runtime.h @@ -719,6 +719,7 @@ uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte); int Cyc_utf8_count_code_points(uint8_t* s); uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); uint32_t Cyc_utf8_validate(char *str, size_t len); +int uint32_num_bytes(uint32_t val); /**@}*/ #endif /* CYCLONE_RUNTIME_H */ diff --git a/runtime.c b/runtime.c index da3f255c..79badbf9 100644 --- a/runtime.c +++ b/runtime.c @@ -178,6 +178,7 @@ void pack_env_variables(void *data, object k) svar->hdr.grayed = 0; svar->tag = string_tag; svar->len = eqpos - e; + svar->num_cp = svar->len; // TODO: proper UTF-8 support! svar->str = alloca(sizeof(char) * (svar->len)); strncpy(svar->str, e, svar->len); (svar->str)[svar->len] = '\0'; @@ -189,6 +190,7 @@ void pack_env_variables(void *data, object k) sval->hdr.grayed = 0; sval->tag = string_tag; sval->len = strlen(eqpos); + sval->num_cp = sval->len; // TODO: proper UTF-8 support! sval->str = eqpos; set_pair(tmp, svar, sval); set_pair(p, tmp, NULL); @@ -6553,4 +6555,12 @@ uint32_t Cyc_utf8_validate(char *str, size_t len) { return state; } +int uint32_num_bytes(uint32_t x) { + // TODO: could compute log(val) / log(256) + if (x < 0x100) return 1; + if (x < 0x10000) return 2; + if (x < 0x1000000) return 3; + return 4; +} + ////////////// END UTF-8 Section ////////////// From 325112e50b56151b5e6daaa22c64a692d239697d Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Tue, 24 Oct 2017 19:00:45 -0400 Subject: [PATCH 16/61] Temporary file --- test.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 test.c diff --git a/test.c b/test.c new file mode 100644 index 00000000..4c97178b --- /dev/null +++ b/test.c @@ -0,0 +1,20 @@ +// A temporary test file +#include +#include +#include +#include + +void main(){ + char c[128]; + uint32_t val = 0x32363435; + uint8_t *ptr = (uint8_t *)&val; + int i, j = 0; + //memset(c, 0x34, 128); + for (i = 0; i < 127; i++) { + c[i] = ptr[j++]; + if (j == 4) j = 0; + } + c[127] = '\0'; + printf("%s\n", c); + return; +} From 722d077367cb2ec4afc015c3e5760f04bff4903f Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Tue, 24 Oct 2017 19:01:20 -0400 Subject: [PATCH 17/61] WIP --- scheme/base.sld | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scheme/base.sld b/scheme/base.sld index 1f864709..d3a8ee78 100644 --- a/scheme/base.sld +++ b/scheme/base.sld @@ -976,7 +976,14 @@ ((string_type *)s)->num_cp = num_cp; ((string_type *)s)->str = alloca(sizeof(char) * (len + 1)); } - memset(((string_type *)s)->str, c, len); + //if (num_cp == 1) { /* Fast path */ + memset(((string_type *)s)->str, c, len); + //} else { + // int i; + // uint32_t* + // for (i = 0; i < len; i++) { + // } + //} ((string_type *)s)->str[len] = '\\0'; return_closcall1(data, k, s); ") From 556f97dd5fc5a6d50699d64f9404b3c176931060 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Wed, 25 Oct 2017 13:54:36 +0000 Subject: [PATCH 18/61] WIP, testing encoders/decoders --- test.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 121 insertions(+), 12 deletions(-) diff --git a/test.c b/test.c index 4c97178b..306f1d85 100644 --- a/test.c +++ b/test.c @@ -4,17 +4,126 @@ #include #include -void main(){ - char c[128]; - uint32_t val = 0x32363435; - uint8_t *ptr = (uint8_t *)&val; - int i, j = 0; - //memset(c, 0x34, 128); - for (i = 0; i < 127; i++) { - c[i] = ptr[j++]; - if (j == 4) j = 0; - } - c[127] = '\0'; - printf("%s\n", c); +#define CYC_UTF8_ACCEPT 0 + +// Copyright (c) 2008-2009 Bjoern Hoehrmann +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +//uint32_t inline +uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != CYC_UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} + +// FROM: https://www.cprogramming.com/tutorial/utf8.c +/* srcsz = number of source characters, or -1 if 0-terminated + sz = size of dest buffer in bytes + + returns # characters converted + dest will only be '\0'-terminated if there is enough space. this is + for consistency; imagine there are 2 bytes of space left, but the next + character requires 3 bytes. in this case we could NUL-terminate, but in + general we can't when there's insufficient space. therefore this function + only NUL-terminates if all the characters fit, and there's space for + the NUL as well. + the destination string will never be bigger than the source string. +*/ +int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz) +{ + u_int32_t ch; + int i = 0; + char *dest_end = dest + sz; + + while (srcsz<0 ? src[i]!=0 : i < srcsz) { + ch = src[i]; + if (ch < 0x80) { + if (dest >= dest_end) + return i; + *dest++ = (char)ch; + } + else if (ch < 0x800) { + if (dest >= dest_end-1) + return i; + *dest++ = (ch>>6) | 0xC0; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x10000) { + if (dest >= dest_end-2) + return i; + *dest++ = (ch>>12) | 0xE0; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x110000) { + if (dest >= dest_end-3) + return i; + *dest++ = (ch>>18) | 0xF0; + *dest++ = ((ch>>12) & 0x3F) | 0x80; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + i++; + } + if (dest < dest_end) + *dest = '\0'; + return i; +} + +void encoding() { + char dest[5]; + int rv; + uint32_t val = 0x03bb; + + rv = u8_toutf8(dest, 5, &val, 1); + printf("%d %x\n", rv, dest); +TODO: above seems broken, should encode to 0xCEBB (see below) + return; +} + +void main(){ + char c[128]; + uint8_t cv[] = {0xEC, 0xBA, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE +// uint8_t cv[] = {0xCE, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE + char *cptr; + uint32_t state = CYC_UTF8_ACCEPT, codepoint, val = 0x32363435; + uint8_t *ptr = (uint8_t *)&val; + int i, j = 0; +// //memset(c, 0x34, 128); +// for (i = 0; i < 127; i++) { +// c[i] = ptr[j++]; +// if (j == 4) j = 0; +// } +// c[127] = '\0'; +// printf("%s\n", c); + + ptr = cv; + for (i = 0; i < 3; i++) { + Cyc_utf8_decode(&state, &codepoint, ptr[i]); + } + printf("state = %d, cp = %d\n", state, codepoint); + + encoding(); return; } From 96c3846b433fe470af4ff966ed4cee99d7a6a0ee Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Wed, 25 Oct 2017 17:14:10 +0000 Subject: [PATCH 19/61] Cleanup --- test.c | 53 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/test.c b/test.c index 306f1d85..f3693d80 100644 --- a/test.c +++ b/test.c @@ -37,20 +37,27 @@ uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { return *state; } -// FROM: https://www.cprogramming.com/tutorial/utf8.c -/* srcsz = number of source characters, or -1 if 0-terminated - sz = size of dest buffer in bytes - - returns # characters converted - dest will only be '\0'-terminated if there is enough space. this is - for consistency; imagine there are 2 bytes of space left, but the next - character requires 3 bytes. in this case we could NUL-terminate, but in - general we can't when there's insufficient space. therefore this function - only NUL-terminates if all the characters fit, and there's space for - the NUL as well. - the destination string will never be bigger than the source string. -*/ -int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz) +/** + * This function takes one or more 32-bit chars and encodes them + * as an array of UTF-8 bytes. + * FROM: https://www.cprogramming.com/tutorial/utf8.c + * + * @param dest Destination byte buffer + * @param sz size of dest buffer in bytes + * @param src Buffer of source data, in 32-bit characters + * @param srcsz number of source characters, or -1 if 0-terminated + * + * @return Number of characters converted + * + * dest will only be '\0'-terminated if there is enough space. this is + * for consistency; imagine there are 2 bytes of space left, but the next + * character requires 3 bytes. in this case we could NUL-terminate, but in + * general we can't when there's insufficient space. therefore this function + * only NUL-terminates if all the characters fit, and there's space for + * the NUL as well. + * the destination string will never be bigger than the source string. + */ +int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz) { u_int32_t ch; int i = 0; @@ -91,14 +98,16 @@ int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz) return i; } -void encoding() { +void encode(uint32_t val) { char dest[5]; - int rv; - uint32_t val = 0x03bb; + int rv, i; - rv = u8_toutf8(dest, 5, &val, 1); - printf("%d %x\n", rv, dest); -TODO: above seems broken, should encode to 0xCEBB (see below) + rv = Cyc_utf8_encode(dest, 5, &val, 1); + printf("%x %d \n", val, rv); + for(i = 0; i < 5; i++) { + printf("[%x] ", (uint8_t)dest[i]); + } + printf("\n"); return; } @@ -124,6 +133,8 @@ void main(){ } printf("state = %d, cp = %d\n", state, codepoint); - encoding(); + encode(0x3bb); + encode(65); + encode(0xcebb); return; } From ccfde220ffc762a975cfdeb2c81c593db0a28144 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Wed, 25 Oct 2017 17:21:53 +0000 Subject: [PATCH 20/61] WIP --- test.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test.c b/test.c index f3693d80..2d86b1a2 100644 --- a/test.c +++ b/test.c @@ -37,6 +37,12 @@ uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { return *state; } +/** + * Simple macro to make it more convenient to convert a single char + */ +#define Cyc_utf8_encode_char(dest, dest_size, char_value) \ + Cyc_utf8_encode(dest, dest_size, &char_value, 1) + /** * This function takes one or more 32-bit chars and encodes them * as an array of UTF-8 bytes. @@ -102,7 +108,7 @@ void encode(uint32_t val) { char dest[5]; int rv, i; - rv = Cyc_utf8_encode(dest, 5, &val, 1); + rv = Cyc_utf8_encode_char(dest, 5, val); printf("%x %d \n", val, rv); for(i = 0; i < 5; i++) { printf("[%x] ", (uint8_t)dest[i]); From aa0b0a75678b2b6134c02ba24c2e10d2ee12fd7d Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Wed, 25 Oct 2017 18:35:11 -0400 Subject: [PATCH 21/61] Added UTF8 encoder, final version of string->utf8 --- include/cyclone/runtime.h | 7 +++ runtime.c | 102 ++++++++++++++++++++++++++++++++++---- 2 files changed, 98 insertions(+), 11 deletions(-) diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h index 119338e9..6720002b 100644 --- a/include/cyclone/runtime.h +++ b/include/cyclone/runtime.h @@ -715,6 +715,13 @@ void Cyc_set_globals_changed(gc_thread_data *thd); /**@{*/ #define CYC_UTF8_ACCEPT 0 #define CYC_UTF8_REJECT 1 + +/** + * Simple macro to make it more convenient to convert a single char + */ +#define Cyc_utf8_encode_char(dest, dest_size, char_value) \ + Cyc_utf8_encode(dest, dest_size, &char_value, 1) + uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte); int Cyc_utf8_count_code_points(uint8_t* s); uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); diff --git a/runtime.c b/runtime.c index 79badbf9..d0e804e9 100644 --- a/runtime.c +++ b/runtime.c @@ -2595,23 +2595,41 @@ object Cyc_string2utf8(void *data, object cont, object str, object start, e = unbox_number(end); len = e - s; - if (s < 0 || (s >= string_len(str) && len > 0)) { + if (s < 0 || (s >= string_num_cp(str) && len > 0)) { Cyc_rt_raise2(data, "string->utf8 - invalid start", start); } - if (e < 0 || e < s || e > string_len(str)) { + if (e < 0 || e < s || e > string_num_cp(str)) { Cyc_rt_raise2(data, "string->utf8 - invalid end", end); } - // TODO: we have code point positions s, e, and length. We need to take those - // and walk the string to figure out the starting and ending BYTE positions - - // TODO: fast path, can keep below if string_num_cp(str) == string_len(str) - - result.len = len; - result.data = alloca(sizeof(char) * len); - memcpy(&result.data[0], &(string_str(str))[s], len); - _return_closcall1(data, cont, &result); + // Fast path + if (string_num_cp(str) == string_len(str)) { // TODO: disable for testing purposes + result.len = len; + result.data = alloca(sizeof(char) * len); + memcpy(&result.data[0], &(string_str(str))[s], len); + _return_closcall1(data, cont, &result); + } else { + int i, start_i = 0, end_i = 0; + const char *tmp = string_str(str); + char_type codepoint; + uint32_t state = 0; + for (i = 0; *tmp; ++tmp) { + if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){ + if (i == s) { + start_i = i; + } else if (i == e) { + break; + } + } + i++; + } + end_i = i; + result.len = end_i - start_i; + result.data = alloca(sizeof(char) * result.len); + memcpy(&result.data[0], &(string_str(str))[start_i], result.len); + _return_closcall1(data, cont, &result); + } } object Cyc_bytevector_u8_ref(void *data, object bv, object k) @@ -6563,4 +6581,66 @@ int uint32_num_bytes(uint32_t x) { return 4; } +/** + * This function takes one or more 32-bit chars and encodes them + * as an array of UTF-8 bytes. + * FROM: https://www.cprogramming.com/tutorial/utf8.c + * + * @param dest Destination byte buffer + * @param sz size of dest buffer in bytes + * @param src Buffer of source data, in 32-bit characters + * @param srcsz number of source characters, or -1 if 0-terminated + * + * @return Number of characters converted + * + * dest will only be '\0'-terminated if there is enough space. this is + * for consistency; imagine there are 2 bytes of space left, but the next + * character requires 3 bytes. in this case we could NUL-terminate, but in + * general we can't when there's insufficient space. therefore this function + * only NUL-terminates if all the characters fit, and there's space for + * the NUL as well. + * the destination string will never be bigger than the source string. + */ +int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz) +{ + u_int32_t ch; + int i = 0; + char *dest_end = dest + sz; + + while (srcsz<0 ? src[i]!=0 : i < srcsz) { + ch = src[i]; + if (ch < 0x80) { + if (dest >= dest_end) + return i; + *dest++ = (char)ch; + } + else if (ch < 0x800) { + if (dest >= dest_end-1) + return i; + *dest++ = (ch>>6) | 0xC0; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x10000) { + if (dest >= dest_end-2) + return i; + *dest++ = (ch>>12) | 0xE0; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x110000) { + if (dest >= dest_end-3) + return i; + *dest++ = (ch>>18) | 0xF0; + *dest++ = ((ch>>12) & 0x3F) | 0x80; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + i++; + } + if (dest < dest_end) + *dest = '\0'; + return i; +} + + ////////////// END UTF-8 Section ////////////// From 596f225179dfa3f925d55e0e3e0bced598228acd Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Thu, 26 Oct 2017 13:02:55 +0000 Subject: [PATCH 22/61] Added memset test code --- test.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test.c b/test.c index 2d86b1a2..5f7996e8 100644 --- a/test.c +++ b/test.c @@ -117,9 +117,19 @@ void encode(uint32_t val) { return; } +void multi_byte_memset(char *buf, int blen, char *src, int slen) +{ + int bi, si; + for (bi = 0, si = 0; bi < blen; bi++, si++) { + buf[bi] = src[si % slen]; + } +} + void main(){ char c[128]; uint8_t cv[] = {0xEC, 0xBA, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE + uint8_t cv2[] = {0xCE, 0xBB}; // Lambda (0x03bb) is encoded with leading 0xCE + //uint8_t cv2[] = {0xEC, 0xBA, 0xBB}; // Lambda (0x03bb) is encoded with leading 0xCE // uint8_t cv[] = {0xCE, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE char *cptr; uint32_t state = CYC_UTF8_ACCEPT, codepoint, val = 0x32363435; @@ -132,6 +142,9 @@ void main(){ // } // c[127] = '\0'; // printf("%s\n", c); + multi_byte_memset(c, 126, cv2, 2); + c[127] = '\0'; + printf("TEST: %s\n", c); ptr = cv; for (i = 0; i < 3; i++) { From 0bd0eeb7a6162b3cd202c9b14cc889c99bfe1a7b Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Thu, 26 Oct 2017 17:04:52 +0000 Subject: [PATCH 23/61] WIP --- scheme/base.sld | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scheme/base.sld b/scheme/base.sld index d3a8ee78..2fdd8ece 100644 --- a/scheme/base.sld +++ b/scheme/base.sld @@ -950,10 +950,12 @@ (define-c Cyc-make-string "(void *data, int argc, closure _, object k, object count, object fill)" " object s = NULL; + char ch_buf[5]; Cyc_check_int(data, count); - char c = obj_obj2char(fill); + char_type c = obj_obj2char(fill); + Cyc_utf8_encode_char(ch_buf, 5, &c); int num_cp = obj_obj2int(count); - int len = num_cp * uint32_num_bytes(c); +TODO: read encoded ch_buf int len = num_cp * uint32_num_bytes(c); if (len >= MAX_STACK_OBJ) { int heap_grown; s = gc_alloc(((gc_thread_data *)data)->heap, From 703f863e4885c950796d5a6e9fdbd5f95f9bd65f Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Thu, 26 Oct 2017 21:56:35 +0000 Subject: [PATCH 24/61] Fixes for make-string --- include/cyclone/runtime.h | 1 + scheme/base.sld | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h index 6720002b..3557ee6f 100644 --- a/include/cyclone/runtime.h +++ b/include/cyclone/runtime.h @@ -722,6 +722,7 @@ void Cyc_set_globals_changed(gc_thread_data *thd); #define Cyc_utf8_encode_char(dest, dest_size, char_value) \ Cyc_utf8_encode(dest, dest_size, &char_value, 1) +int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz); uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte); int Cyc_utf8_count_code_points(uint8_t* s); uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); diff --git a/scheme/base.sld b/scheme/base.sld index 2fdd8ece..e2a54188 100644 --- a/scheme/base.sld +++ b/scheme/base.sld @@ -953,9 +953,9 @@ char ch_buf[5]; Cyc_check_int(data, count); char_type c = obj_obj2char(fill); - Cyc_utf8_encode_char(ch_buf, 5, &c); + Cyc_utf8_encode_char(ch_buf, 5, c); int num_cp = obj_obj2int(count); -TODO: read encoded ch_buf int len = num_cp * uint32_num_bytes(c); + int len = num_cp * strlen(ch_buf); if (len >= MAX_STACK_OBJ) { int heap_grown; s = gc_alloc(((gc_thread_data *)data)->heap, @@ -978,14 +978,15 @@ TODO: read encoded ch_buf int len = num_cp * uint32_num_bytes(c); ((string_type *)s)->num_cp = num_cp; ((string_type *)s)->str = alloca(sizeof(char) * (len + 1)); } - //if (num_cp == 1) { /* Fast path */ - memset(((string_type *)s)->str, c, len); - //} else { - // int i; - // uint32_t* - // for (i = 0; i < len; i++) { - // } - //} + if (0 && num_cp == 1) { /* Fast path */ + memset(((string_type *)s)->str, ch_buf[0], len); + } else { + char *buf = ((string_type *)s)->str; + int bi, si, slen = strlen(ch_buf); + for (bi = 0, si = 0; bi < len; bi++, si++) { + buf[bi] = ch_buf[si % slen]; + } + } ((string_type *)s)->str[len] = '\\0'; return_closcall1(data, k, s); ") From 77e391cabcc83c209c31a6739f907000915f3416 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Thu, 26 Oct 2017 22:35:11 +0000 Subject: [PATCH 25/61] Uncomment fast path --- scheme/base.sld | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scheme/base.sld b/scheme/base.sld index e2a54188..0a63fb55 100644 --- a/scheme/base.sld +++ b/scheme/base.sld @@ -978,7 +978,7 @@ ((string_type *)s)->num_cp = num_cp; ((string_type *)s)->str = alloca(sizeof(char) * (len + 1)); } - if (0 && num_cp == 1) { /* Fast path */ + if (num_cp == 1) { /* Fast path */ memset(((string_type *)s)->str, ch_buf[0], len); } else { char *buf = ((string_type *)s)->str; From 4a77296ddf9124ac1eb33a5ea8aa65cc1b06acd9 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Fri, 27 Oct 2017 12:44:06 +0000 Subject: [PATCH 26/61] Added UTF-8 support to list->string --- runtime.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/runtime.c b/runtime.c index d0e804e9..323ba363 100644 --- a/runtime.c +++ b/runtime.c @@ -1835,21 +1835,34 @@ object Cyc_string2symbol(void *data, object str) object Cyc_list2string(void *data, object cont, object lst) { - char *buf; - int i = 0; - object len; + char *buf, cbuf[5]; + int i = 0, len = 0; + object cbox, tmp = lst; + char_type ch; Cyc_check_pair_or_null(data, lst); - len = Cyc_length(data, lst); // Inefficient, walks whole list + + // Need to walk the list of chars to compute multibyte length + while (tmp) { + if (is_value_type(tmp) || ((list) tmp)->tag != pair_tag) { + Cyc_rt_raise2(data, "length - invalid parameter, expected list", tmp); + } + cbox = car(tmp); + ch = obj_obj2char(cbox); + if (!obj_is_char(cbox)) { + Cyc_rt_raise2(data, "Expected character but received", cbox); + } + len += Cyc_utf8_encode_char(cbuf, 5, ch); + tmp = cdr(tmp); + } { - make_string_noalloc(str, NULL, (obj_obj2int(len))); - str.str = buf = alloca(sizeof(char) * (obj_obj2int(len) + 1)); + make_string_noalloc(str, NULL, len); + str.str = buf = alloca(sizeof(char) * (len + 1)); while ((lst != NULL)) { - if (!obj_is_char(car(lst))) { - Cyc_rt_raise2(data, "Expected character but received", car(lst)); - } - buf[i++] = obj_obj2char(car(lst)); + cbox = car(lst); + ch = obj_obj2char(cbox); // Already validated, can assume chars now + i += Cyc_utf8_encode_char(&(buf[i]), 5, ch); lst = cdr(lst); } buf[i] = '\0'; From 6aaa600ebca7bd31c4233d5af580691d89075bdf Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Fri, 27 Oct 2017 13:01:04 +0000 Subject: [PATCH 27/61] Bugfixes: - Avoid unnecessary calls to `strlen` - Type check the `fill` parameter to `make-string` --- scheme/base.sld | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/scheme/base.sld b/scheme/base.sld index 0a63fb55..f5451340 100644 --- a/scheme/base.sld +++ b/scheme/base.sld @@ -951,11 +951,16 @@ "(void *data, int argc, closure _, object k, object count, object fill)" " object s = NULL; char ch_buf[5]; + char_type c; + int buflen, num_cp, len; Cyc_check_int(data, count); - char_type c = obj_obj2char(fill); - Cyc_utf8_encode_char(ch_buf, 5, c); - int num_cp = obj_obj2int(count); - int len = num_cp * strlen(ch_buf); + if (!obj_is_char(fill)) { + Cyc_rt_raise2(data, \"Expected character buf received\", fill); + } + c = obj_obj2char(fill); + buflen = Cyc_utf8_encode_char(ch_buf, 5, c); + num_cp = obj_obj2int(count); + len = num_cp * buflen; if (len >= MAX_STACK_OBJ) { int heap_grown; s = gc_alloc(((gc_thread_data *)data)->heap, @@ -982,7 +987,7 @@ memset(((string_type *)s)->str, ch_buf[0], len); } else { char *buf = ((string_type *)s)->str; - int bi, si, slen = strlen(ch_buf); + int bi, si, slen = buflen; for (bi = 0, si = 0; bi < len; bi++, si++) { buf[bi] = ch_buf[si % slen]; } From 8289eca02a832663513c7810c7de74632cedd1bc Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Fri, 27 Oct 2017 13:02:51 +0000 Subject: [PATCH 28/61] Remove obsolete function --- include/cyclone/runtime.h | 1 - runtime.c | 14 +++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h index 3557ee6f..49648706 100644 --- a/include/cyclone/runtime.h +++ b/include/cyclone/runtime.h @@ -727,7 +727,6 @@ uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte); int Cyc_utf8_count_code_points(uint8_t* s); uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); uint32_t Cyc_utf8_validate(char *str, size_t len); -int uint32_num_bytes(uint32_t val); /**@}*/ #endif /* CYCLONE_RUNTIME_H */ diff --git a/runtime.c b/runtime.c index 323ba363..6178ca72 100644 --- a/runtime.c +++ b/runtime.c @@ -6586,13 +6586,13 @@ uint32_t Cyc_utf8_validate(char *str, size_t len) { return state; } -int uint32_num_bytes(uint32_t x) { - // TODO: could compute log(val) / log(256) - if (x < 0x100) return 1; - if (x < 0x10000) return 2; - if (x < 0x1000000) return 3; - return 4; -} +//int uint32_num_bytes(uint32_t x) { +// // TODO: could compute log(val) / log(256) +// if (x < 0x100) return 1; +// if (x < 0x10000) return 2; +// if (x < 0x1000000) return 3; +// return 4; +//} /** * This function takes one or more 32-bit chars and encodes them From a5d768a8a40c0a5c50b91b7a7123d2d843028e85 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Fri, 27 Oct 2017 13:17:34 +0000 Subject: [PATCH 29/61] Cyc_io_get_output_string - populate num_cp correctly --- mstreams.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mstreams.c b/mstreams.c index f1b1c50e..939d4047 100644 --- a/mstreams.c +++ b/mstreams.c @@ -102,6 +102,7 @@ void Cyc_io_get_output_string(void *data, object cont, object port) } { make_string_with_len(s, p->str_bv_in_mem_buf, p->str_bv_in_mem_buf_len); + s.num_cp = Cyc_utf8_count_code_points((uint8_t *)string_str(&s)); return_closcall1(data, cont, &s); } } From 0bcce5038ef561ce9e72964e35650075c3dc1155 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Fri, 27 Oct 2017 17:18:29 +0000 Subject: [PATCH 30/61] WIP --- include/cyclone/runtime.h | 1 + runtime.c | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h index 49648706..5c31471d 100644 --- a/include/cyclone/runtime.h +++ b/include/cyclone/runtime.h @@ -725,6 +725,7 @@ void Cyc_set_globals_changed(gc_thread_data *thd); int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz); uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte); int Cyc_utf8_count_code_points(uint8_t* s); +int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, int *cpts, int *bytes); uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); uint32_t Cyc_utf8_validate(char *str, size_t len); /**@}*/ diff --git a/runtime.c b/runtime.c index 6178ca72..a744d288 100644 --- a/runtime.c +++ b/runtime.c @@ -6313,7 +6313,7 @@ object Cyc_io_read_line(void *data, object cont, object port) { FILE *stream = ((port_type *) port)->fp; char buf[1024]; - int len; + int len, num_cp; Cyc_check_port(data, port); if (stream == NULL) { @@ -6322,7 +6322,8 @@ object Cyc_io_read_line(void *data, object cont, object port) set_thread_blocked(data, cont); errno = 0; if (fgets(buf, 1023, stream) != NULL) { - len = strlen(buf); + // TODO: not good enough for UTF-8, what if we stopped reading in the middle of a code point? + Cyc_utf8_count_code_points_and_bytes((uint8_t *)buf, &num_cp, &len); { // Remove any trailing CR / newline chars while (len > 0 && (buf[len - 1] == '\n' || @@ -6331,6 +6332,7 @@ object Cyc_io_read_line(void *data, object cont, object port) } buf[len] = '\0'; make_string_noalloc(s, buf, len); + s.num_cp = num_cp; return_thread_runnable(data, &s); } } else { @@ -6539,6 +6541,22 @@ int Cyc_utf8_count_code_points(uint8_t* s) { return count; } +int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, int *cpts, int *bytes) { + uint32_t codepoint; + uint32_t state = 0; + *cpts = 0; + *bytes = 0; + for (; *s; ++s){ + *bytes += 1; + if (!Cyc_utf8_decode(&state, &codepoint, *s)) + *cpts += 1; + } + + if (state != CYC_UTF8_ACCEPT) + return -1; + return 0; +} + // TODO: index into X codepoint in a string /** From 3783da2674e4725d3e0adfb7987538250b2a8af1 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 30 Oct 2017 13:17:37 +0000 Subject: [PATCH 31/61] WIP - obj_obj2char fixes --- include/cyclone/types.h | 4 ++-- runtime.c | 23 +++++++++++++++++------ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/include/cyclone/types.h b/include/cyclone/types.h index d91a1aa0..903b4ab7 100644 --- a/include/cyclone/types.h +++ b/include/cyclone/types.h @@ -494,12 +494,12 @@ typedef uint32_t char_type; /** * Convert from an object to a char. */ -#define obj_obj2char(x) (char)((long)(x)>>2) +#define obj_obj2char(x) (char_type)((uintmax_t)(x)>>2) /** * Convert from a char to an object. */ -#define obj_char2obj(c) ((void *)((((unsigned long)c)<<2) | 2)) +#define obj_char2obj(c) ((void *)((((uintmax_t)c)<<2) | 2)) /** * Is the given object a value type? diff --git a/runtime.c b/runtime.c index a744d288..dc5d9aa8 100644 --- a/runtime.c +++ b/runtime.c @@ -804,7 +804,10 @@ object Cyc_display(void *data, object x, FILE * port) return quote_void; } if (obj_is_char(x)) { - fprintf(port, "%c", obj_obj2char(x)); + char cbuf[5]; + char_type unbox = obj_obj2char(x); + Cyc_utf8_encode_char(cbuf, 5, unbox); + fprintf(port, "%s", cbuf); return quote_void; } if (obj_is_int(x)) { @@ -984,7 +987,7 @@ static object _Cyc_write(void *data, object x, FILE * port) return quote_void; } if (obj_is_char(x)) { - char c = obj_obj2char(x); + char_type c = obj_obj2char(x); switch (c) { case 0: fprintf(port, "#\\null"); break; case 7: fprintf(port, "#\\alarm"); break; @@ -995,11 +998,13 @@ static object _Cyc_write(void *data, object x, FILE * port) case 27: fprintf(port, "#\\escape"); break; case 32: fprintf(port, "#\\space"); break; case 127: fprintf(port, "#\\delete"); break; - default: - fprintf(port, "#\\%c", obj_obj2char(x)); + default: { + char cbuf[5]; + Cyc_utf8_encode_char(cbuf, 5, c); + fprintf(port, "#\\%s", cbuf); break; + } } - //fprintf(port, "#\\%c", obj_obj2char(x)); return quote_void; } if (obj_is_int(x)) { @@ -1097,7 +1102,10 @@ object Cyc_write_char(void *data, object c, object port) if (obj_is_char(c)) { FILE *fp = ((port_type *) port)->fp; if (fp){ - fprintf(fp, "%c", obj_obj2char(c)); + char cbuf[5]; + char_type unbox = obj_obj2char(c); + Cyc_utf8_encode_char(cbuf, 5, unbox); + fprintf(fp, "%s", cbuf); } } else { Cyc_rt_raise2(data, "Argument is not a character", c); @@ -2119,6 +2127,7 @@ object Cyc_string_set(void *data, object str, object k, object chr) // Take fast path if all chars are just 1 byte if (string_num_cp(str) == string_len(str)) { + // TODO: not good enough, chr could be multi-byte raw[idx] = obj_obj2char(chr); } else { fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), len); @@ -6323,6 +6332,8 @@ object Cyc_io_read_line(void *data, object cont, object port) errno = 0; if (fgets(buf, 1023, stream) != NULL) { // TODO: not good enough for UTF-8, what if we stopped reading in the middle of a code point? + // should reserve 3 extra bytes and, if last code point is not complete, read one byte at a + // time until it has been read Cyc_utf8_count_code_points_and_bytes((uint8_t *)buf, &num_cp, &len); { // Remove any trailing CR / newline chars From 7f8cc02c5047f2c457e11b78ec5f5566ddd3ab34 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 30 Oct 2017 13:26:57 +0000 Subject: [PATCH 32/61] WIP - obj_char2obj --- runtime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime.c b/runtime.c index dc5d9aa8..5be2037e 100644 --- a/runtime.c +++ b/runtime.c @@ -2755,7 +2755,7 @@ object Cyc_char2integer(object chr) object Cyc_integer2char(void *data, object n) { - int val = 0; + char_type val = 0; Cyc_check_num(data, n); val = unbox_number(n); @@ -6095,7 +6095,7 @@ void _read_return_character(void *data, port_type *p) return_thread_runnable(data, obj_char2obj('\t')); } else if(strlen(p->tok_buf) > 1 && p->tok_buf[0] == 'x') { const char *buf = p->tok_buf + 1; - int result = strtol(buf, NULL, 16); + char_type result = strtol(buf, NULL, 16); return_thread_runnable(data, obj_char2obj(result)); } else { char buf[31]; From 118822f353444a43a9a7615b956e6e5268e7a931 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 30 Oct 2017 16:57:39 +0000 Subject: [PATCH 33/61] WIP --- runtime.c | 14 ++++++++++++-- test.c | 2 ++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/runtime.c b/runtime.c index 5be2037e..e57009a0 100644 --- a/runtime.c +++ b/runtime.c @@ -6001,8 +6001,15 @@ void _read_string(void *data, object cont, port_type *p) } buf[i] = '\0'; { - int result = (int)strtol(buf, NULL, 16); - p->tok_buf[p->tok_end++] = (char)result; + char_type result = strtol(buf, NULL, 16); + char cbuf[5]; + int i; + Cyc_utf8_encode_char(cbuf, 5, result); +// TODO: infinite loop here or above if ; is not provided??? + for (i = 0; cbuf[i] != 0; i++) { + _read_add_to_tok_buf(p, cbuf[i]); + } + //p->tok_buf[p->tok_end++] = (char)result; } break; } @@ -6014,7 +6021,10 @@ void _read_string(void *data, object cont, port_type *p) p->tok_buf[p->tok_end] = '\0'; // TODO: what if buffer is full? p->tok_end = 0; // Reset for next atom { +// TODO: need to change this below, but run into trouble in icyc, eg: +// (string-ref "ab\x3bb;" 2) crashes make_string(str, p->tok_buf); + //make_utf8_string(data, str, p->tok_buf); return_thread_runnable(data, &str); } } else if (c == '\\') { diff --git a/test.c b/test.c index 5f7996e8..ac48fef5 100644 --- a/test.c +++ b/test.c @@ -155,5 +155,7 @@ void main(){ encode(0x3bb); encode(65); encode(0xcebb); + + printf("%06X\n", 0x0fff); return; } From a38295b22b05b8eddcf67806c479cec9a5c8e938 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 30 Oct 2017 17:52:16 +0000 Subject: [PATCH 34/61] WIP --- include/cyclone/types.h | 2 +- runtime.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/cyclone/types.h b/include/cyclone/types.h index 903b4ab7..65dc2d99 100644 --- a/include/cyclone/types.h +++ b/include/cyclone/types.h @@ -776,7 +776,7 @@ typedef struct { cs.hdr.mark = gc_color_red; \ cs.hdr.grayed = 0; \ cs.tag = string_tag; \ - cs.num_cp = Cyc_utf8_count_code_points(s); \ + cs.num_cp = Cyc_utf8_count_code_points((uint8_t *)s); \ if (cs.num_cp < 0) { \ Cyc_rt_raise_msg(data, "Invalid UTF-8 characters in string"); \ } \ diff --git a/runtime.c b/runtime.c index e57009a0..a0910378 100644 --- a/runtime.c +++ b/runtime.c @@ -2143,7 +2143,7 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le int i = 0, count, start_len = 0, start_cp = 0; for (count = 0; *tmp; ++tmp){ - if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){ + if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ if (count < idx) { start_len = i; start_cp = count; @@ -2193,7 +2193,7 @@ object Cyc_string_ref(void *data, object str, object k) int count; for (count = 0; *raw; ++raw){ - if (!Cyc_utf8_decode(&state, &codepoint, *raw)){ + if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*raw)){ if (count == idx) break; // Reached requested index count += 1; } @@ -2241,7 +2241,7 @@ object Cyc_substring(void *data, object cont, object str, object start, int count, start_i = 0, end_i = 0; for (count = 0; *tmp; ++tmp){ - if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){ + if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ if (count == s) { start_i = end_i; } else if (count == e) { @@ -2637,7 +2637,7 @@ object Cyc_string2utf8(void *data, object cont, object str, object start, char_type codepoint; uint32_t state = 0; for (i = 0; *tmp; ++tmp) { - if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){ + if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ if (i == s) { start_i = i; } else if (i == e) { @@ -6023,8 +6023,8 @@ void _read_string(void *data, object cont, port_type *p) { // TODO: need to change this below, but run into trouble in icyc, eg: // (string-ref "ab\x3bb;" 2) crashes - make_string(str, p->tok_buf); - //make_utf8_string(data, str, p->tok_buf); + //make_string(str, p->tok_buf); + make_utf8_string(data, str, p->tok_buf); return_thread_runnable(data, &str); } } else if (c == '\\') { From 8585a9f3ccf92b7135426cf4b5c6a8dd7a8d27ad Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 30 Oct 2017 18:58:47 -0400 Subject: [PATCH 35/61] Test scaffold for Cyc_substring --- test.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/test.c b/test.c index ac48fef5..1f2ef29c 100644 --- a/test.c +++ b/test.c @@ -125,6 +125,29 @@ void multi_byte_memset(char *buf, int blen, char *src, int slen) } } +void substring(int s, int e) { + uint8_t raw[] = {65, 66, 0xCE, 0xBB}; + + const char *tmp = raw; + uint32_t codepoint; + uint32_t state = 0; + int count, start_i = 0, end_i = 0; + + for (count = 0; *tmp; ++tmp){ + if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ + if (count == s) { + start_i = end_i; + } else if (count == e) { + break; + } + count += 1; + } + end_i++; + } + raw[end_i] = '\0'; + printf("raw=%s, s=%d, e=%d, start_i=%d, end_i=%d\n", raw, s, e, start_i, end_i); +} + void main(){ char c[128]; uint8_t cv[] = {0xEC, 0xBA, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE @@ -157,5 +180,9 @@ void main(){ encode(0xcebb); printf("%06X\n", 0x0fff); + substring(0, 1); + substring(0, 2); + substring(1, 3); + substring(1, 4); return; } From 950d92615b2ff4e1bc735c8b3466e35116d9c6f9 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Tue, 31 Oct 2017 16:46:14 +0000 Subject: [PATCH 36/61] WIP --- test.c | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/test.c b/test.c index 1f2ef29c..c4d3e693 100644 --- a/test.c +++ b/test.c @@ -125,27 +125,38 @@ void multi_byte_memset(char *buf, int blen, char *src, int slen) } } -void substring(int s, int e) { - uint8_t raw[] = {65, 66, 0xCE, 0xBB}; - +void substring(int s, int e, const char *expected) { + uint8_t raw[] = {65, 66, 0xCE, 0xBB, 67}; const char *tmp = raw; uint32_t codepoint; uint32_t state = 0; - int count, start_i = 0, end_i = 0; - - for (count = 0; *tmp; ++tmp){ + int num_ch, cur_ch_bytes = 0, start_i = 0, end_i = 0; + for (num_ch = 0; *tmp; ++tmp){ + //printf("char = %d\n", (int)*tmp); if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ - if (count == s) { + end_i += cur_ch_bytes; + num_ch += 1; + cur_ch_bytes = 0; + + if (num_ch == s) { start_i = end_i; - } else if (count == e) { + } + if (num_ch == e) { break; } - count += 1; + + //if (num_ch == s) { + // start_i = end_i; + //} else if (num_ch == (e - 1)) { + // end_i += cur_ch_bytes; + // if (s == e) start_i = end_i; + // break; + //} } - end_i++; + cur_ch_bytes++; } - raw[end_i] = '\0'; - printf("raw=%s, s=%d, e=%d, start_i=%d, end_i=%d\n", raw, s, e, start_i, end_i); + raw[end_i + 1] = '\0'; + printf("expected=%s, raw=%s, s=%d, e=%d, start_i=%d, end_i=%d\n", expected, raw + start_i, s, e, start_i, end_i); } void main(){ @@ -180,9 +191,12 @@ void main(){ encode(0xcebb); printf("%06X\n", 0x0fff); - substring(0, 1); - substring(0, 2); - substring(1, 3); - substring(1, 4); + substring(0, 1, "A "); + substring(0, 2, "AB "); + substring(1, 3, "Bx "); + substring(1, 4, "BxC "); + substring(2, 2, " "); + substring(2, 3, "x "); + substring(2, 4, "xC "); return; } From 509fd430224113efb0d804bd292fc43d4fbe1ee9 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Tue, 31 Oct 2017 17:58:17 -0400 Subject: [PATCH 37/61] Fixed substring --- runtime.c | 29 ++++++++++++++++++++++------- test.c | 13 ++----------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/runtime.c b/runtime.c index a0910378..c830e294 100644 --- a/runtime.c +++ b/runtime.c @@ -2238,19 +2238,34 @@ object Cyc_substring(void *data, object cont, object str, object start, const char *tmp = raw; char_type codepoint; uint32_t state = 0; - int count, start_i = 0, end_i = 0; - - for (count = 0; *tmp; ++tmp){ + int num_ch, cur_ch_bytes = 0, start_i = 0, end_i = 0; + for (num_ch = 0; *tmp; ++tmp){ + cur_ch_bytes++; if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ - if (count == s) { + end_i += cur_ch_bytes; + num_ch += 1; + cur_ch_bytes = 0; + + if (num_ch == s) { start_i = end_i; - } else if (count == e) { + } + if (num_ch == e) { break; } - count += 1; } - end_i++; } + //int count, start_i = 0, end_i = 0; + //for (count = 0; *tmp; ++tmp){ + // if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ + // if (count == s) { + // start_i = end_i; + // } else if (count == e) { + // break; + // } + // count += 1; + // } + // end_i++; + //} if (state != CYC_UTF8_ACCEPT) Cyc_rt_raise2(data, "substring - invalid character in string", str); make_utf8_string_with_len(sub, raw + start_i, end_i - start_i, e - s); diff --git a/test.c b/test.c index c4d3e693..434c33cd 100644 --- a/test.c +++ b/test.c @@ -132,7 +132,7 @@ void substring(int s, int e, const char *expected) { uint32_t state = 0; int num_ch, cur_ch_bytes = 0, start_i = 0, end_i = 0; for (num_ch = 0; *tmp; ++tmp){ - //printf("char = %d\n", (int)*tmp); + cur_ch_bytes++; if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ end_i += cur_ch_bytes; num_ch += 1; @@ -144,18 +144,9 @@ void substring(int s, int e, const char *expected) { if (num_ch == e) { break; } - - //if (num_ch == s) { - // start_i = end_i; - //} else if (num_ch == (e - 1)) { - // end_i += cur_ch_bytes; - // if (s == e) start_i = end_i; - // break; - //} } - cur_ch_bytes++; } - raw[end_i + 1] = '\0'; + raw[end_i] = '\0'; printf("expected=%s, raw=%s, s=%d, e=%d, start_i=%d, end_i=%d\n", expected, raw + start_i, s, e, start_i, end_i); } From b1ea22c940f6d4325a9456e533d8d4734436e0d2 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Tue, 31 Oct 2017 18:41:52 -0400 Subject: [PATCH 38/61] Fixed (string->utf8) --- runtime.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/runtime.c b/runtime.c index c830e294..f5da4c46 100644 --- a/runtime.c +++ b/runtime.c @@ -2254,18 +2254,6 @@ object Cyc_substring(void *data, object cont, object str, object start, } } } - //int count, start_i = 0, end_i = 0; - //for (count = 0; *tmp; ++tmp){ - // if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ - // if (count == s) { - // start_i = end_i; - // } else if (count == e) { - // break; - // } - // count += 1; - // } - // end_i++; - //} if (state != CYC_UTF8_ACCEPT) Cyc_rt_raise2(data, "substring - invalid character in string", str); make_utf8_string_with_len(sub, raw + start_i, end_i - start_i, e - s); @@ -2647,21 +2635,25 @@ object Cyc_string2utf8(void *data, object cont, object str, object start, memcpy(&result.data[0], &(string_str(str))[s], len); _return_closcall1(data, cont, &result); } else { - int i, start_i = 0, end_i = 0; const char *tmp = string_str(str); char_type codepoint; uint32_t state = 0; - for (i = 0; *tmp; ++tmp) { + int num_ch, cur_ch_bytes = 0, start_i = 0, end_i = 0; + for (num_ch = 0; *tmp; ++tmp){ + cur_ch_bytes++; if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ - if (i == s) { - start_i = i; - } else if (i == e) { + end_i += cur_ch_bytes; + num_ch += 1; + cur_ch_bytes = 0; + + if (num_ch == s) { + start_i = end_i; + } + if (num_ch == e) { break; } } - i++; } - end_i = i; result.len = end_i - start_i; result.data = alloca(sizeof(char) * result.len); memcpy(&result.data[0], &(string_str(str))[start_i], result.len); From 734a6e1911ce7d94319472ab9dc070756ad5a0e5 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Tue, 31 Oct 2017 20:54:21 +0000 Subject: [PATCH 39/61] Allow read-char to handle unicode characters --- runtime.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/runtime.c b/runtime.c index f5da4c46..bb1fc51b 100644 --- a/runtime.c +++ b/runtime.c @@ -6319,17 +6319,21 @@ object Cyc_io_peek_char(void *data, object cont, object port) object Cyc_io_read_char(void *data, object cont, object port) { port_type *p = (port_type *)port; - int c; Cyc_check_port(data, port); if (p->fp == NULL) { Cyc_rt_raise2(data, "Unable to read from closed port: ", port); } { + uint32_t state = CYC_UTF8_ACCEPT; + char_type codepoint; + int c; set_thread_blocked(data, cont); - _read_next_char(data, cont, p); - c = p->mem_buf[p->buf_idx++]; + do { + _read_next_char(data, cont, p); + c = p->mem_buf[p->buf_idx++]; + } while(Cyc_utf8_decode(&state, &codepoint, (uint8_t)c)); p->col_num++; - return_thread_runnable(data, (c != EOF) ? obj_char2obj(c) : Cyc_EOF); + return_thread_runnable(data, (c != EOF) ? obj_char2obj(codepoint) : Cyc_EOF); } return Cyc_EOF; } From 3aa2a159b7eab99713a9dedb59b3882049d6a4d0 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Thu, 2 Nov 2017 17:41:26 -0400 Subject: [PATCH 40/61] Bugfix: Cyc_utf8_encode returns char count, not bytes --- runtime.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/runtime.c b/runtime.c index bb1fc51b..690488ea 100644 --- a/runtime.c +++ b/runtime.c @@ -1860,7 +1860,8 @@ object Cyc_list2string(void *data, object cont, object lst) if (!obj_is_char(cbox)) { Cyc_rt_raise2(data, "Expected character but received", cbox); } - len += Cyc_utf8_encode_char(cbuf, 5, ch); + Cyc_utf8_encode_char(cbuf, 5, ch); + len += strlen(cbuf); tmp = cdr(tmp); } @@ -1870,7 +1871,8 @@ object Cyc_list2string(void *data, object cont, object lst) while ((lst != NULL)) { cbox = car(lst); ch = obj_obj2char(cbox); // Already validated, can assume chars now - i += Cyc_utf8_encode_char(&(buf[i]), 5, ch); + Cyc_utf8_encode_char(&(buf[i]), 5, ch); + i += strlen(buf+i); lst = cdr(lst); } buf[i] = '\0'; @@ -6013,6 +6015,7 @@ void _read_string(void *data, object cont, port_type *p) int i; Cyc_utf8_encode_char(cbuf, 5, result); // TODO: infinite loop here or above if ; is not provided??? +// only because it is still waiting for the ; after it reads the closing quote for (i = 0; cbuf[i] != 0; i++) { _read_add_to_tok_buf(p, cbuf[i]); } From bbe8fbb97070cd81eadb8b4d10756ae8cefbbbe3 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Thu, 2 Nov 2017 18:00:10 -0400 Subject: [PATCH 41/61] Allow read_return_character to parse UTF8 chars --- runtime.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/runtime.c b/runtime.c index 690488ea..801857b7 100644 --- a/runtime.c +++ b/runtime.c @@ -6118,9 +6118,23 @@ void _read_return_character(void *data, port_type *p) char_type result = strtol(buf, NULL, 16); return_thread_runnable(data, obj_char2obj(result)); } else { - char buf[31]; - snprintf(buf, 30, "Unable to parse character %s", p->tok_buf); - _read_error(data, p, buf); + uint32_t state = CYC_UTF8_ACCEPT; + char_type codepoint; + uint8_t *s = (uint8_t *)p->tok_buf; + while(s) { + if (!Cyc_utf8_decode(&state, &codepoint, *s)) { + s++; + break; + } + s++; + } + if (state == CYC_UTF8_ACCEPT && *s == '\0') { + return_thread_runnable(data, obj_char2obj(codepoint)); + } else { + char buf[31]; + snprintf(buf, 30, "Unable to parse character %s", p->tok_buf); + _read_error(data, p, buf); + } } } From 67398186d0aa26eecd5ab873c9458d0d32939417 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Fri, 3 Nov 2017 14:41:58 +0000 Subject: [PATCH 42/61] Added comments --- runtime.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/runtime.c b/runtime.c index 801857b7..bfd0f64b 100644 --- a/runtime.c +++ b/runtime.c @@ -6094,6 +6094,7 @@ void _read_return_character(void *data, port_type *p) p->tok_buf[p->tok_end] = '\0'; // TODO: what if buffer is full? p->tok_end = 0; // Reset for next atom if (strlen(p->tok_buf) == 1) { + // ASCII char, consider merging with below? return_thread_runnable(data, obj_char2obj(p->tok_buf[0])); } else if(strncmp(p->tok_buf, "alarm", 5) == 0) { return_thread_runnable(data, obj_char2obj('\a')); @@ -6118,6 +6119,7 @@ void _read_return_character(void *data, port_type *p) char_type result = strtol(buf, NULL, 16); return_thread_runnable(data, obj_char2obj(result)); } else { + // Try to read a UTF-8 char and if so return it, otherwise throw an error uint32_t state = CYC_UTF8_ACCEPT; char_type codepoint; uint8_t *s = (uint8_t *)p->tok_buf; From 6910e3e4cb31cfe56e21069f1827d615eb28b671 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Fri, 3 Nov 2017 14:51:34 +0000 Subject: [PATCH 43/61] Added TODO --- runtime.c | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/runtime.c b/runtime.c index bfd0f64b..a0e140e4 100644 --- a/runtime.c +++ b/runtime.c @@ -5944,6 +5944,28 @@ static void _read_add_to_tok_buf(port_type *p, char c) p->tok_buf[p->tok_end++] = c; } +/** + * @brief Determine if given string is numeric + */ +int _read_is_numeric(const char *tok) +{ + int len = strlen(tok); + return (len && + ((isdigit(tok[0])) || + ((len > 1) && tok[0] == '.' && isdigit(tok[1])) || + ((len > 1) && (tok[1] == '.' || isdigit(tok[1])) && (tok[0] == '-' || tok[0] == '+')))); +} + +/** + * @brief Helper function, determine if given number is a hex digit + * @param c Character to check + */ +int _read_is_hex_digit(char c) +{ + return (c >= 'a' && c <= 'f') || + (c >= 'A' && c <= 'F'); +} + /** * @brief Helper function to read a string * @param data Thread data object @@ -6003,6 +6025,10 @@ void _read_string(void *data, object cont, port_type *p) p->buf_idx++; break; } + // TODO: verify if hex digit is valid + //if (!isdigit(p->buf_idx) && !_read_is_hex_digit(p->buf_idx)) { + // _read_error(data, p, "invalid hex digit in string"); + //} buf[i] = p->mem_buf[p->buf_idx]; p->buf_idx++; p->col_num++; @@ -6168,28 +6194,6 @@ void _read_character(void *data, port_type *p) } } -/** - * @brief Determine if given string is numeric - */ -int _read_is_numeric(const char *tok) -{ - int len = strlen(tok); - return (len && - ((isdigit(tok[0])) || - ((len > 1) && tok[0] == '.' && isdigit(tok[1])) || - ((len > 1) && (tok[1] == '.' || isdigit(tok[1])) && (tok[0] == '-' || tok[0] == '+')))); -} - -/** - * @brief Helper function, determine if given number is a hex digit - * @param c Character to check - */ -int _read_is_hex_digit(char c) -{ - return (c >= 'a' && c <= 'f') || - (c >= 'A' && c <= 'F'); -} - /** * @brief Helper function, return read number. * @param data Thread data object From d431b2af1c4835b8c13703677dac2e0a46840865 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 6 Nov 2017 13:19:31 +0000 Subject: [PATCH 44/61] Updated Cyc_io_read_line to prevent truncation Ensure last codepoint is fully-read before returning --- include/cyclone/runtime.h | 2 +- runtime.c | 32 ++++++++++++++++++++++---------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h index 5c31471d..4529fa85 100644 --- a/include/cyclone/runtime.h +++ b/include/cyclone/runtime.h @@ -725,7 +725,7 @@ void Cyc_set_globals_changed(gc_thread_data *thd); int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz); uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte); int Cyc_utf8_count_code_points(uint8_t* s); -int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, int *cpts, int *bytes); +int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, char_type *codepoint, int *cpts, int *bytes); uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); uint32_t Cyc_utf8_validate(char *str, size_t len); /**@}*/ diff --git a/runtime.c b/runtime.c index a0e140e4..a7de71fd 100644 --- a/runtime.c +++ b/runtime.c @@ -6365,8 +6365,10 @@ object Cyc_io_read_char(void *data, object cont, object port) object Cyc_io_read_line(void *data, object cont, object port) { FILE *stream = ((port_type *) port)->fp; - char buf[1024]; - int len, num_cp; + char buf[1027]; + int len, num_cp, i = 0; + char_type codepoint; + uint32_t state; Cyc_check_port(data, port); if (stream == NULL) { @@ -6375,10 +6377,21 @@ object Cyc_io_read_line(void *data, object cont, object port) set_thread_blocked(data, cont); errno = 0; if (fgets(buf, 1023, stream) != NULL) { - // TODO: not good enough for UTF-8, what if we stopped reading in the middle of a code point? - // should reserve 3 extra bytes and, if last code point is not complete, read one byte at a - // time until it has been read - Cyc_utf8_count_code_points_and_bytes((uint8_t *)buf, &num_cp, &len); + state = Cyc_utf8_count_code_points_and_bytes((uint8_t *)buf, &codepoint, &num_cp, &len); + // Check if we stopped reading in the middle of a code point and + // if so, read one byte at a time until that code point is finished. + while (state != CYC_UTF8_ACCEPT && i < 3) { + int c = fgetc(stream); + buf[len] = c; + len++; + Cyc_utf8_decode(&state, &codepoint, (uint8_t)c); + if (state == CYC_UTF8_ACCEPT) { + num_cp++; + break; + } + i++; + } + { // Remove any trailing CR / newline chars while (len > 0 && (buf[len - 1] == '\n' || @@ -6596,19 +6609,18 @@ int Cyc_utf8_count_code_points(uint8_t* s) { return count; } -int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, int *cpts, int *bytes) { - uint32_t codepoint; +int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, char_type *codepoint, int *cpts, int *bytes) { uint32_t state = 0; *cpts = 0; *bytes = 0; for (; *s; ++s){ *bytes += 1; - if (!Cyc_utf8_decode(&state, &codepoint, *s)) + if (!Cyc_utf8_decode(&state, codepoint, *s)) *cpts += 1; } if (state != CYC_UTF8_ACCEPT) - return -1; + return state; return 0; } From 9962bca854e749ee6cf4309700333f4c752bf020 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 6 Nov 2017 14:12:21 +0000 Subject: [PATCH 45/61] Validate hex digits in string with the \x; syntax --- runtime.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/runtime.c b/runtime.c index a7de71fd..dccdc380 100644 --- a/runtime.c +++ b/runtime.c @@ -6025,10 +6025,12 @@ void _read_string(void *data, object cont, port_type *p) p->buf_idx++; break; } - // TODO: verify if hex digit is valid - //if (!isdigit(p->buf_idx) && !_read_is_hex_digit(p->buf_idx)) { - // _read_error(data, p, "invalid hex digit in string"); - //} + // Verify if hex digit is valid + if (!isdigit(p->mem_buf[p->buf_idx]) && + !_read_is_hex_digit(p->mem_buf[p->buf_idx])) { + p->buf_idx++; + _read_error(data, p, "invalid hex digit in string"); + } buf[i] = p->mem_buf[p->buf_idx]; p->buf_idx++; p->col_num++; @@ -6040,8 +6042,6 @@ void _read_string(void *data, object cont, port_type *p) char cbuf[5]; int i; Cyc_utf8_encode_char(cbuf, 5, result); -// TODO: infinite loop here or above if ; is not provided??? -// only because it is still waiting for the ; after it reads the closing quote for (i = 0; cbuf[i] != 0; i++) { _read_add_to_tok_buf(p, cbuf[i]); } From 471f0d4b5042a0782faac44bc5d4ab0e88db3c77 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 6 Nov 2017 16:00:11 +0000 Subject: [PATCH 46/61] UTF8 support --- scheme/process-context.sld | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scheme/process-context.sld b/scheme/process-context.sld index f8ce64f2..e5cfa5c3 100644 --- a/scheme/process-context.sld +++ b/scheme/process-context.sld @@ -24,7 +24,7 @@ for (i = _cyc_argc; i > 0; i--) { object ps = alloca(sizeof(string_type)); object pl = alloca(sizeof(pair_type)); - make_string(s, _cyc_argv[i - 1]); + make_utf8_string(data, s, _cyc_argv[i - 1]); memcpy(ps, &s, sizeof(string_type)); ((list)pl)->hdr.mark = gc_color_red; ((list)pl)->hdr.grayed = 0; @@ -44,7 +44,7 @@ if (v == NULL) { return_closcall1(data, k, boolean_f); } else { - make_string(str, v); + make_utf8_string(data, str, v); return_closcall1(data, k, &str); } ") From ec5ef86b6ae7ac2ce3fd16bf2809d49d0848fe50 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 6 Nov 2017 16:00:55 +0000 Subject: [PATCH 47/61] Do not use make_string for UTF8 strings --- runtime.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/runtime.c b/runtime.c index dccdc380..e96f8a37 100644 --- a/runtime.c +++ b/runtime.c @@ -560,7 +560,7 @@ void Cyc_rt_raise(void *data, object err) void Cyc_rt_raise2(void *data, const char *msg, object err) { - make_string(s, msg); + make_utf8_string(data, s, msg); make_pair(c3, err, NULL); make_pair(c2, &s, &c3); make_pair(c1, boolean_f, &c2); @@ -573,7 +573,7 @@ void Cyc_rt_raise2(void *data, const char *msg, object err) void Cyc_rt_raise_msg(void *data, const char *err) { - make_string(s, err); + make_utf8_string(data, s, err); Cyc_rt_raise(data, &s); } @@ -1826,7 +1826,7 @@ object Cyc_symbol2string(void *data, object cont, object sym) Cyc_check_sym(data, sym); { const char *desc = symbol_desc(sym); - make_string(str, desc); + make_utf8_string(data, str, desc); _return_closcall1(data, cont, &str); }} @@ -2273,22 +2273,22 @@ object Cyc_installation_dir(void *data, object cont, object type) strncmp(((symbol) type)->desc, "sld", 5) == 0) { char buf[1024]; snprintf(buf, sizeof(buf), "%s", CYC_INSTALL_SLD); - make_string(str, buf); + make_utf8_string(data, str, buf); _return_closcall1(data, cont, &str); } else if (Cyc_is_symbol(type) == boolean_t && strncmp(((symbol) type)->desc, "lib", 5) == 0) { char buf[1024]; snprintf(buf, sizeof(buf), "%s", CYC_INSTALL_LIB); - make_string(str, buf); + make_utf8_string(data, str, buf); _return_closcall1(data, cont, &str); } else if (Cyc_is_symbol(type) == boolean_t && strncmp(((symbol) type)->desc, "inc", 5) == 0) { char buf[1024]; snprintf(buf, sizeof(buf), "%s", CYC_INSTALL_INC); - make_string(str, buf); + make_utf8_string(data, str, buf); _return_closcall1(data, cont, &str); } else { - make_string(str, CYC_INSTALL_DIR); + make_utf8_string(data, str, CYC_INSTALL_DIR); _return_closcall1(data, cont, &str); } } @@ -2302,22 +2302,22 @@ object Cyc_compilation_environment(void *data, object cont, object var) if (strncmp(((symbol) var)->desc, "cc-prog", 8) == 0) { char buf[1024]; snprintf(buf, sizeof(buf), "%s", CYC_CC_PROG); - make_string(str, buf); + make_utf8_string(data, str, buf); _return_closcall1(data, cont, &str); } else if (strncmp(((symbol) var)->desc, "cc-exec", 8) == 0) { char buf[1024]; snprintf(buf, sizeof(buf), "%s", CYC_CC_EXEC); - make_string(str, buf); + make_utf8_string(data, str, buf); _return_closcall1(data, cont, &str); } else if (strncmp(((symbol) var)->desc, "cc-lib", 7) == 0) { char buf[1024]; snprintf(buf, sizeof(buf), "%s", CYC_CC_LIB); - make_string(str, buf); + make_utf8_string(data, str, buf); _return_closcall1(data, cont, &str); } else if (strncmp(((symbol) var)->desc, "cc-so", 6) == 0) { char buf[1024]; snprintf(buf, sizeof(buf), "%s", CYC_CC_SO); - make_string(str, buf); + make_utf8_string(data, str, buf); _return_closcall1(data, cont, &str); } } @@ -2343,7 +2343,7 @@ object Cyc_command_line_arguments(void *data, object cont) for (i = _cyc_argc; i > 1; i--) { // skip program name object ps = alloca(sizeof(string_type)); object pl = alloca(sizeof(pair_type)); - make_string(s, _cyc_argv[i - 1]); + make_utf8_string(data, s, _cyc_argv[i - 1]); memcpy(ps, &s, sizeof(string_type)); ((list) pl)->hdr.mark = gc_color_red; ((list) pl)->hdr.grayed = 0; @@ -5775,7 +5775,7 @@ void Cyc_import_shared_object(void *data, object cont, object filename, object e handle = dlopen(string_str(filename), RTLD_GLOBAL | RTLD_LAZY); if (handle == NULL) { snprintf(buffer, 256, "%s", dlerror()); - make_string(s, buffer); + make_utf8_string(data, s, buffer); Cyc_rt_raise2(data, "Unable to import library", &s); } dlerror(); /* Clear any existing error */ @@ -5783,7 +5783,7 @@ void Cyc_import_shared_object(void *data, object cont, object filename, object e entry_pt = (function_type) dlsym(handle, string_str(entry_pt_fnc)); if (entry_pt == NULL) { snprintf(buffer, 256, "%s, %s, %s", string_str(filename), string_str(entry_pt_fnc), dlerror()); - make_string(s, buffer); + make_utf8_string(data, s, buffer); Cyc_rt_raise2(data, "Unable to load symbol", &s); } mclosure1(clo, entry_pt, cont); @@ -5832,6 +5832,7 @@ void _read_error(void *data, port_type *p, const char *msg) // the cont could receive an error and raise it though //Cyc_rt_raise_msg(data, buf); make_string(str, buf); + str.num_cp = Cyc_utf8_count_code_points((uint8_t *)buf); make_empty_vector(vec); vec.num_elements = 1; vec.elements = (object *) alloca(sizeof(object) * vec.num_elements); @@ -6057,9 +6058,6 @@ void _read_string(void *data, object cont, port_type *p) p->tok_buf[p->tok_end] = '\0'; // TODO: what if buffer is full? p->tok_end = 0; // Reset for next atom { -// TODO: need to change this below, but run into trouble in icyc, eg: -// (string-ref "ab\x3bb;" 2) crashes - //make_string(str, p->tok_buf); make_utf8_string(data, str, p->tok_buf); return_thread_runnable(data, &str); } @@ -6273,6 +6271,7 @@ void _read_return_atom(void *data, object cont, port_type *p) if (_read_is_numeric(p->tok_buf)) { make_string(str, p->tok_buf); + str.num_cp = Cyc_utf8_count_code_points((uint8_t *)(p->tok_buf)); make_c_opaque(opq, &str); return_thread_runnable(data, &opq); } else if (strncmp("+inf.0", p->tok_buf, 6) == 0 || From d43d019c2077fe7adf7e086f4afd5c648e9388d0 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 6 Nov 2017 17:06:12 +0000 Subject: [PATCH 48/61] Fix UTF8 support for pack_env_variables() --- runtime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime.c b/runtime.c index e96f8a37..81b48222 100644 --- a/runtime.c +++ b/runtime.c @@ -178,10 +178,10 @@ void pack_env_variables(void *data, object k) svar->hdr.grayed = 0; svar->tag = string_tag; svar->len = eqpos - e; - svar->num_cp = svar->len; // TODO: proper UTF-8 support! svar->str = alloca(sizeof(char) * (svar->len)); strncpy(svar->str, e, svar->len); (svar->str)[svar->len] = '\0'; + svar->num_cp = Cyc_utf8_count_code_points((uint8_t *)svar->str); if (eqpos) { eqpos++; @@ -190,7 +190,7 @@ void pack_env_variables(void *data, object k) sval->hdr.grayed = 0; sval->tag = string_tag; sval->len = strlen(eqpos); - sval->num_cp = sval->len; // TODO: proper UTF-8 support! + svar->num_cp = Cyc_utf8_count_code_points((uint8_t *)eqpos); sval->str = eqpos; set_pair(tmp, svar, sval); set_pair(p, tmp, NULL); From 348ed7205c49d81f1d0fd66cb9e813335d602f00 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 6 Nov 2017 17:46:56 +0000 Subject: [PATCH 49/61] Added a TODO for peek-char and UTF8 --- runtime.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/runtime.c b/runtime.c index 81b48222..8318ed93 100644 --- a/runtime.c +++ b/runtime.c @@ -6304,6 +6304,8 @@ object Cyc_io_peek_char(void *data, object cont, object port) { FILE *stream; port_type *p; + uint32_t state = CYC_UTF8_ACCEPT; + char_type codepoint; int c; Cyc_check_port(data, port); @@ -6316,7 +6318,13 @@ object Cyc_io_peek_char(void *data, object cont, object port) set_thread_blocked(data, cont); _read_next_char(data, cont, p); c = p->mem_buf[p->buf_idx]; - return_thread_runnable(data, (c != EOF) ? obj_char2obj(c) : Cyc_EOF); + if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)c)) { + // TODO: only have a partial UTF8 code point, read more chars. + // Problem is that there may not be enough space to store them + // and do need to set them aside since we are just peeking here + // and not actually supposed to be reading past chars. + } + return_thread_runnable(data, (c != EOF) ? obj_char2obj(codepoint) : Cyc_EOF); } return Cyc_EOF; } From 42507606a53374a330dcc4b162ef7944eb546c22 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 6 Nov 2017 17:54:00 +0000 Subject: [PATCH 50/61] Added Cyc_string_byte_length() --- include/cyclone/runtime.h | 1 + runtime.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h index 4529fa85..80ea3f87 100644 --- a/include/cyclone/runtime.h +++ b/include/cyclone/runtime.h @@ -194,6 +194,7 @@ int binstr2int(const char *str); int octstr2int(const char *str); object Cyc_string_append(void *data, object cont, int argc, object str1, ...); object Cyc_string_length(void *data, object str); +object Cyc_string_byte_length(void *data, object str); object Cyc_substring(void *data, object cont, object str, object start, object end); object Cyc_string_ref(void *data, object str, object k); diff --git a/runtime.c b/runtime.c index 8318ed93..e7792702 100644 --- a/runtime.c +++ b/runtime.c @@ -2109,6 +2109,12 @@ object Cyc_string_length(void *data, object str) return obj_int2obj(string_num_cp(str)); } +object Cyc_string_byte_length(void *data, object str) +{ + Cyc_check_str(data, str); + return obj_int2obj(string_len(str)); +} + object Cyc_string_set(void *data, object str, object k, object chr) { char *raw; From cfdec73d78b88ecc2af7f63eaa99dad224ab5782 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 6 Nov 2017 18:57:56 +0000 Subject: [PATCH 51/61] Emit strings with char/byte lengths --- scheme/cyclone/cgen.sld | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/scheme/cyclone/cgen.sld b/scheme/cyclone/cgen.sld index 7cdd74f0..d3456d11 100644 --- a/scheme/cyclone/cgen.sld +++ b/scheme/cyclone/cgen.sld @@ -521,7 +521,15 @@ (string-append "&" cvar-name) ; Code is just the variable name (list ; Allocate integer on the C stack (string-append - "make_string(" cvar-name ", " (->cstr exp) ");"))))) + "make_utf8_string_with_len(" + cvar-name + ", " + (->cstr exp) + ", " + (number->string (string-byte-length exp)) + ", " + (number->string (string-length exp)) + ");"))))) ;TODO: not good enough, need to store new symbols in a table so they can ;be inserted into the C program ((symbol? exp) @@ -536,6 +544,10 @@ (define (->cstr str) (string-append "\"" (cstr:escape-chars str) "\"")) +(define-c string-byte-length + "(void *data, int argc, closure _, object k, object s)" + " return_closcall1(data, k, Cyc_string_byte_length(data, s)); ") + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Primitives From f5787184dae2e98a736f63225aafe90a68923514 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Tue, 7 Nov 2017 18:18:56 +0000 Subject: [PATCH 52/61] WIP - string-set! --- runtime.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/runtime.c b/runtime.c index e7792702..f129899a 100644 --- a/runtime.c +++ b/runtime.c @@ -2117,8 +2117,10 @@ object Cyc_string_byte_length(void *data, object str) object Cyc_string_set(void *data, object str, object k, object chr) { + char buf[5]; char *raw; - int idx, len; + int idx, len, buf_len; + char_type input_char; Cyc_check_str(data, str); Cyc_check_num(data, k); @@ -2127,6 +2129,10 @@ object Cyc_string_set(void *data, object str, object k, object chr) Cyc_rt_raise2(data, "Expected char but received", chr); } + input_char = obj_obj2char(chr); + Cyc_utf8_encode_char(buf, 5, input_char); + buf_len = strlen(buf); + raw = string_str(str); idx = unbox_number(k); len = string_len(str); @@ -2134,8 +2140,7 @@ object Cyc_string_set(void *data, object str, object k, object chr) Cyc_check_bounds(data, "string-set!", len, idx); // Take fast path if all chars are just 1 byte - if (string_num_cp(str) == string_len(str)) { - // TODO: not good enough, chr could be multi-byte + if (string_num_cp(str) == string_len(str) && buf_len == 1) { raw[idx] = obj_obj2char(chr); } else { fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), len); @@ -2148,9 +2153,10 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le char *tmp = raw; char_type codepoint; uint32_t state = 0; - int i = 0, count, start_len = 0, start_cp = 0; + int i = 0, count, start_len = 0, start_cp = 0, bytes = 0; for (count = 0; *tmp; ++tmp){ + bytes++; if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ if (count < idx) { start_len = i; @@ -2159,6 +2165,7 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le break; } count += 1; + bytes = 0; } i++; } @@ -2171,6 +2178,13 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le // and we know the codepoint to be replaced. by calculating its length // we can compute where the end portion starts, and by using str we can // figure out how many remaining bytes/codepoints are in end + // + // 3 cases: + // - buf_len = bytes, just straight replace + // - buf_len > bytes, will need to allocate more memory (!!) + // - buf_len < bytes, just replace, but pad with NULL chars. + // in this case need to ensure string_len is not + // reduced because original value still matters for GC purposes } return str; From 02014322b736dc451df61d835ac8172c80e93dbd Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Tue, 7 Nov 2017 17:47:08 -0500 Subject: [PATCH 53/61] Properly count bytes in make-string --- scheme/base.sld | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scheme/base.sld b/scheme/base.sld index f5451340..5ab42fd0 100644 --- a/scheme/base.sld +++ b/scheme/base.sld @@ -958,7 +958,8 @@ Cyc_rt_raise2(data, \"Expected character buf received\", fill); } c = obj_obj2char(fill); - buflen = Cyc_utf8_encode_char(ch_buf, 5, c); + Cyc_utf8_encode_char(ch_buf, 5, c); + buflen = strlen(ch_buf); num_cp = obj_obj2int(count); len = num_cp * buflen; if (len >= MAX_STACK_OBJ) { From d584cf059ec09acb752c543c1ded6b49ed46ac67 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Tue, 7 Nov 2017 18:13:12 -0500 Subject: [PATCH 54/61] Partial fixes to string-set! --- runtime.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/runtime.c b/runtime.c index f129899a..dbb1b448 100644 --- a/runtime.c +++ b/runtime.c @@ -2150,27 +2150,26 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le // or don't allocate if chr uses as many or fewer bytes // than the codepoint it is replacing - char *tmp = raw; + char *tmp = raw, *this_cp = raw; char_type codepoint; uint32_t state = 0; - int i = 0, count, start_len = 0, start_cp = 0, bytes = 0; + int i = 0, count, bytes = 0; for (count = 0; *tmp; ++tmp){ bytes++; if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ - if (count < idx) { - start_len = i; - start_cp = count; - } else if (count == idx) { + if (count == idx) { break; } + this_cp = tmp + 1; count += 1; bytes = 0; } i++; } - if (state != CYC_UTF8_ACCEPT) + if (state != CYC_UTF8_ACCEPT) { Cyc_rt_raise2(data, "string-set! - invalid character at index", k); + } // TODO: perform actual mutation // @@ -2181,11 +2180,18 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le // // 3 cases: // - buf_len = bytes, just straight replace + if (buf_len == bytes) { + for (i = 0; i < buf_len; i++) { + this_cp[i] = buf[i]; + } + } // - buf_len > bytes, will need to allocate more memory (!!) // - buf_len < bytes, just replace, but pad with NULL chars. // in this case need to ensure string_len is not // reduced because original value still matters for GC purposes - + else { + Cyc_rt_raise2(data, "string-set! - unable to modify character", chr); + } } return str; } From 61a18d8fb35ea1e59c8889119a7fd97c41c15c7d Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Tue, 7 Nov 2017 18:39:12 -0500 Subject: [PATCH 55/61] WIP --- runtime.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/runtime.c b/runtime.c index dbb1b448..8e6a045a 100644 --- a/runtime.c +++ b/runtime.c @@ -2189,6 +2189,12 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le // - buf_len < bytes, just replace, but pad with NULL chars. // in this case need to ensure string_len is not // reduced because original value still matters for GC purposes + //else if (buf_len < bytes) { + // for (i = 0; i < buf_len; i++) { + // this_cp[i] = buf[i]; + // } + // TODO: memcpy remaining string, ensure trailing null is setup correctly, consolidate with above?? + //} else { Cyc_rt_raise2(data, "string-set! - unable to modify character", chr); } From 0f4a7b30c1d123c6e9fca1db917adbc6661da8f2 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Wed, 8 Nov 2017 14:55:57 +0000 Subject: [PATCH 56/61] Fixes for string-set! Handle setting of a char to one that is represented using fewer bytes. --- runtime.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/runtime.c b/runtime.c index 8e6a045a..f2a8ce9f 100644 --- a/runtime.c +++ b/runtime.c @@ -2139,39 +2139,35 @@ object Cyc_string_set(void *data, object str, object k, object chr) Cyc_check_bounds(data, "string-set!", len, idx); - // Take fast path if all chars are just 1 byte if (string_num_cp(str) == string_len(str) && buf_len == 1) { + // Take fast path if all chars are just 1 byte raw[idx] = obj_obj2char(chr); } else { -fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), len); - // TODO: utf8 support - // find codepoint at k, figure out how many bytes it is, - // allocate a new string (start) + chr + (end) - // or don't allocate if chr uses as many or fewer bytes - // than the codepoint it is replacing - + // Slower path for UTF-8, need to handle replacement differently + // depending upon how the new char affects length of the string char *tmp = raw, *this_cp = raw; char_type codepoint; uint32_t state = 0; - int i = 0, count, bytes = 0; + int i = 0, count, prev_cp_bytes = 0, cp_idx; for (count = 0; *tmp; ++tmp){ - bytes++; + prev_cp_bytes++; if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ if (count == idx) { break; } this_cp = tmp + 1; count += 1; - bytes = 0; + prev_cp_bytes = 0; } i++; } + cp_idx = i; if (state != CYC_UTF8_ACCEPT) { Cyc_rt_raise2(data, "string-set! - invalid character at index", k); } - // TODO: perform actual mutation + // Perform actual mutation // // Now we know length of start (both in codepoints and bytes), // and we know the codepoint to be replaced. by calculating its length @@ -2179,22 +2175,26 @@ fprintf(stderr, "DEBUG %s, num_cp = %d, len = %d\n", raw, string_num_cp(str), le // figure out how many remaining bytes/codepoints are in end // // 3 cases: - // - buf_len = bytes, just straight replace - if (buf_len == bytes) { + // - 1) buf_len = prev_cp_bytes, just straight replace + if (buf_len == prev_cp_bytes) { for (i = 0; i < buf_len; i++) { this_cp[i] = buf[i]; } } - // - buf_len > bytes, will need to allocate more memory (!!) - // - buf_len < bytes, just replace, but pad with NULL chars. - // in this case need to ensure string_len is not - // reduced because original value still matters for GC purposes - //else if (buf_len < bytes) { - // for (i = 0; i < buf_len; i++) { - // this_cp[i] = buf[i]; - // } - // TODO: memcpy remaining string, ensure trailing null is setup correctly, consolidate with above?? - //} + // - 2) buf_len < prev_cp_bytes, replace and shift chars down + else if (buf_len < prev_cp_bytes) { + // Replace code point with shorter one + for (i = 0; i < buf_len; i++) { + this_cp[i] = buf[i]; + } + // Move string down to eliminate unneeded chars + memmove(this_cp + buf_len, this_cp + prev_cp_bytes, len - cp_idx); + // Null terminate the shorter string. + // Ensure string_len is not reduced because original + // value still matters for GC purposes + raw[len - (prev_cp_bytes - buf_len)] = '\0'; + } + // - 3) TODO: buf_len > prev_cp_bytes, will need to allocate more memory (!!) else { Cyc_rt_raise2(data, "string-set! - unable to modify character", chr); } From 40b729e11bfdce80db77e80f8b4a1efc7d21bc15 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Wed, 8 Nov 2017 18:38:31 +0000 Subject: [PATCH 57/61] WIP - peek-char UTF8 support --- runtime.c | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/runtime.c b/runtime.c index f2a8ce9f..524e0b5e 100644 --- a/runtime.c +++ b/runtime.c @@ -6338,7 +6338,8 @@ object Cyc_io_peek_char(void *data, object cont, object port) port_type *p; uint32_t state = CYC_UTF8_ACCEPT; char_type codepoint; - int c; + int c, i = 0, at_mem_buf_end = 0; + char buf[5]; Cyc_check_port(data, port); { @@ -6348,14 +6349,39 @@ object Cyc_io_peek_char(void *data, object cont, object port) Cyc_rt_raise2(data, "Unable to read from closed port: ", port); } set_thread_blocked(data, cont); - _read_next_char(data, cont, p); + if (p->mem_buf_len == 0 || p->mem_buf_len == p->buf_idx) { + _read_next_char(data, cont, p); + } c = p->mem_buf[p->buf_idx]; - if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)c)) { - // TODO: only have a partial UTF8 code point, read more chars. + if (Cyc_utf8_decode(&state, &codepoint, (uint8_t)c)) { + // Only have a partial UTF8 code point, read more chars. // Problem is that there may not be enough space to store them // and do need to set them aside since we are just peeking here // and not actually supposed to be reading past chars. + + buf[0] = c; + i = 1; + while (1) { // TODO: limit to 4 chars?? + if (p->mem_buf_len == p->buf_idx + i) { + // No more buffered chars + at_mem_buf_end = 1; + c = fgetc(stream); + if (c == EOF) break; // TODO: correct to do this here???? + } else { + c = p->mem_buf[p->buf_idx + i]; + } + buf[i++] = c; + if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)c)) { + break; + } + } } + if (at_mem_buf_end && c != EOF) { + p->buf_idx = 0; + p->mem_buf_len = i; + memmove(p->mem_buf, buf, i); + } + return_thread_runnable(data, (c != EOF) ? obj_char2obj(codepoint) : Cyc_EOF); } return Cyc_EOF; @@ -6393,7 +6419,9 @@ object Cyc_io_read_char(void *data, object cont, object port) do { _read_next_char(data, cont, p); c = p->mem_buf[p->buf_idx++]; + if (c == EOF) break; } while(Cyc_utf8_decode(&state, &codepoint, (uint8_t)c)); +// TODO: limit above to 4 chars and then thrown an error? p->col_num++; return_thread_runnable(data, (c != EOF) ? obj_char2obj(codepoint) : Cyc_EOF); } From a492ca379d2117f8a8cb3fb1fef43df65413ed3d Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Thu, 9 Nov 2017 19:00:21 -0500 Subject: [PATCH 58/61] Handle the null character --- runtime.c | 24 ++++++++++++++++++------ scheme/base.sld | 8 ++++++-- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/runtime.c b/runtime.c index 524e0b5e..bf9d9e76 100644 --- a/runtime.c +++ b/runtime.c @@ -1860,8 +1860,12 @@ object Cyc_list2string(void *data, object cont, object lst) if (!obj_is_char(cbox)) { Cyc_rt_raise2(data, "Expected character but received", cbox); } - Cyc_utf8_encode_char(cbuf, 5, ch); - len += strlen(cbuf); + if (!ch) { + len++; + } else { + Cyc_utf8_encode_char(cbuf, 5, ch); + len += strlen(cbuf); + } tmp = cdr(tmp); } @@ -1871,8 +1875,12 @@ object Cyc_list2string(void *data, object cont, object lst) while ((lst != NULL)) { cbox = car(lst); ch = obj_obj2char(cbox); // Already validated, can assume chars now - Cyc_utf8_encode_char(&(buf[i]), 5, ch); - i += strlen(buf+i); + if (!ch) { + i++; + } else { + Cyc_utf8_encode_char(&(buf[i]), 5, ch); + i += strlen(buf+i); + } lst = cdr(lst); } buf[i] = '\0'; @@ -2130,8 +2138,12 @@ object Cyc_string_set(void *data, object str, object k, object chr) } input_char = obj_obj2char(chr); - Cyc_utf8_encode_char(buf, 5, input_char); - buf_len = strlen(buf); + if (!input_char) { + buf_len = 1; + } else { + Cyc_utf8_encode_char(buf, 5, input_char); + buf_len = strlen(buf); + } raw = string_str(str); idx = unbox_number(k); diff --git a/scheme/base.sld b/scheme/base.sld index 5ab42fd0..c13cde79 100644 --- a/scheme/base.sld +++ b/scheme/base.sld @@ -958,8 +958,12 @@ Cyc_rt_raise2(data, \"Expected character buf received\", fill); } c = obj_obj2char(fill); - Cyc_utf8_encode_char(ch_buf, 5, c); - buflen = strlen(ch_buf); + if (!c) { + buflen = 1; + } else { + Cyc_utf8_encode_char(ch_buf, 5, c); + buflen = strlen(ch_buf); + } num_cp = obj_obj2int(count); len = num_cp * buflen; if (len >= MAX_STACK_OBJ) { From 39d3be81419ba8752e491244c0ea722f60b5e77e Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Sat, 11 Nov 2017 18:16:14 -0500 Subject: [PATCH 59/61] Removing --- opt-test.data | 3 --- opt-test.scm | 45 --------------------------------------------- 2 files changed, 48 deletions(-) delete mode 100644 opt-test.data delete mode 100644 opt-test.scm diff --git a/opt-test.data b/opt-test.data deleted file mode 100644 index 01e79c32..00000000 --- a/opt-test.data +++ /dev/null @@ -1,3 +0,0 @@ -1 -2 -3 diff --git a/opt-test.scm b/opt-test.scm deleted file mode 100644 index d98394a1..00000000 --- a/opt-test.scm +++ /dev/null @@ -1,45 +0,0 @@ -;; Testing the next set of optimizations -;; To run: ./opt-test < opt-test.data -;; -;; Timings: T430 -;; Baseline - 2.511 -;; Dyadic - 1.409 -;; -(import (scheme base) - (scheme write) - (scheme read)) -(let ((x (read)) - (y (read)) - (z (read)) - (iterations 10000000) - (sum 0)) - (do ((i iterations (- i 1))) - ((zero? i)) - (set! sum (+ sum sum (* x y z))) - (set! sum (- sum sum (* x y z)))) - (write sum)) - -;;; Take an expression containing a single function call and break it up -;;; into many calls of 2 arguments each. -;(define (->dyadic expr) -; (cond -; ((< (length expr) 4) -; expr) -; (else -; (let ((fnc (car expr))) -; (foldl -; (lambda (x acc) -; (list fnc acc x)) -; `(,fnc ,(cadr expr) ,(caddr expr)) -; (cdddr expr)))))) -; -;(write (->dyadic '(+ 1))) -;(write (->dyadic '(+ 1 2))) -;(write (->dyadic '(+ 1 2 3))) -;(write (->dyadic '(+ 1 2 3 4))) -;;(write -;; (foldl -;; (lambda (x acc) -;; (list 'Cyc-fast-plus acc x)) -;; '(Cyc-fast-plus 1 2) -;; '(3 4 5))) From 9cfb80677a13a5306dae6abe4216ab478c795ec1 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Sun, 12 Nov 2017 18:29:32 -0500 Subject: [PATCH 60/61] Cleanup --- runtime.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/runtime.c b/runtime.c index bf9d9e76..d60de898 100644 --- a/runtime.c +++ b/runtime.c @@ -2162,6 +2162,7 @@ object Cyc_string_set(void *data, object str, object k, object chr) uint32_t state = 0; int i = 0, count, prev_cp_bytes = 0, cp_idx; + // Find index to change, and how many bytes it is for (count = 0; *tmp; ++tmp){ prev_cp_bytes++; if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ @@ -2208,7 +2209,8 @@ object Cyc_string_set(void *data, object str, object k, object chr) } // - 3) TODO: buf_len > prev_cp_bytes, will need to allocate more memory (!!) else { - Cyc_rt_raise2(data, "string-set! - unable to modify character", chr); + // TODO: maybe we can try a little harder here, at least in some cases + Cyc_rt_raise2(data, "string-set! - Unable to allocate memory to store multibyte character", chr); } } return str; From 1e8819d57ecfed13426e34c17c6a9ea822c6ddc6 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Sun, 12 Nov 2017 18:45:04 -0500 Subject: [PATCH 61/61] Limit iteration in Cyc_io_peek_char --- runtime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime.c b/runtime.c index d60de898..9b9e09af 100644 --- a/runtime.c +++ b/runtime.c @@ -6375,7 +6375,7 @@ object Cyc_io_peek_char(void *data, object cont, object port) buf[0] = c; i = 1; - while (1) { // TODO: limit to 4 chars?? + while (i < 5) { // TODO: limit to 4 chars?? if (p->mem_buf_len == p->buf_idx + i) { // No more buffered chars at_mem_buf_end = 1;