diff --git a/gc.c b/gc.c index 6fdf7f82..0c0e57f4 100644 --- a/gc.c +++ b/gc.c @@ -451,6 +451,7 @@ char *gc_copy_obj(object dest, char *obj, gc_thread_data * thd) memcpy(s, string_str(obj), string_len(obj) + 1); mark(hp) = thd->gc_alloc_color; type_of(hp) = string_tag; + string_num_cp(hp) = string_num_cp(obj); string_len(hp) = string_len(obj); string_str(hp) = s; return (char *)hp; diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h index 6a8027dd..80ea3f87 100644 --- a/include/cyclone/runtime.h +++ b/include/cyclone/runtime.h @@ -194,6 +194,7 @@ int binstr2int(const char *str); int octstr2int(const char *str); object Cyc_string_append(void *data, object cont, int argc, object str1, ...); object Cyc_string_length(void *data, object str); +object Cyc_string_byte_length(void *data, object str); object Cyc_substring(void *data, object cont, object str, object start, object end); object Cyc_string_ref(void *data, object str, object k); @@ -707,4 +708,27 @@ void add_global(object * glo); void Cyc_set_globals_changed(gc_thread_data *thd); /**@}*/ +/** + * \defgroup prim_utf8 UTF-8 + * + * @brief Unicode processing using UTF-8 + */ +/**@{*/ +#define CYC_UTF8_ACCEPT 0 +#define CYC_UTF8_REJECT 1 + +/** + * Simple macro to make it more convenient to convert a single char + */ +#define Cyc_utf8_encode_char(dest, dest_size, char_value) \ + Cyc_utf8_encode(dest, dest_size, &char_value, 1) + +int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz); +uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte); +int Cyc_utf8_count_code_points(uint8_t* s); +int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, char_type *codepoint, int *cpts, int *bytes); +uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); +uint32_t Cyc_utf8_validate(char *str, size_t len); +/**@}*/ + #endif /* CYCLONE_RUNTIME_H */ diff --git a/include/cyclone/types.h b/include/cyclone/types.h index 647f4ba3..65dc2d99 100644 --- a/include/cyclone/types.h +++ b/include/cyclone/types.h @@ -465,6 +465,12 @@ void clear_mutations(void *data); /** Minimum allowed value of a fixnum */ #define CYC_FIXNUM_MIN -1073741824 +/** + * Explicit character type now that we are using UTF-8. + * Chars are still value types though + */ +typedef uint32_t char_type; + /** * Determine if an object is an integer. */ @@ -488,12 +494,12 @@ void clear_mutations(void *data); /** * Convert from an object to a char. */ -#define obj_obj2char(x) (char)((long)(x)>>2) +#define obj_obj2char(x) (char_type)((uintmax_t)(x)>>2) /** * Convert from a char to an object. */ -#define obj_char2obj(c) ((void *)((((unsigned long)c)<<2) | 2)) +#define obj_char2obj(c) ((void *)((((uintmax_t)c)<<2) | 2)) /** * Is the given object a value type? @@ -721,16 +727,21 @@ typedef enum { typedef struct { gc_header_type hdr; tag_type tag; + int num_cp; int len; char *str; } string_type; +// TODO: below macros are obsolete, need new ones that populate num_cp and +// raise an error if an invalid UTF-8 char is detected + /** Create a new string in the nursery */ #define make_string(cs, s) string_type cs; \ { int len = strlen(s); \ cs.hdr.mark = gc_color_red; \ cs.hdr.grayed = 0; \ cs.tag = string_tag; \ + cs.num_cp = len; \ cs.len = len; \ cs.str = alloca(sizeof(char) * (len + 1)); \ memcpy(cs.str, s, len + 1);} @@ -744,6 +755,7 @@ typedef struct { cs.hdr.mark = gc_color_red; \ cs.hdr.grayed = 0; \ cs.tag = string_tag; cs.len = len; \ + cs.num_cp = len; \ cs.str = alloca(sizeof(char) * (len + 1)); \ memcpy(cs.str, s, len); \ cs.str[len] = '\0';} @@ -755,9 +767,51 @@ typedef struct { #define make_string_noalloc(cs, s, length) string_type cs; \ { cs.hdr.mark = gc_color_red; cs.hdr.grayed = 0; \ cs.tag = string_tag; cs.len = length; \ + cs.num_cp = length; \ cs.str = s; } -/** Get the length of a string */ +/** Create a new string in the nursery */ +#define make_utf8_string(data, cs, s) string_type cs; \ +{ int len = strlen(s); \ + cs.hdr.mark = gc_color_red; \ + cs.hdr.grayed = 0; \ + cs.tag = string_tag; \ + cs.num_cp = Cyc_utf8_count_code_points((uint8_t *)s); \ + if (cs.num_cp < 0) { \ + Cyc_rt_raise_msg(data, "Invalid UTF-8 characters in string"); \ + } \ + cs.len = len; \ + cs.str = alloca(sizeof(char) * (len + 1)); \ + memcpy(cs.str, s, len + 1);} + +/** + * Create a new string with the given length + * (so it does not need to be computed) + */ +#define make_utf8_string_with_len(cs, s, length, num_code_points) string_type cs; \ +{ int len = length; \ + cs.hdr.mark = gc_color_red; \ + cs.hdr.grayed = 0; \ + cs.tag = string_tag; cs.len = len; \ + cs.num_cp = num_code_points; \ + cs.str = alloca(sizeof(char) * (len + 1)); \ + memcpy(cs.str, s, len); \ + cs.str[len] = '\0';} + +/** + * Create a string object using the given C string and length. + * No allocation is done for the given C string. + */ +#define make_utf8_string_noalloc(cs, s, length) string_type cs; \ +{ cs.hdr.mark = gc_color_red; cs.hdr.grayed = 0; \ + cs.tag = string_tag; cs.len = length; \ + cs.num_cp = length; \ + cs.str = s; } + +/** Get the length of a string, in characters (code points) */ +#define string_num_cp(x) (((string_type *) x)->num_cp) + +/** Get the length of a string, in bytes */ #define string_len(x) (((string_type *) x)->len) /** Get a string object's C string */ diff --git a/mstreams.c b/mstreams.c index f1b1c50e..939d4047 100644 --- a/mstreams.c +++ b/mstreams.c @@ -102,6 +102,7 @@ void Cyc_io_get_output_string(void *data, object cont, object port) } { make_string_with_len(s, p->str_bv_in_mem_buf, p->str_bv_in_mem_buf_len); + s.num_cp = Cyc_utf8_count_code_points((uint8_t *)string_str(&s)); return_closcall1(data, cont, &s); } } diff --git a/runtime.c b/runtime.c index 1525f2d0..9b9e09af 100644 --- a/runtime.c +++ b/runtime.c @@ -181,6 +181,7 @@ void pack_env_variables(void *data, object k) svar->str = alloca(sizeof(char) * (svar->len)); strncpy(svar->str, e, svar->len); (svar->str)[svar->len] = '\0'; + svar->num_cp = Cyc_utf8_count_code_points((uint8_t *)svar->str); if (eqpos) { eqpos++; @@ -189,6 +190,7 @@ void pack_env_variables(void *data, object k) sval->hdr.grayed = 0; sval->tag = string_tag; sval->len = strlen(eqpos); + svar->num_cp = Cyc_utf8_count_code_points((uint8_t *)eqpos); sval->str = eqpos; set_pair(tmp, svar, sval); set_pair(p, tmp, NULL); @@ -558,7 +560,7 @@ void Cyc_rt_raise(void *data, object err) void Cyc_rt_raise2(void *data, const char *msg, object err) { - make_string(s, msg); + make_utf8_string(data, s, msg); make_pair(c3, err, NULL); make_pair(c2, &s, &c3); make_pair(c1, boolean_f, &c2); @@ -571,7 +573,7 @@ void Cyc_rt_raise2(void *data, const char *msg, object err) void Cyc_rt_raise_msg(void *data, const char *err) { - make_string(s, err); + make_utf8_string(data, s, err); Cyc_rt_raise(data, &s); } @@ -802,7 +804,10 @@ object Cyc_display(void *data, object x, FILE * port) return quote_void; } if (obj_is_char(x)) { - fprintf(port, "%c", obj_obj2char(x)); + char cbuf[5]; + char_type unbox = obj_obj2char(x); + Cyc_utf8_encode_char(cbuf, 5, unbox); + fprintf(port, "%s", cbuf); return quote_void; } if (obj_is_int(x)) { @@ -982,7 +987,7 @@ static object _Cyc_write(void *data, object x, FILE * port) return quote_void; } if (obj_is_char(x)) { - char c = obj_obj2char(x); + char_type c = obj_obj2char(x); switch (c) { case 0: fprintf(port, "#\\null"); break; case 7: fprintf(port, "#\\alarm"); break; @@ -993,11 +998,13 @@ static object _Cyc_write(void *data, object x, FILE * port) case 27: fprintf(port, "#\\escape"); break; case 32: fprintf(port, "#\\space"); break; case 127: fprintf(port, "#\\delete"); break; - default: - fprintf(port, "#\\%c", obj_obj2char(x)); + default: { + char cbuf[5]; + Cyc_utf8_encode_char(cbuf, 5, c); + fprintf(port, "#\\%s", cbuf); break; + } } - //fprintf(port, "#\\%c", obj_obj2char(x)); return quote_void; } if (obj_is_int(x)) { @@ -1095,7 +1102,10 @@ object Cyc_write_char(void *data, object c, object port) if (obj_is_char(c)) { FILE *fp = ((port_type *) port)->fp; if (fp){ - fprintf(fp, "%c", obj_obj2char(c)); + char cbuf[5]; + char_type unbox = obj_obj2char(c); + Cyc_utf8_encode_char(cbuf, 5, unbox); + fprintf(fp, "%s", cbuf); } } else { Cyc_rt_raise2(data, "Argument is not a character", c); @@ -1816,7 +1826,7 @@ object Cyc_symbol2string(void *data, object cont, object sym) Cyc_check_sym(data, sym); { const char *desc = symbol_desc(sym); - make_string(str, desc); + make_utf8_string(data, str, desc); _return_closcall1(data, cont, &str); }} @@ -1833,21 +1843,44 @@ object Cyc_string2symbol(void *data, object str) object Cyc_list2string(void *data, object cont, object lst) { - char *buf; - int i = 0; - object len; + char *buf, cbuf[5]; + int i = 0, len = 0; + object cbox, tmp = lst; + char_type ch; Cyc_check_pair_or_null(data, lst); - len = Cyc_length(data, lst); // Inefficient, walks whole list + + // Need to walk the list of chars to compute multibyte length + while (tmp) { + if (is_value_type(tmp) || ((list) tmp)->tag != pair_tag) { + Cyc_rt_raise2(data, "length - invalid parameter, expected list", tmp); + } + cbox = car(tmp); + ch = obj_obj2char(cbox); + if (!obj_is_char(cbox)) { + Cyc_rt_raise2(data, "Expected character but received", cbox); + } + if (!ch) { + len++; + } else { + Cyc_utf8_encode_char(cbuf, 5, ch); + len += strlen(cbuf); + } + tmp = cdr(tmp); + } { - make_string_noalloc(str, NULL, (obj_obj2int(len))); - str.str = buf = alloca(sizeof(char) * (obj_obj2int(len) + 1)); + make_string_noalloc(str, NULL, len); + str.str = buf = alloca(sizeof(char) * (len + 1)); while ((lst != NULL)) { - if (!obj_is_char(car(lst))) { - Cyc_rt_raise2(data, "Expected character but received", car(lst)); + cbox = car(lst); + ch = obj_obj2char(cbox); // Already validated, can assume chars now + if (!ch) { + i++; + } else { + Cyc_utf8_encode_char(&(buf[i]), 5, ch); + i += strlen(buf+i); } - buf[i++] = obj_obj2char(car(lst)); lst = cdr(lst); } buf[i] = '\0'; @@ -2032,7 +2065,7 @@ object Cyc_string_cmp(void *data, object str1, object str2) } #define Cyc_string_append_va_list(data, argc) { \ - int i = 0, total_len = 1; \ + int i = 0, total_cp = 0, total_len = 1; \ int *len = alloca(sizeof(int) * argc); \ char *buffer, *bufferp, **str = alloca(sizeof(char *) * argc); \ object tmp; \ @@ -2041,6 +2074,7 @@ object Cyc_string_cmp(void *data, object str1, object str2) str[i] = ((string_type *)str1)->str; \ len[i] = string_len((str1)); \ total_len += len[i]; \ + total_cp += string_num_cp((str1)); \ } \ for (i = 1; i < argc; i++) { \ tmp = va_arg(ap, object); \ @@ -2048,6 +2082,7 @@ object Cyc_string_cmp(void *data, object str1, object str2) str[i] = ((string_type *)tmp)->str; \ len[i] = string_len((tmp)); \ total_len += len[i]; \ + total_cp += string_num_cp((tmp)); \ } \ buffer = bufferp = alloca(sizeof(char) * total_len); \ for (i = 0; i < argc; i++) { \ @@ -2056,6 +2091,7 @@ object Cyc_string_cmp(void *data, object str1, object str2) } \ *bufferp = '\0'; \ make_string(result, buffer); \ + string_num_cp((&result)) = total_cp; \ va_end(ap); \ _return_closcall1(data, cont, &result); \ } @@ -2076,6 +2112,12 @@ object Cyc_string_append(void *data, object cont, int _argc, object str1, ...) } object Cyc_string_length(void *data, object str) +{ + Cyc_check_str(data, str); + return obj_int2obj(string_num_cp(str)); +} + +object Cyc_string_byte_length(void *data, object str) { Cyc_check_str(data, str); return obj_int2obj(string_len(str)); @@ -2083,8 +2125,10 @@ object Cyc_string_length(void *data, object str) object Cyc_string_set(void *data, object str, object k, object chr) { + char buf[5]; char *raw; - int idx, len; + int idx, len, buf_len; + char_type input_char; Cyc_check_str(data, str); Cyc_check_num(data, k); @@ -2093,12 +2137,82 @@ object Cyc_string_set(void *data, object str, object k, object chr) Cyc_rt_raise2(data, "Expected char but received", chr); } + input_char = obj_obj2char(chr); + if (!input_char) { + buf_len = 1; + } else { + Cyc_utf8_encode_char(buf, 5, input_char); + buf_len = strlen(buf); + } + raw = string_str(str); idx = unbox_number(k); len = string_len(str); Cyc_check_bounds(data, "string-set!", len, idx); - raw[idx] = obj_obj2char(chr); + + if (string_num_cp(str) == string_len(str) && buf_len == 1) { + // Take fast path if all chars are just 1 byte + raw[idx] = obj_obj2char(chr); + } else { + // Slower path for UTF-8, need to handle replacement differently + // depending upon how the new char affects length of the string + char *tmp = raw, *this_cp = raw; + char_type codepoint; + uint32_t state = 0; + int i = 0, count, prev_cp_bytes = 0, cp_idx; + + // Find index to change, and how many bytes it is + for (count = 0; *tmp; ++tmp){ + prev_cp_bytes++; + if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ + if (count == idx) { + break; + } + this_cp = tmp + 1; + count += 1; + prev_cp_bytes = 0; + } + i++; + } + cp_idx = i; + if (state != CYC_UTF8_ACCEPT) { + Cyc_rt_raise2(data, "string-set! - invalid character at index", k); + } + + // Perform actual mutation + // + // Now we know length of start (both in codepoints and bytes), + // and we know the codepoint to be replaced. by calculating its length + // we can compute where the end portion starts, and by using str we can + // figure out how many remaining bytes/codepoints are in end + // + // 3 cases: + // - 1) buf_len = prev_cp_bytes, just straight replace + if (buf_len == prev_cp_bytes) { + for (i = 0; i < buf_len; i++) { + this_cp[i] = buf[i]; + } + } + // - 2) buf_len < prev_cp_bytes, replace and shift chars down + else if (buf_len < prev_cp_bytes) { + // Replace code point with shorter one + for (i = 0; i < buf_len; i++) { + this_cp[i] = buf[i]; + } + // Move string down to eliminate unneeded chars + memmove(this_cp + buf_len, this_cp + prev_cp_bytes, len - cp_idx); + // Null terminate the shorter string. + // Ensure string_len is not reduced because original + // value still matters for GC purposes + raw[len - (prev_cp_bytes - buf_len)] = '\0'; + } + // - 3) TODO: buf_len > prev_cp_bytes, will need to allocate more memory (!!) + else { + // TODO: maybe we can try a little harder here, at least in some cases + Cyc_rt_raise2(data, "string-set! - Unable to allocate memory to store multibyte character", chr); + } + } return str; } @@ -2112,13 +2226,30 @@ object Cyc_string_ref(void *data, object str, object k) raw = string_str(str); idx = unbox_number(k); - len = string_len(str); + len = string_num_cp(str); if (idx < 0 || idx >= len) { Cyc_rt_raise2(data, "string-ref - invalid index", k); } - return obj_char2obj(raw[idx]); + // Take fast path if all chars are just 1 byte + if (string_num_cp(str) == string_len(str)) { + return obj_char2obj(raw[idx]); + } else { + char_type codepoint; + uint32_t state = 0; + int count; + + for (count = 0; *raw; ++raw){ + if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*raw)){ + if (count == idx) break; // Reached requested index + count += 1; + } + } + if (state != CYC_UTF8_ACCEPT) + Cyc_rt_raise2(data, "string-ref - invalid character at index", k); + return obj_char2obj(codepoint); + } } object Cyc_substring(void *data, object cont, object str, object start, @@ -2134,7 +2265,7 @@ object Cyc_substring(void *data, object cont, object str, object start, raw = string_str(str); s = unbox_number(start); e = unbox_number(end); - len = string_len(str); + len = string_num_cp(str); if (s > e) { Cyc_rt_raise2(data, "substring - start cannot be greater than end", start); @@ -2148,9 +2279,33 @@ object Cyc_substring(void *data, object cont, object str, object start, e = len; } - { + if (string_num_cp(str) == string_len(str)){ // Fast path for ASCII make_string_with_len(sub, raw + s, e - s); _return_closcall1(data, cont, &sub); + } else { + const char *tmp = raw; + char_type codepoint; + uint32_t state = 0; + int num_ch, cur_ch_bytes = 0, start_i = 0, end_i = 0; + for (num_ch = 0; *tmp; ++tmp){ + cur_ch_bytes++; + if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ + end_i += cur_ch_bytes; + num_ch += 1; + cur_ch_bytes = 0; + + if (num_ch == s) { + start_i = end_i; + } + if (num_ch == e) { + break; + } + } + } + if (state != CYC_UTF8_ACCEPT) + Cyc_rt_raise2(data, "substring - invalid character in string", str); + make_utf8_string_with_len(sub, raw + start_i, end_i - start_i, e - s); + _return_closcall1(data, cont, &sub); } } @@ -2164,22 +2319,22 @@ object Cyc_installation_dir(void *data, object cont, object type) strncmp(((symbol) type)->desc, "sld", 5) == 0) { char buf[1024]; snprintf(buf, sizeof(buf), "%s", CYC_INSTALL_SLD); - make_string(str, buf); + make_utf8_string(data, str, buf); _return_closcall1(data, cont, &str); } else if (Cyc_is_symbol(type) == boolean_t && strncmp(((symbol) type)->desc, "lib", 5) == 0) { char buf[1024]; snprintf(buf, sizeof(buf), "%s", CYC_INSTALL_LIB); - make_string(str, buf); + make_utf8_string(data, str, buf); _return_closcall1(data, cont, &str); } else if (Cyc_is_symbol(type) == boolean_t && strncmp(((symbol) type)->desc, "inc", 5) == 0) { char buf[1024]; snprintf(buf, sizeof(buf), "%s", CYC_INSTALL_INC); - make_string(str, buf); + make_utf8_string(data, str, buf); _return_closcall1(data, cont, &str); } else { - make_string(str, CYC_INSTALL_DIR); + make_utf8_string(data, str, CYC_INSTALL_DIR); _return_closcall1(data, cont, &str); } } @@ -2193,22 +2348,22 @@ object Cyc_compilation_environment(void *data, object cont, object var) if (strncmp(((symbol) var)->desc, "cc-prog", 8) == 0) { char buf[1024]; snprintf(buf, sizeof(buf), "%s", CYC_CC_PROG); - make_string(str, buf); + make_utf8_string(data, str, buf); _return_closcall1(data, cont, &str); } else if (strncmp(((symbol) var)->desc, "cc-exec", 8) == 0) { char buf[1024]; snprintf(buf, sizeof(buf), "%s", CYC_CC_EXEC); - make_string(str, buf); + make_utf8_string(data, str, buf); _return_closcall1(data, cont, &str); } else if (strncmp(((symbol) var)->desc, "cc-lib", 7) == 0) { char buf[1024]; snprintf(buf, sizeof(buf), "%s", CYC_CC_LIB); - make_string(str, buf); + make_utf8_string(data, str, buf); _return_closcall1(data, cont, &str); } else if (strncmp(((symbol) var)->desc, "cc-so", 6) == 0) { char buf[1024]; snprintf(buf, sizeof(buf), "%s", CYC_CC_SO); - make_string(str, buf); + make_utf8_string(data, str, buf); _return_closcall1(data, cont, &str); } } @@ -2234,7 +2389,7 @@ object Cyc_command_line_arguments(void *data, object cont) for (i = _cyc_argc; i > 1; i--) { // skip program name object ps = alloca(sizeof(string_type)); object pl = alloca(sizeof(pair_type)); - make_string(s, _cyc_argv[i - 1]); + make_utf8_string(data, s, _cyc_argv[i - 1]); memcpy(ps, &s, sizeof(string_type)); ((list) pl)->hdr.mark = gc_color_red; ((list) pl)->hdr.grayed = 0; @@ -2490,6 +2645,10 @@ object Cyc_utf82string(void *data, object cont, object bv, object start, st.str = alloca(sizeof(char) * (len + 1)); memcpy(st.str, &buf[s], len); st.str[len] = '\0'; + st.num_cp = Cyc_utf8_count_code_points((uint8_t *)(st.str)); + if (st.num_cp < 0) { + Cyc_rt_raise2(data, "utf8->string - error decoding UTF 8", bv); + } _return_closcall1(data, cont, &st); } } @@ -2509,18 +2668,45 @@ object Cyc_string2utf8(void *data, object cont, object str, object start, e = unbox_number(end); len = e - s; - if (s < 0 || (s >= string_len(str) && len > 0)) { + if (s < 0 || (s >= string_num_cp(str) && len > 0)) { Cyc_rt_raise2(data, "string->utf8 - invalid start", start); } - if (e < 0 || e < s || e > string_len(str)) { + if (e < 0 || e < s || e > string_num_cp(str)) { Cyc_rt_raise2(data, "string->utf8 - invalid end", end); } - result.len = len; - result.data = alloca(sizeof(char) * len); - memcpy(&result.data[0], &(string_str(str))[s], len); - _return_closcall1(data, cont, &result); + // Fast path + if (string_num_cp(str) == string_len(str)) { // TODO: disable for testing purposes + result.len = len; + result.data = alloca(sizeof(char) * len); + memcpy(&result.data[0], &(string_str(str))[s], len); + _return_closcall1(data, cont, &result); + } else { + const char *tmp = string_str(str); + char_type codepoint; + uint32_t state = 0; + int num_ch, cur_ch_bytes = 0, start_i = 0, end_i = 0; + for (num_ch = 0; *tmp; ++tmp){ + cur_ch_bytes++; + if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ + end_i += cur_ch_bytes; + num_ch += 1; + cur_ch_bytes = 0; + + if (num_ch == s) { + start_i = end_i; + } + if (num_ch == e) { + break; + } + } + } + result.len = end_i - start_i; + result.data = alloca(sizeof(char) * result.len); + memcpy(&result.data[0], &(string_str(str))[start_i], result.len); + _return_closcall1(data, cont, &result); + } } object Cyc_bytevector_u8_ref(void *data, object bv, object k) @@ -2624,7 +2810,7 @@ object Cyc_char2integer(object chr) object Cyc_integer2char(void *data, object n) { - int val = 0; + char_type val = 0; Cyc_check_num(data, n); val = unbox_number(n); @@ -5635,7 +5821,7 @@ void Cyc_import_shared_object(void *data, object cont, object filename, object e handle = dlopen(string_str(filename), RTLD_GLOBAL | RTLD_LAZY); if (handle == NULL) { snprintf(buffer, 256, "%s", dlerror()); - make_string(s, buffer); + make_utf8_string(data, s, buffer); Cyc_rt_raise2(data, "Unable to import library", &s); } dlerror(); /* Clear any existing error */ @@ -5643,7 +5829,7 @@ void Cyc_import_shared_object(void *data, object cont, object filename, object e entry_pt = (function_type) dlsym(handle, string_str(entry_pt_fnc)); if (entry_pt == NULL) { snprintf(buffer, 256, "%s, %s, %s", string_str(filename), string_str(entry_pt_fnc), dlerror()); - make_string(s, buffer); + make_utf8_string(data, s, buffer); Cyc_rt_raise2(data, "Unable to load symbol", &s); } mclosure1(clo, entry_pt, cont); @@ -5692,6 +5878,7 @@ void _read_error(void *data, port_type *p, const char *msg) // the cont could receive an error and raise it though //Cyc_rt_raise_msg(data, buf); make_string(str, buf); + str.num_cp = Cyc_utf8_count_code_points((uint8_t *)buf); make_empty_vector(vec); vec.num_elements = 1; vec.elements = (object *) alloca(sizeof(object) * vec.num_elements); @@ -5804,6 +5991,28 @@ static void _read_add_to_tok_buf(port_type *p, char c) p->tok_buf[p->tok_end++] = c; } +/** + * @brief Determine if given string is numeric + */ +int _read_is_numeric(const char *tok) +{ + int len = strlen(tok); + return (len && + ((isdigit(tok[0])) || + ((len > 1) && tok[0] == '.' && isdigit(tok[1])) || + ((len > 1) && (tok[1] == '.' || isdigit(tok[1])) && (tok[0] == '-' || tok[0] == '+')))); +} + +/** + * @brief Helper function, determine if given number is a hex digit + * @param c Character to check + */ +int _read_is_hex_digit(char c) +{ + return (c >= 'a' && c <= 'f') || + (c >= 'A' && c <= 'F'); +} + /** * @brief Helper function to read a string * @param data Thread data object @@ -5863,6 +6072,12 @@ void _read_string(void *data, object cont, port_type *p) p->buf_idx++; break; } + // Verify if hex digit is valid + if (!isdigit(p->mem_buf[p->buf_idx]) && + !_read_is_hex_digit(p->mem_buf[p->buf_idx])) { + p->buf_idx++; + _read_error(data, p, "invalid hex digit in string"); + } buf[i] = p->mem_buf[p->buf_idx]; p->buf_idx++; p->col_num++; @@ -5870,8 +6085,14 @@ void _read_string(void *data, object cont, port_type *p) } buf[i] = '\0'; { - int result = (int)strtol(buf, NULL, 16); - p->tok_buf[p->tok_end++] = (char)result; + char_type result = strtol(buf, NULL, 16); + char cbuf[5]; + int i; + Cyc_utf8_encode_char(cbuf, 5, result); + for (i = 0; cbuf[i] != 0; i++) { + _read_add_to_tok_buf(p, cbuf[i]); + } + //p->tok_buf[p->tok_end++] = (char)result; } break; } @@ -5883,7 +6104,7 @@ void _read_string(void *data, object cont, port_type *p) p->tok_buf[p->tok_end] = '\0'; // TODO: what if buffer is full? p->tok_end = 0; // Reset for next atom { - make_string(str, p->tok_buf); + make_utf8_string(data, str, p->tok_buf); return_thread_runnable(data, &str); } } else if (c == '\\') { @@ -5943,6 +6164,7 @@ void _read_return_character(void *data, port_type *p) p->tok_buf[p->tok_end] = '\0'; // TODO: what if buffer is full? p->tok_end = 0; // Reset for next atom if (strlen(p->tok_buf) == 1) { + // ASCII char, consider merging with below? return_thread_runnable(data, obj_char2obj(p->tok_buf[0])); } else if(strncmp(p->tok_buf, "alarm", 5) == 0) { return_thread_runnable(data, obj_char2obj('\a')); @@ -5964,12 +6186,27 @@ void _read_return_character(void *data, port_type *p) return_thread_runnable(data, obj_char2obj('\t')); } else if(strlen(p->tok_buf) > 1 && p->tok_buf[0] == 'x') { const char *buf = p->tok_buf + 1; - int result = strtol(buf, NULL, 16); + char_type result = strtol(buf, NULL, 16); return_thread_runnable(data, obj_char2obj(result)); } else { - char buf[31]; - snprintf(buf, 30, "Unable to parse character %s", p->tok_buf); - _read_error(data, p, buf); + // Try to read a UTF-8 char and if so return it, otherwise throw an error + uint32_t state = CYC_UTF8_ACCEPT; + char_type codepoint; + uint8_t *s = (uint8_t *)p->tok_buf; + while(s) { + if (!Cyc_utf8_decode(&state, &codepoint, *s)) { + s++; + break; + } + s++; + } + if (state == CYC_UTF8_ACCEPT && *s == '\0') { + return_thread_runnable(data, obj_char2obj(codepoint)); + } else { + char buf[31]; + snprintf(buf, 30, "Unable to parse character %s", p->tok_buf); + _read_error(data, p, buf); + } } } @@ -6001,28 +6238,6 @@ void _read_character(void *data, port_type *p) } } -/** - * @brief Determine if given string is numeric - */ -int _read_is_numeric(const char *tok) -{ - int len = strlen(tok); - return (len && - ((isdigit(tok[0])) || - ((len > 1) && tok[0] == '.' && isdigit(tok[1])) || - ((len > 1) && (tok[1] == '.' || isdigit(tok[1])) && (tok[0] == '-' || tok[0] == '+')))); -} - -/** - * @brief Helper function, determine if given number is a hex digit - * @param c Character to check - */ -int _read_is_hex_digit(char c) -{ - return (c >= 'a' && c <= 'f') || - (c >= 'A' && c <= 'F'); -} - /** * @brief Helper function, return read number. * @param data Thread data object @@ -6102,6 +6317,7 @@ void _read_return_atom(void *data, object cont, port_type *p) if (_read_is_numeric(p->tok_buf)) { make_string(str, p->tok_buf); + str.num_cp = Cyc_utf8_count_code_points((uint8_t *)(p->tok_buf)); make_c_opaque(opq, &str); return_thread_runnable(data, &opq); } else if (strncmp("+inf.0", p->tok_buf, 6) == 0 || @@ -6134,7 +6350,10 @@ object Cyc_io_peek_char(void *data, object cont, object port) { FILE *stream; port_type *p; - int c; + uint32_t state = CYC_UTF8_ACCEPT; + char_type codepoint; + int c, i = 0, at_mem_buf_end = 0; + char buf[5]; Cyc_check_port(data, port); { @@ -6144,9 +6363,40 @@ object Cyc_io_peek_char(void *data, object cont, object port) Cyc_rt_raise2(data, "Unable to read from closed port: ", port); } set_thread_blocked(data, cont); - _read_next_char(data, cont, p); + if (p->mem_buf_len == 0 || p->mem_buf_len == p->buf_idx) { + _read_next_char(data, cont, p); + } c = p->mem_buf[p->buf_idx]; - return_thread_runnable(data, (c != EOF) ? obj_char2obj(c) : Cyc_EOF); + if (Cyc_utf8_decode(&state, &codepoint, (uint8_t)c)) { + // Only have a partial UTF8 code point, read more chars. + // Problem is that there may not be enough space to store them + // and do need to set them aside since we are just peeking here + // and not actually supposed to be reading past chars. + + buf[0] = c; + i = 1; + while (i < 5) { // TODO: limit to 4 chars?? + if (p->mem_buf_len == p->buf_idx + i) { + // No more buffered chars + at_mem_buf_end = 1; + c = fgetc(stream); + if (c == EOF) break; // TODO: correct to do this here???? + } else { + c = p->mem_buf[p->buf_idx + i]; + } + buf[i++] = c; + if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)c)) { + break; + } + } + } + if (at_mem_buf_end && c != EOF) { + p->buf_idx = 0; + p->mem_buf_len = i; + memmove(p->mem_buf, buf, i); + } + + return_thread_runnable(data, (c != EOF) ? obj_char2obj(codepoint) : Cyc_EOF); } return Cyc_EOF; } @@ -6171,17 +6421,23 @@ object Cyc_io_peek_char(void *data, object cont, object port) object Cyc_io_read_char(void *data, object cont, object port) { port_type *p = (port_type *)port; - int c; Cyc_check_port(data, port); if (p->fp == NULL) { Cyc_rt_raise2(data, "Unable to read from closed port: ", port); } { + uint32_t state = CYC_UTF8_ACCEPT; + char_type codepoint; + int c; set_thread_blocked(data, cont); - _read_next_char(data, cont, p); - c = p->mem_buf[p->buf_idx++]; + do { + _read_next_char(data, cont, p); + c = p->mem_buf[p->buf_idx++]; + if (c == EOF) break; + } while(Cyc_utf8_decode(&state, &codepoint, (uint8_t)c)); +// TODO: limit above to 4 chars and then thrown an error? p->col_num++; - return_thread_runnable(data, (c != EOF) ? obj_char2obj(c) : Cyc_EOF); + return_thread_runnable(data, (c != EOF) ? obj_char2obj(codepoint) : Cyc_EOF); } return Cyc_EOF; } @@ -6190,8 +6446,10 @@ object Cyc_io_read_char(void *data, object cont, object port) object Cyc_io_read_line(void *data, object cont, object port) { FILE *stream = ((port_type *) port)->fp; - char buf[1024]; - int len; + char buf[1027]; + int len, num_cp, i = 0; + char_type codepoint; + uint32_t state; Cyc_check_port(data, port); if (stream == NULL) { @@ -6200,7 +6458,21 @@ object Cyc_io_read_line(void *data, object cont, object port) set_thread_blocked(data, cont); errno = 0; if (fgets(buf, 1023, stream) != NULL) { - len = strlen(buf); + state = Cyc_utf8_count_code_points_and_bytes((uint8_t *)buf, &codepoint, &num_cp, &len); + // Check if we stopped reading in the middle of a code point and + // if so, read one byte at a time until that code point is finished. + while (state != CYC_UTF8_ACCEPT && i < 3) { + int c = fgetc(stream); + buf[len] = c; + len++; + Cyc_utf8_decode(&state, &codepoint, (uint8_t)c); + if (state == CYC_UTF8_ACCEPT) { + num_cp++; + break; + } + i++; + } + { // Remove any trailing CR / newline chars while (len > 0 && (buf[len - 1] == '\n' || @@ -6209,6 +6481,7 @@ object Cyc_io_read_line(void *data, object cont, object port) } buf[len] = '\0'; make_string_noalloc(s, buf, len); + s.num_cp = num_cp; return_thread_runnable(data, &s); } } else { @@ -6364,3 +6637,189 @@ void Cyc_io_read_token(void *data, object cont, object port) } } +////////////// UTF-8 Section ////////////// + +// Copyright (c) 2008-2009 Bjoern Hoehrmann +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +//uint32_t inline +uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != CYC_UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} +// END Bjoern Hoehrmann + +/** + * @brief + * Count the number of code points in a string. + * Based on example code from Bjoern Hoehrmann. + */ +int Cyc_utf8_count_code_points(uint8_t* s) { + uint32_t codepoint; + uint32_t state = 0; + int count; + + for (count = 0; *s; ++s) + if (!Cyc_utf8_decode(&state, &codepoint, *s)) + count += 1; + + if (state != CYC_UTF8_ACCEPT) + return -1; + return count; +} + +int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, char_type *codepoint, int *cpts, int *bytes) { + uint32_t state = 0; + *cpts = 0; + *bytes = 0; + for (; *s; ++s){ + *bytes += 1; + if (!Cyc_utf8_decode(&state, codepoint, *s)) + *cpts += 1; + } + + if (state != CYC_UTF8_ACCEPT) + return state; + return 0; +} + +// TODO: index into X codepoint in a string + +/** + * @brief + * Use this when validating from a stream, as it may be that the stream stopped + * in the middle of a codepoint, hence state passed in as an arg, so it can be + * tested in a loop and also after the loop has finished. + * + * From https://stackoverflow.com/a/22135005/101258 + */ +uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len) { + size_t i; + uint32_t type; + + for (i = 0; i < len; i++) { + // We don't care about the codepoint, so this is + // a simplified version of the decode function. + type = utf8d[(uint8_t)str[i]]; + *state = utf8d[256 + (*state) * 16 + type]; + + if (*state == CYC_UTF8_REJECT) + break; + } + + return *state; +} + +/** + * @brief Simplified version of Cyc_utf8_validate_stream that must always be called with a complete string buffer. + */ +uint32_t Cyc_utf8_validate(char *str, size_t len) { + size_t i; + uint32_t state = CYC_UTF8_ACCEPT, type; + + for (i = 0; i < len; i++) { + // We don't care about the codepoint, so this is + // a simplified version of the decode function. + type = utf8d[(uint8_t)str[i]]; + state = utf8d[256 + (state) * 16 + type]; + + if (state == CYC_UTF8_REJECT) + break; + } + + return state; +} + +//int uint32_num_bytes(uint32_t x) { +// // TODO: could compute log(val) / log(256) +// if (x < 0x100) return 1; +// if (x < 0x10000) return 2; +// if (x < 0x1000000) return 3; +// return 4; +//} + +/** + * This function takes one or more 32-bit chars and encodes them + * as an array of UTF-8 bytes. + * FROM: https://www.cprogramming.com/tutorial/utf8.c + * + * @param dest Destination byte buffer + * @param sz size of dest buffer in bytes + * @param src Buffer of source data, in 32-bit characters + * @param srcsz number of source characters, or -1 if 0-terminated + * + * @return Number of characters converted + * + * dest will only be '\0'-terminated if there is enough space. this is + * for consistency; imagine there are 2 bytes of space left, but the next + * character requires 3 bytes. in this case we could NUL-terminate, but in + * general we can't when there's insufficient space. therefore this function + * only NUL-terminates if all the characters fit, and there's space for + * the NUL as well. + * the destination string will never be bigger than the source string. + */ +int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz) +{ + u_int32_t ch; + int i = 0; + char *dest_end = dest + sz; + + while (srcsz<0 ? src[i]!=0 : i < srcsz) { + ch = src[i]; + if (ch < 0x80) { + if (dest >= dest_end) + return i; + *dest++ = (char)ch; + } + else if (ch < 0x800) { + if (dest >= dest_end-1) + return i; + *dest++ = (ch>>6) | 0xC0; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x10000) { + if (dest >= dest_end-2) + return i; + *dest++ = (ch>>12) | 0xE0; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x110000) { + if (dest >= dest_end-3) + return i; + *dest++ = (ch>>18) | 0xF0; + *dest++ = ((ch>>12) & 0x3F) | 0x80; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + i++; + } + if (dest < dest_end) + *dest = '\0'; + return i; +} + + +////////////// END UTF-8 Section ////////////// diff --git a/scheme/base.sld b/scheme/base.sld index 75ecc2f1..c13cde79 100644 --- a/scheme/base.sld +++ b/scheme/base.sld @@ -950,9 +950,22 @@ (define-c Cyc-make-string "(void *data, int argc, closure _, object k, object count, object fill)" " object s = NULL; + char ch_buf[5]; + char_type c; + int buflen, num_cp, len; Cyc_check_int(data, count); - char c = obj_obj2char(fill); - int len = obj_obj2int(count); + if (!obj_is_char(fill)) { + Cyc_rt_raise2(data, \"Expected character buf received\", fill); + } + c = obj_obj2char(fill); + if (!c) { + buflen = 1; + } else { + Cyc_utf8_encode_char(ch_buf, 5, c); + buflen = strlen(ch_buf); + } + num_cp = obj_obj2int(count); + len = num_cp * buflen; if (len >= MAX_STACK_OBJ) { int heap_grown; s = gc_alloc(((gc_thread_data *)data)->heap, @@ -964,6 +977,7 @@ ((string_type *) s)->hdr.grayed = 0; ((string_type *) s)->tag = string_tag; ((string_type *) s)->len = len; + ((string_type *) s)->num_cp = num_cp; ((string_type *) s)->str = (((char *)s) + sizeof(string_type)); } else { s = alloca(sizeof(string_type)); @@ -971,9 +985,18 @@ ((string_type *)s)->hdr.grayed = 0; ((string_type *)s)->tag = string_tag; ((string_type *)s)->len = len; + ((string_type *)s)->num_cp = num_cp; ((string_type *)s)->str = alloca(sizeof(char) * (len + 1)); } - memset(((string_type *)s)->str, c, len); + if (num_cp == 1) { /* Fast path */ + memset(((string_type *)s)->str, ch_buf[0], len); + } else { + char *buf = ((string_type *)s)->str; + int bi, si, slen = buflen; + for (bi = 0, si = 0; bi < len; bi++, si++) { + buf[bi] = ch_buf[si % slen]; + } + } ((string_type *)s)->str[len] = '\\0'; return_closcall1(data, k, s); ") diff --git a/scheme/cyclone/cgen.sld b/scheme/cyclone/cgen.sld index 7cdd74f0..d3456d11 100644 --- a/scheme/cyclone/cgen.sld +++ b/scheme/cyclone/cgen.sld @@ -521,7 +521,15 @@ (string-append "&" cvar-name) ; Code is just the variable name (list ; Allocate integer on the C stack (string-append - "make_string(" cvar-name ", " (->cstr exp) ");"))))) + "make_utf8_string_with_len(" + cvar-name + ", " + (->cstr exp) + ", " + (number->string (string-byte-length exp)) + ", " + (number->string (string-length exp)) + ");"))))) ;TODO: not good enough, need to store new symbols in a table so they can ;be inserted into the C program ((symbol? exp) @@ -536,6 +544,10 @@ (define (->cstr str) (string-append "\"" (cstr:escape-chars str) "\"")) +(define-c string-byte-length + "(void *data, int argc, closure _, object k, object s)" + " return_closcall1(data, k, Cyc_string_byte_length(data, s)); ") + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Primitives diff --git a/scheme/process-context.sld b/scheme/process-context.sld index f8ce64f2..e5cfa5c3 100644 --- a/scheme/process-context.sld +++ b/scheme/process-context.sld @@ -24,7 +24,7 @@ for (i = _cyc_argc; i > 0; i--) { object ps = alloca(sizeof(string_type)); object pl = alloca(sizeof(pair_type)); - make_string(s, _cyc_argv[i - 1]); + make_utf8_string(data, s, _cyc_argv[i - 1]); memcpy(ps, &s, sizeof(string_type)); ((list)pl)->hdr.mark = gc_color_red; ((list)pl)->hdr.grayed = 0; @@ -44,7 +44,7 @@ if (v == NULL) { return_closcall1(data, k, boolean_f); } else { - make_string(str, v); + make_utf8_string(data, str, v); return_closcall1(data, k, &str); } ") diff --git a/test.c b/test.c new file mode 100644 index 00000000..434c33cd --- /dev/null +++ b/test.c @@ -0,0 +1,193 @@ +// A temporary test file +#include +#include +#include +#include + +#define CYC_UTF8_ACCEPT 0 + +// Copyright (c) 2008-2009 Bjoern Hoehrmann +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +//uint32_t inline +uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != CYC_UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} + +/** + * Simple macro to make it more convenient to convert a single char + */ +#define Cyc_utf8_encode_char(dest, dest_size, char_value) \ + Cyc_utf8_encode(dest, dest_size, &char_value, 1) + +/** + * This function takes one or more 32-bit chars and encodes them + * as an array of UTF-8 bytes. + * FROM: https://www.cprogramming.com/tutorial/utf8.c + * + * @param dest Destination byte buffer + * @param sz size of dest buffer in bytes + * @param src Buffer of source data, in 32-bit characters + * @param srcsz number of source characters, or -1 if 0-terminated + * + * @return Number of characters converted + * + * dest will only be '\0'-terminated if there is enough space. this is + * for consistency; imagine there are 2 bytes of space left, but the next + * character requires 3 bytes. in this case we could NUL-terminate, but in + * general we can't when there's insufficient space. therefore this function + * only NUL-terminates if all the characters fit, and there's space for + * the NUL as well. + * the destination string will never be bigger than the source string. + */ +int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz) +{ + u_int32_t ch; + int i = 0; + char *dest_end = dest + sz; + + while (srcsz<0 ? src[i]!=0 : i < srcsz) { + ch = src[i]; + if (ch < 0x80) { + if (dest >= dest_end) + return i; + *dest++ = (char)ch; + } + else if (ch < 0x800) { + if (dest >= dest_end-1) + return i; + *dest++ = (ch>>6) | 0xC0; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x10000) { + if (dest >= dest_end-2) + return i; + *dest++ = (ch>>12) | 0xE0; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x110000) { + if (dest >= dest_end-3) + return i; + *dest++ = (ch>>18) | 0xF0; + *dest++ = ((ch>>12) & 0x3F) | 0x80; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + i++; + } + if (dest < dest_end) + *dest = '\0'; + return i; +} + +void encode(uint32_t val) { + char dest[5]; + int rv, i; + + rv = Cyc_utf8_encode_char(dest, 5, val); + printf("%x %d \n", val, rv); + for(i = 0; i < 5; i++) { + printf("[%x] ", (uint8_t)dest[i]); + } + printf("\n"); + return; +} + +void multi_byte_memset(char *buf, int blen, char *src, int slen) +{ + int bi, si; + for (bi = 0, si = 0; bi < blen; bi++, si++) { + buf[bi] = src[si % slen]; + } +} + +void substring(int s, int e, const char *expected) { + uint8_t raw[] = {65, 66, 0xCE, 0xBB, 67}; + const char *tmp = raw; + uint32_t codepoint; + uint32_t state = 0; + int num_ch, cur_ch_bytes = 0, start_i = 0, end_i = 0; + for (num_ch = 0; *tmp; ++tmp){ + cur_ch_bytes++; + if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){ + end_i += cur_ch_bytes; + num_ch += 1; + cur_ch_bytes = 0; + + if (num_ch == s) { + start_i = end_i; + } + if (num_ch == e) { + break; + } + } + } + raw[end_i] = '\0'; + printf("expected=%s, raw=%s, s=%d, e=%d, start_i=%d, end_i=%d\n", expected, raw + start_i, s, e, start_i, end_i); +} + +void main(){ + char c[128]; + uint8_t cv[] = {0xEC, 0xBA, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE + uint8_t cv2[] = {0xCE, 0xBB}; // Lambda (0x03bb) is encoded with leading 0xCE + //uint8_t cv2[] = {0xEC, 0xBA, 0xBB}; // Lambda (0x03bb) is encoded with leading 0xCE +// uint8_t cv[] = {0xCE, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE + char *cptr; + uint32_t state = CYC_UTF8_ACCEPT, codepoint, val = 0x32363435; + uint8_t *ptr = (uint8_t *)&val; + int i, j = 0; +// //memset(c, 0x34, 128); +// for (i = 0; i < 127; i++) { +// c[i] = ptr[j++]; +// if (j == 4) j = 0; +// } +// c[127] = '\0'; +// printf("%s\n", c); + multi_byte_memset(c, 126, cv2, 2); + c[127] = '\0'; + printf("TEST: %s\n", c); + + ptr = cv; + for (i = 0; i < 3; i++) { + Cyc_utf8_decode(&state, &codepoint, ptr[i]); + } + printf("state = %d, cp = %d\n", state, codepoint); + + encode(0x3bb); + encode(65); + encode(0xcebb); + + printf("%06X\n", 0x0fff); + substring(0, 1, "A "); + substring(0, 2, "AB "); + substring(1, 3, "Bx "); + substring(1, 4, "BxC "); + substring(2, 2, " "); + substring(2, 3, "x "); + substring(2, 4, "xC "); + return; +}