diff --git a/include/cyclone/types.h b/include/cyclone/types.h index 3667d661..d91a1aa0 100644 --- a/include/cyclone/types.h +++ b/include/cyclone/types.h @@ -788,12 +788,12 @@ typedef struct { * Create a new string with the given length * (so it does not need to be computed) */ -#define make_utf8_string_with_len(cs, s, length, num_cp) string_type cs; \ +#define make_utf8_string_with_len(cs, s, length, num_code_points) string_type cs; \ { int len = length; \ cs.hdr.mark = gc_color_red; \ cs.hdr.grayed = 0; \ cs.tag = string_tag; cs.len = len; \ - cs.num_cp = num_cp; \ + cs.num_cp = num_code_points; \ cs.str = alloca(sizeof(char) * (len + 1)); \ memcpy(cs.str, s, len); \ cs.str[len] = '\0';} diff --git a/runtime.c b/runtime.c index 67f51bf3..607f935d 100644 --- a/runtime.c +++ b/runtime.c @@ -2121,9 +2121,10 @@ object Cyc_string_ref(void *data, object str, object k) Cyc_rt_raise2(data, "string-ref - invalid index", k); } -TODO: we can take the fast path if num_cp == len, since that implies all chars are just 1 byte. - would be the case for all string functions that need to be updated to be (possibly) O(n) - { + // Take fast path if all chars are just 1 byte + if (string_num_cp(str) == string_len(str)) { + return obj_char2obj(raw[idx]); + } else { char_type codepoint; uint32_t state = 0; int count; @@ -2153,7 +2154,7 @@ object Cyc_substring(void *data, object cont, object str, object start, raw = string_str(str); s = unbox_number(start); e = unbox_number(end); - len = string_len(str); + len = string_num_cp(str); if (s > e) { Cyc_rt_raise2(data, "substring - start cannot be greater than end", start); @@ -2167,9 +2168,30 @@ object Cyc_substring(void *data, object cont, object str, object start, e = len; } - { + if (string_num_cp(str) == string_len(str)){ // Fast path for ASCII make_string_with_len(sub, raw + s, e - s); _return_closcall1(data, cont, &sub); + } else { + const char *tmp = raw; + char_type codepoint; + uint32_t state = 0; + int count, start_i = 0, end_i = 0; + + for (count = 0; *tmp; ++tmp){ + if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){ + if (count == s) { + start_i = end_i; + } else if (count == e) { + break; + } + count += 1; + } + end_i++; + } + if (state != CYC_UTF8_ACCEPT) + Cyc_rt_raise2(data, "substring - invalid character in string", str); + make_utf8_string_with_len(sub, raw + start_i, end_i - start_i, e - s); + _return_closcall1(data, cont, &sub); } }