mirror of
https://github.com/justinethier/cyclone.git
synced 2025-05-24 20:45:06 +02:00
Added UTF8 support to Cyc_substring
This commit is contained in:
parent
424592ad8b
commit
3e64420101
2 changed files with 29 additions and 7 deletions
|
@ -788,12 +788,12 @@ typedef struct {
|
||||||
* Create a new string with the given length
|
* Create a new string with the given length
|
||||||
* (so it does not need to be computed)
|
* (so it does not need to be computed)
|
||||||
*/
|
*/
|
||||||
#define make_utf8_string_with_len(cs, s, length, num_cp) string_type cs; \
|
#define make_utf8_string_with_len(cs, s, length, num_code_points) string_type cs; \
|
||||||
{ int len = length; \
|
{ int len = length; \
|
||||||
cs.hdr.mark = gc_color_red; \
|
cs.hdr.mark = gc_color_red; \
|
||||||
cs.hdr.grayed = 0; \
|
cs.hdr.grayed = 0; \
|
||||||
cs.tag = string_tag; cs.len = len; \
|
cs.tag = string_tag; cs.len = len; \
|
||||||
cs.num_cp = num_cp; \
|
cs.num_cp = num_code_points; \
|
||||||
cs.str = alloca(sizeof(char) * (len + 1)); \
|
cs.str = alloca(sizeof(char) * (len + 1)); \
|
||||||
memcpy(cs.str, s, len); \
|
memcpy(cs.str, s, len); \
|
||||||
cs.str[len] = '\0';}
|
cs.str[len] = '\0';}
|
||||||
|
|
32
runtime.c
32
runtime.c
|
@ -2121,9 +2121,10 @@ object Cyc_string_ref(void *data, object str, object k)
|
||||||
Cyc_rt_raise2(data, "string-ref - invalid index", k);
|
Cyc_rt_raise2(data, "string-ref - invalid index", k);
|
||||||
}
|
}
|
||||||
|
|
||||||
TODO: we can take the fast path if num_cp == len, since that implies all chars are just 1 byte.
|
// Take fast path if all chars are just 1 byte
|
||||||
would be the case for all string functions that need to be updated to be (possibly) O(n)
|
if (string_num_cp(str) == string_len(str)) {
|
||||||
{
|
return obj_char2obj(raw[idx]);
|
||||||
|
} else {
|
||||||
char_type codepoint;
|
char_type codepoint;
|
||||||
uint32_t state = 0;
|
uint32_t state = 0;
|
||||||
int count;
|
int count;
|
||||||
|
@ -2153,7 +2154,7 @@ object Cyc_substring(void *data, object cont, object str, object start,
|
||||||
raw = string_str(str);
|
raw = string_str(str);
|
||||||
s = unbox_number(start);
|
s = unbox_number(start);
|
||||||
e = unbox_number(end);
|
e = unbox_number(end);
|
||||||
len = string_len(str);
|
len = string_num_cp(str);
|
||||||
|
|
||||||
if (s > e) {
|
if (s > e) {
|
||||||
Cyc_rt_raise2(data, "substring - start cannot be greater than end", start);
|
Cyc_rt_raise2(data, "substring - start cannot be greater than end", start);
|
||||||
|
@ -2167,9 +2168,30 @@ object Cyc_substring(void *data, object cont, object str, object start,
|
||||||
e = len;
|
e = len;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
if (string_num_cp(str) == string_len(str)){ // Fast path for ASCII
|
||||||
make_string_with_len(sub, raw + s, e - s);
|
make_string_with_len(sub, raw + s, e - s);
|
||||||
_return_closcall1(data, cont, &sub);
|
_return_closcall1(data, cont, &sub);
|
||||||
|
} else {
|
||||||
|
const char *tmp = raw;
|
||||||
|
char_type codepoint;
|
||||||
|
uint32_t state = 0;
|
||||||
|
int count, start_i = 0, end_i = 0;
|
||||||
|
|
||||||
|
for (count = 0; *tmp; ++tmp){
|
||||||
|
if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){
|
||||||
|
if (count == s) {
|
||||||
|
start_i = end_i;
|
||||||
|
} else if (count == e) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
end_i++;
|
||||||
|
}
|
||||||
|
if (state != CYC_UTF8_ACCEPT)
|
||||||
|
Cyc_rt_raise2(data, "substring - invalid character in string", str);
|
||||||
|
make_utf8_string_with_len(sub, raw + start_i, end_i - start_i, e - s);
|
||||||
|
_return_closcall1(data, cont, &sub);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue