mirror of
https://github.com/justinethier/cyclone.git
synced 2025-07-15 08:47:35 +02:00
Merge remote-tracking branch 'origin/utf8-dev'
This commit is contained in:
commit
9186af6db4
9 changed files with 858 additions and 91 deletions
1
gc.c
1
gc.c
|
@ -451,6 +451,7 @@ char *gc_copy_obj(object dest, char *obj, gc_thread_data * thd)
|
|||
memcpy(s, string_str(obj), string_len(obj) + 1);
|
||||
mark(hp) = thd->gc_alloc_color;
|
||||
type_of(hp) = string_tag;
|
||||
string_num_cp(hp) = string_num_cp(obj);
|
||||
string_len(hp) = string_len(obj);
|
||||
string_str(hp) = s;
|
||||
return (char *)hp;
|
||||
|
|
|
@ -194,6 +194,7 @@ int binstr2int(const char *str);
|
|||
int octstr2int(const char *str);
|
||||
object Cyc_string_append(void *data, object cont, int argc, object str1, ...);
|
||||
object Cyc_string_length(void *data, object str);
|
||||
object Cyc_string_byte_length(void *data, object str);
|
||||
object Cyc_substring(void *data, object cont, object str, object start,
|
||||
object end);
|
||||
object Cyc_string_ref(void *data, object str, object k);
|
||||
|
@ -707,4 +708,27 @@ void add_global(object * glo);
|
|||
void Cyc_set_globals_changed(gc_thread_data *thd);
|
||||
/**@}*/
|
||||
|
||||
/**
|
||||
* \defgroup prim_utf8 UTF-8
|
||||
*
|
||||
* @brief Unicode processing using UTF-8
|
||||
*/
|
||||
/**@{*/
|
||||
#define CYC_UTF8_ACCEPT 0
|
||||
#define CYC_UTF8_REJECT 1
|
||||
|
||||
/**
|
||||
* Simple macro to make it more convenient to convert a single char
|
||||
*/
|
||||
#define Cyc_utf8_encode_char(dest, dest_size, char_value) \
|
||||
Cyc_utf8_encode(dest, dest_size, &char_value, 1)
|
||||
|
||||
int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz);
|
||||
uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
|
||||
int Cyc_utf8_count_code_points(uint8_t* s);
|
||||
int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, char_type *codepoint, int *cpts, int *bytes);
|
||||
uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len);
|
||||
uint32_t Cyc_utf8_validate(char *str, size_t len);
|
||||
/**@}*/
|
||||
|
||||
#endif /* CYCLONE_RUNTIME_H */
|
||||
|
|
|
@ -465,6 +465,12 @@ void clear_mutations(void *data);
|
|||
/** Minimum allowed value of a fixnum */
|
||||
#define CYC_FIXNUM_MIN -1073741824
|
||||
|
||||
/**
|
||||
* Explicit character type now that we are using UTF-8.
|
||||
* Chars are still value types though
|
||||
*/
|
||||
typedef uint32_t char_type;
|
||||
|
||||
/**
|
||||
* Determine if an object is an integer.
|
||||
*/
|
||||
|
@ -488,12 +494,12 @@ void clear_mutations(void *data);
|
|||
/**
|
||||
* Convert from an object to a char.
|
||||
*/
|
||||
#define obj_obj2char(x) (char)((long)(x)>>2)
|
||||
#define obj_obj2char(x) (char_type)((uintmax_t)(x)>>2)
|
||||
|
||||
/**
|
||||
* Convert from a char to an object.
|
||||
*/
|
||||
#define obj_char2obj(c) ((void *)((((unsigned long)c)<<2) | 2))
|
||||
#define obj_char2obj(c) ((void *)((((uintmax_t)c)<<2) | 2))
|
||||
|
||||
/**
|
||||
* Is the given object a value type?
|
||||
|
@ -721,16 +727,21 @@ typedef enum {
|
|||
typedef struct {
|
||||
gc_header_type hdr;
|
||||
tag_type tag;
|
||||
int num_cp;
|
||||
int len;
|
||||
char *str;
|
||||
} string_type;
|
||||
|
||||
// TODO: below macros are obsolete, need new ones that populate num_cp and
|
||||
// raise an error if an invalid UTF-8 char is detected
|
||||
|
||||
/** Create a new string in the nursery */
|
||||
#define make_string(cs, s) string_type cs; \
|
||||
{ int len = strlen(s); \
|
||||
cs.hdr.mark = gc_color_red; \
|
||||
cs.hdr.grayed = 0; \
|
||||
cs.tag = string_tag; \
|
||||
cs.num_cp = len; \
|
||||
cs.len = len; \
|
||||
cs.str = alloca(sizeof(char) * (len + 1)); \
|
||||
memcpy(cs.str, s, len + 1);}
|
||||
|
@ -744,6 +755,7 @@ typedef struct {
|
|||
cs.hdr.mark = gc_color_red; \
|
||||
cs.hdr.grayed = 0; \
|
||||
cs.tag = string_tag; cs.len = len; \
|
||||
cs.num_cp = len; \
|
||||
cs.str = alloca(sizeof(char) * (len + 1)); \
|
||||
memcpy(cs.str, s, len); \
|
||||
cs.str[len] = '\0';}
|
||||
|
@ -755,9 +767,51 @@ typedef struct {
|
|||
#define make_string_noalloc(cs, s, length) string_type cs; \
|
||||
{ cs.hdr.mark = gc_color_red; cs.hdr.grayed = 0; \
|
||||
cs.tag = string_tag; cs.len = length; \
|
||||
cs.num_cp = length; \
|
||||
cs.str = s; }
|
||||
|
||||
/** Get the length of a string */
|
||||
/** Create a new string in the nursery */
|
||||
#define make_utf8_string(data, cs, s) string_type cs; \
|
||||
{ int len = strlen(s); \
|
||||
cs.hdr.mark = gc_color_red; \
|
||||
cs.hdr.grayed = 0; \
|
||||
cs.tag = string_tag; \
|
||||
cs.num_cp = Cyc_utf8_count_code_points((uint8_t *)s); \
|
||||
if (cs.num_cp < 0) { \
|
||||
Cyc_rt_raise_msg(data, "Invalid UTF-8 characters in string"); \
|
||||
} \
|
||||
cs.len = len; \
|
||||
cs.str = alloca(sizeof(char) * (len + 1)); \
|
||||
memcpy(cs.str, s, len + 1);}
|
||||
|
||||
/**
|
||||
* Create a new string with the given length
|
||||
* (so it does not need to be computed)
|
||||
*/
|
||||
#define make_utf8_string_with_len(cs, s, length, num_code_points) string_type cs; \
|
||||
{ int len = length; \
|
||||
cs.hdr.mark = gc_color_red; \
|
||||
cs.hdr.grayed = 0; \
|
||||
cs.tag = string_tag; cs.len = len; \
|
||||
cs.num_cp = num_code_points; \
|
||||
cs.str = alloca(sizeof(char) * (len + 1)); \
|
||||
memcpy(cs.str, s, len); \
|
||||
cs.str[len] = '\0';}
|
||||
|
||||
/**
|
||||
* Create a string object using the given C string and length.
|
||||
* No allocation is done for the given C string.
|
||||
*/
|
||||
#define make_utf8_string_noalloc(cs, s, length) string_type cs; \
|
||||
{ cs.hdr.mark = gc_color_red; cs.hdr.grayed = 0; \
|
||||
cs.tag = string_tag; cs.len = length; \
|
||||
cs.num_cp = length; \
|
||||
cs.str = s; }
|
||||
|
||||
/** Get the length of a string, in characters (code points) */
|
||||
#define string_num_cp(x) (((string_type *) x)->num_cp)
|
||||
|
||||
/** Get the length of a string, in bytes */
|
||||
#define string_len(x) (((string_type *) x)->len)
|
||||
|
||||
/** Get a string object's C string */
|
||||
|
|
|
@ -102,6 +102,7 @@ void Cyc_io_get_output_string(void *data, object cont, object port)
|
|||
}
|
||||
{
|
||||
make_string_with_len(s, p->str_bv_in_mem_buf, p->str_bv_in_mem_buf_len);
|
||||
s.num_cp = Cyc_utf8_count_code_points((uint8_t *)string_str(&s));
|
||||
return_closcall1(data, cont, &s);
|
||||
}
|
||||
}
|
||||
|
|
599
runtime.c
599
runtime.c
|
@ -181,6 +181,7 @@ void pack_env_variables(void *data, object k)
|
|||
svar->str = alloca(sizeof(char) * (svar->len));
|
||||
strncpy(svar->str, e, svar->len);
|
||||
(svar->str)[svar->len] = '\0';
|
||||
svar->num_cp = Cyc_utf8_count_code_points((uint8_t *)svar->str);
|
||||
|
||||
if (eqpos) {
|
||||
eqpos++;
|
||||
|
@ -189,6 +190,7 @@ void pack_env_variables(void *data, object k)
|
|||
sval->hdr.grayed = 0;
|
||||
sval->tag = string_tag;
|
||||
sval->len = strlen(eqpos);
|
||||
svar->num_cp = Cyc_utf8_count_code_points((uint8_t *)eqpos);
|
||||
sval->str = eqpos;
|
||||
set_pair(tmp, svar, sval);
|
||||
set_pair(p, tmp, NULL);
|
||||
|
@ -558,7 +560,7 @@ void Cyc_rt_raise(void *data, object err)
|
|||
|
||||
void Cyc_rt_raise2(void *data, const char *msg, object err)
|
||||
{
|
||||
make_string(s, msg);
|
||||
make_utf8_string(data, s, msg);
|
||||
make_pair(c3, err, NULL);
|
||||
make_pair(c2, &s, &c3);
|
||||
make_pair(c1, boolean_f, &c2);
|
||||
|
@ -571,7 +573,7 @@ void Cyc_rt_raise2(void *data, const char *msg, object err)
|
|||
|
||||
void Cyc_rt_raise_msg(void *data, const char *err)
|
||||
{
|
||||
make_string(s, err);
|
||||
make_utf8_string(data, s, err);
|
||||
Cyc_rt_raise(data, &s);
|
||||
}
|
||||
|
||||
|
@ -802,7 +804,10 @@ object Cyc_display(void *data, object x, FILE * port)
|
|||
return quote_void;
|
||||
}
|
||||
if (obj_is_char(x)) {
|
||||
fprintf(port, "%c", obj_obj2char(x));
|
||||
char cbuf[5];
|
||||
char_type unbox = obj_obj2char(x);
|
||||
Cyc_utf8_encode_char(cbuf, 5, unbox);
|
||||
fprintf(port, "%s", cbuf);
|
||||
return quote_void;
|
||||
}
|
||||
if (obj_is_int(x)) {
|
||||
|
@ -982,7 +987,7 @@ static object _Cyc_write(void *data, object x, FILE * port)
|
|||
return quote_void;
|
||||
}
|
||||
if (obj_is_char(x)) {
|
||||
char c = obj_obj2char(x);
|
||||
char_type c = obj_obj2char(x);
|
||||
switch (c) {
|
||||
case 0: fprintf(port, "#\\null"); break;
|
||||
case 7: fprintf(port, "#\\alarm"); break;
|
||||
|
@ -993,11 +998,13 @@ static object _Cyc_write(void *data, object x, FILE * port)
|
|||
case 27: fprintf(port, "#\\escape"); break;
|
||||
case 32: fprintf(port, "#\\space"); break;
|
||||
case 127: fprintf(port, "#\\delete"); break;
|
||||
default:
|
||||
fprintf(port, "#\\%c", obj_obj2char(x));
|
||||
default: {
|
||||
char cbuf[5];
|
||||
Cyc_utf8_encode_char(cbuf, 5, c);
|
||||
fprintf(port, "#\\%s", cbuf);
|
||||
break;
|
||||
}
|
||||
//fprintf(port, "#\\%c", obj_obj2char(x));
|
||||
}
|
||||
return quote_void;
|
||||
}
|
||||
if (obj_is_int(x)) {
|
||||
|
@ -1095,7 +1102,10 @@ object Cyc_write_char(void *data, object c, object port)
|
|||
if (obj_is_char(c)) {
|
||||
FILE *fp = ((port_type *) port)->fp;
|
||||
if (fp){
|
||||
fprintf(fp, "%c", obj_obj2char(c));
|
||||
char cbuf[5];
|
||||
char_type unbox = obj_obj2char(c);
|
||||
Cyc_utf8_encode_char(cbuf, 5, unbox);
|
||||
fprintf(fp, "%s", cbuf);
|
||||
}
|
||||
} else {
|
||||
Cyc_rt_raise2(data, "Argument is not a character", c);
|
||||
|
@ -1816,7 +1826,7 @@ object Cyc_symbol2string(void *data, object cont, object sym)
|
|||
Cyc_check_sym(data, sym);
|
||||
{
|
||||
const char *desc = symbol_desc(sym);
|
||||
make_string(str, desc);
|
||||
make_utf8_string(data, str, desc);
|
||||
_return_closcall1(data, cont, &str);
|
||||
}}
|
||||
|
||||
|
@ -1833,21 +1843,44 @@ object Cyc_string2symbol(void *data, object str)
|
|||
|
||||
object Cyc_list2string(void *data, object cont, object lst)
|
||||
{
|
||||
char *buf;
|
||||
int i = 0;
|
||||
object len;
|
||||
char *buf, cbuf[5];
|
||||
int i = 0, len = 0;
|
||||
object cbox, tmp = lst;
|
||||
char_type ch;
|
||||
|
||||
Cyc_check_pair_or_null(data, lst);
|
||||
len = Cyc_length(data, lst); // Inefficient, walks whole list
|
||||
|
||||
// Need to walk the list of chars to compute multibyte length
|
||||
while (tmp) {
|
||||
if (is_value_type(tmp) || ((list) tmp)->tag != pair_tag) {
|
||||
Cyc_rt_raise2(data, "length - invalid parameter, expected list", tmp);
|
||||
}
|
||||
cbox = car(tmp);
|
||||
ch = obj_obj2char(cbox);
|
||||
if (!obj_is_char(cbox)) {
|
||||
Cyc_rt_raise2(data, "Expected character but received", cbox);
|
||||
}
|
||||
if (!ch) {
|
||||
len++;
|
||||
} else {
|
||||
Cyc_utf8_encode_char(cbuf, 5, ch);
|
||||
len += strlen(cbuf);
|
||||
}
|
||||
tmp = cdr(tmp);
|
||||
}
|
||||
|
||||
{
|
||||
make_string_noalloc(str, NULL, (obj_obj2int(len)));
|
||||
str.str = buf = alloca(sizeof(char) * (obj_obj2int(len) + 1));
|
||||
make_string_noalloc(str, NULL, len);
|
||||
str.str = buf = alloca(sizeof(char) * (len + 1));
|
||||
while ((lst != NULL)) {
|
||||
if (!obj_is_char(car(lst))) {
|
||||
Cyc_rt_raise2(data, "Expected character but received", car(lst));
|
||||
cbox = car(lst);
|
||||
ch = obj_obj2char(cbox); // Already validated, can assume chars now
|
||||
if (!ch) {
|
||||
i++;
|
||||
} else {
|
||||
Cyc_utf8_encode_char(&(buf[i]), 5, ch);
|
||||
i += strlen(buf+i);
|
||||
}
|
||||
buf[i++] = obj_obj2char(car(lst));
|
||||
lst = cdr(lst);
|
||||
}
|
||||
buf[i] = '\0';
|
||||
|
@ -2032,7 +2065,7 @@ object Cyc_string_cmp(void *data, object str1, object str2)
|
|||
}
|
||||
|
||||
#define Cyc_string_append_va_list(data, argc) { \
|
||||
int i = 0, total_len = 1; \
|
||||
int i = 0, total_cp = 0, total_len = 1; \
|
||||
int *len = alloca(sizeof(int) * argc); \
|
||||
char *buffer, *bufferp, **str = alloca(sizeof(char *) * argc); \
|
||||
object tmp; \
|
||||
|
@ -2041,6 +2074,7 @@ object Cyc_string_cmp(void *data, object str1, object str2)
|
|||
str[i] = ((string_type *)str1)->str; \
|
||||
len[i] = string_len((str1)); \
|
||||
total_len += len[i]; \
|
||||
total_cp += string_num_cp((str1)); \
|
||||
} \
|
||||
for (i = 1; i < argc; i++) { \
|
||||
tmp = va_arg(ap, object); \
|
||||
|
@ -2048,6 +2082,7 @@ object Cyc_string_cmp(void *data, object str1, object str2)
|
|||
str[i] = ((string_type *)tmp)->str; \
|
||||
len[i] = string_len((tmp)); \
|
||||
total_len += len[i]; \
|
||||
total_cp += string_num_cp((tmp)); \
|
||||
} \
|
||||
buffer = bufferp = alloca(sizeof(char) * total_len); \
|
||||
for (i = 0; i < argc; i++) { \
|
||||
|
@ -2056,6 +2091,7 @@ object Cyc_string_cmp(void *data, object str1, object str2)
|
|||
} \
|
||||
*bufferp = '\0'; \
|
||||
make_string(result, buffer); \
|
||||
string_num_cp((&result)) = total_cp; \
|
||||
va_end(ap); \
|
||||
_return_closcall1(data, cont, &result); \
|
||||
}
|
||||
|
@ -2076,6 +2112,12 @@ object Cyc_string_append(void *data, object cont, int _argc, object str1, ...)
|
|||
}
|
||||
|
||||
object Cyc_string_length(void *data, object str)
|
||||
{
|
||||
Cyc_check_str(data, str);
|
||||
return obj_int2obj(string_num_cp(str));
|
||||
}
|
||||
|
||||
object Cyc_string_byte_length(void *data, object str)
|
||||
{
|
||||
Cyc_check_str(data, str);
|
||||
return obj_int2obj(string_len(str));
|
||||
|
@ -2083,8 +2125,10 @@ object Cyc_string_length(void *data, object str)
|
|||
|
||||
object Cyc_string_set(void *data, object str, object k, object chr)
|
||||
{
|
||||
char buf[5];
|
||||
char *raw;
|
||||
int idx, len;
|
||||
int idx, len, buf_len;
|
||||
char_type input_char;
|
||||
|
||||
Cyc_check_str(data, str);
|
||||
Cyc_check_num(data, k);
|
||||
|
@ -2093,12 +2137,82 @@ object Cyc_string_set(void *data, object str, object k, object chr)
|
|||
Cyc_rt_raise2(data, "Expected char but received", chr);
|
||||
}
|
||||
|
||||
input_char = obj_obj2char(chr);
|
||||
if (!input_char) {
|
||||
buf_len = 1;
|
||||
} else {
|
||||
Cyc_utf8_encode_char(buf, 5, input_char);
|
||||
buf_len = strlen(buf);
|
||||
}
|
||||
|
||||
raw = string_str(str);
|
||||
idx = unbox_number(k);
|
||||
len = string_len(str);
|
||||
|
||||
Cyc_check_bounds(data, "string-set!", len, idx);
|
||||
|
||||
if (string_num_cp(str) == string_len(str) && buf_len == 1) {
|
||||
// Take fast path if all chars are just 1 byte
|
||||
raw[idx] = obj_obj2char(chr);
|
||||
} else {
|
||||
// Slower path for UTF-8, need to handle replacement differently
|
||||
// depending upon how the new char affects length of the string
|
||||
char *tmp = raw, *this_cp = raw;
|
||||
char_type codepoint;
|
||||
uint32_t state = 0;
|
||||
int i = 0, count, prev_cp_bytes = 0, cp_idx;
|
||||
|
||||
// Find index to change, and how many bytes it is
|
||||
for (count = 0; *tmp; ++tmp){
|
||||
prev_cp_bytes++;
|
||||
if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
|
||||
if (count == idx) {
|
||||
break;
|
||||
}
|
||||
this_cp = tmp + 1;
|
||||
count += 1;
|
||||
prev_cp_bytes = 0;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
cp_idx = i;
|
||||
if (state != CYC_UTF8_ACCEPT) {
|
||||
Cyc_rt_raise2(data, "string-set! - invalid character at index", k);
|
||||
}
|
||||
|
||||
// Perform actual mutation
|
||||
//
|
||||
// Now we know length of start (both in codepoints and bytes),
|
||||
// and we know the codepoint to be replaced. by calculating its length
|
||||
// we can compute where the end portion starts, and by using str we can
|
||||
// figure out how many remaining bytes/codepoints are in end
|
||||
//
|
||||
// 3 cases:
|
||||
// - 1) buf_len = prev_cp_bytes, just straight replace
|
||||
if (buf_len == prev_cp_bytes) {
|
||||
for (i = 0; i < buf_len; i++) {
|
||||
this_cp[i] = buf[i];
|
||||
}
|
||||
}
|
||||
// - 2) buf_len < prev_cp_bytes, replace and shift chars down
|
||||
else if (buf_len < prev_cp_bytes) {
|
||||
// Replace code point with shorter one
|
||||
for (i = 0; i < buf_len; i++) {
|
||||
this_cp[i] = buf[i];
|
||||
}
|
||||
// Move string down to eliminate unneeded chars
|
||||
memmove(this_cp + buf_len, this_cp + prev_cp_bytes, len - cp_idx);
|
||||
// Null terminate the shorter string.
|
||||
// Ensure string_len is not reduced because original
|
||||
// value still matters for GC purposes
|
||||
raw[len - (prev_cp_bytes - buf_len)] = '\0';
|
||||
}
|
||||
// - 3) TODO: buf_len > prev_cp_bytes, will need to allocate more memory (!!)
|
||||
else {
|
||||
// TODO: maybe we can try a little harder here, at least in some cases
|
||||
Cyc_rt_raise2(data, "string-set! - Unable to allocate memory to store multibyte character", chr);
|
||||
}
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
|
@ -2112,13 +2226,30 @@ object Cyc_string_ref(void *data, object str, object k)
|
|||
|
||||
raw = string_str(str);
|
||||
idx = unbox_number(k);
|
||||
len = string_len(str);
|
||||
len = string_num_cp(str);
|
||||
|
||||
if (idx < 0 || idx >= len) {
|
||||
Cyc_rt_raise2(data, "string-ref - invalid index", k);
|
||||
}
|
||||
|
||||
// Take fast path if all chars are just 1 byte
|
||||
if (string_num_cp(str) == string_len(str)) {
|
||||
return obj_char2obj(raw[idx]);
|
||||
} else {
|
||||
char_type codepoint;
|
||||
uint32_t state = 0;
|
||||
int count;
|
||||
|
||||
for (count = 0; *raw; ++raw){
|
||||
if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*raw)){
|
||||
if (count == idx) break; // Reached requested index
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
if (state != CYC_UTF8_ACCEPT)
|
||||
Cyc_rt_raise2(data, "string-ref - invalid character at index", k);
|
||||
return obj_char2obj(codepoint);
|
||||
}
|
||||
}
|
||||
|
||||
object Cyc_substring(void *data, object cont, object str, object start,
|
||||
|
@ -2134,7 +2265,7 @@ object Cyc_substring(void *data, object cont, object str, object start,
|
|||
raw = string_str(str);
|
||||
s = unbox_number(start);
|
||||
e = unbox_number(end);
|
||||
len = string_len(str);
|
||||
len = string_num_cp(str);
|
||||
|
||||
if (s > e) {
|
||||
Cyc_rt_raise2(data, "substring - start cannot be greater than end", start);
|
||||
|
@ -2148,9 +2279,33 @@ object Cyc_substring(void *data, object cont, object str, object start,
|
|||
e = len;
|
||||
}
|
||||
|
||||
{
|
||||
if (string_num_cp(str) == string_len(str)){ // Fast path for ASCII
|
||||
make_string_with_len(sub, raw + s, e - s);
|
||||
_return_closcall1(data, cont, &sub);
|
||||
} else {
|
||||
const char *tmp = raw;
|
||||
char_type codepoint;
|
||||
uint32_t state = 0;
|
||||
int num_ch, cur_ch_bytes = 0, start_i = 0, end_i = 0;
|
||||
for (num_ch = 0; *tmp; ++tmp){
|
||||
cur_ch_bytes++;
|
||||
if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
|
||||
end_i += cur_ch_bytes;
|
||||
num_ch += 1;
|
||||
cur_ch_bytes = 0;
|
||||
|
||||
if (num_ch == s) {
|
||||
start_i = end_i;
|
||||
}
|
||||
if (num_ch == e) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (state != CYC_UTF8_ACCEPT)
|
||||
Cyc_rt_raise2(data, "substring - invalid character in string", str);
|
||||
make_utf8_string_with_len(sub, raw + start_i, end_i - start_i, e - s);
|
||||
_return_closcall1(data, cont, &sub);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2164,22 +2319,22 @@ object Cyc_installation_dir(void *data, object cont, object type)
|
|||
strncmp(((symbol) type)->desc, "sld", 5) == 0) {
|
||||
char buf[1024];
|
||||
snprintf(buf, sizeof(buf), "%s", CYC_INSTALL_SLD);
|
||||
make_string(str, buf);
|
||||
make_utf8_string(data, str, buf);
|
||||
_return_closcall1(data, cont, &str);
|
||||
} else if (Cyc_is_symbol(type) == boolean_t &&
|
||||
strncmp(((symbol) type)->desc, "lib", 5) == 0) {
|
||||
char buf[1024];
|
||||
snprintf(buf, sizeof(buf), "%s", CYC_INSTALL_LIB);
|
||||
make_string(str, buf);
|
||||
make_utf8_string(data, str, buf);
|
||||
_return_closcall1(data, cont, &str);
|
||||
} else if (Cyc_is_symbol(type) == boolean_t &&
|
||||
strncmp(((symbol) type)->desc, "inc", 5) == 0) {
|
||||
char buf[1024];
|
||||
snprintf(buf, sizeof(buf), "%s", CYC_INSTALL_INC);
|
||||
make_string(str, buf);
|
||||
make_utf8_string(data, str, buf);
|
||||
_return_closcall1(data, cont, &str);
|
||||
} else {
|
||||
make_string(str, CYC_INSTALL_DIR);
|
||||
make_utf8_string(data, str, CYC_INSTALL_DIR);
|
||||
_return_closcall1(data, cont, &str);
|
||||
}
|
||||
}
|
||||
|
@ -2193,22 +2348,22 @@ object Cyc_compilation_environment(void *data, object cont, object var)
|
|||
if (strncmp(((symbol) var)->desc, "cc-prog", 8) == 0) {
|
||||
char buf[1024];
|
||||
snprintf(buf, sizeof(buf), "%s", CYC_CC_PROG);
|
||||
make_string(str, buf);
|
||||
make_utf8_string(data, str, buf);
|
||||
_return_closcall1(data, cont, &str);
|
||||
} else if (strncmp(((symbol) var)->desc, "cc-exec", 8) == 0) {
|
||||
char buf[1024];
|
||||
snprintf(buf, sizeof(buf), "%s", CYC_CC_EXEC);
|
||||
make_string(str, buf);
|
||||
make_utf8_string(data, str, buf);
|
||||
_return_closcall1(data, cont, &str);
|
||||
} else if (strncmp(((symbol) var)->desc, "cc-lib", 7) == 0) {
|
||||
char buf[1024];
|
||||
snprintf(buf, sizeof(buf), "%s", CYC_CC_LIB);
|
||||
make_string(str, buf);
|
||||
make_utf8_string(data, str, buf);
|
||||
_return_closcall1(data, cont, &str);
|
||||
} else if (strncmp(((symbol) var)->desc, "cc-so", 6) == 0) {
|
||||
char buf[1024];
|
||||
snprintf(buf, sizeof(buf), "%s", CYC_CC_SO);
|
||||
make_string(str, buf);
|
||||
make_utf8_string(data, str, buf);
|
||||
_return_closcall1(data, cont, &str);
|
||||
}
|
||||
}
|
||||
|
@ -2234,7 +2389,7 @@ object Cyc_command_line_arguments(void *data, object cont)
|
|||
for (i = _cyc_argc; i > 1; i--) { // skip program name
|
||||
object ps = alloca(sizeof(string_type));
|
||||
object pl = alloca(sizeof(pair_type));
|
||||
make_string(s, _cyc_argv[i - 1]);
|
||||
make_utf8_string(data, s, _cyc_argv[i - 1]);
|
||||
memcpy(ps, &s, sizeof(string_type));
|
||||
((list) pl)->hdr.mark = gc_color_red;
|
||||
((list) pl)->hdr.grayed = 0;
|
||||
|
@ -2490,6 +2645,10 @@ object Cyc_utf82string(void *data, object cont, object bv, object start,
|
|||
st.str = alloca(sizeof(char) * (len + 1));
|
||||
memcpy(st.str, &buf[s], len);
|
||||
st.str[len] = '\0';
|
||||
st.num_cp = Cyc_utf8_count_code_points((uint8_t *)(st.str));
|
||||
if (st.num_cp < 0) {
|
||||
Cyc_rt_raise2(data, "utf8->string - error decoding UTF 8", bv);
|
||||
}
|
||||
_return_closcall1(data, cont, &st);
|
||||
}
|
||||
}
|
||||
|
@ -2509,18 +2668,45 @@ object Cyc_string2utf8(void *data, object cont, object str, object start,
|
|||
e = unbox_number(end);
|
||||
len = e - s;
|
||||
|
||||
if (s < 0 || (s >= string_len(str) && len > 0)) {
|
||||
if (s < 0 || (s >= string_num_cp(str) && len > 0)) {
|
||||
Cyc_rt_raise2(data, "string->utf8 - invalid start", start);
|
||||
}
|
||||
|
||||
if (e < 0 || e < s || e > string_len(str)) {
|
||||
if (e < 0 || e < s || e > string_num_cp(str)) {
|
||||
Cyc_rt_raise2(data, "string->utf8 - invalid end", end);
|
||||
}
|
||||
|
||||
// Fast path
|
||||
if (string_num_cp(str) == string_len(str)) { // TODO: disable for testing purposes
|
||||
result.len = len;
|
||||
result.data = alloca(sizeof(char) * len);
|
||||
memcpy(&result.data[0], &(string_str(str))[s], len);
|
||||
_return_closcall1(data, cont, &result);
|
||||
} else {
|
||||
const char *tmp = string_str(str);
|
||||
char_type codepoint;
|
||||
uint32_t state = 0;
|
||||
int num_ch, cur_ch_bytes = 0, start_i = 0, end_i = 0;
|
||||
for (num_ch = 0; *tmp; ++tmp){
|
||||
cur_ch_bytes++;
|
||||
if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
|
||||
end_i += cur_ch_bytes;
|
||||
num_ch += 1;
|
||||
cur_ch_bytes = 0;
|
||||
|
||||
if (num_ch == s) {
|
||||
start_i = end_i;
|
||||
}
|
||||
if (num_ch == e) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
result.len = end_i - start_i;
|
||||
result.data = alloca(sizeof(char) * result.len);
|
||||
memcpy(&result.data[0], &(string_str(str))[start_i], result.len);
|
||||
_return_closcall1(data, cont, &result);
|
||||
}
|
||||
}
|
||||
|
||||
object Cyc_bytevector_u8_ref(void *data, object bv, object k)
|
||||
|
@ -2624,7 +2810,7 @@ object Cyc_char2integer(object chr)
|
|||
|
||||
object Cyc_integer2char(void *data, object n)
|
||||
{
|
||||
int val = 0;
|
||||
char_type val = 0;
|
||||
|
||||
Cyc_check_num(data, n);
|
||||
val = unbox_number(n);
|
||||
|
@ -5635,7 +5821,7 @@ void Cyc_import_shared_object(void *data, object cont, object filename, object e
|
|||
handle = dlopen(string_str(filename), RTLD_GLOBAL | RTLD_LAZY);
|
||||
if (handle == NULL) {
|
||||
snprintf(buffer, 256, "%s", dlerror());
|
||||
make_string(s, buffer);
|
||||
make_utf8_string(data, s, buffer);
|
||||
Cyc_rt_raise2(data, "Unable to import library", &s);
|
||||
}
|
||||
dlerror(); /* Clear any existing error */
|
||||
|
@ -5643,7 +5829,7 @@ void Cyc_import_shared_object(void *data, object cont, object filename, object e
|
|||
entry_pt = (function_type) dlsym(handle, string_str(entry_pt_fnc));
|
||||
if (entry_pt == NULL) {
|
||||
snprintf(buffer, 256, "%s, %s, %s", string_str(filename), string_str(entry_pt_fnc), dlerror());
|
||||
make_string(s, buffer);
|
||||
make_utf8_string(data, s, buffer);
|
||||
Cyc_rt_raise2(data, "Unable to load symbol", &s);
|
||||
}
|
||||
mclosure1(clo, entry_pt, cont);
|
||||
|
@ -5692,6 +5878,7 @@ void _read_error(void *data, port_type *p, const char *msg)
|
|||
// the cont could receive an error and raise it though
|
||||
//Cyc_rt_raise_msg(data, buf);
|
||||
make_string(str, buf);
|
||||
str.num_cp = Cyc_utf8_count_code_points((uint8_t *)buf);
|
||||
make_empty_vector(vec);
|
||||
vec.num_elements = 1;
|
||||
vec.elements = (object *) alloca(sizeof(object) * vec.num_elements);
|
||||
|
@ -5804,6 +5991,28 @@ static void _read_add_to_tok_buf(port_type *p, char c)
|
|||
p->tok_buf[p->tok_end++] = c;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determine if given string is numeric
|
||||
*/
|
||||
int _read_is_numeric(const char *tok)
|
||||
{
|
||||
int len = strlen(tok);
|
||||
return (len &&
|
||||
((isdigit(tok[0])) ||
|
||||
((len > 1) && tok[0] == '.' && isdigit(tok[1])) ||
|
||||
((len > 1) && (tok[1] == '.' || isdigit(tok[1])) && (tok[0] == '-' || tok[0] == '+'))));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Helper function, determine if given number is a hex digit
|
||||
* @param c Character to check
|
||||
*/
|
||||
int _read_is_hex_digit(char c)
|
||||
{
|
||||
return (c >= 'a' && c <= 'f') ||
|
||||
(c >= 'A' && c <= 'F');
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Helper function to read a string
|
||||
* @param data Thread data object
|
||||
|
@ -5863,6 +6072,12 @@ void _read_string(void *data, object cont, port_type *p)
|
|||
p->buf_idx++;
|
||||
break;
|
||||
}
|
||||
// Verify if hex digit is valid
|
||||
if (!isdigit(p->mem_buf[p->buf_idx]) &&
|
||||
!_read_is_hex_digit(p->mem_buf[p->buf_idx])) {
|
||||
p->buf_idx++;
|
||||
_read_error(data, p, "invalid hex digit in string");
|
||||
}
|
||||
buf[i] = p->mem_buf[p->buf_idx];
|
||||
p->buf_idx++;
|
||||
p->col_num++;
|
||||
|
@ -5870,8 +6085,14 @@ void _read_string(void *data, object cont, port_type *p)
|
|||
}
|
||||
buf[i] = '\0';
|
||||
{
|
||||
int result = (int)strtol(buf, NULL, 16);
|
||||
p->tok_buf[p->tok_end++] = (char)result;
|
||||
char_type result = strtol(buf, NULL, 16);
|
||||
char cbuf[5];
|
||||
int i;
|
||||
Cyc_utf8_encode_char(cbuf, 5, result);
|
||||
for (i = 0; cbuf[i] != 0; i++) {
|
||||
_read_add_to_tok_buf(p, cbuf[i]);
|
||||
}
|
||||
//p->tok_buf[p->tok_end++] = (char)result;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -5883,7 +6104,7 @@ void _read_string(void *data, object cont, port_type *p)
|
|||
p->tok_buf[p->tok_end] = '\0'; // TODO: what if buffer is full?
|
||||
p->tok_end = 0; // Reset for next atom
|
||||
{
|
||||
make_string(str, p->tok_buf);
|
||||
make_utf8_string(data, str, p->tok_buf);
|
||||
return_thread_runnable(data, &str);
|
||||
}
|
||||
} else if (c == '\\') {
|
||||
|
@ -5943,6 +6164,7 @@ void _read_return_character(void *data, port_type *p)
|
|||
p->tok_buf[p->tok_end] = '\0'; // TODO: what if buffer is full?
|
||||
p->tok_end = 0; // Reset for next atom
|
||||
if (strlen(p->tok_buf) == 1) {
|
||||
// ASCII char, consider merging with below?
|
||||
return_thread_runnable(data, obj_char2obj(p->tok_buf[0]));
|
||||
} else if(strncmp(p->tok_buf, "alarm", 5) == 0) {
|
||||
return_thread_runnable(data, obj_char2obj('\a'));
|
||||
|
@ -5964,13 +6186,28 @@ void _read_return_character(void *data, port_type *p)
|
|||
return_thread_runnable(data, obj_char2obj('\t'));
|
||||
} else if(strlen(p->tok_buf) > 1 && p->tok_buf[0] == 'x') {
|
||||
const char *buf = p->tok_buf + 1;
|
||||
int result = strtol(buf, NULL, 16);
|
||||
char_type result = strtol(buf, NULL, 16);
|
||||
return_thread_runnable(data, obj_char2obj(result));
|
||||
} else {
|
||||
// Try to read a UTF-8 char and if so return it, otherwise throw an error
|
||||
uint32_t state = CYC_UTF8_ACCEPT;
|
||||
char_type codepoint;
|
||||
uint8_t *s = (uint8_t *)p->tok_buf;
|
||||
while(s) {
|
||||
if (!Cyc_utf8_decode(&state, &codepoint, *s)) {
|
||||
s++;
|
||||
break;
|
||||
}
|
||||
s++;
|
||||
}
|
||||
if (state == CYC_UTF8_ACCEPT && *s == '\0') {
|
||||
return_thread_runnable(data, obj_char2obj(codepoint));
|
||||
} else {
|
||||
char buf[31];
|
||||
snprintf(buf, 30, "Unable to parse character %s", p->tok_buf);
|
||||
_read_error(data, p, buf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -6001,28 +6238,6 @@ void _read_character(void *data, port_type *p)
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determine if given string is numeric
|
||||
*/
|
||||
int _read_is_numeric(const char *tok)
|
||||
{
|
||||
int len = strlen(tok);
|
||||
return (len &&
|
||||
((isdigit(tok[0])) ||
|
||||
((len > 1) && tok[0] == '.' && isdigit(tok[1])) ||
|
||||
((len > 1) && (tok[1] == '.' || isdigit(tok[1])) && (tok[0] == '-' || tok[0] == '+'))));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Helper function, determine if given number is a hex digit
|
||||
* @param c Character to check
|
||||
*/
|
||||
int _read_is_hex_digit(char c)
|
||||
{
|
||||
return (c >= 'a' && c <= 'f') ||
|
||||
(c >= 'A' && c <= 'F');
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Helper function, return read number.
|
||||
* @param data Thread data object
|
||||
|
@ -6102,6 +6317,7 @@ void _read_return_atom(void *data, object cont, port_type *p)
|
|||
|
||||
if (_read_is_numeric(p->tok_buf)) {
|
||||
make_string(str, p->tok_buf);
|
||||
str.num_cp = Cyc_utf8_count_code_points((uint8_t *)(p->tok_buf));
|
||||
make_c_opaque(opq, &str);
|
||||
return_thread_runnable(data, &opq);
|
||||
} else if (strncmp("+inf.0", p->tok_buf, 6) == 0 ||
|
||||
|
@ -6134,7 +6350,10 @@ object Cyc_io_peek_char(void *data, object cont, object port)
|
|||
{
|
||||
FILE *stream;
|
||||
port_type *p;
|
||||
int c;
|
||||
uint32_t state = CYC_UTF8_ACCEPT;
|
||||
char_type codepoint;
|
||||
int c, i = 0, at_mem_buf_end = 0;
|
||||
char buf[5];
|
||||
|
||||
Cyc_check_port(data, port);
|
||||
{
|
||||
|
@ -6144,9 +6363,40 @@ object Cyc_io_peek_char(void *data, object cont, object port)
|
|||
Cyc_rt_raise2(data, "Unable to read from closed port: ", port);
|
||||
}
|
||||
set_thread_blocked(data, cont);
|
||||
if (p->mem_buf_len == 0 || p->mem_buf_len == p->buf_idx) {
|
||||
_read_next_char(data, cont, p);
|
||||
}
|
||||
c = p->mem_buf[p->buf_idx];
|
||||
return_thread_runnable(data, (c != EOF) ? obj_char2obj(c) : Cyc_EOF);
|
||||
if (Cyc_utf8_decode(&state, &codepoint, (uint8_t)c)) {
|
||||
// Only have a partial UTF8 code point, read more chars.
|
||||
// Problem is that there may not be enough space to store them
|
||||
// and do need to set them aside since we are just peeking here
|
||||
// and not actually supposed to be reading past chars.
|
||||
|
||||
buf[0] = c;
|
||||
i = 1;
|
||||
while (i < 5) { // TODO: limit to 4 chars??
|
||||
if (p->mem_buf_len == p->buf_idx + i) {
|
||||
// No more buffered chars
|
||||
at_mem_buf_end = 1;
|
||||
c = fgetc(stream);
|
||||
if (c == EOF) break; // TODO: correct to do this here????
|
||||
} else {
|
||||
c = p->mem_buf[p->buf_idx + i];
|
||||
}
|
||||
buf[i++] = c;
|
||||
if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)c)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (at_mem_buf_end && c != EOF) {
|
||||
p->buf_idx = 0;
|
||||
p->mem_buf_len = i;
|
||||
memmove(p->mem_buf, buf, i);
|
||||
}
|
||||
|
||||
return_thread_runnable(data, (c != EOF) ? obj_char2obj(codepoint) : Cyc_EOF);
|
||||
}
|
||||
return Cyc_EOF;
|
||||
}
|
||||
|
@ -6171,17 +6421,23 @@ object Cyc_io_peek_char(void *data, object cont, object port)
|
|||
object Cyc_io_read_char(void *data, object cont, object port)
|
||||
{
|
||||
port_type *p = (port_type *)port;
|
||||
int c;
|
||||
Cyc_check_port(data, port);
|
||||
if (p->fp == NULL) {
|
||||
Cyc_rt_raise2(data, "Unable to read from closed port: ", port);
|
||||
}
|
||||
{
|
||||
uint32_t state = CYC_UTF8_ACCEPT;
|
||||
char_type codepoint;
|
||||
int c;
|
||||
set_thread_blocked(data, cont);
|
||||
do {
|
||||
_read_next_char(data, cont, p);
|
||||
c = p->mem_buf[p->buf_idx++];
|
||||
if (c == EOF) break;
|
||||
} while(Cyc_utf8_decode(&state, &codepoint, (uint8_t)c));
|
||||
// TODO: limit above to 4 chars and then thrown an error?
|
||||
p->col_num++;
|
||||
return_thread_runnable(data, (c != EOF) ? obj_char2obj(c) : Cyc_EOF);
|
||||
return_thread_runnable(data, (c != EOF) ? obj_char2obj(codepoint) : Cyc_EOF);
|
||||
}
|
||||
return Cyc_EOF;
|
||||
}
|
||||
|
@ -6190,8 +6446,10 @@ object Cyc_io_read_char(void *data, object cont, object port)
|
|||
object Cyc_io_read_line(void *data, object cont, object port)
|
||||
{
|
||||
FILE *stream = ((port_type *) port)->fp;
|
||||
char buf[1024];
|
||||
int len;
|
||||
char buf[1027];
|
||||
int len, num_cp, i = 0;
|
||||
char_type codepoint;
|
||||
uint32_t state;
|
||||
|
||||
Cyc_check_port(data, port);
|
||||
if (stream == NULL) {
|
||||
|
@ -6200,7 +6458,21 @@ object Cyc_io_read_line(void *data, object cont, object port)
|
|||
set_thread_blocked(data, cont);
|
||||
errno = 0;
|
||||
if (fgets(buf, 1023, stream) != NULL) {
|
||||
len = strlen(buf);
|
||||
state = Cyc_utf8_count_code_points_and_bytes((uint8_t *)buf, &codepoint, &num_cp, &len);
|
||||
// Check if we stopped reading in the middle of a code point and
|
||||
// if so, read one byte at a time until that code point is finished.
|
||||
while (state != CYC_UTF8_ACCEPT && i < 3) {
|
||||
int c = fgetc(stream);
|
||||
buf[len] = c;
|
||||
len++;
|
||||
Cyc_utf8_decode(&state, &codepoint, (uint8_t)c);
|
||||
if (state == CYC_UTF8_ACCEPT) {
|
||||
num_cp++;
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
{
|
||||
// Remove any trailing CR / newline chars
|
||||
while (len > 0 && (buf[len - 1] == '\n' ||
|
||||
|
@ -6209,6 +6481,7 @@ object Cyc_io_read_line(void *data, object cont, object port)
|
|||
}
|
||||
buf[len] = '\0';
|
||||
make_string_noalloc(s, buf, len);
|
||||
s.num_cp = num_cp;
|
||||
return_thread_runnable(data, &s);
|
||||
}
|
||||
} else {
|
||||
|
@ -6364,3 +6637,189 @@ void Cyc_io_read_token(void *data, object cont, object port)
|
|||
}
|
||||
}
|
||||
|
||||
////////////// UTF-8 Section //////////////
|
||||
|
||||
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
||||
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
||||
static const uint8_t utf8d[] = {
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
|
||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
|
||||
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
|
||||
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
|
||||
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
|
||||
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
|
||||
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
|
||||
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
|
||||
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
|
||||
};
|
||||
|
||||
//uint32_t inline
|
||||
uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
||||
uint32_t type = utf8d[byte];
|
||||
|
||||
*codep = (*state != CYC_UTF8_ACCEPT) ?
|
||||
(byte & 0x3fu) | (*codep << 6) :
|
||||
(0xff >> type) & (byte);
|
||||
|
||||
*state = utf8d[256 + *state*16 + type];
|
||||
return *state;
|
||||
}
|
||||
// END Bjoern Hoehrmann
|
||||
|
||||
/**
|
||||
* @brief
|
||||
* Count the number of code points in a string.
|
||||
* Based on example code from Bjoern Hoehrmann.
|
||||
*/
|
||||
int Cyc_utf8_count_code_points(uint8_t* s) {
|
||||
uint32_t codepoint;
|
||||
uint32_t state = 0;
|
||||
int count;
|
||||
|
||||
for (count = 0; *s; ++s)
|
||||
if (!Cyc_utf8_decode(&state, &codepoint, *s))
|
||||
count += 1;
|
||||
|
||||
if (state != CYC_UTF8_ACCEPT)
|
||||
return -1;
|
||||
return count;
|
||||
}
|
||||
|
||||
int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, char_type *codepoint, int *cpts, int *bytes) {
|
||||
uint32_t state = 0;
|
||||
*cpts = 0;
|
||||
*bytes = 0;
|
||||
for (; *s; ++s){
|
||||
*bytes += 1;
|
||||
if (!Cyc_utf8_decode(&state, codepoint, *s))
|
||||
*cpts += 1;
|
||||
}
|
||||
|
||||
if (state != CYC_UTF8_ACCEPT)
|
||||
return state;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// TODO: index into X codepoint in a string
|
||||
|
||||
/**
|
||||
* @brief
|
||||
* Use this when validating from a stream, as it may be that the stream stopped
|
||||
* in the middle of a codepoint, hence state passed in as an arg, so it can be
|
||||
* tested in a loop and also after the loop has finished.
|
||||
*
|
||||
* From https://stackoverflow.com/a/22135005/101258
|
||||
*/
|
||||
uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len) {
|
||||
size_t i;
|
||||
uint32_t type;
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
// We don't care about the codepoint, so this is
|
||||
// a simplified version of the decode function.
|
||||
type = utf8d[(uint8_t)str[i]];
|
||||
*state = utf8d[256 + (*state) * 16 + type];
|
||||
|
||||
if (*state == CYC_UTF8_REJECT)
|
||||
break;
|
||||
}
|
||||
|
||||
return *state;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Simplified version of Cyc_utf8_validate_stream that must always be called with a complete string buffer.
|
||||
*/
|
||||
uint32_t Cyc_utf8_validate(char *str, size_t len) {
|
||||
size_t i;
|
||||
uint32_t state = CYC_UTF8_ACCEPT, type;
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
// We don't care about the codepoint, so this is
|
||||
// a simplified version of the decode function.
|
||||
type = utf8d[(uint8_t)str[i]];
|
||||
state = utf8d[256 + (state) * 16 + type];
|
||||
|
||||
if (state == CYC_UTF8_REJECT)
|
||||
break;
|
||||
}
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
//int uint32_num_bytes(uint32_t x) {
|
||||
// // TODO: could compute log(val) / log(256)
|
||||
// if (x < 0x100) return 1;
|
||||
// if (x < 0x10000) return 2;
|
||||
// if (x < 0x1000000) return 3;
|
||||
// return 4;
|
||||
//}
|
||||
|
||||
/**
|
||||
* This function takes one or more 32-bit chars and encodes them
|
||||
* as an array of UTF-8 bytes.
|
||||
* FROM: https://www.cprogramming.com/tutorial/utf8.c
|
||||
*
|
||||
* @param dest Destination byte buffer
|
||||
* @param sz size of dest buffer in bytes
|
||||
* @param src Buffer of source data, in 32-bit characters
|
||||
* @param srcsz number of source characters, or -1 if 0-terminated
|
||||
*
|
||||
* @return Number of characters converted
|
||||
*
|
||||
* dest will only be '\0'-terminated if there is enough space. this is
|
||||
* for consistency; imagine there are 2 bytes of space left, but the next
|
||||
* character requires 3 bytes. in this case we could NUL-terminate, but in
|
||||
* general we can't when there's insufficient space. therefore this function
|
||||
* only NUL-terminates if all the characters fit, and there's space for
|
||||
* the NUL as well.
|
||||
* the destination string will never be bigger than the source string.
|
||||
*/
|
||||
int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz)
|
||||
{
|
||||
u_int32_t ch;
|
||||
int i = 0;
|
||||
char *dest_end = dest + sz;
|
||||
|
||||
while (srcsz<0 ? src[i]!=0 : i < srcsz) {
|
||||
ch = src[i];
|
||||
if (ch < 0x80) {
|
||||
if (dest >= dest_end)
|
||||
return i;
|
||||
*dest++ = (char)ch;
|
||||
}
|
||||
else if (ch < 0x800) {
|
||||
if (dest >= dest_end-1)
|
||||
return i;
|
||||
*dest++ = (ch>>6) | 0xC0;
|
||||
*dest++ = (ch & 0x3F) | 0x80;
|
||||
}
|
||||
else if (ch < 0x10000) {
|
||||
if (dest >= dest_end-2)
|
||||
return i;
|
||||
*dest++ = (ch>>12) | 0xE0;
|
||||
*dest++ = ((ch>>6) & 0x3F) | 0x80;
|
||||
*dest++ = (ch & 0x3F) | 0x80;
|
||||
}
|
||||
else if (ch < 0x110000) {
|
||||
if (dest >= dest_end-3)
|
||||
return i;
|
||||
*dest++ = (ch>>18) | 0xF0;
|
||||
*dest++ = ((ch>>12) & 0x3F) | 0x80;
|
||||
*dest++ = ((ch>>6) & 0x3F) | 0x80;
|
||||
*dest++ = (ch & 0x3F) | 0x80;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
if (dest < dest_end)
|
||||
*dest = '\0';
|
||||
return i;
|
||||
}
|
||||
|
||||
|
||||
////////////// END UTF-8 Section //////////////
|
||||
|
|
|
@ -950,9 +950,22 @@
|
|||
(define-c Cyc-make-string
|
||||
"(void *data, int argc, closure _, object k, object count, object fill)"
|
||||
" object s = NULL;
|
||||
char ch_buf[5];
|
||||
char_type c;
|
||||
int buflen, num_cp, len;
|
||||
Cyc_check_int(data, count);
|
||||
char c = obj_obj2char(fill);
|
||||
int len = obj_obj2int(count);
|
||||
if (!obj_is_char(fill)) {
|
||||
Cyc_rt_raise2(data, \"Expected character buf received\", fill);
|
||||
}
|
||||
c = obj_obj2char(fill);
|
||||
if (!c) {
|
||||
buflen = 1;
|
||||
} else {
|
||||
Cyc_utf8_encode_char(ch_buf, 5, c);
|
||||
buflen = strlen(ch_buf);
|
||||
}
|
||||
num_cp = obj_obj2int(count);
|
||||
len = num_cp * buflen;
|
||||
if (len >= MAX_STACK_OBJ) {
|
||||
int heap_grown;
|
||||
s = gc_alloc(((gc_thread_data *)data)->heap,
|
||||
|
@ -964,6 +977,7 @@
|
|||
((string_type *) s)->hdr.grayed = 0;
|
||||
((string_type *) s)->tag = string_tag;
|
||||
((string_type *) s)->len = len;
|
||||
((string_type *) s)->num_cp = num_cp;
|
||||
((string_type *) s)->str = (((char *)s) + sizeof(string_type));
|
||||
} else {
|
||||
s = alloca(sizeof(string_type));
|
||||
|
@ -971,9 +985,18 @@
|
|||
((string_type *)s)->hdr.grayed = 0;
|
||||
((string_type *)s)->tag = string_tag;
|
||||
((string_type *)s)->len = len;
|
||||
((string_type *)s)->num_cp = num_cp;
|
||||
((string_type *)s)->str = alloca(sizeof(char) * (len + 1));
|
||||
}
|
||||
memset(((string_type *)s)->str, c, len);
|
||||
if (num_cp == 1) { /* Fast path */
|
||||
memset(((string_type *)s)->str, ch_buf[0], len);
|
||||
} else {
|
||||
char *buf = ((string_type *)s)->str;
|
||||
int bi, si, slen = buflen;
|
||||
for (bi = 0, si = 0; bi < len; bi++, si++) {
|
||||
buf[bi] = ch_buf[si % slen];
|
||||
}
|
||||
}
|
||||
((string_type *)s)->str[len] = '\\0';
|
||||
return_closcall1(data, k, s);
|
||||
")
|
||||
|
|
|
@ -521,7 +521,15 @@
|
|||
(string-append "&" cvar-name) ; Code is just the variable name
|
||||
(list ; Allocate integer on the C stack
|
||||
(string-append
|
||||
"make_string(" cvar-name ", " (->cstr exp) ");")))))
|
||||
"make_utf8_string_with_len("
|
||||
cvar-name
|
||||
", "
|
||||
(->cstr exp)
|
||||
", "
|
||||
(number->string (string-byte-length exp))
|
||||
", "
|
||||
(number->string (string-length exp))
|
||||
");")))))
|
||||
;TODO: not good enough, need to store new symbols in a table so they can
|
||||
;be inserted into the C program
|
||||
((symbol? exp)
|
||||
|
@ -536,6 +544,10 @@
|
|||
(define (->cstr str)
|
||||
(string-append "\"" (cstr:escape-chars str) "\""))
|
||||
|
||||
(define-c string-byte-length
|
||||
"(void *data, int argc, closure _, object k, object s)"
|
||||
" return_closcall1(data, k, Cyc_string_byte_length(data, s)); ")
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Primitives
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@
|
|||
for (i = _cyc_argc; i > 0; i--) {
|
||||
object ps = alloca(sizeof(string_type));
|
||||
object pl = alloca(sizeof(pair_type));
|
||||
make_string(s, _cyc_argv[i - 1]);
|
||||
make_utf8_string(data, s, _cyc_argv[i - 1]);
|
||||
memcpy(ps, &s, sizeof(string_type));
|
||||
((list)pl)->hdr.mark = gc_color_red;
|
||||
((list)pl)->hdr.grayed = 0;
|
||||
|
@ -44,7 +44,7 @@
|
|||
if (v == NULL) {
|
||||
return_closcall1(data, k, boolean_f);
|
||||
} else {
|
||||
make_string(str, v);
|
||||
make_utf8_string(data, str, v);
|
||||
return_closcall1(data, k, &str);
|
||||
}
|
||||
")
|
||||
|
|
193
test.c
Normal file
193
test.c
Normal file
|
@ -0,0 +1,193 @@
|
|||
// A temporary test file
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#define CYC_UTF8_ACCEPT 0
|
||||
|
||||
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
||||
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
||||
static const uint8_t utf8d[] = {
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
|
||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
|
||||
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
|
||||
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
|
||||
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
|
||||
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
|
||||
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
|
||||
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
|
||||
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
|
||||
};
|
||||
|
||||
//uint32_t inline
|
||||
uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
||||
uint32_t type = utf8d[byte];
|
||||
|
||||
*codep = (*state != CYC_UTF8_ACCEPT) ?
|
||||
(byte & 0x3fu) | (*codep << 6) :
|
||||
(0xff >> type) & (byte);
|
||||
|
||||
*state = utf8d[256 + *state*16 + type];
|
||||
return *state;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple macro to make it more convenient to convert a single char
|
||||
*/
|
||||
#define Cyc_utf8_encode_char(dest, dest_size, char_value) \
|
||||
Cyc_utf8_encode(dest, dest_size, &char_value, 1)
|
||||
|
||||
/**
|
||||
* This function takes one or more 32-bit chars and encodes them
|
||||
* as an array of UTF-8 bytes.
|
||||
* FROM: https://www.cprogramming.com/tutorial/utf8.c
|
||||
*
|
||||
* @param dest Destination byte buffer
|
||||
* @param sz size of dest buffer in bytes
|
||||
* @param src Buffer of source data, in 32-bit characters
|
||||
* @param srcsz number of source characters, or -1 if 0-terminated
|
||||
*
|
||||
* @return Number of characters converted
|
||||
*
|
||||
* dest will only be '\0'-terminated if there is enough space. this is
|
||||
* for consistency; imagine there are 2 bytes of space left, but the next
|
||||
* character requires 3 bytes. in this case we could NUL-terminate, but in
|
||||
* general we can't when there's insufficient space. therefore this function
|
||||
* only NUL-terminates if all the characters fit, and there's space for
|
||||
* the NUL as well.
|
||||
* the destination string will never be bigger than the source string.
|
||||
*/
|
||||
int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz)
|
||||
{
|
||||
u_int32_t ch;
|
||||
int i = 0;
|
||||
char *dest_end = dest + sz;
|
||||
|
||||
while (srcsz<0 ? src[i]!=0 : i < srcsz) {
|
||||
ch = src[i];
|
||||
if (ch < 0x80) {
|
||||
if (dest >= dest_end)
|
||||
return i;
|
||||
*dest++ = (char)ch;
|
||||
}
|
||||
else if (ch < 0x800) {
|
||||
if (dest >= dest_end-1)
|
||||
return i;
|
||||
*dest++ = (ch>>6) | 0xC0;
|
||||
*dest++ = (ch & 0x3F) | 0x80;
|
||||
}
|
||||
else if (ch < 0x10000) {
|
||||
if (dest >= dest_end-2)
|
||||
return i;
|
||||
*dest++ = (ch>>12) | 0xE0;
|
||||
*dest++ = ((ch>>6) & 0x3F) | 0x80;
|
||||
*dest++ = (ch & 0x3F) | 0x80;
|
||||
}
|
||||
else if (ch < 0x110000) {
|
||||
if (dest >= dest_end-3)
|
||||
return i;
|
||||
*dest++ = (ch>>18) | 0xF0;
|
||||
*dest++ = ((ch>>12) & 0x3F) | 0x80;
|
||||
*dest++ = ((ch>>6) & 0x3F) | 0x80;
|
||||
*dest++ = (ch & 0x3F) | 0x80;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
if (dest < dest_end)
|
||||
*dest = '\0';
|
||||
return i;
|
||||
}
|
||||
|
||||
void encode(uint32_t val) {
|
||||
char dest[5];
|
||||
int rv, i;
|
||||
|
||||
rv = Cyc_utf8_encode_char(dest, 5, val);
|
||||
printf("%x %d \n", val, rv);
|
||||
for(i = 0; i < 5; i++) {
|
||||
printf("[%x] ", (uint8_t)dest[i]);
|
||||
}
|
||||
printf("\n");
|
||||
return;
|
||||
}
|
||||
|
||||
void multi_byte_memset(char *buf, int blen, char *src, int slen)
|
||||
{
|
||||
int bi, si;
|
||||
for (bi = 0, si = 0; bi < blen; bi++, si++) {
|
||||
buf[bi] = src[si % slen];
|
||||
}
|
||||
}
|
||||
|
||||
void substring(int s, int e, const char *expected) {
|
||||
uint8_t raw[] = {65, 66, 0xCE, 0xBB, 67};
|
||||
const char *tmp = raw;
|
||||
uint32_t codepoint;
|
||||
uint32_t state = 0;
|
||||
int num_ch, cur_ch_bytes = 0, start_i = 0, end_i = 0;
|
||||
for (num_ch = 0; *tmp; ++tmp){
|
||||
cur_ch_bytes++;
|
||||
if (!Cyc_utf8_decode(&state, &codepoint, (uint8_t)*tmp)){
|
||||
end_i += cur_ch_bytes;
|
||||
num_ch += 1;
|
||||
cur_ch_bytes = 0;
|
||||
|
||||
if (num_ch == s) {
|
||||
start_i = end_i;
|
||||
}
|
||||
if (num_ch == e) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
raw[end_i] = '\0';
|
||||
printf("expected=%s, raw=%s, s=%d, e=%d, start_i=%d, end_i=%d\n", expected, raw + start_i, s, e, start_i, end_i);
|
||||
}
|
||||
|
||||
void main(){
|
||||
char c[128];
|
||||
uint8_t cv[] = {0xEC, 0xBA, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE
|
||||
uint8_t cv2[] = {0xCE, 0xBB}; // Lambda (0x03bb) is encoded with leading 0xCE
|
||||
//uint8_t cv2[] = {0xEC, 0xBA, 0xBB}; // Lambda (0x03bb) is encoded with leading 0xCE
|
||||
// uint8_t cv[] = {0xCE, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE
|
||||
char *cptr;
|
||||
uint32_t state = CYC_UTF8_ACCEPT, codepoint, val = 0x32363435;
|
||||
uint8_t *ptr = (uint8_t *)&val;
|
||||
int i, j = 0;
|
||||
// //memset(c, 0x34, 128);
|
||||
// for (i = 0; i < 127; i++) {
|
||||
// c[i] = ptr[j++];
|
||||
// if (j == 4) j = 0;
|
||||
// }
|
||||
// c[127] = '\0';
|
||||
// printf("%s\n", c);
|
||||
multi_byte_memset(c, 126, cv2, 2);
|
||||
c[127] = '\0';
|
||||
printf("TEST: %s\n", c);
|
||||
|
||||
ptr = cv;
|
||||
for (i = 0; i < 3; i++) {
|
||||
Cyc_utf8_decode(&state, &codepoint, ptr[i]);
|
||||
}
|
||||
printf("state = %d, cp = %d\n", state, codepoint);
|
||||
|
||||
encode(0x3bb);
|
||||
encode(65);
|
||||
encode(0xcebb);
|
||||
|
||||
printf("%06X\n", 0x0fff);
|
||||
substring(0, 1, "A ");
|
||||
substring(0, 2, "AB ");
|
||||
substring(1, 3, "Bx ");
|
||||
substring(1, 4, "BxC ");
|
||||
substring(2, 2, " ");
|
||||
substring(2, 3, "x ");
|
||||
substring(2, 4, "xC ");
|
||||
return;
|
||||
}
|
Loading…
Add table
Reference in a new issue