From aa0b0a75678b2b6134c02ba24c2e10d2ee12fd7d Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Wed, 25 Oct 2017 18:35:11 -0400 Subject: [PATCH] Added UTF8 encoder, final version of string->utf8 --- include/cyclone/runtime.h | 7 +++ runtime.c | 102 ++++++++++++++++++++++++++++++++++---- 2 files changed, 98 insertions(+), 11 deletions(-) diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h index 119338e9..6720002b 100644 --- a/include/cyclone/runtime.h +++ b/include/cyclone/runtime.h @@ -715,6 +715,13 @@ void Cyc_set_globals_changed(gc_thread_data *thd); /**@{*/ #define CYC_UTF8_ACCEPT 0 #define CYC_UTF8_REJECT 1 + +/** + * Simple macro to make it more convenient to convert a single char + */ +#define Cyc_utf8_encode_char(dest, dest_size, char_value) \ + Cyc_utf8_encode(dest, dest_size, &char_value, 1) + uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte); int Cyc_utf8_count_code_points(uint8_t* s); uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); diff --git a/runtime.c b/runtime.c index 79badbf9..d0e804e9 100644 --- a/runtime.c +++ b/runtime.c @@ -2595,23 +2595,41 @@ object Cyc_string2utf8(void *data, object cont, object str, object start, e = unbox_number(end); len = e - s; - if (s < 0 || (s >= string_len(str) && len > 0)) { + if (s < 0 || (s >= string_num_cp(str) && len > 0)) { Cyc_rt_raise2(data, "string->utf8 - invalid start", start); } - if (e < 0 || e < s || e > string_len(str)) { + if (e < 0 || e < s || e > string_num_cp(str)) { Cyc_rt_raise2(data, "string->utf8 - invalid end", end); } - // TODO: we have code point positions s, e, and length. We need to take those - // and walk the string to figure out the starting and ending BYTE positions - - // TODO: fast path, can keep below if string_num_cp(str) == string_len(str) - - result.len = len; - result.data = alloca(sizeof(char) * len); - memcpy(&result.data[0], &(string_str(str))[s], len); - _return_closcall1(data, cont, &result); + // Fast path + if (string_num_cp(str) == string_len(str)) { // TODO: disable for testing purposes + result.len = len; + result.data = alloca(sizeof(char) * len); + memcpy(&result.data[0], &(string_str(str))[s], len); + _return_closcall1(data, cont, &result); + } else { + int i, start_i = 0, end_i = 0; + const char *tmp = string_str(str); + char_type codepoint; + uint32_t state = 0; + for (i = 0; *tmp; ++tmp) { + if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){ + if (i == s) { + start_i = i; + } else if (i == e) { + break; + } + } + i++; + } + end_i = i; + result.len = end_i - start_i; + result.data = alloca(sizeof(char) * result.len); + memcpy(&result.data[0], &(string_str(str))[start_i], result.len); + _return_closcall1(data, cont, &result); + } } object Cyc_bytevector_u8_ref(void *data, object bv, object k) @@ -6563,4 +6581,66 @@ int uint32_num_bytes(uint32_t x) { return 4; } +/** + * This function takes one or more 32-bit chars and encodes them + * as an array of UTF-8 bytes. + * FROM: https://www.cprogramming.com/tutorial/utf8.c + * + * @param dest Destination byte buffer + * @param sz size of dest buffer in bytes + * @param src Buffer of source data, in 32-bit characters + * @param srcsz number of source characters, or -1 if 0-terminated + * + * @return Number of characters converted + * + * dest will only be '\0'-terminated if there is enough space. this is + * for consistency; imagine there are 2 bytes of space left, but the next + * character requires 3 bytes. in this case we could NUL-terminate, but in + * general we can't when there's insufficient space. therefore this function + * only NUL-terminates if all the characters fit, and there's space for + * the NUL as well. + * the destination string will never be bigger than the source string. + */ +int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz) +{ + u_int32_t ch; + int i = 0; + char *dest_end = dest + sz; + + while (srcsz<0 ? src[i]!=0 : i < srcsz) { + ch = src[i]; + if (ch < 0x80) { + if (dest >= dest_end) + return i; + *dest++ = (char)ch; + } + else if (ch < 0x800) { + if (dest >= dest_end-1) + return i; + *dest++ = (ch>>6) | 0xC0; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x10000) { + if (dest >= dest_end-2) + return i; + *dest++ = (ch>>12) | 0xE0; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x110000) { + if (dest >= dest_end-3) + return i; + *dest++ = (ch>>18) | 0xF0; + *dest++ = ((ch>>12) & 0x3F) | 0x80; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + i++; + } + if (dest < dest_end) + *dest = '\0'; + return i; +} + + ////////////// END UTF-8 Section //////////////