Added UTF8 encoder, final version of string->utf8

This commit is contained in:
Justin Ethier 2017-10-25 18:35:11 -04:00
parent ccfde220ff
commit aa0b0a7567
2 changed files with 98 additions and 11 deletions

View file

@ -715,6 +715,13 @@ void Cyc_set_globals_changed(gc_thread_data *thd);
/**@{*/ /**@{*/
#define CYC_UTF8_ACCEPT 0 #define CYC_UTF8_ACCEPT 0
#define CYC_UTF8_REJECT 1 #define CYC_UTF8_REJECT 1
/**
* Simple macro to make it more convenient to convert a single char
*/
#define Cyc_utf8_encode_char(dest, dest_size, char_value) \
Cyc_utf8_encode(dest, dest_size, &char_value, 1)
uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte); uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
int Cyc_utf8_count_code_points(uint8_t* s); int Cyc_utf8_count_code_points(uint8_t* s);
uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len);

View file

@ -2595,23 +2595,41 @@ object Cyc_string2utf8(void *data, object cont, object str, object start,
e = unbox_number(end); e = unbox_number(end);
len = e - s; len = e - s;
if (s < 0 || (s >= string_len(str) && len > 0)) { if (s < 0 || (s >= string_num_cp(str) && len > 0)) {
Cyc_rt_raise2(data, "string->utf8 - invalid start", start); Cyc_rt_raise2(data, "string->utf8 - invalid start", start);
} }
if (e < 0 || e < s || e > string_len(str)) { if (e < 0 || e < s || e > string_num_cp(str)) {
Cyc_rt_raise2(data, "string->utf8 - invalid end", end); Cyc_rt_raise2(data, "string->utf8 - invalid end", end);
} }
// TODO: we have code point positions s, e, and length. We need to take those // Fast path
// and walk the string to figure out the starting and ending BYTE positions if (string_num_cp(str) == string_len(str)) { // TODO: disable for testing purposes
// TODO: fast path, can keep below if string_num_cp(str) == string_len(str)
result.len = len; result.len = len;
result.data = alloca(sizeof(char) * len); result.data = alloca(sizeof(char) * len);
memcpy(&result.data[0], &(string_str(str))[s], len); memcpy(&result.data[0], &(string_str(str))[s], len);
_return_closcall1(data, cont, &result); _return_closcall1(data, cont, &result);
} else {
int i, start_i = 0, end_i = 0;
const char *tmp = string_str(str);
char_type codepoint;
uint32_t state = 0;
for (i = 0; *tmp; ++tmp) {
if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){
if (i == s) {
start_i = i;
} else if (i == e) {
break;
}
}
i++;
}
end_i = i;
result.len = end_i - start_i;
result.data = alloca(sizeof(char) * result.len);
memcpy(&result.data[0], &(string_str(str))[start_i], result.len);
_return_closcall1(data, cont, &result);
}
} }
object Cyc_bytevector_u8_ref(void *data, object bv, object k) object Cyc_bytevector_u8_ref(void *data, object bv, object k)
@ -6563,4 +6581,66 @@ int uint32_num_bytes(uint32_t x) {
return 4; return 4;
} }
/**
* This function takes one or more 32-bit chars and encodes them
* as an array of UTF-8 bytes.
* FROM: https://www.cprogramming.com/tutorial/utf8.c
*
* @param dest Destination byte buffer
* @param sz size of dest buffer in bytes
* @param src Buffer of source data, in 32-bit characters
* @param srcsz number of source characters, or -1 if 0-terminated
*
* @return Number of characters converted
*
* dest will only be '\0'-terminated if there is enough space. this is
* for consistency; imagine there are 2 bytes of space left, but the next
* character requires 3 bytes. in this case we could NUL-terminate, but in
* general we can't when there's insufficient space. therefore this function
* only NUL-terminates if all the characters fit, and there's space for
* the NUL as well.
* the destination string will never be bigger than the source string.
*/
int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz)
{
u_int32_t ch;
int i = 0;
char *dest_end = dest + sz;
while (srcsz<0 ? src[i]!=0 : i < srcsz) {
ch = src[i];
if (ch < 0x80) {
if (dest >= dest_end)
return i;
*dest++ = (char)ch;
}
else if (ch < 0x800) {
if (dest >= dest_end-1)
return i;
*dest++ = (ch>>6) | 0xC0;
*dest++ = (ch & 0x3F) | 0x80;
}
else if (ch < 0x10000) {
if (dest >= dest_end-2)
return i;
*dest++ = (ch>>12) | 0xE0;
*dest++ = ((ch>>6) & 0x3F) | 0x80;
*dest++ = (ch & 0x3F) | 0x80;
}
else if (ch < 0x110000) {
if (dest >= dest_end-3)
return i;
*dest++ = (ch>>18) | 0xF0;
*dest++ = ((ch>>12) & 0x3F) | 0x80;
*dest++ = ((ch>>6) & 0x3F) | 0x80;
*dest++ = (ch & 0x3F) | 0x80;
}
i++;
}
if (dest < dest_end)
*dest = '\0';
return i;
}
////////////// END UTF-8 Section ////////////// ////////////// END UTF-8 Section //////////////