mirror of
https://github.com/justinethier/cyclone.git
synced 2025-07-09 14:07:34 +02:00
Added UTF8 encoder, final version of string->utf8
This commit is contained in:
parent
ccfde220ff
commit
aa0b0a7567
2 changed files with 98 additions and 11 deletions
|
@ -715,6 +715,13 @@ void Cyc_set_globals_changed(gc_thread_data *thd);
|
|||
/**@{*/
|
||||
#define CYC_UTF8_ACCEPT 0
|
||||
#define CYC_UTF8_REJECT 1
|
||||
|
||||
/**
|
||||
* Simple macro to make it more convenient to convert a single char
|
||||
*/
|
||||
#define Cyc_utf8_encode_char(dest, dest_size, char_value) \
|
||||
Cyc_utf8_encode(dest, dest_size, &char_value, 1)
|
||||
|
||||
uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
|
||||
int Cyc_utf8_count_code_points(uint8_t* s);
|
||||
uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len);
|
||||
|
|
102
runtime.c
102
runtime.c
|
@ -2595,23 +2595,41 @@ object Cyc_string2utf8(void *data, object cont, object str, object start,
|
|||
e = unbox_number(end);
|
||||
len = e - s;
|
||||
|
||||
if (s < 0 || (s >= string_len(str) && len > 0)) {
|
||||
if (s < 0 || (s >= string_num_cp(str) && len > 0)) {
|
||||
Cyc_rt_raise2(data, "string->utf8 - invalid start", start);
|
||||
}
|
||||
|
||||
if (e < 0 || e < s || e > string_len(str)) {
|
||||
if (e < 0 || e < s || e > string_num_cp(str)) {
|
||||
Cyc_rt_raise2(data, "string->utf8 - invalid end", end);
|
||||
}
|
||||
|
||||
// TODO: we have code point positions s, e, and length. We need to take those
|
||||
// and walk the string to figure out the starting and ending BYTE positions
|
||||
|
||||
// TODO: fast path, can keep below if string_num_cp(str) == string_len(str)
|
||||
|
||||
result.len = len;
|
||||
result.data = alloca(sizeof(char) * len);
|
||||
memcpy(&result.data[0], &(string_str(str))[s], len);
|
||||
_return_closcall1(data, cont, &result);
|
||||
// Fast path
|
||||
if (string_num_cp(str) == string_len(str)) { // TODO: disable for testing purposes
|
||||
result.len = len;
|
||||
result.data = alloca(sizeof(char) * len);
|
||||
memcpy(&result.data[0], &(string_str(str))[s], len);
|
||||
_return_closcall1(data, cont, &result);
|
||||
} else {
|
||||
int i, start_i = 0, end_i = 0;
|
||||
const char *tmp = string_str(str);
|
||||
char_type codepoint;
|
||||
uint32_t state = 0;
|
||||
for (i = 0; *tmp; ++tmp) {
|
||||
if (!Cyc_utf8_decode(&state, &codepoint, *tmp)){
|
||||
if (i == s) {
|
||||
start_i = i;
|
||||
} else if (i == e) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
end_i = i;
|
||||
result.len = end_i - start_i;
|
||||
result.data = alloca(sizeof(char) * result.len);
|
||||
memcpy(&result.data[0], &(string_str(str))[start_i], result.len);
|
||||
_return_closcall1(data, cont, &result);
|
||||
}
|
||||
}
|
||||
|
||||
object Cyc_bytevector_u8_ref(void *data, object bv, object k)
|
||||
|
@ -6563,4 +6581,66 @@ int uint32_num_bytes(uint32_t x) {
|
|||
return 4;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function takes one or more 32-bit chars and encodes them
|
||||
* as an array of UTF-8 bytes.
|
||||
* FROM: https://www.cprogramming.com/tutorial/utf8.c
|
||||
*
|
||||
* @param dest Destination byte buffer
|
||||
* @param sz size of dest buffer in bytes
|
||||
* @param src Buffer of source data, in 32-bit characters
|
||||
* @param srcsz number of source characters, or -1 if 0-terminated
|
||||
*
|
||||
* @return Number of characters converted
|
||||
*
|
||||
* dest will only be '\0'-terminated if there is enough space. this is
|
||||
* for consistency; imagine there are 2 bytes of space left, but the next
|
||||
* character requires 3 bytes. in this case we could NUL-terminate, but in
|
||||
* general we can't when there's insufficient space. therefore this function
|
||||
* only NUL-terminates if all the characters fit, and there's space for
|
||||
* the NUL as well.
|
||||
* the destination string will never be bigger than the source string.
|
||||
*/
|
||||
int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz)
|
||||
{
|
||||
u_int32_t ch;
|
||||
int i = 0;
|
||||
char *dest_end = dest + sz;
|
||||
|
||||
while (srcsz<0 ? src[i]!=0 : i < srcsz) {
|
||||
ch = src[i];
|
||||
if (ch < 0x80) {
|
||||
if (dest >= dest_end)
|
||||
return i;
|
||||
*dest++ = (char)ch;
|
||||
}
|
||||
else if (ch < 0x800) {
|
||||
if (dest >= dest_end-1)
|
||||
return i;
|
||||
*dest++ = (ch>>6) | 0xC0;
|
||||
*dest++ = (ch & 0x3F) | 0x80;
|
||||
}
|
||||
else if (ch < 0x10000) {
|
||||
if (dest >= dest_end-2)
|
||||
return i;
|
||||
*dest++ = (ch>>12) | 0xE0;
|
||||
*dest++ = ((ch>>6) & 0x3F) | 0x80;
|
||||
*dest++ = (ch & 0x3F) | 0x80;
|
||||
}
|
||||
else if (ch < 0x110000) {
|
||||
if (dest >= dest_end-3)
|
||||
return i;
|
||||
*dest++ = (ch>>18) | 0xF0;
|
||||
*dest++ = ((ch>>12) & 0x3F) | 0x80;
|
||||
*dest++ = ((ch>>6) & 0x3F) | 0x80;
|
||||
*dest++ = (ch & 0x3F) | 0x80;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
if (dest < dest_end)
|
||||
*dest = '\0';
|
||||
return i;
|
||||
}
|
||||
|
||||
|
||||
////////////// END UTF-8 Section //////////////
|
||||
|
|
Loading…
Add table
Reference in a new issue