diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h index 6a8027dd..dbbfb8d9 100644 --- a/include/cyclone/runtime.h +++ b/include/cyclone/runtime.h @@ -707,4 +707,18 @@ void add_global(object * glo); void Cyc_set_globals_changed(gc_thread_data *thd); /**@}*/ +/** + * \defgroup prim_utf8 UTF-8 + * + * @brief Unicode processing using UTF-8 + */ +/**@{*/ +#define CYC_UTF8_ACCEPT 0 +#define CYC_UTF8_REJECT 1 +uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte); +int Cyc_utf8_count_code_points(uint8_t* s, size_t* count); +uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); +uint32_t Cyc_utf8_validate(char *str, size_t len); +/**@}*/ + #endif /* CYCLONE_RUNTIME_H */ diff --git a/runtime.c b/runtime.c index dd799d88..a0662e81 100644 --- a/runtime.c +++ b/runtime.c @@ -6368,10 +6368,6 @@ void Cyc_io_read_token(void *data, object cont, object port) // Copyright (c) 2008-2009 Bjoern Hoehrmann // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. - -#define UTF8_ACCEPT 0 -#define UTF8_REJECT 1 - static const uint8_t utf8d[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f @@ -6389,11 +6385,11 @@ static const uint8_t utf8d[] = { 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 }; -uint32_t inline -decode(uint32_t* state, uint32_t* codep, uint32_t byte) { +//uint32_t inline +uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { uint32_t type = utf8d[byte]; - *codep = (*state != UTF8_ACCEPT) ? + *codep = (*state != CYC_UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) : (0xff >> type) & (byte); @@ -6407,15 +6403,15 @@ decode(uint32_t* state, uint32_t* codep, uint32_t byte) { * Count the number of code points in a string. * Based on example code from Bjoern Hoehrmann. */ -int countCodePoints(uint8_t* s, size_t* count) { +int Cyc_utf8_count_code_points(uint8_t* s, size_t* count) { uint32_t codepoint; uint32_t state = 0; for (*count = 0; *s; ++s) - if (!decode(&state, &codepoint, *s)) + if (!Cyc_utf8_decode(&state, &codepoint, *s)) *count += 1; - return state != UTF8_ACCEPT; + return state != CYC_UTF8_ACCEPT; } // TODO: index into X codepoint in a string @@ -6428,7 +6424,7 @@ int countCodePoints(uint8_t* s, size_t* count) { * * From https://stackoverflow.com/a/22135005/101258 */ -uint32_t validate_utf8(uint32_t *state, char *str, size_t len) { +uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len) { size_t i; uint32_t type; @@ -6438,7 +6434,7 @@ uint32_t validate_utf8(uint32_t *state, char *str, size_t len) { type = utf8d[(uint8_t)str[i]]; *state = utf8d[256 + (*state) * 16 + type]; - if (*state == UTF8_REJECT) + if (*state == CYC_UTF8_REJECT) break; } @@ -6446,11 +6442,11 @@ uint32_t validate_utf8(uint32_t *state, char *str, size_t len) { } /** - * @brief Simplified version of above, always called with a complete string buffer + * @brief Simplified version of Cyc_utf8_validate_stream that must always be called with a complete string buffer. */ -uint32_t valid_utf8(char *str, size_t len) { +uint32_t Cyc_utf8_validate(char *str, size_t len) { size_t i; - uint32_t state = UTF8_ACCEPT, type; + uint32_t state = CYC_UTF8_ACCEPT, type; for (i = 0; i < len; i++) { // We don't care about the codepoint, so this is @@ -6458,7 +6454,7 @@ uint32_t valid_utf8(char *str, size_t len) { type = utf8d[(uint8_t)str[i]]; state = utf8d[256 + (state) * 16 + type]; - if (state == UTF8_REJECT) + if (state == CYC_UTF8_REJECT) break; }