Cleanup and added UTF 8 definitions to header file

This commit is contained in:
Justin Ethier 2017-10-20 12:54:13 +00:00
parent ae3aa1941d
commit 71c7ed3e7f
2 changed files with 26 additions and 16 deletions

View file

@ -707,4 +707,18 @@ void add_global(object * glo);
void Cyc_set_globals_changed(gc_thread_data *thd); void Cyc_set_globals_changed(gc_thread_data *thd);
/**@}*/ /**@}*/
/**
* \defgroup prim_utf8 UTF-8
*
* @brief Unicode processing using UTF-8
*/
/**@{*/
#define CYC_UTF8_ACCEPT 0
#define CYC_UTF8_REJECT 1
uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
int Cyc_utf8_count_code_points(uint8_t* s, size_t* count);
uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len);
uint32_t Cyc_utf8_validate(char *str, size_t len);
/**@}*/
#endif /* CYCLONE_RUNTIME_H */ #endif /* CYCLONE_RUNTIME_H */

View file

@ -6368,10 +6368,6 @@ void Cyc_io_read_token(void *data, object cont, object port)
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
#define UTF8_ACCEPT 0
#define UTF8_REJECT 1
static const uint8_t utf8d[] = { static const uint8_t utf8d[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
@ -6389,11 +6385,11 @@ static const uint8_t utf8d[] = {
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
}; };
uint32_t inline //uint32_t inline
decode(uint32_t* state, uint32_t* codep, uint32_t byte) { uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
uint32_t type = utf8d[byte]; uint32_t type = utf8d[byte];
*codep = (*state != UTF8_ACCEPT) ? *codep = (*state != CYC_UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) : (byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte); (0xff >> type) & (byte);
@ -6407,15 +6403,15 @@ decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
* Count the number of code points in a string. * Count the number of code points in a string.
* Based on example code from Bjoern Hoehrmann. * Based on example code from Bjoern Hoehrmann.
*/ */
int countCodePoints(uint8_t* s, size_t* count) { int Cyc_utf8_count_code_points(uint8_t* s, size_t* count) {
uint32_t codepoint; uint32_t codepoint;
uint32_t state = 0; uint32_t state = 0;
for (*count = 0; *s; ++s) for (*count = 0; *s; ++s)
if (!decode(&state, &codepoint, *s)) if (!Cyc_utf8_decode(&state, &codepoint, *s))
*count += 1; *count += 1;
return state != UTF8_ACCEPT; return state != CYC_UTF8_ACCEPT;
} }
// TODO: index into X codepoint in a string // TODO: index into X codepoint in a string
@ -6428,7 +6424,7 @@ int countCodePoints(uint8_t* s, size_t* count) {
* *
* From https://stackoverflow.com/a/22135005/101258 * From https://stackoverflow.com/a/22135005/101258
*/ */
uint32_t validate_utf8(uint32_t *state, char *str, size_t len) { uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len) {
size_t i; size_t i;
uint32_t type; uint32_t type;
@ -6438,7 +6434,7 @@ uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
type = utf8d[(uint8_t)str[i]]; type = utf8d[(uint8_t)str[i]];
*state = utf8d[256 + (*state) * 16 + type]; *state = utf8d[256 + (*state) * 16 + type];
if (*state == UTF8_REJECT) if (*state == CYC_UTF8_REJECT)
break; break;
} }
@ -6446,11 +6442,11 @@ uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
} }
/** /**
* @brief Simplified version of above, always called with a complete string buffer * @brief Simplified version of Cyc_utf8_validate_stream that must always be called with a complete string buffer.
*/ */
uint32_t valid_utf8(char *str, size_t len) { uint32_t Cyc_utf8_validate(char *str, size_t len) {
size_t i; size_t i;
uint32_t state = UTF8_ACCEPT, type; uint32_t state = CYC_UTF8_ACCEPT, type;
for (i = 0; i < len; i++) { for (i = 0; i < len; i++) {
// We don't care about the codepoint, so this is // We don't care about the codepoint, so this is
@ -6458,7 +6454,7 @@ uint32_t valid_utf8(char *str, size_t len) {
type = utf8d[(uint8_t)str[i]]; type = utf8d[(uint8_t)str[i]];
state = utf8d[256 + (state) * 16 + type]; state = utf8d[256 + (state) * 16 + type];
if (state == UTF8_REJECT) if (state == CYC_UTF8_REJECT)
break; break;
} }