mirror of
https://github.com/justinethier/cyclone.git
synced 2025-07-15 16:57:35 +02:00
Cleanup and added UTF 8 definitions to header file
This commit is contained in:
parent
ae3aa1941d
commit
71c7ed3e7f
2 changed files with 26 additions and 16 deletions
|
@ -707,4 +707,18 @@ void add_global(object * glo);
|
||||||
void Cyc_set_globals_changed(gc_thread_data *thd);
|
void Cyc_set_globals_changed(gc_thread_data *thd);
|
||||||
/**@}*/
|
/**@}*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \defgroup prim_utf8 UTF-8
|
||||||
|
*
|
||||||
|
* @brief Unicode processing using UTF-8
|
||||||
|
*/
|
||||||
|
/**@{*/
|
||||||
|
#define CYC_UTF8_ACCEPT 0
|
||||||
|
#define CYC_UTF8_REJECT 1
|
||||||
|
uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
|
||||||
|
int Cyc_utf8_count_code_points(uint8_t* s, size_t* count);
|
||||||
|
uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len);
|
||||||
|
uint32_t Cyc_utf8_validate(char *str, size_t len);
|
||||||
|
/**@}*/
|
||||||
|
|
||||||
#endif /* CYCLONE_RUNTIME_H */
|
#endif /* CYCLONE_RUNTIME_H */
|
||||||
|
|
28
runtime.c
28
runtime.c
|
@ -6368,10 +6368,6 @@ void Cyc_io_read_token(void *data, object cont, object port)
|
||||||
|
|
||||||
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
||||||
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
||||||
|
|
||||||
#define UTF8_ACCEPT 0
|
|
||||||
#define UTF8_REJECT 1
|
|
||||||
|
|
||||||
static const uint8_t utf8d[] = {
|
static const uint8_t utf8d[] = {
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
||||||
|
@ -6389,11 +6385,11 @@ static const uint8_t utf8d[] = {
|
||||||
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
|
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
|
||||||
};
|
};
|
||||||
|
|
||||||
uint32_t inline
|
//uint32_t inline
|
||||||
decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
||||||
uint32_t type = utf8d[byte];
|
uint32_t type = utf8d[byte];
|
||||||
|
|
||||||
*codep = (*state != UTF8_ACCEPT) ?
|
*codep = (*state != CYC_UTF8_ACCEPT) ?
|
||||||
(byte & 0x3fu) | (*codep << 6) :
|
(byte & 0x3fu) | (*codep << 6) :
|
||||||
(0xff >> type) & (byte);
|
(0xff >> type) & (byte);
|
||||||
|
|
||||||
|
@ -6407,15 +6403,15 @@ decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
||||||
* Count the number of code points in a string.
|
* Count the number of code points in a string.
|
||||||
* Based on example code from Bjoern Hoehrmann.
|
* Based on example code from Bjoern Hoehrmann.
|
||||||
*/
|
*/
|
||||||
int countCodePoints(uint8_t* s, size_t* count) {
|
int Cyc_utf8_count_code_points(uint8_t* s, size_t* count) {
|
||||||
uint32_t codepoint;
|
uint32_t codepoint;
|
||||||
uint32_t state = 0;
|
uint32_t state = 0;
|
||||||
|
|
||||||
for (*count = 0; *s; ++s)
|
for (*count = 0; *s; ++s)
|
||||||
if (!decode(&state, &codepoint, *s))
|
if (!Cyc_utf8_decode(&state, &codepoint, *s))
|
||||||
*count += 1;
|
*count += 1;
|
||||||
|
|
||||||
return state != UTF8_ACCEPT;
|
return state != CYC_UTF8_ACCEPT;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: index into X codepoint in a string
|
// TODO: index into X codepoint in a string
|
||||||
|
@ -6428,7 +6424,7 @@ int countCodePoints(uint8_t* s, size_t* count) {
|
||||||
*
|
*
|
||||||
* From https://stackoverflow.com/a/22135005/101258
|
* From https://stackoverflow.com/a/22135005/101258
|
||||||
*/
|
*/
|
||||||
uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
|
uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len) {
|
||||||
size_t i;
|
size_t i;
|
||||||
uint32_t type;
|
uint32_t type;
|
||||||
|
|
||||||
|
@ -6438,7 +6434,7 @@ uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
|
||||||
type = utf8d[(uint8_t)str[i]];
|
type = utf8d[(uint8_t)str[i]];
|
||||||
*state = utf8d[256 + (*state) * 16 + type];
|
*state = utf8d[256 + (*state) * 16 + type];
|
||||||
|
|
||||||
if (*state == UTF8_REJECT)
|
if (*state == CYC_UTF8_REJECT)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6446,11 +6442,11 @@ uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Simplified version of above, always called with a complete string buffer
|
* @brief Simplified version of Cyc_utf8_validate_stream that must always be called with a complete string buffer.
|
||||||
*/
|
*/
|
||||||
uint32_t valid_utf8(char *str, size_t len) {
|
uint32_t Cyc_utf8_validate(char *str, size_t len) {
|
||||||
size_t i;
|
size_t i;
|
||||||
uint32_t state = UTF8_ACCEPT, type;
|
uint32_t state = CYC_UTF8_ACCEPT, type;
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
for (i = 0; i < len; i++) {
|
||||||
// We don't care about the codepoint, so this is
|
// We don't care about the codepoint, so this is
|
||||||
|
@ -6458,7 +6454,7 @@ uint32_t valid_utf8(char *str, size_t len) {
|
||||||
type = utf8d[(uint8_t)str[i]];
|
type = utf8d[(uint8_t)str[i]];
|
||||||
state = utf8d[256 + (state) * 16 + type];
|
state = utf8d[256 + (state) * 16 + type];
|
||||||
|
|
||||||
if (state == UTF8_REJECT)
|
if (state == CYC_UTF8_REJECT)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue