mirror of
https://github.com/justinethier/cyclone.git
synced 2025-07-15 08:47:35 +02:00
Cleanup and added UTF 8 definitions to header file
This commit is contained in:
parent
ae3aa1941d
commit
71c7ed3e7f
2 changed files with 26 additions and 16 deletions
|
@ -707,4 +707,18 @@ void add_global(object * glo);
|
|||
void Cyc_set_globals_changed(gc_thread_data *thd);
|
||||
/**@}*/
|
||||
|
||||
/**
|
||||
* \defgroup prim_utf8 UTF-8
|
||||
*
|
||||
* @brief Unicode processing using UTF-8
|
||||
*/
|
||||
/**@{*/
|
||||
#define CYC_UTF8_ACCEPT 0
|
||||
#define CYC_UTF8_REJECT 1
|
||||
uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
|
||||
int Cyc_utf8_count_code_points(uint8_t* s, size_t* count);
|
||||
uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len);
|
||||
uint32_t Cyc_utf8_validate(char *str, size_t len);
|
||||
/**@}*/
|
||||
|
||||
#endif /* CYCLONE_RUNTIME_H */
|
||||
|
|
28
runtime.c
28
runtime.c
|
@ -6368,10 +6368,6 @@ void Cyc_io_read_token(void *data, object cont, object port)
|
|||
|
||||
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
||||
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
||||
|
||||
#define UTF8_ACCEPT 0
|
||||
#define UTF8_REJECT 1
|
||||
|
||||
static const uint8_t utf8d[] = {
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
||||
|
@ -6389,11 +6385,11 @@ static const uint8_t utf8d[] = {
|
|||
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
|
||||
};
|
||||
|
||||
uint32_t inline
|
||||
decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
||||
//uint32_t inline
|
||||
uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
||||
uint32_t type = utf8d[byte];
|
||||
|
||||
*codep = (*state != UTF8_ACCEPT) ?
|
||||
*codep = (*state != CYC_UTF8_ACCEPT) ?
|
||||
(byte & 0x3fu) | (*codep << 6) :
|
||||
(0xff >> type) & (byte);
|
||||
|
||||
|
@ -6407,15 +6403,15 @@ decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
|||
* Count the number of code points in a string.
|
||||
* Based on example code from Bjoern Hoehrmann.
|
||||
*/
|
||||
int countCodePoints(uint8_t* s, size_t* count) {
|
||||
int Cyc_utf8_count_code_points(uint8_t* s, size_t* count) {
|
||||
uint32_t codepoint;
|
||||
uint32_t state = 0;
|
||||
|
||||
for (*count = 0; *s; ++s)
|
||||
if (!decode(&state, &codepoint, *s))
|
||||
if (!Cyc_utf8_decode(&state, &codepoint, *s))
|
||||
*count += 1;
|
||||
|
||||
return state != UTF8_ACCEPT;
|
||||
return state != CYC_UTF8_ACCEPT;
|
||||
}
|
||||
|
||||
// TODO: index into X codepoint in a string
|
||||
|
@ -6428,7 +6424,7 @@ int countCodePoints(uint8_t* s, size_t* count) {
|
|||
*
|
||||
* From https://stackoverflow.com/a/22135005/101258
|
||||
*/
|
||||
uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
|
||||
uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len) {
|
||||
size_t i;
|
||||
uint32_t type;
|
||||
|
||||
|
@ -6438,7 +6434,7 @@ uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
|
|||
type = utf8d[(uint8_t)str[i]];
|
||||
*state = utf8d[256 + (*state) * 16 + type];
|
||||
|
||||
if (*state == UTF8_REJECT)
|
||||
if (*state == CYC_UTF8_REJECT)
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -6446,11 +6442,11 @@ uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
|
|||
}
|
||||
|
||||
/**
|
||||
* @brief Simplified version of above, always called with a complete string buffer
|
||||
* @brief Simplified version of Cyc_utf8_validate_stream that must always be called with a complete string buffer.
|
||||
*/
|
||||
uint32_t valid_utf8(char *str, size_t len) {
|
||||
uint32_t Cyc_utf8_validate(char *str, size_t len) {
|
||||
size_t i;
|
||||
uint32_t state = UTF8_ACCEPT, type;
|
||||
uint32_t state = CYC_UTF8_ACCEPT, type;
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
// We don't care about the codepoint, so this is
|
||||
|
@ -6458,7 +6454,7 @@ uint32_t valid_utf8(char *str, size_t len) {
|
|||
type = utf8d[(uint8_t)str[i]];
|
||||
state = utf8d[256 + (state) * 16 + type];
|
||||
|
||||
if (state == UTF8_REJECT)
|
||||
if (state == CYC_UTF8_REJECT)
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue