diff --git a/runtime.c b/runtime.c index 1525f2d0..dd799d88 100644 --- a/runtime.c +++ b/runtime.c @@ -6364,3 +6364,105 @@ void Cyc_io_read_token(void *data, object cont, object port) } } +////////////// UTF-8 Section ////////////// + +// Copyright (c) 2008-2009 Bjoern Hoehrmann +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 1 + +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +uint32_t inline +decode(uint32_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} +// END Bjoern Hoehrmann + +/** + * @brief + * Count the number of code points in a string. + * Based on example code from Bjoern Hoehrmann. + */ +int countCodePoints(uint8_t* s, size_t* count) { + uint32_t codepoint; + uint32_t state = 0; + + for (*count = 0; *s; ++s) + if (!decode(&state, &codepoint, *s)) + *count += 1; + + return state != UTF8_ACCEPT; +} + +// TODO: index into X codepoint in a string + +/** + * @brief + * Use this when validating from a stream, as it may be that the stream stopped + * in the middle of a codepoint, hence state passed in as an arg, so it can be + * tested in a loop and also after the loop has finished. + * + * From https://stackoverflow.com/a/22135005/101258 + */ +uint32_t validate_utf8(uint32_t *state, char *str, size_t len) { + size_t i; + uint32_t type; + + for (i = 0; i < len; i++) { + // We don't care about the codepoint, so this is + // a simplified version of the decode function. + type = utf8d[(uint8_t)str[i]]; + *state = utf8d[256 + (*state) * 16 + type]; + + if (*state == UTF8_REJECT) + break; + } + + return *state; +} + +/** + * @brief Simplified version of above, always called with a complete string buffer + */ +uint32_t valid_utf8(char *str, size_t len) { + size_t i; + uint32_t state = UTF8_ACCEPT, type; + + for (i = 0; i < len; i++) { + // We don't care about the codepoint, so this is + // a simplified version of the decode function. + type = utf8d[(uint8_t)str[i]]; + state = utf8d[256 + (state) * 16 + type]; + + if (state == UTF8_REJECT) + break; + } + + return state; +} + +////////////// END UTF-8 Section //////////////