From 556f97dd5fc5a6d50699d64f9404b3c176931060 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Wed, 25 Oct 2017 13:54:36 +0000 Subject: [PATCH] WIP, testing encoders/decoders --- test.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 121 insertions(+), 12 deletions(-) diff --git a/test.c b/test.c index 4c97178b..306f1d85 100644 --- a/test.c +++ b/test.c @@ -4,17 +4,126 @@ #include #include -void main(){ - char c[128]; - uint32_t val = 0x32363435; - uint8_t *ptr = (uint8_t *)&val; - int i, j = 0; - //memset(c, 0x34, 128); - for (i = 0; i < 127; i++) { - c[i] = ptr[j++]; - if (j == 4) j = 0; - } - c[127] = '\0'; - printf("%s\n", c); +#define CYC_UTF8_ACCEPT 0 + +// Copyright (c) 2008-2009 Bjoern Hoehrmann +// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +//uint32_t inline +uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != CYC_UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} + +// FROM: https://www.cprogramming.com/tutorial/utf8.c +/* srcsz = number of source characters, or -1 if 0-terminated + sz = size of dest buffer in bytes + + returns # characters converted + dest will only be '\0'-terminated if there is enough space. this is + for consistency; imagine there are 2 bytes of space left, but the next + character requires 3 bytes. in this case we could NUL-terminate, but in + general we can't when there's insufficient space. therefore this function + only NUL-terminates if all the characters fit, and there's space for + the NUL as well. + the destination string will never be bigger than the source string. +*/ +int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz) +{ + u_int32_t ch; + int i = 0; + char *dest_end = dest + sz; + + while (srcsz<0 ? src[i]!=0 : i < srcsz) { + ch = src[i]; + if (ch < 0x80) { + if (dest >= dest_end) + return i; + *dest++ = (char)ch; + } + else if (ch < 0x800) { + if (dest >= dest_end-1) + return i; + *dest++ = (ch>>6) | 0xC0; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x10000) { + if (dest >= dest_end-2) + return i; + *dest++ = (ch>>12) | 0xE0; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x110000) { + if (dest >= dest_end-3) + return i; + *dest++ = (ch>>18) | 0xF0; + *dest++ = ((ch>>12) & 0x3F) | 0x80; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + i++; + } + if (dest < dest_end) + *dest = '\0'; + return i; +} + +void encoding() { + char dest[5]; + int rv; + uint32_t val = 0x03bb; + + rv = u8_toutf8(dest, 5, &val, 1); + printf("%d %x\n", rv, dest); +TODO: above seems broken, should encode to 0xCEBB (see below) + return; +} + +void main(){ + char c[128]; + uint8_t cv[] = {0xEC, 0xBA, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE +// uint8_t cv[] = {0xCE, 0xBB, 0x00}; // Lambda (0x03bb) is encoded with leading 0xCE + char *cptr; + uint32_t state = CYC_UTF8_ACCEPT, codepoint, val = 0x32363435; + uint8_t *ptr = (uint8_t *)&val; + int i, j = 0; +// //memset(c, 0x34, 128); +// for (i = 0; i < 127; i++) { +// c[i] = ptr[j++]; +// if (j == 4) j = 0; +// } +// c[127] = '\0'; +// printf("%s\n", c); + + ptr = cv; + for (i = 0; i < 3; i++) { + Cyc_utf8_decode(&state, &codepoint, ptr[i]); + } + printf("state = %d, cp = %d\n", state, codepoint); + + encoding(); return; }