From d431b2af1c4835b8c13703677dac2e0a46840865 Mon Sep 17 00:00:00 2001 From: Justin Ethier Date: Mon, 6 Nov 2017 13:19:31 +0000 Subject: [PATCH] Updated Cyc_io_read_line to prevent truncation Ensure last codepoint is fully-read before returning --- include/cyclone/runtime.h | 2 +- runtime.c | 32 ++++++++++++++++++++++---------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/include/cyclone/runtime.h b/include/cyclone/runtime.h index 5c31471d..4529fa85 100644 --- a/include/cyclone/runtime.h +++ b/include/cyclone/runtime.h @@ -725,7 +725,7 @@ void Cyc_set_globals_changed(gc_thread_data *thd); int Cyc_utf8_encode(char *dest, int sz, uint32_t *src, int srcsz); uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte); int Cyc_utf8_count_code_points(uint8_t* s); -int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, int *cpts, int *bytes); +int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, char_type *codepoint, int *cpts, int *bytes); uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); uint32_t Cyc_utf8_validate(char *str, size_t len); /**@}*/ diff --git a/runtime.c b/runtime.c index a0e140e4..a7de71fd 100644 --- a/runtime.c +++ b/runtime.c @@ -6365,8 +6365,10 @@ object Cyc_io_read_char(void *data, object cont, object port) object Cyc_io_read_line(void *data, object cont, object port) { FILE *stream = ((port_type *) port)->fp; - char buf[1024]; - int len, num_cp; + char buf[1027]; + int len, num_cp, i = 0; + char_type codepoint; + uint32_t state; Cyc_check_port(data, port); if (stream == NULL) { @@ -6375,10 +6377,21 @@ object Cyc_io_read_line(void *data, object cont, object port) set_thread_blocked(data, cont); errno = 0; if (fgets(buf, 1023, stream) != NULL) { - // TODO: not good enough for UTF-8, what if we stopped reading in the middle of a code point? - // should reserve 3 extra bytes and, if last code point is not complete, read one byte at a - // time until it has been read - Cyc_utf8_count_code_points_and_bytes((uint8_t *)buf, &num_cp, &len); + state = Cyc_utf8_count_code_points_and_bytes((uint8_t *)buf, &codepoint, &num_cp, &len); + // Check if we stopped reading in the middle of a code point and + // if so, read one byte at a time until that code point is finished. + while (state != CYC_UTF8_ACCEPT && i < 3) { + int c = fgetc(stream); + buf[len] = c; + len++; + Cyc_utf8_decode(&state, &codepoint, (uint8_t)c); + if (state == CYC_UTF8_ACCEPT) { + num_cp++; + break; + } + i++; + } + { // Remove any trailing CR / newline chars while (len > 0 && (buf[len - 1] == '\n' || @@ -6596,19 +6609,18 @@ int Cyc_utf8_count_code_points(uint8_t* s) { return count; } -int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, int *cpts, int *bytes) { - uint32_t codepoint; +int Cyc_utf8_count_code_points_and_bytes(uint8_t* s, char_type *codepoint, int *cpts, int *bytes) { uint32_t state = 0; *cpts = 0; *bytes = 0; for (; *s; ++s){ *bytes += 1; - if (!Cyc_utf8_decode(&state, &codepoint, *s)) + if (!Cyc_utf8_decode(&state, codepoint, *s)) *cpts += 1; } if (state != CYC_UTF8_ACCEPT) - return -1; + return state; return 0; }