Cleanup and added UTF 8 definitions to header file

2025-07-15 08:47:35 +02:00 · 2017-10-20 12:54:13 +00:00 · 2017-10-20 12:54:13 +00:00 · 71c7ed3e7f
commit 71c7ed3e7f
parent ae3aa1941d
2 changed files with 26 additions and 16 deletions
--- a/include/cyclone/runtime.h
+++ b/include/cyclone/runtime.h
@ -707,4 +707,18 @@ void add_global(object * glo);
 void Cyc_set_globals_changed(gc_thread_data *thd);
 /**@}*/

+/**
+ * \defgroup prim_utf8 UTF-8
+ *
+ * @brief Unicode processing using UTF-8
+ */
+/**@{*/
+#define CYC_UTF8_ACCEPT 0
+#define CYC_UTF8_REJECT 1
+uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte);
+int Cyc_utf8_count_code_points(uint8_t* s, size_t* count);
+uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len); 
+uint32_t Cyc_utf8_validate(char *str, size_t len);
+/**@}*/
+
 #endif                          /* CYCLONE_RUNTIME_H */
--- a/runtime.c
+++ b/runtime.c
@ -6368,10 +6368,6 @@ void Cyc_io_read_token(void *data, object cont, object port)

 // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
-
-#define UTF8_ACCEPT 0
-#define UTF8_REJECT 1
-
 static const uint8_t utf8d[] = {
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
@ -6389,11 +6385,11 @@ static const uint8_t utf8d[] = {
  1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
 };

-uint32_t inline
-decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
+//uint32_t inline
+uint32_t Cyc_utf8_decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
  uint32_t type = utf8d[byte];

-  *codep = (*state != UTF8_ACCEPT) ?
+  *codep = (*state != CYC_UTF8_ACCEPT) ?
    (byte & 0x3fu) | (*codep << 6) :
    (0xff >> type) & (byte);

@ -6407,15 +6403,15 @@ decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
 * Count the number of code points in a string.
 * Based on example code from Bjoern Hoehrmann.
 */
-int countCodePoints(uint8_t* s, size_t* count) {
+int Cyc_utf8_count_code_points(uint8_t* s, size_t* count) {
  uint32_t codepoint;
  uint32_t state = 0;

  for (*count = 0; *s; ++s)
-    if (!decode(&state, &codepoint, *s))
+    if (!Cyc_utf8_decode(&state, &codepoint, *s))
      *count += 1;

-  return state != UTF8_ACCEPT;
+  return state != CYC_UTF8_ACCEPT;
 }

 // TODO: index into X codepoint in a string 
@ -6428,7 +6424,7 @@ int countCodePoints(uint8_t* s, size_t* count) {
 *
 * From https://stackoverflow.com/a/22135005/101258
 */
-uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
+uint32_t Cyc_utf8_validate_stream(uint32_t *state, char *str, size_t len) {
   size_t i;
   uint32_t type;

@ -6438,7 +6434,7 @@ uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
        type = utf8d[(uint8_t)str[i]];
        *state = utf8d[256 + (*state) * 16 + type];

-        if (*state == UTF8_REJECT)
+        if (*state == CYC_UTF8_REJECT)
            break;
    }

@ -6446,11 +6442,11 @@ uint32_t validate_utf8(uint32_t *state, char *str, size_t len) {
 }

 /**
- * @brief Simplified version of above, always called with a complete string buffer
+ * @brief Simplified version of Cyc_utf8_validate_stream that must always be called with a complete string buffer.
 */
-uint32_t valid_utf8(char *str, size_t len) {
+uint32_t Cyc_utf8_validate(char *str, size_t len) {
   size_t i;
-   uint32_t state = UTF8_ACCEPT, type;
+   uint32_t state = CYC_UTF8_ACCEPT, type;

    for (i = 0; i < len; i++) {
        // We don't care about the codepoint, so this is
@ -6458,7 +6454,7 @@ uint32_t valid_utf8(char *str, size_t len) {
        type = utf8d[(uint8_t)str[i]];
        state = utf8d[256 + (state) * 16 + type];

-        if (state == UTF8_REJECT)
+        if (state == CYC_UTF8_REJECT)
            break;
    }