From 9569460a58629b12366ad86ca0e63b5951e67df5 Mon Sep 17 00:00:00 2001 From: Alex Shinn Date: Sat, 26 Jan 2019 05:35:27 +0800 Subject: [PATCH] add compile-time option to store precomputed index->cursor tables for strings --- eval.c | 2 ++ include/chibi/features.h | 26 ++++++++++++++---- include/chibi/sexp.h | 12 +++++++- sexp.c | 56 ++++++++++++++++++++++++++++++++++++-- tests/build/build-opts.txt | 1 + 5 files changed, 88 insertions(+), 9 deletions(-) diff --git a/eval.c b/eval.c index a3b2d66e..59aad5c3 100644 --- a/eval.c +++ b/eval.c @@ -1947,6 +1947,8 @@ void sexp_string_utf8_set (sexp ctx, sexp str, sexp index, sexp ch) { sexp_string_size(str) += new_len - old_len; } sexp_utf8_encode_char(p, new_len, c); + if (old_len != new_len) + sexp_update_string_index_lookup(ctx, str); } sexp sexp_string_utf8_index_set (sexp ctx, sexp self, sexp_sint_t n, sexp str, sexp i, sexp ch) { diff --git a/include/chibi/features.h b/include/chibi/features.h index 6daa103b..d23858e0 100644 --- a/include/chibi/features.h +++ b/include/chibi/features.h @@ -210,12 +210,14 @@ /* Making them immutable allows for packed UTF-8 strings. */ /* #define SEXP_USE_MUTABLE_STRINGS 0 */ -/* uncomment this to make string cursors just fixnum offsets */ -/* The default when using UTF-8 is to have a disjoint string */ -/* cursor type. This is an immediate type with no loss in */ -/* performance, and prevents confusion mixing indexes and */ -/* cursors. */ -/* #define SEXP_USE_DISJOINT_STRING_CURSORS 0 */ +/* uncomment this to enable precomputed index->cursor tables for strings */ +/* This makes string-ref faster at the expensive of making string */ +/* construction (including string-append and I/O) slower. */ +/* You can configure with SEXP_STRING_INDEX_TABLE_CHUNK_SIZE below, */ +/* the default is caching every 64th index (<=12.5% string overhead). */ +/* With a minimum of 1 you'd have up to 8x string overhead, and */ +/* string-ref would still be slightly slower than string-cursors. */ +/* #define SEXP_USE_STRING_INDEX_TABLE 1 */ /* uncomment this to disable automatic closing of ports */ /* If enabled, the underlying FILE* for file ports will be */ @@ -647,6 +649,18 @@ #define SEXP_USE_PACKED_STRINGS 1 #endif +#if SEXP_USE_PACKED_STRINGS +#define SEXP_USE_STRING_INDEX_TABLE 0 +#endif +#ifndef SEXP_USE_STRING_INDEX_TABLE +#define SEXP_USE_STRING_INDEX_TABLE 0 +#endif + +/* for every chunk_size indexes store the precomputed offset */ +#ifndef SEXP_STRING_INDEX_TABLE_CHUNK_SIZE +#define SEXP_STRING_INDEX_TABLE_CHUNK_SIZE 64 +#endif + #ifndef SEXP_USE_DISJOINT_STRING_CURSORS #define SEXP_USE_DISJOINT_STRING_CURSORS SEXP_USE_UTF8_STRINGS #endif diff --git a/include/chibi/sexp.h b/include/chibi/sexp.h index f2aacf6c..3909f060 100644 --- a/include/chibi/sexp.h +++ b/include/chibi/sexp.h @@ -443,6 +443,9 @@ struct sexp_struct { #else sexp_uint_t offset, length; sexp bytes; +#if SEXP_USE_STRING_INDEX_TABLE + sexp charlens; +#endif #endif } string; struct { @@ -1122,7 +1125,8 @@ enum sexp_uniform_vector_type { #define sexp_bit_ref(u1v, i) (((sexp_uvector_data(u1v)[i/8])>>(i%8))&1) #define sexp_bit_set(u1v, i, x) (x ? (sexp_uvector_data(u1v)[i/8]|=(1<<(i%8))) : (sexp_uvector_data(u1v)[i/8]&=~(1<<(i%8)))) -#define sexp_string_size(x) (sexp_field(x, string, SEXP_STRING, length)) +#define sexp_string_size(x) (sexp_field(x, string, SEXP_STRING, length)) +#define sexp_string_charlens(x) (sexp_field(x, string, SEXP_STRING, charlens)) #if SEXP_USE_PACKED_STRINGS #define sexp_string_data(x) (sexp_field(x, string, SEXP_STRING, data)) #define sexp_string_bytes(x) (x) @@ -1722,6 +1726,12 @@ SEXP_API int sexp_write_utf8_char (sexp ctx, int c, sexp out); #define sexp_substring_cursor(ctx, s, i, j) sexp_substring_op(ctx, NULL, 3, s, i, j) #endif +#if SEXP_USE_STRING_INDEX_TABLE +SEXP_API void sexp_update_string_index_lookup(sexp ctx, sexp s); +#else +#define sexp_update_string_index_lookup(ctx, s) +#endif + #if SEXP_USE_GREEN_THREADS SEXP_API int sexp_maybe_block_port (sexp ctx, sexp in, int forcep); SEXP_API void sexp_maybe_unblock_port (sexp ctx, sexp in); diff --git a/sexp.c b/sexp.c index 324f103a..78f83a12 100644 --- a/sexp.c +++ b/sexp.c @@ -262,7 +262,7 @@ static struct sexp_type_struct _sexp_type_specs[] = { #if SEXP_USE_PACKED_STRINGS {SEXP_STRING, 0, 0, 0, 0, 0, sexp_sizeof(string)+1, sexp_offsetof(string, length), 1, 0, 0, 0, 0, 0, 0, (sexp)"String", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL}, #else - {SEXP_STRING, sexp_offsetof(string, bytes), 1, 1, 0, 0, sexp_sizeof(string), 0, 0, 0, 0, 0, 0, 0, 0, (sexp)"String", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL}, + {SEXP_STRING, sexp_offsetof(string, bytes), 1, 1+SEXP_USE_STRING_INDEX_TABLE, 0, 0, sexp_sizeof(string), 0, 0, 0, 0, 0, 0, 0, 0, (sexp)"String", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL}, #endif {SEXP_VECTOR, sexp_offsetof(vector, data), 0, 0, sexp_offsetof(vector, length), 1, sexp_sizeof(vector), sexp_offsetof(vector, length), sizeof(sexp), 0, 0, 0, 0, 0, 0, (sexp)"Vector", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL}, {SEXP_FLONUM, 0, 0, 0, 0, 0, sexp_sizeof(flonum), 0, 0, 0, 0, 0, 0, 0, 0, (sexp)"Flonum", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL}, @@ -1198,13 +1198,31 @@ void sexp_utf8_encode_char (unsigned char* p, int len, int c) { } sexp sexp_string_index_to_cursor (sexp ctx, sexp self, sexp_sint_t n, sexp str, sexp index) { +#if SEXP_USE_STRING_INDEX_TABLE + sexp charlens; + sexp_sint_t* chunklens; + sexp_sint_t chunk; +#endif sexp_sint_t i, j, limit; unsigned char *p; sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str); sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, index); p = (unsigned char*)sexp_string_data(str); limit = sexp_string_size(str); - for (j=0, i=sexp_unbox_fixnum(index); i>0 && j SEXP_STRING_INDEX_TABLE_CHUNK_SIZE) { + charlens = sexp_string_charlens(str); + if (charlens) { + chunklens = (sexp_sint_t*)sexp_bytes_data(charlens); + chunk = i / SEXP_STRING_INDEX_TABLE_CHUNK_SIZE - 1; + j = chunklens[chunk]; + i -= (chunk+1) * SEXP_STRING_INDEX_TABLE_CHUNK_SIZE; + } + } +#endif + for ( ; i>0 && jcursor: index out of range", index); @@ -1227,6 +1245,36 @@ sexp sexp_string_cursor_offset (sexp ctx, sexp self, sexp_sint_t n, sexp cur) { #endif +#if SEXP_USE_STRING_INDEX_TABLE +void sexp_update_string_index_lookup(sexp ctx, sexp s) { + char *p; + sexp_sint_t numchunks, len, i, *chunks; + sexp_gc_var1(tmp); + if (sexp_string_size(s) < SEXP_STRING_INDEX_TABLE_CHUNK_SIZE*1.2) { + sexp_string_charlens(s) = NULL; /* don't build table for just a few chars */ + return; + } + sexp_gc_preserve1(ctx, tmp); + tmp = s; + len = sexp_string_utf8_length((unsigned char*) sexp_string_data(s), sexp_string_size(s)); + numchunks = ((len + SEXP_STRING_INDEX_TABLE_CHUNK_SIZE - 1) / SEXP_STRING_INDEX_TABLE_CHUNK_SIZE) - 1; + sexp_string_charlens(s) = + sexp_make_bytes_op(ctx, NULL, 2, sexp_make_fixnum(numchunks * sizeof(sexp_sint_t)), SEXP_VOID); + chunks = (sexp_sint_t*)sexp_bytes_data(sexp_string_charlens(s)); + p = sexp_string_data(s); + i = 0; + while (1) { + p += sexp_utf8_initial_byte_count(*p); + if (++i % SEXP_STRING_INDEX_TABLE_CHUNK_SIZE == 0) { + chunks[i/SEXP_STRING_INDEX_TABLE_CHUNK_SIZE - 1] = p - sexp_string_data(s); + if (i / SEXP_STRING_INDEX_TABLE_CHUNK_SIZE >= numchunks-1) + break; + } + } + sexp_gc_release1(ctx); +} +#endif + sexp sexp_make_string_op (sexp ctx, sexp self, sexp_sint_t n, sexp len, sexp ch) { sexp i = (sexp_charp(ch) ? sexp_make_fixnum(sexp_unbox_character(ch)) : ch); @@ -1259,6 +1307,7 @@ sexp sexp_make_string_op (sexp ctx, sexp self, sexp_sint_t n, sexp len, sexp ch) sexp_string_bytes(s) = b; sexp_string_offset(s) = 0; sexp_string_size(s) = sexp_bytes_length(b); + sexp_update_string_index_lookup(ctx, s); sexp_gc_release2(ctx); return s; #endif @@ -1273,6 +1322,7 @@ sexp sexp_c_string (sexp ctx, const char *str, sexp_sint_t slen) { if (sexp_exceptionp(s)) return s; memcpy(sexp_string_data(s), str, len); sexp_string_data(s)[len] = '\0'; + sexp_update_string_index_lookup(ctx, s); return s; } @@ -1294,6 +1344,7 @@ sexp sexp_substring_op (sexp ctx, sexp self, sexp_sint_t n, sexp str, sexp start sexp_string_data(str)+sexp_unbox_string_cursor(start), sexp_string_size(res)); sexp_string_data(res)[sexp_string_size(res)] = '\0'; + sexp_update_string_index_lookup(ctx, res); return res; } @@ -1360,6 +1411,7 @@ sexp sexp_string_concatenate_op (sexp ctx, sexp self, sexp_sint_t n, sexp str_ls } } *p = '\0'; + sexp_update_string_index_lookup(ctx, res); return res; } diff --git a/tests/build/build-opts.txt b/tests/build/build-opts.txt index a90f97ec..585bb379 100644 --- a/tests/build/build-opts.txt +++ b/tests/build/build-opts.txt @@ -33,5 +33,6 @@ CPPFLAGS=-DSEXP_USE_UTF8_STRINGS=0 CPPFLAGS=-DSEXP_USE_DISJOINT_STRING_CURSORS=0 CFLAGS=-DSEXP_USE_STATIC_LIBS_NO_INCLUDE=0;CPPFLAGS=-DSEXP_USE_STATIC_LIBS=1 CPPFLAGS=-DSEXP_USE_MUTABLE_STRINGS=0 +CPPFLAGS=-DSEXP_USE_STRING_INDEX_TABLE=1 CPPFLAGS=-DSEXP_USE_STRICT_TOPLEVEL_BINDINGS=1 CPPFLAGS=-DSEXP_USE_NO_FEATURES=1