add compile-time option to store precomputed index->cursor tables for strings

This commit is contained in:
Alex Shinn 2019-01-26 05:35:27 +08:00
parent 677ccdce68
commit 9569460a58
5 changed files with 88 additions and 9 deletions

2
eval.c
View file

@ -1947,6 +1947,8 @@ void sexp_string_utf8_set (sexp ctx, sexp str, sexp index, sexp ch) {
sexp_string_size(str) += new_len - old_len; sexp_string_size(str) += new_len - old_len;
} }
sexp_utf8_encode_char(p, new_len, c); sexp_utf8_encode_char(p, new_len, c);
if (old_len != new_len)
sexp_update_string_index_lookup(ctx, str);
} }
sexp sexp_string_utf8_index_set (sexp ctx, sexp self, sexp_sint_t n, sexp str, sexp i, sexp ch) { sexp sexp_string_utf8_index_set (sexp ctx, sexp self, sexp_sint_t n, sexp str, sexp i, sexp ch) {

View file

@ -210,12 +210,14 @@
/* Making them immutable allows for packed UTF-8 strings. */ /* Making them immutable allows for packed UTF-8 strings. */
/* #define SEXP_USE_MUTABLE_STRINGS 0 */ /* #define SEXP_USE_MUTABLE_STRINGS 0 */
/* uncomment this to make string cursors just fixnum offsets */ /* uncomment this to enable precomputed index->cursor tables for strings */
/* The default when using UTF-8 is to have a disjoint string */ /* This makes string-ref faster at the expensive of making string */
/* cursor type. This is an immediate type with no loss in */ /* construction (including string-append and I/O) slower. */
/* performance, and prevents confusion mixing indexes and */ /* You can configure with SEXP_STRING_INDEX_TABLE_CHUNK_SIZE below, */
/* cursors. */ /* the default is caching every 64th index (<=12.5% string overhead). */
/* #define SEXP_USE_DISJOINT_STRING_CURSORS 0 */ /* With a minimum of 1 you'd have up to 8x string overhead, and */
/* string-ref would still be slightly slower than string-cursors. */
/* #define SEXP_USE_STRING_INDEX_TABLE 1 */
/* uncomment this to disable automatic closing of ports */ /* uncomment this to disable automatic closing of ports */
/* If enabled, the underlying FILE* for file ports will be */ /* If enabled, the underlying FILE* for file ports will be */
@ -647,6 +649,18 @@
#define SEXP_USE_PACKED_STRINGS 1 #define SEXP_USE_PACKED_STRINGS 1
#endif #endif
#if SEXP_USE_PACKED_STRINGS
#define SEXP_USE_STRING_INDEX_TABLE 0
#endif
#ifndef SEXP_USE_STRING_INDEX_TABLE
#define SEXP_USE_STRING_INDEX_TABLE 0
#endif
/* for every chunk_size indexes store the precomputed offset */
#ifndef SEXP_STRING_INDEX_TABLE_CHUNK_SIZE
#define SEXP_STRING_INDEX_TABLE_CHUNK_SIZE 64
#endif
#ifndef SEXP_USE_DISJOINT_STRING_CURSORS #ifndef SEXP_USE_DISJOINT_STRING_CURSORS
#define SEXP_USE_DISJOINT_STRING_CURSORS SEXP_USE_UTF8_STRINGS #define SEXP_USE_DISJOINT_STRING_CURSORS SEXP_USE_UTF8_STRINGS
#endif #endif

View file

@ -443,6 +443,9 @@ struct sexp_struct {
#else #else
sexp_uint_t offset, length; sexp_uint_t offset, length;
sexp bytes; sexp bytes;
#if SEXP_USE_STRING_INDEX_TABLE
sexp charlens;
#endif
#endif #endif
} string; } string;
struct { struct {
@ -1123,6 +1126,7 @@ enum sexp_uniform_vector_type {
#define sexp_bit_set(u1v, i, x) (x ? (sexp_uvector_data(u1v)[i/8]|=(1<<(i%8))) : (sexp_uvector_data(u1v)[i/8]&=~(1<<(i%8)))) #define sexp_bit_set(u1v, i, x) (x ? (sexp_uvector_data(u1v)[i/8]|=(1<<(i%8))) : (sexp_uvector_data(u1v)[i/8]&=~(1<<(i%8))))
#define sexp_string_size(x) (sexp_field(x, string, SEXP_STRING, length)) #define sexp_string_size(x) (sexp_field(x, string, SEXP_STRING, length))
#define sexp_string_charlens(x) (sexp_field(x, string, SEXP_STRING, charlens))
#if SEXP_USE_PACKED_STRINGS #if SEXP_USE_PACKED_STRINGS
#define sexp_string_data(x) (sexp_field(x, string, SEXP_STRING, data)) #define sexp_string_data(x) (sexp_field(x, string, SEXP_STRING, data))
#define sexp_string_bytes(x) (x) #define sexp_string_bytes(x) (x)
@ -1722,6 +1726,12 @@ SEXP_API int sexp_write_utf8_char (sexp ctx, int c, sexp out);
#define sexp_substring_cursor(ctx, s, i, j) sexp_substring_op(ctx, NULL, 3, s, i, j) #define sexp_substring_cursor(ctx, s, i, j) sexp_substring_op(ctx, NULL, 3, s, i, j)
#endif #endif
#if SEXP_USE_STRING_INDEX_TABLE
SEXP_API void sexp_update_string_index_lookup(sexp ctx, sexp s);
#else
#define sexp_update_string_index_lookup(ctx, s)
#endif
#if SEXP_USE_GREEN_THREADS #if SEXP_USE_GREEN_THREADS
SEXP_API int sexp_maybe_block_port (sexp ctx, sexp in, int forcep); SEXP_API int sexp_maybe_block_port (sexp ctx, sexp in, int forcep);
SEXP_API void sexp_maybe_unblock_port (sexp ctx, sexp in); SEXP_API void sexp_maybe_unblock_port (sexp ctx, sexp in);

56
sexp.c
View file

@ -262,7 +262,7 @@ static struct sexp_type_struct _sexp_type_specs[] = {
#if SEXP_USE_PACKED_STRINGS #if SEXP_USE_PACKED_STRINGS
{SEXP_STRING, 0, 0, 0, 0, 0, sexp_sizeof(string)+1, sexp_offsetof(string, length), 1, 0, 0, 0, 0, 0, 0, (sexp)"String", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL}, {SEXP_STRING, 0, 0, 0, 0, 0, sexp_sizeof(string)+1, sexp_offsetof(string, length), 1, 0, 0, 0, 0, 0, 0, (sexp)"String", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL},
#else #else
{SEXP_STRING, sexp_offsetof(string, bytes), 1, 1, 0, 0, sexp_sizeof(string), 0, 0, 0, 0, 0, 0, 0, 0, (sexp)"String", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL}, {SEXP_STRING, sexp_offsetof(string, bytes), 1, 1+SEXP_USE_STRING_INDEX_TABLE, 0, 0, sexp_sizeof(string), 0, 0, 0, 0, 0, 0, 0, 0, (sexp)"String", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL},
#endif #endif
{SEXP_VECTOR, sexp_offsetof(vector, data), 0, 0, sexp_offsetof(vector, length), 1, sexp_sizeof(vector), sexp_offsetof(vector, length), sizeof(sexp), 0, 0, 0, 0, 0, 0, (sexp)"Vector", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL}, {SEXP_VECTOR, sexp_offsetof(vector, data), 0, 0, sexp_offsetof(vector, length), 1, sexp_sizeof(vector), sexp_offsetof(vector, length), sizeof(sexp), 0, 0, 0, 0, 0, 0, (sexp)"Vector", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL},
{SEXP_FLONUM, 0, 0, 0, 0, 0, sexp_sizeof(flonum), 0, 0, 0, 0, 0, 0, 0, 0, (sexp)"Flonum", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL}, {SEXP_FLONUM, 0, 0, 0, 0, 0, sexp_sizeof(flonum), 0, 0, 0, 0, 0, 0, 0, 0, (sexp)"Flonum", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL},
@ -1198,13 +1198,31 @@ void sexp_utf8_encode_char (unsigned char* p, int len, int c) {
} }
sexp sexp_string_index_to_cursor (sexp ctx, sexp self, sexp_sint_t n, sexp str, sexp index) { sexp sexp_string_index_to_cursor (sexp ctx, sexp self, sexp_sint_t n, sexp str, sexp index) {
#if SEXP_USE_STRING_INDEX_TABLE
sexp charlens;
sexp_sint_t* chunklens;
sexp_sint_t chunk;
#endif
sexp_sint_t i, j, limit; sexp_sint_t i, j, limit;
unsigned char *p; unsigned char *p;
sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str); sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str);
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, index); sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, index);
p = (unsigned char*)sexp_string_data(str); p = (unsigned char*)sexp_string_data(str);
limit = sexp_string_size(str); limit = sexp_string_size(str);
for (j=0, i=sexp_unbox_fixnum(index); i>0 && j<limit; i--) i = sexp_unbox_fixnum(index);
j = 0;
#if SEXP_USE_STRING_INDEX_TABLE
if (i > SEXP_STRING_INDEX_TABLE_CHUNK_SIZE) {
charlens = sexp_string_charlens(str);
if (charlens) {
chunklens = (sexp_sint_t*)sexp_bytes_data(charlens);
chunk = i / SEXP_STRING_INDEX_TABLE_CHUNK_SIZE - 1;
j = chunklens[chunk];
i -= (chunk+1) * SEXP_STRING_INDEX_TABLE_CHUNK_SIZE;
}
}
#endif
for ( ; i>0 && j<limit; i--)
j += sexp_utf8_initial_byte_count(p[j]); j += sexp_utf8_initial_byte_count(p[j]);
if (i != 0) if (i != 0)
return sexp_user_exception(ctx, self, "string-index->cursor: index out of range", index); return sexp_user_exception(ctx, self, "string-index->cursor: index out of range", index);
@ -1227,6 +1245,36 @@ sexp sexp_string_cursor_offset (sexp ctx, sexp self, sexp_sint_t n, sexp cur) {
#endif #endif
#if SEXP_USE_STRING_INDEX_TABLE
void sexp_update_string_index_lookup(sexp ctx, sexp s) {
char *p;
sexp_sint_t numchunks, len, i, *chunks;
sexp_gc_var1(tmp);
if (sexp_string_size(s) < SEXP_STRING_INDEX_TABLE_CHUNK_SIZE*1.2) {
sexp_string_charlens(s) = NULL; /* don't build table for just a few chars */
return;
}
sexp_gc_preserve1(ctx, tmp);
tmp = s;
len = sexp_string_utf8_length((unsigned char*) sexp_string_data(s), sexp_string_size(s));
numchunks = ((len + SEXP_STRING_INDEX_TABLE_CHUNK_SIZE - 1) / SEXP_STRING_INDEX_TABLE_CHUNK_SIZE) - 1;
sexp_string_charlens(s) =
sexp_make_bytes_op(ctx, NULL, 2, sexp_make_fixnum(numchunks * sizeof(sexp_sint_t)), SEXP_VOID);
chunks = (sexp_sint_t*)sexp_bytes_data(sexp_string_charlens(s));
p = sexp_string_data(s);
i = 0;
while (1) {
p += sexp_utf8_initial_byte_count(*p);
if (++i % SEXP_STRING_INDEX_TABLE_CHUNK_SIZE == 0) {
chunks[i/SEXP_STRING_INDEX_TABLE_CHUNK_SIZE - 1] = p - sexp_string_data(s);
if (i / SEXP_STRING_INDEX_TABLE_CHUNK_SIZE >= numchunks-1)
break;
}
}
sexp_gc_release1(ctx);
}
#endif
sexp sexp_make_string_op (sexp ctx, sexp self, sexp_sint_t n, sexp len, sexp ch) sexp sexp_make_string_op (sexp ctx, sexp self, sexp_sint_t n, sexp len, sexp ch)
{ {
sexp i = (sexp_charp(ch) ? sexp_make_fixnum(sexp_unbox_character(ch)) : ch); sexp i = (sexp_charp(ch) ? sexp_make_fixnum(sexp_unbox_character(ch)) : ch);
@ -1259,6 +1307,7 @@ sexp sexp_make_string_op (sexp ctx, sexp self, sexp_sint_t n, sexp len, sexp ch)
sexp_string_bytes(s) = b; sexp_string_bytes(s) = b;
sexp_string_offset(s) = 0; sexp_string_offset(s) = 0;
sexp_string_size(s) = sexp_bytes_length(b); sexp_string_size(s) = sexp_bytes_length(b);
sexp_update_string_index_lookup(ctx, s);
sexp_gc_release2(ctx); sexp_gc_release2(ctx);
return s; return s;
#endif #endif
@ -1273,6 +1322,7 @@ sexp sexp_c_string (sexp ctx, const char *str, sexp_sint_t slen) {
if (sexp_exceptionp(s)) return s; if (sexp_exceptionp(s)) return s;
memcpy(sexp_string_data(s), str, len); memcpy(sexp_string_data(s), str, len);
sexp_string_data(s)[len] = '\0'; sexp_string_data(s)[len] = '\0';
sexp_update_string_index_lookup(ctx, s);
return s; return s;
} }
@ -1294,6 +1344,7 @@ sexp sexp_substring_op (sexp ctx, sexp self, sexp_sint_t n, sexp str, sexp start
sexp_string_data(str)+sexp_unbox_string_cursor(start), sexp_string_data(str)+sexp_unbox_string_cursor(start),
sexp_string_size(res)); sexp_string_size(res));
sexp_string_data(res)[sexp_string_size(res)] = '\0'; sexp_string_data(res)[sexp_string_size(res)] = '\0';
sexp_update_string_index_lookup(ctx, res);
return res; return res;
} }
@ -1360,6 +1411,7 @@ sexp sexp_string_concatenate_op (sexp ctx, sexp self, sexp_sint_t n, sexp str_ls
} }
} }
*p = '\0'; *p = '\0';
sexp_update_string_index_lookup(ctx, res);
return res; return res;
} }

View file

@ -33,5 +33,6 @@ CPPFLAGS=-DSEXP_USE_UTF8_STRINGS=0
CPPFLAGS=-DSEXP_USE_DISJOINT_STRING_CURSORS=0 CPPFLAGS=-DSEXP_USE_DISJOINT_STRING_CURSORS=0
CFLAGS=-DSEXP_USE_STATIC_LIBS_NO_INCLUDE=0;CPPFLAGS=-DSEXP_USE_STATIC_LIBS=1 CFLAGS=-DSEXP_USE_STATIC_LIBS_NO_INCLUDE=0;CPPFLAGS=-DSEXP_USE_STATIC_LIBS=1
CPPFLAGS=-DSEXP_USE_MUTABLE_STRINGS=0 CPPFLAGS=-DSEXP_USE_MUTABLE_STRINGS=0
CPPFLAGS=-DSEXP_USE_STRING_INDEX_TABLE=1
CPPFLAGS=-DSEXP_USE_STRICT_TOPLEVEL_BINDINGS=1 CPPFLAGS=-DSEXP_USE_STRICT_TOPLEVEL_BINDINGS=1
CPPFLAGS=-DSEXP_USE_NO_FEATURES=1 CPPFLAGS=-DSEXP_USE_NO_FEATURES=1