add compile-time option to store precomputed index->cursor tables for strings

This commit is contained in:
Alex Shinn 2019-01-26 05:35:27 +08:00
parent 677ccdce68
commit 9569460a58
5 changed files with 88 additions and 9 deletions

2
eval.c
View file

@ -1947,6 +1947,8 @@ void sexp_string_utf8_set (sexp ctx, sexp str, sexp index, sexp ch) {
sexp_string_size(str) += new_len - old_len;
}
sexp_utf8_encode_char(p, new_len, c);
if (old_len != new_len)
sexp_update_string_index_lookup(ctx, str);
}
sexp sexp_string_utf8_index_set (sexp ctx, sexp self, sexp_sint_t n, sexp str, sexp i, sexp ch) {

View file

@ -210,12 +210,14 @@
/* Making them immutable allows for packed UTF-8 strings. */
/* #define SEXP_USE_MUTABLE_STRINGS 0 */
/* uncomment this to make string cursors just fixnum offsets */
/* The default when using UTF-8 is to have a disjoint string */
/* cursor type. This is an immediate type with no loss in */
/* performance, and prevents confusion mixing indexes and */
/* cursors. */
/* #define SEXP_USE_DISJOINT_STRING_CURSORS 0 */
/* uncomment this to enable precomputed index->cursor tables for strings */
/* This makes string-ref faster at the expensive of making string */
/* construction (including string-append and I/O) slower. */
/* You can configure with SEXP_STRING_INDEX_TABLE_CHUNK_SIZE below, */
/* the default is caching every 64th index (<=12.5% string overhead). */
/* With a minimum of 1 you'd have up to 8x string overhead, and */
/* string-ref would still be slightly slower than string-cursors. */
/* #define SEXP_USE_STRING_INDEX_TABLE 1 */
/* uncomment this to disable automatic closing of ports */
/* If enabled, the underlying FILE* for file ports will be */
@ -647,6 +649,18 @@
#define SEXP_USE_PACKED_STRINGS 1
#endif
#if SEXP_USE_PACKED_STRINGS
#define SEXP_USE_STRING_INDEX_TABLE 0
#endif
#ifndef SEXP_USE_STRING_INDEX_TABLE
#define SEXP_USE_STRING_INDEX_TABLE 0
#endif
/* for every chunk_size indexes store the precomputed offset */
#ifndef SEXP_STRING_INDEX_TABLE_CHUNK_SIZE
#define SEXP_STRING_INDEX_TABLE_CHUNK_SIZE 64
#endif
#ifndef SEXP_USE_DISJOINT_STRING_CURSORS
#define SEXP_USE_DISJOINT_STRING_CURSORS SEXP_USE_UTF8_STRINGS
#endif

View file

@ -443,6 +443,9 @@ struct sexp_struct {
#else
sexp_uint_t offset, length;
sexp bytes;
#if SEXP_USE_STRING_INDEX_TABLE
sexp charlens;
#endif
#endif
} string;
struct {
@ -1122,7 +1125,8 @@ enum sexp_uniform_vector_type {
#define sexp_bit_ref(u1v, i) (((sexp_uvector_data(u1v)[i/8])>>(i%8))&1)
#define sexp_bit_set(u1v, i, x) (x ? (sexp_uvector_data(u1v)[i/8]|=(1<<(i%8))) : (sexp_uvector_data(u1v)[i/8]&=~(1<<(i%8))))
#define sexp_string_size(x) (sexp_field(x, string, SEXP_STRING, length))
#define sexp_string_size(x) (sexp_field(x, string, SEXP_STRING, length))
#define sexp_string_charlens(x) (sexp_field(x, string, SEXP_STRING, charlens))
#if SEXP_USE_PACKED_STRINGS
#define sexp_string_data(x) (sexp_field(x, string, SEXP_STRING, data))
#define sexp_string_bytes(x) (x)
@ -1722,6 +1726,12 @@ SEXP_API int sexp_write_utf8_char (sexp ctx, int c, sexp out);
#define sexp_substring_cursor(ctx, s, i, j) sexp_substring_op(ctx, NULL, 3, s, i, j)
#endif
#if SEXP_USE_STRING_INDEX_TABLE
SEXP_API void sexp_update_string_index_lookup(sexp ctx, sexp s);
#else
#define sexp_update_string_index_lookup(ctx, s)
#endif
#if SEXP_USE_GREEN_THREADS
SEXP_API int sexp_maybe_block_port (sexp ctx, sexp in, int forcep);
SEXP_API void sexp_maybe_unblock_port (sexp ctx, sexp in);

56
sexp.c
View file

@ -262,7 +262,7 @@ static struct sexp_type_struct _sexp_type_specs[] = {
#if SEXP_USE_PACKED_STRINGS
{SEXP_STRING, 0, 0, 0, 0, 0, sexp_sizeof(string)+1, sexp_offsetof(string, length), 1, 0, 0, 0, 0, 0, 0, (sexp)"String", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL},
#else
{SEXP_STRING, sexp_offsetof(string, bytes), 1, 1, 0, 0, sexp_sizeof(string), 0, 0, 0, 0, 0, 0, 0, 0, (sexp)"String", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL},
{SEXP_STRING, sexp_offsetof(string, bytes), 1, 1+SEXP_USE_STRING_INDEX_TABLE, 0, 0, sexp_sizeof(string), 0, 0, 0, 0, 0, 0, 0, 0, (sexp)"String", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL},
#endif
{SEXP_VECTOR, sexp_offsetof(vector, data), 0, 0, sexp_offsetof(vector, length), 1, sexp_sizeof(vector), sexp_offsetof(vector, length), sizeof(sexp), 0, 0, 0, 0, 0, 0, (sexp)"Vector", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL},
{SEXP_FLONUM, 0, 0, 0, 0, 0, sexp_sizeof(flonum), 0, 0, 0, 0, 0, 0, 0, 0, (sexp)"Flonum", SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, SEXP_FALSE, NULL, NULL, NULL, NULL},
@ -1198,13 +1198,31 @@ void sexp_utf8_encode_char (unsigned char* p, int len, int c) {
}
sexp sexp_string_index_to_cursor (sexp ctx, sexp self, sexp_sint_t n, sexp str, sexp index) {
#if SEXP_USE_STRING_INDEX_TABLE
sexp charlens;
sexp_sint_t* chunklens;
sexp_sint_t chunk;
#endif
sexp_sint_t i, j, limit;
unsigned char *p;
sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str);
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, index);
p = (unsigned char*)sexp_string_data(str);
limit = sexp_string_size(str);
for (j=0, i=sexp_unbox_fixnum(index); i>0 && j<limit; i--)
i = sexp_unbox_fixnum(index);
j = 0;
#if SEXP_USE_STRING_INDEX_TABLE
if (i > SEXP_STRING_INDEX_TABLE_CHUNK_SIZE) {
charlens = sexp_string_charlens(str);
if (charlens) {
chunklens = (sexp_sint_t*)sexp_bytes_data(charlens);
chunk = i / SEXP_STRING_INDEX_TABLE_CHUNK_SIZE - 1;
j = chunklens[chunk];
i -= (chunk+1) * SEXP_STRING_INDEX_TABLE_CHUNK_SIZE;
}
}
#endif
for ( ; i>0 && j<limit; i--)
j += sexp_utf8_initial_byte_count(p[j]);
if (i != 0)
return sexp_user_exception(ctx, self, "string-index->cursor: index out of range", index);
@ -1227,6 +1245,36 @@ sexp sexp_string_cursor_offset (sexp ctx, sexp self, sexp_sint_t n, sexp cur) {
#endif
#if SEXP_USE_STRING_INDEX_TABLE
void sexp_update_string_index_lookup(sexp ctx, sexp s) {
char *p;
sexp_sint_t numchunks, len, i, *chunks;
sexp_gc_var1(tmp);
if (sexp_string_size(s) < SEXP_STRING_INDEX_TABLE_CHUNK_SIZE*1.2) {
sexp_string_charlens(s) = NULL; /* don't build table for just a few chars */
return;
}
sexp_gc_preserve1(ctx, tmp);
tmp = s;
len = sexp_string_utf8_length((unsigned char*) sexp_string_data(s), sexp_string_size(s));
numchunks = ((len + SEXP_STRING_INDEX_TABLE_CHUNK_SIZE - 1) / SEXP_STRING_INDEX_TABLE_CHUNK_SIZE) - 1;
sexp_string_charlens(s) =
sexp_make_bytes_op(ctx, NULL, 2, sexp_make_fixnum(numchunks * sizeof(sexp_sint_t)), SEXP_VOID);
chunks = (sexp_sint_t*)sexp_bytes_data(sexp_string_charlens(s));
p = sexp_string_data(s);
i = 0;
while (1) {
p += sexp_utf8_initial_byte_count(*p);
if (++i % SEXP_STRING_INDEX_TABLE_CHUNK_SIZE == 0) {
chunks[i/SEXP_STRING_INDEX_TABLE_CHUNK_SIZE - 1] = p - sexp_string_data(s);
if (i / SEXP_STRING_INDEX_TABLE_CHUNK_SIZE >= numchunks-1)
break;
}
}
sexp_gc_release1(ctx);
}
#endif
sexp sexp_make_string_op (sexp ctx, sexp self, sexp_sint_t n, sexp len, sexp ch)
{
sexp i = (sexp_charp(ch) ? sexp_make_fixnum(sexp_unbox_character(ch)) : ch);
@ -1259,6 +1307,7 @@ sexp sexp_make_string_op (sexp ctx, sexp self, sexp_sint_t n, sexp len, sexp ch)
sexp_string_bytes(s) = b;
sexp_string_offset(s) = 0;
sexp_string_size(s) = sexp_bytes_length(b);
sexp_update_string_index_lookup(ctx, s);
sexp_gc_release2(ctx);
return s;
#endif
@ -1273,6 +1322,7 @@ sexp sexp_c_string (sexp ctx, const char *str, sexp_sint_t slen) {
if (sexp_exceptionp(s)) return s;
memcpy(sexp_string_data(s), str, len);
sexp_string_data(s)[len] = '\0';
sexp_update_string_index_lookup(ctx, s);
return s;
}
@ -1294,6 +1344,7 @@ sexp sexp_substring_op (sexp ctx, sexp self, sexp_sint_t n, sexp str, sexp start
sexp_string_data(str)+sexp_unbox_string_cursor(start),
sexp_string_size(res));
sexp_string_data(res)[sexp_string_size(res)] = '\0';
sexp_update_string_index_lookup(ctx, res);
return res;
}
@ -1360,6 +1411,7 @@ sexp sexp_string_concatenate_op (sexp ctx, sexp self, sexp_sint_t n, sexp str_ls
}
}
*p = '\0';
sexp_update_string_index_lookup(ctx, res);
return res;
}

View file

@ -33,5 +33,6 @@ CPPFLAGS=-DSEXP_USE_UTF8_STRINGS=0
CPPFLAGS=-DSEXP_USE_DISJOINT_STRING_CURSORS=0
CFLAGS=-DSEXP_USE_STATIC_LIBS_NO_INCLUDE=0;CPPFLAGS=-DSEXP_USE_STATIC_LIBS=1
CPPFLAGS=-DSEXP_USE_MUTABLE_STRINGS=0
CPPFLAGS=-DSEXP_USE_STRING_INDEX_TABLE=1
CPPFLAGS=-DSEXP_USE_STRICT_TOPLEVEL_BINDINGS=1
CPPFLAGS=-DSEXP_USE_NO_FEATURES=1