Add a feature to cache the most recent string index->cursor result

This is lighter-weight than building a full index->cursor table for
the string, adding a constant two words to the memory required to
store a string, as opposed to one word for every n characters. The
cached cursor is used for any string-ref operation requesting an index
after the most-recently-requested index, making potentially quadratic
repeated string-ref procedures run in linear time. In theory, it could
also use a heuristic to speed up moving backwards through the string
when it thinks that moving the old cursor backwards would be faster
than starting again at the start of the string. In practice, my
logging of when the cached cursor is actually reused during the Chibi
compilation and startup process shows that the most common case of
moving backwards is going back to the start of the string anyway.

Benchmarks to follow.
This commit is contained in:
Daphne Preston-Kendal 2021-12-10 21:24:05 +01:00
parent 3080087d8c
commit c09897c449
4 changed files with 39 additions and 2 deletions

8
eval.c
View file

@ -1988,8 +1988,14 @@ void sexp_string_utf8_set (sexp ctx, sexp str, sexp index, sexp ch) {
sexp_string_size(str) += new_len - old_len;
}
sexp_utf8_encode_char(p, new_len, c);
if (old_len != new_len)
if (old_len != new_len) {
#if SEXP_USE_STRING_INDEX_TABLE
sexp_update_string_index_lookup(ctx, str);
#elif SEXP_USE_STRING_REF_CACHE
sexp_cached_char_idx(str) = 0;
sexp_cached_cursor(str) = sexp_make_string_cursor(0);
#endif
}
}
sexp sexp_string_utf8_index_set (sexp ctx, sexp self, sexp_sint_t n, sexp str, sexp i, sexp ch) {

View file

@ -252,6 +252,12 @@
/* */
/* #define SEXP_USE_STRING_INDEX_TABLE 1 */
/* uncomment this to cache a string cursor for string-ref calls */
/* The default is not to use a cache. The goal of caching is to */
/* soften the performance impact of repeated O(n) string-ref */
/* operations on the same string. */
/* #define SEXP_USE_STRING_REF_CACHE 1 */
/* uncomment this to disable automatic closing of ports */
/* If enabled, the underlying FILE* for file ports will be */
/* automatically closed when they're garbage collected. Doesn't */

View file

@ -481,6 +481,9 @@ struct sexp_struct {
sexp bytes;
#if SEXP_USE_STRING_INDEX_TABLE
sexp charlens;
#elif SEXP_USE_STRING_REF_CACHE
sexp_uint_t cached_char_idx;
sexp cached_cursor;
#endif
sexp_uint_t offset, length;
#endif
@ -1198,6 +1201,10 @@ enum sexp_uniform_vector_type {
#define sexp_string_offset(x) (sexp_field(x, string, SEXP_STRING, offset))
#define sexp_string_data(x) (sexp_bytes_data(sexp_string_bytes(x))+sexp_string_offset(x))
#endif
#if SEXP_USE_STRING_REF_CACHE
#define sexp_cached_char_idx(x) (sexp_field(x, string, SEXP_STRING, cached_char_idx))
#define sexp_cached_cursor(x) (sexp_field(x, string, SEXP_STRING, cached_cursor))
#endif
#define sexp_string_maybe_null_data(x) (sexp_not(x) ? NULL : sexp_string_data(x))
#if SEXP_USE_PACKED_STRINGS

20
sexp.c
View file

@ -500,6 +500,9 @@ static const char* sexp_initial_features[] = {
#if SEXP_USE_STRING_INDEX_TABLE
"string-index",
#endif
#if SEXP_USE_STRING_REF_CACHE
"string-ref-cache",
#endif
#if SEXP_USE_GREEN_THREADS
"threads",
#endif
@ -1254,6 +1257,7 @@ sexp sexp_string_index_to_cursor (sexp ctx, sexp self, sexp_sint_t n, sexp str,
sexp_sint_t* chunklens;
sexp_sint_t chunk;
#endif
sexp cursor;
sexp_sint_t i, j, limit;
unsigned char *p;
sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str);
@ -1272,12 +1276,22 @@ sexp sexp_string_index_to_cursor (sexp ctx, sexp self, sexp_sint_t n, sexp str,
i -= (chunk+1) * SEXP_STRING_INDEX_TABLE_CHUNK_SIZE;
}
}
#elif SEXP_USE_STRING_REF_CACHE
if (i >= sexp_cached_char_idx(str)) {
j = sexp_unbox_string_cursor(sexp_cached_cursor(str));
i -= sexp_cached_char_idx(str);
}
#endif
for ( ; i>0 && j<limit; i--)
j += sexp_utf8_initial_byte_count(p[j]);
if (i != 0)
return sexp_user_exception(ctx, self, "string-index->cursor: index out of range", index);
return sexp_make_string_cursor(j);
cursor = sexp_make_string_cursor(j);
#if SEXP_USE_STRING_REF_CACHE
sexp_cached_char_idx(str) = sexp_unbox_fixnum(index);
sexp_cached_cursor(str) = cursor;
#endif
return cursor;
}
sexp sexp_string_cursor_to_index (sexp ctx, sexp self, sexp_sint_t n, sexp str, sexp offset) {
@ -1358,6 +1372,10 @@ sexp sexp_make_string_op (sexp ctx, sexp self, sexp_sint_t n, sexp len, sexp ch)
sexp_string_bytes(s) = b;
sexp_string_offset(s) = 0;
sexp_string_size(s) = sexp_bytes_length(b);
#if SEXP_USE_STRING_REF_CACHE
sexp_cached_char_idx(s) = 0;
sexp_cached_cursor(s) = sexp_make_string_cursor(0);
#endif
sexp_update_string_index_lookup(ctx, s);
sexp_gc_release2(ctx);
return s;