adding unicode tests, fixing substring & make-string

This commit is contained in:
Alex Shinn 2010-12-08 22:57:13 -08:00
parent 6a10550dad
commit 840ef090bd
6 changed files with 135 additions and 44 deletions

View file

@ -206,6 +206,9 @@ test-records: chibi-scheme$(EXE)
test-weak: chibi-scheme$(EXE)
LD_LIBRARY_PATH=".:$(LD_LIBRARY_PATH)" ./chibi-scheme$(EXE) tests/weak-tests.scm
test-unicode: chibi-scheme$(EXE)
LD_LIBRARY_PATH=".:$(LD_LIBRARY_PATH)" ./chibi-scheme$(EXE) tests/unicode-tests.scm
test-libs: chibi-scheme$(EXE)
LD_LIBRARY_PATH=".:$(LD_LIBRARY_PATH)" ./chibi-scheme$(EXE) tests/lib-tests.scm

29
eval.c
View file

@ -1142,13 +1142,13 @@ static sexp sexp_string_cmp_op (sexp ctx sexp_api_params(self, n), sexp str1, se
#if SEXP_USE_UTF8_STRINGS
static int sexp_utf8_initial_byte_count(int c) {
static int sexp_utf8_initial_byte_count (int c) {
if (c < 0xC0) return 1;
if (c < 0xE0) return 2;
return ((c>>4)&1)+3;
}
static int sexp_utf8_char_byte_count(int c) {
static int sexp_utf8_char_byte_count (int c) {
if (c < 0x80) return 1;
if (c < 0x800) return 2;
if (c < 0x10000) return 3;
@ -1163,20 +1163,6 @@ static int sexp_string_utf8_length (unsigned char *p, int len) {
return i;
}
sexp sexp_string_index_to_offset (sexp ctx sexp_api_params(self, n), sexp str, sexp index) {
sexp_sint_t i, j, limit;
unsigned char *p;
sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str);
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, index);
p = (unsigned char*)sexp_string_data(str);
limit = sexp_string_length(str);
for (j=0, i=sexp_unbox_fixnum(index); i>0 && j<limit; i--)
j += sexp_utf8_initial_byte_count(p[j]);
if (i>0)
return sexp_user_exception(ctx, self, "string-index->offset: index out of range", index);
return sexp_make_fixnum(j);
}
sexp sexp_string_utf8_ref (sexp ctx, sexp str, sexp i) {
unsigned char *p=(unsigned char*)sexp_string_data(str) + sexp_unbox_fixnum(i);
if (*p < 0x80)
@ -1200,17 +1186,6 @@ sexp sexp_string_utf8_index_ref (sexp ctx sexp_api_params(self, n), sexp str, se
return sexp_string_utf8_ref(ctx, str, off);
}
void sexp_utf8_encode_char (unsigned char* p, int len, int c) {
switch (len) {
case 4: *p++ = (0xF0 + ((c)>>18)); *p++ = (0x80 + ((c>>12)&0x3F));
*p++ = (0x80 + ((c>>6)&0x3F)); *p = (0x80 + (c&0x3F)); break;
case 3: *p++ = (0xE0 + ((c)>>12)); *p++ = (0x80 + ((c>>6)&0x3F));
*p = (0x80 + (c&0x3F)); break;
case 2: *p++ = (0xC0 + ((c)>>6)); *p = (0x80 + (c&0x3F)); break;
default: *p = c; break;
}
}
void sexp_write_utf8_char (sexp ctx, int c, sexp out) {
unsigned char buf[8];
int len = sexp_utf8_char_byte_count(c);

View file

@ -1034,6 +1034,12 @@ SEXP_API sexp sexp_range_exception (sexp ctx, sexp obj, sexp start, sexp end);
SEXP_API sexp sexp_print_exception_op (sexp ctx sexp_api_params(self, n), sexp exn, sexp out);
SEXP_API void sexp_init(void);
#if SEXP_USE_UTF8_STRINGS
SEXP_API sexp sexp_string_index_to_offset (sexp ctx sexp_api_params(self, n), sexp str, sexp index);
SEXP_API sexp sexp_utf8_substring_op (sexp ctx sexp_api_params(self, n), sexp str, sexp start, sexp end);
SEXP_API void sexp_utf8_encode_char (unsigned char* p, int len, int c);
#endif
#define sexp_assert_type(ctx, pred, type_id, obj) if (! pred(obj)) return sexp_type_exception(ctx, self, type_id, obj)
#define SEXP_COPY_DEFAULT SEXP_ZERO

View file

@ -10,6 +10,7 @@
#define _FN2OPT(rt, a1, a2, s, d, f) _FN(SEXP_OP_FCALL2, 1, 1, rt, a1, a2, SEXP_FALSE, s, d, f)
#define _FN2OPTP(rt, a1, a2, s, d, f) _FN(SEXP_OP_FCALL2, 1, 3, rt, a1, a2, SEXP_FALSE, s, d, f)
#define _FN3(rt, a1, a2, a3, s, d, f) _FN(SEXP_OP_FCALL3, 3, 0, rt, a1, a2, a3, s, d, f)
#define _FN3OPT(rt, a1, a2, a3, s, d, f) _FN(SEXP_OP_FCALL3, 2, 1, rt, a1, a2, a3, s, d, f)
#define _FN4(rt, a1, a2, a3, s, d, f) _FN(SEXP_OP_FCALL4, 4, 0, rt, a1, a2, a3, s, d, f)
#define _PARAM(n, t) _OP(SEXP_OPC_PARAMETER, SEXP_OP_PARAMETER_REF, 0, 1, t, t, SEXP_FALSE, SEXP_FALSE, 0, n, SEXP_FALSE, 0)
@ -120,7 +121,6 @@ _FN2OPT(_I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_CHAR), "make-string", sexp_mak
_FN2OPT(_I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_FIXNUM), "make-byte-vector", SEXP_ZERO, sexp_make_bytes_op),
_FN2OPT(_I(SEXP_NUMBER), _I(SEXP_STRING), _I(SEXP_FIXNUM), "string->number", SEXP_TEN, sexp_string_to_number_op),
_FN3(_I(SEXP_FIXNUM), _I(SEXP_STRING), _I(SEXP_STRING), _I(SEXP_BOOLEAN), "string-cmp", 0, sexp_string_cmp_op),
_FN3(_I(SEXP_STRING), _I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_FIXNUM), "substring", 0, sexp_substring_op),
_FN1(_I(SEXP_SYMBOL), _I(SEXP_STRING), "string->symbol", 0, sexp_string_to_symbol_op),
_FN2OPT(_I(SEXP_STRING), SEXP_NULL, _I(SEXP_STRING), "string-concatenate", SEXP_FALSE, sexp_string_concatenate_op),
_FN2(_I(SEXP_OBJECT), _I(SEXP_OBJECT), SEXP_NULL, "memq", 0, sexp_memq_op),
@ -150,6 +150,10 @@ _FN2(_I(SEXP_NUMBER), _I(SEXP_NUMBER), _I(SEXP_NUMBER), "expt", 0, sexp_expt_op)
_FN2(_I(SEXP_FIXNUM), _I(SEXP_STRING), _I(SEXP_FIXNUM), "string-index->offset", 0, sexp_string_index_to_offset),
_FN2(_I(SEXP_CHAR), _I(SEXP_STRING), _I(SEXP_FIXNUM), "string-ref", 0, sexp_string_utf8_index_ref),
_FN3(SEXP_VOID, _I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_CHAR), "string-set!", 0, sexp_string_utf8_index_set),
_FN3OPT(_I(SEXP_STRING), _I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_FIXNUM), "substring-cursor", SEXP_FALSE, sexp_substring_op),
_FN3OPT(_I(SEXP_STRING), _I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_FIXNUM), "substring", SEXP_FALSE, sexp_utf8_substring_op),
#else
_FN3OPT(_I(SEXP_STRING), _I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_FIXNUM), "substring", SEXP_FALSE, sexp_substring_op),
#endif
#if SEXP_USE_TYPE_DEFS
_FN3(_I(SEXP_TYPE), _I(SEXP_STRING), _I(SEXP_TYPE), SEXP_NULL, "register-simple-type", 0, sexp_register_simple_type_op),

92
sexp.c
View file

@ -720,10 +720,64 @@ sexp sexp_make_bytes_op (sexp ctx sexp_api_params(self, n), sexp len, sexp i) {
return s;
}
#if SEXP_USE_UTF8_STRINGS
static int sexp_utf8_initial_byte_count (int c) {
if (c < 0xC0) return 1;
if (c < 0xE0) return 2;
return ((c>>4)&1)+3;
}
static int sexp_utf8_char_byte_count (int c) {
if (c < 0x80) return 1;
if (c < 0x800) return 2;
if (c < 0x10000) return 3;
return 4;
}
void sexp_utf8_encode_char (unsigned char* p, int len, int c) {
switch (len) {
case 4: *p++ = (0xF0 + ((c)>>18)); *p++ = (0x80 + ((c>>12)&0x3F));
*p++ = (0x80 + ((c>>6)&0x3F)); *p = (0x80 + (c&0x3F)); break;
case 3: *p++ = (0xE0 + ((c)>>12)); *p++ = (0x80 + ((c>>6)&0x3F));
*p = (0x80 + (c&0x3F)); break;
case 2: *p++ = (0xC0 + ((c)>>6)); *p = (0x80 + (c&0x3F)); break;
default: *p = c; break;
}
}
sexp sexp_string_index_to_offset (sexp ctx sexp_api_params(self, n), sexp str, sexp index) {
sexp_sint_t i, j, limit;
unsigned char *p;
sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str);
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, index);
p = (unsigned char*)sexp_string_data(str);
limit = sexp_string_length(str);
for (j=0, i=sexp_unbox_fixnum(index); i>0 && j<limit; i--)
j += sexp_utf8_initial_byte_count(p[j]);
if (i>0)
return sexp_user_exception(ctx, self, "string-index->offset: index out of range", index);
return sexp_make_fixnum(j);
}
#endif
sexp sexp_make_string_op (sexp ctx sexp_api_params(self, n), sexp len, sexp ch)
{
sexp i = (sexp_charp(ch) ? sexp_make_fixnum(sexp_unbox_character(ch)) : ch);
sexp_gc_var2(b, s);
#if SEXP_USE_UTF8_STRINGS
int j, clen;
if (sexp_charp(ch) && (sexp_unbox_character(ch) >= 0x80)) {
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, len);
clen = sexp_utf8_char_byte_count(sexp_unbox_character(ch));
b = sexp_make_bytes_op(ctx sexp_api_pass(self, n),
sexp_fx_mul(len, sexp_make_fixnum(clen)), SEXP_VOID);
for (j=0; j<sexp_unbox_fixnum(len); j++)
sexp_utf8_encode_char((unsigned char*)sexp_bytes_data(b)+(j*clen), clen,
sexp_unbox_character(ch));
} else
#endif
b = sexp_make_bytes_op(ctx sexp_api_pass(self, n), len, i);
if (sexp_exceptionp(b)) return b;
#if SEXP_USE_PACKED_STRINGS
@ -734,7 +788,7 @@ sexp sexp_make_string_op (sexp ctx sexp_api_params(self, n), sexp len, sexp ch)
s = sexp_alloc_type(ctx, string, SEXP_STRING);
sexp_string_bytes(s) = b;
sexp_string_offset(s) = 0;
sexp_string_length(s) = sexp_unbox_fixnum(len);
sexp_string_length(s) = sexp_bytes_length(b);
sexp_gc_release2(ctx);
return s;
#endif
@ -769,6 +823,17 @@ sexp sexp_substring_op (sexp ctx sexp_api_params(self, n), sexp str, sexp start,
return res;
}
#if SEXP_USE_UTF8_STRINGS
sexp sexp_utf8_substring_op (sexp ctx sexp_api_params(self, n), sexp str, sexp start, sexp end) {
sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str);
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, start);
start = sexp_string_index_to_offset(ctx sexp_api_pass(self, n), str, start);
if (sexp_fixnump(end))
end = sexp_string_index_to_offset(ctx sexp_api_pass(self, n), str, end);
return sexp_substring_op(ctx sexp_api_pass(self, n), str, start, end);
}
#endif
sexp sexp_string_concatenate_op (sexp ctx sexp_api_params(self, n), sexp str_ls, sexp sep) {
sexp res, ls;
sexp_uint_t len=0, i=0, sep_len=0;
@ -1540,23 +1605,18 @@ sexp sexp_read_number (sexp ctx, sexp in, int base) {
}
#if SEXP_USE_UTF8_STRINGS
static int sexp_decode_utf8_char(const unsigned char* str) {
const unsigned char* tail=str;
int i = str[0], res = -1;
if ((i >= 0xC0) && (i <= 0xF7)) {
if (i < 0xE0) {
res = ((i&0x3F)<<6) + (str[1]&0x3F);
tail = str + 2;
} else if (i < 0xF0) {
res = ((i&0x1F)<<12) + ((str[1]&0x3F)<<6) + (str[2]&0x3F);
tail = str + 3;
} else {
res = ((i&0x0F)<<16) + ((str[1]&0x3F)<<6)
+ ((str[2]&0x3F)<<6) + (str[3]&0x3F);
tail = str + 4;
static int sexp_decode_utf8_char(const unsigned char* s) {
int i = s[0], len = strlen((const char*)s);
if ((i >= 0xC0) && (i <= 0xF7) && (s[1]>>6 == 2)) {
if ((i < 0xE0) && (len == 2)) {
return ((i&0x3F)<<6) + (s[1]&0x3F);
} else if ((i < 0xF0) && (len == 3) && (s[2]>>6 == 2)) {
return ((i&0x1F)<<12) + ((s[1]&0x3F)<<6) + (s[2]&0x3F);
} else if ((len == 4) && (s[2]>>6 == 2) && (s[3]>>6 == 2)) {
return ((i&0x0F)<<16) + ((s[1]&0x3F)<<6) + ((s[2]&0x3F)<<6) + (s[3]&0x3F);
}
}
return *tail ? -1 : res;
return -1;
}
#endif

43
tests/unicode-tests.scm Normal file
View file

@ -0,0 +1,43 @@
;; These tests are only valid if chibi-scheme is compiled with Unicode
;; support (SEXP_USE_UTF8_STRINGS).
(import (chibi test))
(test-begin "unicode")
(test #\Р (string-ref "Русский" 0))
(test #\и (string-ref "Русский" 5))
(test #\й (string-ref "Русский" 6))
(test 7 (string-length "Русский"))
(test #\二 (string-ref "二本語" 0))
(test #\本 (string-ref "二本語" 1))
(test #\語 (string-ref "二本語" 2))
(test 3 (string-length "二本語"))
(test '(#\二 #\本 #\語) (string->list "二本語"))
(test "二本語" (list->string '(#\二 #\本 #\語)))
(test "二本" (substring "二本語" 0 2))
(test "本語" (substring "二本語" 1 3))
(test "二-語"
(let ((s (substring "二本語" 0 3)))
(string-set! s 1 #\-)
s))
(test "二本人"
(let ((s (substring "二本語" 0 3)))
(string-set! s 2 #\人)
s))
(test "字字字" (make-string 3 #\字))
(test "字字字"
(let ((s (make-string 3)))
(string-fill! s #\字)
s))
(test-end)