mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-05-20 14:19:18 +02:00
adding unicode tests, fixing substring & make-string
This commit is contained in:
parent
6a10550dad
commit
840ef090bd
6 changed files with 135 additions and 44 deletions
3
Makefile
3
Makefile
|
@ -206,6 +206,9 @@ test-records: chibi-scheme$(EXE)
|
|||
test-weak: chibi-scheme$(EXE)
|
||||
LD_LIBRARY_PATH=".:$(LD_LIBRARY_PATH)" ./chibi-scheme$(EXE) tests/weak-tests.scm
|
||||
|
||||
test-unicode: chibi-scheme$(EXE)
|
||||
LD_LIBRARY_PATH=".:$(LD_LIBRARY_PATH)" ./chibi-scheme$(EXE) tests/unicode-tests.scm
|
||||
|
||||
test-libs: chibi-scheme$(EXE)
|
||||
LD_LIBRARY_PATH=".:$(LD_LIBRARY_PATH)" ./chibi-scheme$(EXE) tests/lib-tests.scm
|
||||
|
||||
|
|
29
eval.c
29
eval.c
|
@ -1142,13 +1142,13 @@ static sexp sexp_string_cmp_op (sexp ctx sexp_api_params(self, n), sexp str1, se
|
|||
|
||||
#if SEXP_USE_UTF8_STRINGS
|
||||
|
||||
static int sexp_utf8_initial_byte_count(int c) {
|
||||
static int sexp_utf8_initial_byte_count (int c) {
|
||||
if (c < 0xC0) return 1;
|
||||
if (c < 0xE0) return 2;
|
||||
return ((c>>4)&1)+3;
|
||||
}
|
||||
|
||||
static int sexp_utf8_char_byte_count(int c) {
|
||||
static int sexp_utf8_char_byte_count (int c) {
|
||||
if (c < 0x80) return 1;
|
||||
if (c < 0x800) return 2;
|
||||
if (c < 0x10000) return 3;
|
||||
|
@ -1163,20 +1163,6 @@ static int sexp_string_utf8_length (unsigned char *p, int len) {
|
|||
return i;
|
||||
}
|
||||
|
||||
sexp sexp_string_index_to_offset (sexp ctx sexp_api_params(self, n), sexp str, sexp index) {
|
||||
sexp_sint_t i, j, limit;
|
||||
unsigned char *p;
|
||||
sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str);
|
||||
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, index);
|
||||
p = (unsigned char*)sexp_string_data(str);
|
||||
limit = sexp_string_length(str);
|
||||
for (j=0, i=sexp_unbox_fixnum(index); i>0 && j<limit; i--)
|
||||
j += sexp_utf8_initial_byte_count(p[j]);
|
||||
if (i>0)
|
||||
return sexp_user_exception(ctx, self, "string-index->offset: index out of range", index);
|
||||
return sexp_make_fixnum(j);
|
||||
}
|
||||
|
||||
sexp sexp_string_utf8_ref (sexp ctx, sexp str, sexp i) {
|
||||
unsigned char *p=(unsigned char*)sexp_string_data(str) + sexp_unbox_fixnum(i);
|
||||
if (*p < 0x80)
|
||||
|
@ -1200,17 +1186,6 @@ sexp sexp_string_utf8_index_ref (sexp ctx sexp_api_params(self, n), sexp str, se
|
|||
return sexp_string_utf8_ref(ctx, str, off);
|
||||
}
|
||||
|
||||
void sexp_utf8_encode_char (unsigned char* p, int len, int c) {
|
||||
switch (len) {
|
||||
case 4: *p++ = (0xF0 + ((c)>>18)); *p++ = (0x80 + ((c>>12)&0x3F));
|
||||
*p++ = (0x80 + ((c>>6)&0x3F)); *p = (0x80 + (c&0x3F)); break;
|
||||
case 3: *p++ = (0xE0 + ((c)>>12)); *p++ = (0x80 + ((c>>6)&0x3F));
|
||||
*p = (0x80 + (c&0x3F)); break;
|
||||
case 2: *p++ = (0xC0 + ((c)>>6)); *p = (0x80 + (c&0x3F)); break;
|
||||
default: *p = c; break;
|
||||
}
|
||||
}
|
||||
|
||||
void sexp_write_utf8_char (sexp ctx, int c, sexp out) {
|
||||
unsigned char buf[8];
|
||||
int len = sexp_utf8_char_byte_count(c);
|
||||
|
|
|
@ -1034,6 +1034,12 @@ SEXP_API sexp sexp_range_exception (sexp ctx, sexp obj, sexp start, sexp end);
|
|||
SEXP_API sexp sexp_print_exception_op (sexp ctx sexp_api_params(self, n), sexp exn, sexp out);
|
||||
SEXP_API void sexp_init(void);
|
||||
|
||||
#if SEXP_USE_UTF8_STRINGS
|
||||
SEXP_API sexp sexp_string_index_to_offset (sexp ctx sexp_api_params(self, n), sexp str, sexp index);
|
||||
SEXP_API sexp sexp_utf8_substring_op (sexp ctx sexp_api_params(self, n), sexp str, sexp start, sexp end);
|
||||
SEXP_API void sexp_utf8_encode_char (unsigned char* p, int len, int c);
|
||||
#endif
|
||||
|
||||
#define sexp_assert_type(ctx, pred, type_id, obj) if (! pred(obj)) return sexp_type_exception(ctx, self, type_id, obj)
|
||||
|
||||
#define SEXP_COPY_DEFAULT SEXP_ZERO
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#define _FN2OPT(rt, a1, a2, s, d, f) _FN(SEXP_OP_FCALL2, 1, 1, rt, a1, a2, SEXP_FALSE, s, d, f)
|
||||
#define _FN2OPTP(rt, a1, a2, s, d, f) _FN(SEXP_OP_FCALL2, 1, 3, rt, a1, a2, SEXP_FALSE, s, d, f)
|
||||
#define _FN3(rt, a1, a2, a3, s, d, f) _FN(SEXP_OP_FCALL3, 3, 0, rt, a1, a2, a3, s, d, f)
|
||||
#define _FN3OPT(rt, a1, a2, a3, s, d, f) _FN(SEXP_OP_FCALL3, 2, 1, rt, a1, a2, a3, s, d, f)
|
||||
#define _FN4(rt, a1, a2, a3, s, d, f) _FN(SEXP_OP_FCALL4, 4, 0, rt, a1, a2, a3, s, d, f)
|
||||
#define _PARAM(n, t) _OP(SEXP_OPC_PARAMETER, SEXP_OP_PARAMETER_REF, 0, 1, t, t, SEXP_FALSE, SEXP_FALSE, 0, n, SEXP_FALSE, 0)
|
||||
|
||||
|
@ -120,7 +121,6 @@ _FN2OPT(_I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_CHAR), "make-string", sexp_mak
|
|||
_FN2OPT(_I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_FIXNUM), "make-byte-vector", SEXP_ZERO, sexp_make_bytes_op),
|
||||
_FN2OPT(_I(SEXP_NUMBER), _I(SEXP_STRING), _I(SEXP_FIXNUM), "string->number", SEXP_TEN, sexp_string_to_number_op),
|
||||
_FN3(_I(SEXP_FIXNUM), _I(SEXP_STRING), _I(SEXP_STRING), _I(SEXP_BOOLEAN), "string-cmp", 0, sexp_string_cmp_op),
|
||||
_FN3(_I(SEXP_STRING), _I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_FIXNUM), "substring", 0, sexp_substring_op),
|
||||
_FN1(_I(SEXP_SYMBOL), _I(SEXP_STRING), "string->symbol", 0, sexp_string_to_symbol_op),
|
||||
_FN2OPT(_I(SEXP_STRING), SEXP_NULL, _I(SEXP_STRING), "string-concatenate", SEXP_FALSE, sexp_string_concatenate_op),
|
||||
_FN2(_I(SEXP_OBJECT), _I(SEXP_OBJECT), SEXP_NULL, "memq", 0, sexp_memq_op),
|
||||
|
@ -150,6 +150,10 @@ _FN2(_I(SEXP_NUMBER), _I(SEXP_NUMBER), _I(SEXP_NUMBER), "expt", 0, sexp_expt_op)
|
|||
_FN2(_I(SEXP_FIXNUM), _I(SEXP_STRING), _I(SEXP_FIXNUM), "string-index->offset", 0, sexp_string_index_to_offset),
|
||||
_FN2(_I(SEXP_CHAR), _I(SEXP_STRING), _I(SEXP_FIXNUM), "string-ref", 0, sexp_string_utf8_index_ref),
|
||||
_FN3(SEXP_VOID, _I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_CHAR), "string-set!", 0, sexp_string_utf8_index_set),
|
||||
_FN3OPT(_I(SEXP_STRING), _I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_FIXNUM), "substring-cursor", SEXP_FALSE, sexp_substring_op),
|
||||
_FN3OPT(_I(SEXP_STRING), _I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_FIXNUM), "substring", SEXP_FALSE, sexp_utf8_substring_op),
|
||||
#else
|
||||
_FN3OPT(_I(SEXP_STRING), _I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_FIXNUM), "substring", SEXP_FALSE, sexp_substring_op),
|
||||
#endif
|
||||
#if SEXP_USE_TYPE_DEFS
|
||||
_FN3(_I(SEXP_TYPE), _I(SEXP_STRING), _I(SEXP_TYPE), SEXP_NULL, "register-simple-type", 0, sexp_register_simple_type_op),
|
||||
|
|
92
sexp.c
92
sexp.c
|
@ -720,10 +720,64 @@ sexp sexp_make_bytes_op (sexp ctx sexp_api_params(self, n), sexp len, sexp i) {
|
|||
return s;
|
||||
}
|
||||
|
||||
#if SEXP_USE_UTF8_STRINGS
|
||||
|
||||
static int sexp_utf8_initial_byte_count (int c) {
|
||||
if (c < 0xC0) return 1;
|
||||
if (c < 0xE0) return 2;
|
||||
return ((c>>4)&1)+3;
|
||||
}
|
||||
|
||||
static int sexp_utf8_char_byte_count (int c) {
|
||||
if (c < 0x80) return 1;
|
||||
if (c < 0x800) return 2;
|
||||
if (c < 0x10000) return 3;
|
||||
return 4;
|
||||
}
|
||||
|
||||
void sexp_utf8_encode_char (unsigned char* p, int len, int c) {
|
||||
switch (len) {
|
||||
case 4: *p++ = (0xF0 + ((c)>>18)); *p++ = (0x80 + ((c>>12)&0x3F));
|
||||
*p++ = (0x80 + ((c>>6)&0x3F)); *p = (0x80 + (c&0x3F)); break;
|
||||
case 3: *p++ = (0xE0 + ((c)>>12)); *p++ = (0x80 + ((c>>6)&0x3F));
|
||||
*p = (0x80 + (c&0x3F)); break;
|
||||
case 2: *p++ = (0xC0 + ((c)>>6)); *p = (0x80 + (c&0x3F)); break;
|
||||
default: *p = c; break;
|
||||
}
|
||||
}
|
||||
|
||||
sexp sexp_string_index_to_offset (sexp ctx sexp_api_params(self, n), sexp str, sexp index) {
|
||||
sexp_sint_t i, j, limit;
|
||||
unsigned char *p;
|
||||
sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str);
|
||||
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, index);
|
||||
p = (unsigned char*)sexp_string_data(str);
|
||||
limit = sexp_string_length(str);
|
||||
for (j=0, i=sexp_unbox_fixnum(index); i>0 && j<limit; i--)
|
||||
j += sexp_utf8_initial_byte_count(p[j]);
|
||||
if (i>0)
|
||||
return sexp_user_exception(ctx, self, "string-index->offset: index out of range", index);
|
||||
return sexp_make_fixnum(j);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
sexp sexp_make_string_op (sexp ctx sexp_api_params(self, n), sexp len, sexp ch)
|
||||
{
|
||||
sexp i = (sexp_charp(ch) ? sexp_make_fixnum(sexp_unbox_character(ch)) : ch);
|
||||
sexp_gc_var2(b, s);
|
||||
#if SEXP_USE_UTF8_STRINGS
|
||||
int j, clen;
|
||||
if (sexp_charp(ch) && (sexp_unbox_character(ch) >= 0x80)) {
|
||||
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, len);
|
||||
clen = sexp_utf8_char_byte_count(sexp_unbox_character(ch));
|
||||
b = sexp_make_bytes_op(ctx sexp_api_pass(self, n),
|
||||
sexp_fx_mul(len, sexp_make_fixnum(clen)), SEXP_VOID);
|
||||
for (j=0; j<sexp_unbox_fixnum(len); j++)
|
||||
sexp_utf8_encode_char((unsigned char*)sexp_bytes_data(b)+(j*clen), clen,
|
||||
sexp_unbox_character(ch));
|
||||
} else
|
||||
#endif
|
||||
b = sexp_make_bytes_op(ctx sexp_api_pass(self, n), len, i);
|
||||
if (sexp_exceptionp(b)) return b;
|
||||
#if SEXP_USE_PACKED_STRINGS
|
||||
|
@ -734,7 +788,7 @@ sexp sexp_make_string_op (sexp ctx sexp_api_params(self, n), sexp len, sexp ch)
|
|||
s = sexp_alloc_type(ctx, string, SEXP_STRING);
|
||||
sexp_string_bytes(s) = b;
|
||||
sexp_string_offset(s) = 0;
|
||||
sexp_string_length(s) = sexp_unbox_fixnum(len);
|
||||
sexp_string_length(s) = sexp_bytes_length(b);
|
||||
sexp_gc_release2(ctx);
|
||||
return s;
|
||||
#endif
|
||||
|
@ -769,6 +823,17 @@ sexp sexp_substring_op (sexp ctx sexp_api_params(self, n), sexp str, sexp start,
|
|||
return res;
|
||||
}
|
||||
|
||||
#if SEXP_USE_UTF8_STRINGS
|
||||
sexp sexp_utf8_substring_op (sexp ctx sexp_api_params(self, n), sexp str, sexp start, sexp end) {
|
||||
sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str);
|
||||
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, start);
|
||||
start = sexp_string_index_to_offset(ctx sexp_api_pass(self, n), str, start);
|
||||
if (sexp_fixnump(end))
|
||||
end = sexp_string_index_to_offset(ctx sexp_api_pass(self, n), str, end);
|
||||
return sexp_substring_op(ctx sexp_api_pass(self, n), str, start, end);
|
||||
}
|
||||
#endif
|
||||
|
||||
sexp sexp_string_concatenate_op (sexp ctx sexp_api_params(self, n), sexp str_ls, sexp sep) {
|
||||
sexp res, ls;
|
||||
sexp_uint_t len=0, i=0, sep_len=0;
|
||||
|
@ -1540,23 +1605,18 @@ sexp sexp_read_number (sexp ctx, sexp in, int base) {
|
|||
}
|
||||
|
||||
#if SEXP_USE_UTF8_STRINGS
|
||||
static int sexp_decode_utf8_char(const unsigned char* str) {
|
||||
const unsigned char* tail=str;
|
||||
int i = str[0], res = -1;
|
||||
if ((i >= 0xC0) && (i <= 0xF7)) {
|
||||
if (i < 0xE0) {
|
||||
res = ((i&0x3F)<<6) + (str[1]&0x3F);
|
||||
tail = str + 2;
|
||||
} else if (i < 0xF0) {
|
||||
res = ((i&0x1F)<<12) + ((str[1]&0x3F)<<6) + (str[2]&0x3F);
|
||||
tail = str + 3;
|
||||
} else {
|
||||
res = ((i&0x0F)<<16) + ((str[1]&0x3F)<<6)
|
||||
+ ((str[2]&0x3F)<<6) + (str[3]&0x3F);
|
||||
tail = str + 4;
|
||||
static int sexp_decode_utf8_char(const unsigned char* s) {
|
||||
int i = s[0], len = strlen((const char*)s);
|
||||
if ((i >= 0xC0) && (i <= 0xF7) && (s[1]>>6 == 2)) {
|
||||
if ((i < 0xE0) && (len == 2)) {
|
||||
return ((i&0x3F)<<6) + (s[1]&0x3F);
|
||||
} else if ((i < 0xF0) && (len == 3) && (s[2]>>6 == 2)) {
|
||||
return ((i&0x1F)<<12) + ((s[1]&0x3F)<<6) + (s[2]&0x3F);
|
||||
} else if ((len == 4) && (s[2]>>6 == 2) && (s[3]>>6 == 2)) {
|
||||
return ((i&0x0F)<<16) + ((s[1]&0x3F)<<6) + ((s[2]&0x3F)<<6) + (s[3]&0x3F);
|
||||
}
|
||||
}
|
||||
return *tail ? -1 : res;
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
43
tests/unicode-tests.scm
Normal file
43
tests/unicode-tests.scm
Normal file
|
@ -0,0 +1,43 @@
|
|||
;; These tests are only valid if chibi-scheme is compiled with Unicode
|
||||
;; support (SEXP_USE_UTF8_STRINGS).
|
||||
|
||||
(import (chibi test))
|
||||
|
||||
(test-begin "unicode")
|
||||
|
||||
(test #\Р (string-ref "Русский" 0))
|
||||
(test #\и (string-ref "Русский" 5))
|
||||
(test #\й (string-ref "Русский" 6))
|
||||
|
||||
(test 7 (string-length "Русский"))
|
||||
|
||||
(test #\二 (string-ref "二本語" 0))
|
||||
(test #\本 (string-ref "二本語" 1))
|
||||
(test #\語 (string-ref "二本語" 2))
|
||||
|
||||
(test 3 (string-length "二本語"))
|
||||
|
||||
(test '(#\二 #\本 #\語) (string->list "二本語"))
|
||||
(test "二本語" (list->string '(#\二 #\本 #\語)))
|
||||
|
||||
(test "二本" (substring "二本語" 0 2))
|
||||
(test "本語" (substring "二本語" 1 3))
|
||||
|
||||
(test "二-語"
|
||||
(let ((s (substring "二本語" 0 3)))
|
||||
(string-set! s 1 #\-)
|
||||
s))
|
||||
|
||||
(test "二本人"
|
||||
(let ((s (substring "二本語" 0 3)))
|
||||
(string-set! s 2 #\人)
|
||||
s))
|
||||
|
||||
(test "字字字" (make-string 3 #\字))
|
||||
|
||||
(test "字字字"
|
||||
(let ((s (make-string 3)))
|
||||
(string-fill! s #\字)
|
||||
s))
|
||||
|
||||
(test-end)
|
Loading…
Add table
Reference in a new issue