diff --git a/Makefile b/Makefile index b9b338b3..0c1ba328 100644 --- a/Makefile +++ b/Makefile @@ -206,6 +206,9 @@ test-records: chibi-scheme$(EXE) test-weak: chibi-scheme$(EXE) LD_LIBRARY_PATH=".:$(LD_LIBRARY_PATH)" ./chibi-scheme$(EXE) tests/weak-tests.scm +test-unicode: chibi-scheme$(EXE) + LD_LIBRARY_PATH=".:$(LD_LIBRARY_PATH)" ./chibi-scheme$(EXE) tests/unicode-tests.scm + test-libs: chibi-scheme$(EXE) LD_LIBRARY_PATH=".:$(LD_LIBRARY_PATH)" ./chibi-scheme$(EXE) tests/lib-tests.scm diff --git a/eval.c b/eval.c index ede8e5bd..4683edea 100644 --- a/eval.c +++ b/eval.c @@ -1142,13 +1142,13 @@ static sexp sexp_string_cmp_op (sexp ctx sexp_api_params(self, n), sexp str1, se #if SEXP_USE_UTF8_STRINGS -static int sexp_utf8_initial_byte_count(int c) { +static int sexp_utf8_initial_byte_count (int c) { if (c < 0xC0) return 1; if (c < 0xE0) return 2; return ((c>>4)&1)+3; } -static int sexp_utf8_char_byte_count(int c) { +static int sexp_utf8_char_byte_count (int c) { if (c < 0x80) return 1; if (c < 0x800) return 2; if (c < 0x10000) return 3; @@ -1163,20 +1163,6 @@ static int sexp_string_utf8_length (unsigned char *p, int len) { return i; } -sexp sexp_string_index_to_offset (sexp ctx sexp_api_params(self, n), sexp str, sexp index) { - sexp_sint_t i, j, limit; - unsigned char *p; - sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str); - sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, index); - p = (unsigned char*)sexp_string_data(str); - limit = sexp_string_length(str); - for (j=0, i=sexp_unbox_fixnum(index); i>0 && j0) - return sexp_user_exception(ctx, self, "string-index->offset: index out of range", index); - return sexp_make_fixnum(j); -} - sexp sexp_string_utf8_ref (sexp ctx, sexp str, sexp i) { unsigned char *p=(unsigned char*)sexp_string_data(str) + sexp_unbox_fixnum(i); if (*p < 0x80) @@ -1200,17 +1186,6 @@ sexp sexp_string_utf8_index_ref (sexp ctx sexp_api_params(self, n), sexp str, se return sexp_string_utf8_ref(ctx, str, off); } -void sexp_utf8_encode_char (unsigned char* p, int len, int c) { - switch (len) { - case 4: *p++ = (0xF0 + ((c)>>18)); *p++ = (0x80 + ((c>>12)&0x3F)); - *p++ = (0x80 + ((c>>6)&0x3F)); *p = (0x80 + (c&0x3F)); break; - case 3: *p++ = (0xE0 + ((c)>>12)); *p++ = (0x80 + ((c>>6)&0x3F)); - *p = (0x80 + (c&0x3F)); break; - case 2: *p++ = (0xC0 + ((c)>>6)); *p = (0x80 + (c&0x3F)); break; - default: *p = c; break; - } -} - void sexp_write_utf8_char (sexp ctx, int c, sexp out) { unsigned char buf[8]; int len = sexp_utf8_char_byte_count(c); diff --git a/include/chibi/sexp.h b/include/chibi/sexp.h index 700f011b..cd2d4727 100644 --- a/include/chibi/sexp.h +++ b/include/chibi/sexp.h @@ -1034,6 +1034,12 @@ SEXP_API sexp sexp_range_exception (sexp ctx, sexp obj, sexp start, sexp end); SEXP_API sexp sexp_print_exception_op (sexp ctx sexp_api_params(self, n), sexp exn, sexp out); SEXP_API void sexp_init(void); +#if SEXP_USE_UTF8_STRINGS +SEXP_API sexp sexp_string_index_to_offset (sexp ctx sexp_api_params(self, n), sexp str, sexp index); +SEXP_API sexp sexp_utf8_substring_op (sexp ctx sexp_api_params(self, n), sexp str, sexp start, sexp end); +SEXP_API void sexp_utf8_encode_char (unsigned char* p, int len, int c); +#endif + #define sexp_assert_type(ctx, pred, type_id, obj) if (! pred(obj)) return sexp_type_exception(ctx, self, type_id, obj) #define SEXP_COPY_DEFAULT SEXP_ZERO diff --git a/opcodes.c b/opcodes.c index e7a13eee..9bfdeed7 100644 --- a/opcodes.c +++ b/opcodes.c @@ -10,6 +10,7 @@ #define _FN2OPT(rt, a1, a2, s, d, f) _FN(SEXP_OP_FCALL2, 1, 1, rt, a1, a2, SEXP_FALSE, s, d, f) #define _FN2OPTP(rt, a1, a2, s, d, f) _FN(SEXP_OP_FCALL2, 1, 3, rt, a1, a2, SEXP_FALSE, s, d, f) #define _FN3(rt, a1, a2, a3, s, d, f) _FN(SEXP_OP_FCALL3, 3, 0, rt, a1, a2, a3, s, d, f) +#define _FN3OPT(rt, a1, a2, a3, s, d, f) _FN(SEXP_OP_FCALL3, 2, 1, rt, a1, a2, a3, s, d, f) #define _FN4(rt, a1, a2, a3, s, d, f) _FN(SEXP_OP_FCALL4, 4, 0, rt, a1, a2, a3, s, d, f) #define _PARAM(n, t) _OP(SEXP_OPC_PARAMETER, SEXP_OP_PARAMETER_REF, 0, 1, t, t, SEXP_FALSE, SEXP_FALSE, 0, n, SEXP_FALSE, 0) @@ -120,7 +121,6 @@ _FN2OPT(_I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_CHAR), "make-string", sexp_mak _FN2OPT(_I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_FIXNUM), "make-byte-vector", SEXP_ZERO, sexp_make_bytes_op), _FN2OPT(_I(SEXP_NUMBER), _I(SEXP_STRING), _I(SEXP_FIXNUM), "string->number", SEXP_TEN, sexp_string_to_number_op), _FN3(_I(SEXP_FIXNUM), _I(SEXP_STRING), _I(SEXP_STRING), _I(SEXP_BOOLEAN), "string-cmp", 0, sexp_string_cmp_op), -_FN3(_I(SEXP_STRING), _I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_FIXNUM), "substring", 0, sexp_substring_op), _FN1(_I(SEXP_SYMBOL), _I(SEXP_STRING), "string->symbol", 0, sexp_string_to_symbol_op), _FN2OPT(_I(SEXP_STRING), SEXP_NULL, _I(SEXP_STRING), "string-concatenate", SEXP_FALSE, sexp_string_concatenate_op), _FN2(_I(SEXP_OBJECT), _I(SEXP_OBJECT), SEXP_NULL, "memq", 0, sexp_memq_op), @@ -150,6 +150,10 @@ _FN2(_I(SEXP_NUMBER), _I(SEXP_NUMBER), _I(SEXP_NUMBER), "expt", 0, sexp_expt_op) _FN2(_I(SEXP_FIXNUM), _I(SEXP_STRING), _I(SEXP_FIXNUM), "string-index->offset", 0, sexp_string_index_to_offset), _FN2(_I(SEXP_CHAR), _I(SEXP_STRING), _I(SEXP_FIXNUM), "string-ref", 0, sexp_string_utf8_index_ref), _FN3(SEXP_VOID, _I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_CHAR), "string-set!", 0, sexp_string_utf8_index_set), +_FN3OPT(_I(SEXP_STRING), _I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_FIXNUM), "substring-cursor", SEXP_FALSE, sexp_substring_op), +_FN3OPT(_I(SEXP_STRING), _I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_FIXNUM), "substring", SEXP_FALSE, sexp_utf8_substring_op), +#else +_FN3OPT(_I(SEXP_STRING), _I(SEXP_STRING), _I(SEXP_FIXNUM), _I(SEXP_FIXNUM), "substring", SEXP_FALSE, sexp_substring_op), #endif #if SEXP_USE_TYPE_DEFS _FN3(_I(SEXP_TYPE), _I(SEXP_STRING), _I(SEXP_TYPE), SEXP_NULL, "register-simple-type", 0, sexp_register_simple_type_op), diff --git a/sexp.c b/sexp.c index 0d79e42c..68c35bf9 100644 --- a/sexp.c +++ b/sexp.c @@ -720,10 +720,64 @@ sexp sexp_make_bytes_op (sexp ctx sexp_api_params(self, n), sexp len, sexp i) { return s; } +#if SEXP_USE_UTF8_STRINGS + +static int sexp_utf8_initial_byte_count (int c) { + if (c < 0xC0) return 1; + if (c < 0xE0) return 2; + return ((c>>4)&1)+3; +} + +static int sexp_utf8_char_byte_count (int c) { + if (c < 0x80) return 1; + if (c < 0x800) return 2; + if (c < 0x10000) return 3; + return 4; +} + +void sexp_utf8_encode_char (unsigned char* p, int len, int c) { + switch (len) { + case 4: *p++ = (0xF0 + ((c)>>18)); *p++ = (0x80 + ((c>>12)&0x3F)); + *p++ = (0x80 + ((c>>6)&0x3F)); *p = (0x80 + (c&0x3F)); break; + case 3: *p++ = (0xE0 + ((c)>>12)); *p++ = (0x80 + ((c>>6)&0x3F)); + *p = (0x80 + (c&0x3F)); break; + case 2: *p++ = (0xC0 + ((c)>>6)); *p = (0x80 + (c&0x3F)); break; + default: *p = c; break; + } +} + +sexp sexp_string_index_to_offset (sexp ctx sexp_api_params(self, n), sexp str, sexp index) { + sexp_sint_t i, j, limit; + unsigned char *p; + sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str); + sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, index); + p = (unsigned char*)sexp_string_data(str); + limit = sexp_string_length(str); + for (j=0, i=sexp_unbox_fixnum(index); i>0 && j0) + return sexp_user_exception(ctx, self, "string-index->offset: index out of range", index); + return sexp_make_fixnum(j); +} + +#endif + sexp sexp_make_string_op (sexp ctx sexp_api_params(self, n), sexp len, sexp ch) { sexp i = (sexp_charp(ch) ? sexp_make_fixnum(sexp_unbox_character(ch)) : ch); sexp_gc_var2(b, s); +#if SEXP_USE_UTF8_STRINGS + int j, clen; + if (sexp_charp(ch) && (sexp_unbox_character(ch) >= 0x80)) { + sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, len); + clen = sexp_utf8_char_byte_count(sexp_unbox_character(ch)); + b = sexp_make_bytes_op(ctx sexp_api_pass(self, n), + sexp_fx_mul(len, sexp_make_fixnum(clen)), SEXP_VOID); + for (j=0; j= 0xC0) && (i <= 0xF7)) { - if (i < 0xE0) { - res = ((i&0x3F)<<6) + (str[1]&0x3F); - tail = str + 2; - } else if (i < 0xF0) { - res = ((i&0x1F)<<12) + ((str[1]&0x3F)<<6) + (str[2]&0x3F); - tail = str + 3; - } else { - res = ((i&0x0F)<<16) + ((str[1]&0x3F)<<6) - + ((str[2]&0x3F)<<6) + (str[3]&0x3F); - tail = str + 4; +static int sexp_decode_utf8_char(const unsigned char* s) { + int i = s[0], len = strlen((const char*)s); + if ((i >= 0xC0) && (i <= 0xF7) && (s[1]>>6 == 2)) { + if ((i < 0xE0) && (len == 2)) { + return ((i&0x3F)<<6) + (s[1]&0x3F); + } else if ((i < 0xF0) && (len == 3) && (s[2]>>6 == 2)) { + return ((i&0x1F)<<12) + ((s[1]&0x3F)<<6) + (s[2]&0x3F); + } else if ((len == 4) && (s[2]>>6 == 2) && (s[3]>>6 == 2)) { + return ((i&0x0F)<<16) + ((s[1]&0x3F)<<6) + ((s[2]&0x3F)<<6) + (s[3]&0x3F); } } - return *tail ? -1 : res; + return -1; } #endif diff --git a/tests/unicode-tests.scm b/tests/unicode-tests.scm new file mode 100644 index 00000000..faec75e7 --- /dev/null +++ b/tests/unicode-tests.scm @@ -0,0 +1,43 @@ +;; These tests are only valid if chibi-scheme is compiled with Unicode +;; support (SEXP_USE_UTF8_STRINGS). + +(import (chibi test)) + +(test-begin "unicode") + +(test #\Р (string-ref "Русский" 0)) +(test #\и (string-ref "Русский" 5)) +(test #\й (string-ref "Русский" 6)) + +(test 7 (string-length "Русский")) + +(test #\二 (string-ref "二本語" 0)) +(test #\本 (string-ref "二本語" 1)) +(test #\語 (string-ref "二本語" 2)) + +(test 3 (string-length "二本語")) + +(test '(#\二 #\本 #\語) (string->list "二本語")) +(test "二本語" (list->string '(#\二 #\本 #\語))) + +(test "二本" (substring "二本語" 0 2)) +(test "本語" (substring "二本語" 1 3)) + +(test "二-語" + (let ((s (substring "二本語" 0 3))) + (string-set! s 1 #\-) + s)) + +(test "二本人" + (let ((s (substring "二本語" 0 3))) + (string-set! s 2 #\人) + s)) + +(test "字字字" (make-string 3 #\字)) + +(test "字字字" + (let ((s (make-string 3))) + (string-fill! s #\字) + s)) + +(test-end)