From dbb4db17280c0ce6fb74d951b4ac38c8ace79ebb Mon Sep 17 00:00:00 2001 From: Alex Shinn Date: Mon, 12 Jul 2010 00:00:31 +0900 Subject: [PATCH] initial unicode support --- AUTHORS | 7 +- TODO | 21 +++++- eval.c | 157 ++++++++++++++++++++++++++++++++++++--- include/chibi/eval.h | 4 + include/chibi/features.h | 31 +++++++- include/chibi/sexp.h | 26 +++++++ opcodes.c | 19 +++++ opt/opcode_names.h | 19 +++-- sexp.c | 63 ++++++++++++---- vm.c | 43 +++++++++-- 10 files changed, 349 insertions(+), 41 deletions(-) diff --git a/AUTHORS b/AUTHORS index 1e15e0a5..fc0b8224 100644 --- a/AUTHORS +++ b/AUTHORS @@ -5,18 +5,23 @@ The `dynamic-wind' implementation is adapted from the implementation in the appendix to the Scheme48 reference manual, reportedly first written by Chris Hanson and John Lamping. -Thanks to the following people for patches: +Thanks to the following people for patches and bug reports: + * Alexander Shendi * Andreas Rottman * Bruno Deferrari * Derrick Eddington + * Eduardo Cavazos * Felix Winkelmann * Gregor Klinke + * Jeremy Wolff + * Jeronimo Pellegrini * John Cowan * John Samsa * Lars J Aas * Lorenzo Campedelli * Michal Kowalski (sladegen) + * Taylor Venable If you would prefer not to be listed, or are one of the users listed without a full name, please contact me. If you've made a contribution diff --git a/TODO b/TODO index 3e01c1f5..161ca82c 100644 --- a/TODO +++ b/TODO @@ -8,7 +8,12 @@ ** DONE exceptions - State "DONE" [2009-04-09 Thu 14:45] ** TODO native x86 backend + API redesign in preparation complete, initial + tests on native factorial and closures working. ** TODO fasl/image files + sexp_copy_context() can form the basis for images, + FASL for arbitrary modules will need additional + help with resolving external references. ** DONE shared stack on EVAL - State "DONE" [2009-12-26 Sat 08:22] @@ -58,8 +63,17 @@ * runtime ** DONE bignums - State "DONE" [2009-07-07 Tue 14:42] -** TODO unicode -** TODO threads +** DONE unicode + - State "DONE" from "TODO" [2010-07-11 Sun 23:58] + Supported with UTF-8 strings, string-ref is O(n) and + string-set! may need to reallocate the whole string. + string-cursor-ref can be used for O(1) string access. +** DONE threads + - State "DONE" from "TODO" [2010-07-11 Sun 15:31] + VM now supports an optional hook for green threads, + and a SRFI-18 interface is provided as a separate module. + I/O operations will currently block all threads though, + this needs to be addressed. ** DONE virtual ports - State "DONE" [2010-01-02 Sat 20:12] ** DONE dynamic-wind @@ -107,7 +121,8 @@ ** DONE loop library - State "DONE" [2009-12-08 Tue 14:54] ** TODO network interface -** TODO posix interface +** DONE posix interface + - State "DONE" from "TODO" [2010-07-11 Sun 15:36] Splitting this into several parts. *** DONE filesystem interface - State "DONE" [2009-12-26 Sat 01:50] diff --git a/eval.c b/eval.c index eb438301..50b6726d 100644 --- a/eval.c +++ b/eval.c @@ -43,7 +43,7 @@ void sexp_warn_undefs (sexp ctx, sexp from, sexp to) { sexp x; for (x=from; sexp_pairp(x) && x!=to; x=sexp_env_next_cell(x)) if (sexp_cdr(x) == SEXP_UNDEF) - sexp_warn(ctx, "reference to undefined variable", sexp_car(x)); + sexp_warn(ctx, "reference to undefined variable: ", sexp_car(x)); } @@ -299,14 +299,6 @@ static sexp sexp_make_lit (sexp ctx, sexp value) { return res; } -/************************* backend ***************************/ - -#if SEXP_USE_NATIVE_X86 -#include "opt/x86.c" -#else -#include "vm.c" -#endif - /****************************** contexts ******************************/ #define SEXP_STACK_SIZE (sexp_sizeof(stack)+sizeof(sexp)*SEXP_INIT_STACK_SIZE) @@ -1123,6 +1115,142 @@ static sexp sexp_string_cmp_op (sexp ctx sexp_api_params(self, n), sexp str1, se return sexp_make_fixnum(diff); } +#if SEXP_USE_UTF8_STRINGS + +static int sexp_utf8_initial_byte_count(int c) { + if (c < 0xC0) return 1; + if (c < 0xE0) return 2; + return ((c>>4)&1)+3; +} + +static int sexp_utf8_char_byte_count(int c) { + if (c < 0x80) return 1; + if (c < 0x800) return 2; + if (c < 0x10000) return 3; + return 4; +} + +static int sexp_string_utf8_length (unsigned char *p, int len) { + unsigned char *q = p+len; + int i; + for (i=0; p0 && j0) + return sexp_user_exception(ctx, self, "string-index->offset: index out of range", index); + return sexp_make_fixnum(j); +} + +sexp sexp_string_utf8_ref (sexp ctx, sexp str, sexp i) { + unsigned char *p=(unsigned char*)sexp_string_data(str) + sexp_unbox_fixnum(i); + if (*p < 0x80) + return sexp_make_character(*p); + else if ((*p < 0xC0) || (*p > 0xF7)) + return sexp_user_exception(ctx, NULL, "string-ref: invalid utf8 byte", i); + else if (*p < 0xE0) + return sexp_make_character(((p[0]&0x3F)<<6) + (p[1]&0x3F)); + else if (*p < 0xF0) + return sexp_make_character(((p[0]&0x1F)<<12) + ((p[1]&0x3F)<<6) + (p[2]&0x3F)); + else + return sexp_make_character(((p[0]&0x0F)<<16) + ((p[1]&0x3F)<<6) + ((p[2]&0x3F)<<6) + (p[2]&0x3F)); +} + +sexp sexp_string_utf8_index_ref (sexp ctx sexp_api_params(self, n), sexp str, sexp i) { + sexp off; + sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str); + sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, i); + off = sexp_string_index_to_offset(ctx sexp_api_pass(self, n), str, i); + if (sexp_exceptionp(off)) return off; + return sexp_string_utf8_ref(ctx, str, off); +} + +void sexp_utf8_encode_char (unsigned char* p, int len, int c) { + switch (len) { + case 4: *p++ = (0xF0 + ((c)>>18)); *p++ = (0x80 + ((c>>12)&0x3F)); + *p++ = (0x80 + ((c>>6)&0x3F)); *p = (0x80 + (c&0x3F)); break; + case 3: *p++ = (0xE0 + ((c)>>12)); *p++ = (0x80 + ((c>>6)&0x3F)); + *p = (0x80 + (c&0x3F)); break; + case 2: *p++ = (0xC0 + ((c)>>6)); *p = (0x80 + (c&0x3F)); break; + default: *p = c; break; + } +} + +void sexp_write_utf8_char (sexp ctx, int c, sexp out) { + unsigned char buf[8]; + int len = sexp_utf8_char_byte_count(c); + sexp_utf8_encode_char(buf, len, c); + buf[len+1] = 0; + sexp_write_string(ctx, (char*)buf, out); +} + +sexp sexp_read_utf8_char (sexp ctx, sexp port, int i) { + if (i >= 0x80) { + if ((i < 0xC0) || (i > 0xF7)) { + return sexp_user_exception(ctx, NULL, "read-char: invalid utf8 byte", sexp_make_fixnum(i)); + } else if (i < 0xE0) { + i = ((i&0x3F)<<6) + (sexp_read_char(ctx, port)&0x3F); + } else if (i < 0xF0) { + i = ((i&0x1F)<<12) + ((sexp_read_char(ctx, port)&0x3F)<<6); + i += sexp_read_char(ctx, port)&0x3F; + } else { + i = ((i&0x0F)<<16) + ((sexp_read_char(ctx, port)&0x3F)<<6); + i += (sexp_read_char(ctx, port)&0x3F)<<6; + i += sexp_read_char(ctx, port)&0x3F; + } + } + return sexp_make_character(i); +} + +#if SEXP_USE_MUTABLE_STRINGS + +void sexp_string_utf8_set (sexp ctx, sexp str, sexp index, sexp ch) { + sexp b; + unsigned char *p, *q; + int i = sexp_unbox_fixnum(index), c = sexp_unbox_character(ch), + old_len, new_len, len; + p = (unsigned char*)sexp_string_data(str) + i; + old_len = sexp_utf8_initial_byte_count(*p); + new_len = sexp_utf8_char_byte_count(c); + if (old_len != new_len) { /* resize bytes if needed */ + len = sexp_string_length(str)+(new_len-old_len); + b = sexp_make_bytes(ctx, sexp_make_fixnum(len), SEXP_VOID); + if (! sexp_exceptionp(b)) { + q = (unsigned char*)sexp_bytes_data(b); + memcpy(q, sexp_string_data(str), i); + memcpy(q+i+new_len, p+old_len, len-i-new_len+1); + sexp_string_bytes(str) = b; + p = q + i; + } + sexp_string_length(str) += new_len - old_len; + } + sexp_utf8_encode_char(p, new_len, c); +} + +sexp sexp_string_utf8_index_set (sexp ctx sexp_api_params(self, n), sexp str, sexp i, sexp ch) { + sexp off; + sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str); + sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, i); + sexp_assert_type(ctx, sexp_charp, SEXP_CHAR, ch); + off = sexp_string_index_to_offset(ctx sexp_api_pass(self, n), str, i); + if (sexp_exceptionp(off)) return off; + sexp_string_utf8_set(ctx, str, off, ch); + return SEXP_VOID; +} + +#endif +#endif + #ifdef PLAN9 #include "opt/plan9.c" #endif @@ -1438,6 +1566,9 @@ sexp sexp_load_standard_env (sexp ctx, sexp e, sexp version) { #endif #if SEXP_USE_BOEHM sexp_push(ctx, tmp, sym=sexp_intern(ctx, "boehm-gc", -1)); +#endif +#if SEXP_USE_UTF8_STRINGS + sexp_push(ctx, tmp, sym=sexp_intern(ctx, "boehm-gc", -1)); #endif sexp_push(ctx, tmp, sym=sexp_intern(ctx, "chibi", -1)); sexp_env_define(ctx, e, sexp_intern(ctx, "*features*", -1), tmp); @@ -1519,6 +1650,14 @@ sexp sexp_env_copy_op (sexp ctx sexp_api_params(self, n), sexp to, sexp from, se return SEXP_VOID; } +/************************* backend ***************************/ + +#if SEXP_USE_NATIVE_X86 +#include "opt/x86.c" +#else +#include "vm.c" +#endif + /************************** eval interface ****************************/ sexp sexp_compile (sexp ctx, sexp x) { diff --git a/include/chibi/eval.h b/include/chibi/eval.h index 938bf7c0..b21e3825 100644 --- a/include/chibi/eval.h +++ b/include/chibi/eval.h @@ -76,6 +76,9 @@ enum sexp_opcode_names { SEXP_OP_VECTOR_REF, SEXP_OP_VECTOR_SET, SEXP_OP_VECTOR_LENGTH, + SEXP_OP_BYTES_REF, + SEXP_OP_BYTES_SET, + SEXP_OP_BYTES_LENGTH, SEXP_OP_STRING_REF, SEXP_OP_STRING_SET, SEXP_OP_STRING_LENGTH, @@ -132,6 +135,7 @@ SEXP_API sexp sexp_compile_error (sexp ctx, const char *message, sexp obj); SEXP_API sexp sexp_analyze (sexp context, sexp x); SEXP_API void sexp_stack_trace (sexp ctx, sexp out); SEXP_API sexp sexp_apply (sexp context, sexp proc, sexp args); +SEXP_API sexp sexp_apply1 (sexp ctx, sexp f, sexp x); SEXP_API sexp sexp_free_vars (sexp context, sexp x, sexp fv); SEXP_API int sexp_param_index (sexp lambda, sexp name); SEXP_API sexp sexp_eval_op (sexp context sexp_api_params(self, n), sexp obj, sexp env); diff --git a/include/chibi/features.h b/include/chibi/features.h index d1b0c5e8..a1159c47 100644 --- a/include/chibi/features.h +++ b/include/chibi/features.h @@ -145,6 +145,16 @@ /* non-immediate symbols in a single list. */ /* #define SEXP_USE_HASH_SYMS 0 */ +/* uncomment this to disable UTF-8 string support */ +/* The default settings store strings in memory as UTF-8, */ +/* and assumes strings passed to/from the C FFI are UTF-8. */ +/* #define SEXP_USE_UTF8_STRINGS 0 */ + +/* uncomment this to disable the string-set! opcode */ +/* By default (non-literal) strings are mutable. */ +/* Making them immutable allows for packed UTF-8 strings. */ +/* #define SEXP_USE_MUTABLE_STRINGS 0 */ + /* uncomment this to disable string ports */ /* If disabled some basic functionality such as number->string */ /* will not be available by default. */ @@ -201,7 +211,7 @@ /* the default number of opcodes to run each thread for */ #ifndef SEXP_DEFAULT_QUANTUM -#define SEXP_DEFAULT_QUANTUM 1000 +#define SEXP_DEFAULT_QUANTUM 500 #endif /************************************************************************/ @@ -230,7 +240,7 @@ #endif #ifndef SEXP_USE_GREEN_THREADS -#define SEXP_USE_GREEN_THREADS 1 +#define SEXP_USE_GREEN_THREADS ! SEXP_USE_NO_FEATURES #endif #ifndef SEXP_USE_NATIVE_X86 @@ -314,7 +324,7 @@ #endif #ifndef SEXP_USE_EXTENDED_FCALL -#define SEXP_USE_EXTENDED_FCALL 1 +#define SEXP_USE_EXTENDED_FCALL ! SEXP_USE_NO_FEATURES #endif #ifndef SEXP_USE_FLONUMS @@ -361,6 +371,21 @@ #define SEXP_USE_DEBUG_VM 0 #endif +#ifndef SEXP_USE_UTF8_STRINGS +#define SEXP_USE_UTF8_STRINGS ! SEXP_USE_NO_FEATURES +#endif + +#ifndef SEXP_USE_MUTABLE_STRINGS +#define SEXP_USE_MUTABLE_STRINGS 1 +#endif + +#if (SEXP_USE_UTF8_STRINGS && SEXP_USE_MUTABLE_STRINGS) +#define SEXP_USE_PACKED_STRINGS 0 +#endif +#ifndef SEXP_USE_PACKED_STRINGS +#define SEXP_USE_PACKED_STRINGS 1 +#endif + #ifndef SEXP_USE_STRING_STREAMS #define SEXP_USE_STRING_STREAMS ! SEXP_USE_NO_FEATURES #endif diff --git a/include/chibi/sexp.h b/include/chibi/sexp.h index 3e66a297..ddcbb098 100644 --- a/include/chibi/sexp.h +++ b/include/chibi/sexp.h @@ -83,6 +83,7 @@ enum sexp_types { SEXP_BOOLEAN, SEXP_PAIR, SEXP_SYMBOL, + SEXP_BYTES, SEXP_STRING, SEXP_VECTOR, SEXP_FLONUM, @@ -210,6 +211,15 @@ struct sexp_struct { struct { sexp_uint_t length; char data[]; + } bytes; + struct { +#if SEXP_USE_PACKED_STRINGS + sexp_uint_t length; + char data[]; +#else + sexp_uint_t offset, length; + sexp bytes; +#endif } string; struct { sexp_uint_t length; @@ -578,12 +588,26 @@ SEXP_API sexp sexp_make_unsigned_integer(sexp ctx, sexp_luint_t x); #define sexp_procedure_code(x) ((x)->value.procedure.bc) #define sexp_procedure_vars(x) ((x)->value.procedure.vars) +#define sexp_bytes_length(x) ((x)->value.bytes.length) +#define sexp_bytes_data(x) ((x)->value.bytes.data) + #define sexp_string_length(x) ((x)->value.string.length) +#if SEXP_USE_PACKED_STRINGS #define sexp_string_data(x) ((x)->value.string.data) +#else +#define sexp_string_bytes(x) ((x)->value.string.bytes) +#define sexp_string_offset(x) ((x)->value.string.offset) +#define sexp_string_data(x) (sexp_bytes_data(sexp_string_bytes(x))+sexp_string_offset(x)) +#endif + +#define sexp_bytes_ref(x, i) (sexp_make_fixnum((unsigned char)sexp_bytes_data(x)[sexp_unbox_fixnum(i)])) +#define sexp_bytes_set(x, i, v) (sexp_bytes_data(x)[sexp_unbox_fixnum(i)] = sexp_unbox_fixnum(v)) #define sexp_string_ref(x, i) (sexp_make_character((unsigned char)sexp_string_data(x)[sexp_unbox_fixnum(i)])) #define sexp_string_set(x, i, v) (sexp_string_data(x)[sexp_unbox_fixnum(i)] = sexp_unbox_character(v)) +#define sexp_symbol_data(x) ((x)->value.symbol.data) +#define sexp_symbol_length(x) ((x)->value.symbol.length) #define sexp_symbol_string(x) (x) #define sexp_port_stream(p) ((p)->value.port.stream) @@ -902,6 +926,7 @@ SEXP_API sexp sexp_memq_op(sexp ctx sexp_api_params(self, n), sexp x, sexp ls); SEXP_API sexp sexp_assq_op(sexp ctx sexp_api_params(self, n), sexp x, sexp ls); SEXP_API sexp sexp_length_op(sexp ctx sexp_api_params(self, n), sexp ls); SEXP_API sexp sexp_c_string(sexp ctx, const char *str, sexp_sint_t slen); +SEXP_API sexp sexp_make_bytes_op(sexp ctx sexp_api_params(self, n), sexp len, sexp i); SEXP_API sexp sexp_make_string_op(sexp ctx sexp_api_params(self, n), sexp len, sexp ch); SEXP_API sexp sexp_substring_op (sexp ctx sexp_api_params(self, n), sexp str, sexp start, sexp end); SEXP_API sexp sexp_string_concatenate_op (sexp ctx sexp_api_params(self, n), sexp str_ls, sexp sep); @@ -980,6 +1005,7 @@ SEXP_API sexp sexp_finalize_c_type (sexp ctx sexp_api_params(self, n), sexp obj) #define sexp_list_to_vector(ctx, x) sexp_list_to_vector_op(ctx sexp_api_pass(NULL, 1), x) #define sexp_exception_type(ctx, x) sexp_exception_type_op(ctx sexp_api_pass(NULL, 1), x) #define sexp_string_to_number(ctx, s, b) sexp_make_string_op(ctx sexp_api_pass(NULL, 2), s, b) +#define sexp_make_bytes(ctx, l, i) sexp_make_bytes_op(ctx sexp_api_pass(NULL, 2), l, i) #define sexp_make_string(ctx, l, c) sexp_make_string_op(ctx sexp_api_pass(NULL, 2), l, c) #define sexp_string_cmp(ctx, a, b, c) sexp_string_cmp_op(ctx sexp_api_pass(NULL, 3), a, b, c) #define sexp_substring(ctx, a, b, c) sexp_substring_op(ctx sexp_api_pass(NULL, 3), a, b, c) diff --git a/opcodes.c b/opcodes.c index 3e74ce53..533052a6 100644 --- a/opcodes.c +++ b/opcodes.c @@ -24,8 +24,21 @@ _OP(SEXP_OPC_SETTER, SEXP_OP_SET_CDR, 2, 0, SEXP_PAIR, 0, 0, "set-cdr!", 0, NULL _OP(SEXP_OPC_GETTER, SEXP_OP_VECTOR_REF,2,0, SEXP_VECTOR, SEXP_FIXNUM, 0,"vector-ref", 0, NULL), _OP(SEXP_OPC_SETTER, SEXP_OP_VECTOR_SET,3,0, SEXP_VECTOR, SEXP_FIXNUM, 0,"vector-set!", 0, NULL), _OP(SEXP_OPC_GETTER, SEXP_OP_VECTOR_LENGTH,1,0, SEXP_VECTOR, 0, 0,"vector-length", 0, NULL), +_OP(SEXP_OPC_GETTER, SEXP_OP_BYTES_REF,2,0, SEXP_BYTES, SEXP_FIXNUM, 0,"byte-vector-ref", 0, NULL), +_OP(SEXP_OPC_SETTER, SEXP_OP_BYTES_SET,3,0, SEXP_BYTES, SEXP_FIXNUM, 0,"byte-vector-set!", 0, NULL), +_OP(SEXP_OPC_GETTER, SEXP_OP_BYTES_LENGTH,1,0, SEXP_BYTES, 0, 0,"byte-vector-length", 0, NULL), +#if SEXP_USE_UTF8_STRINGS +_OP(SEXP_OPC_GETTER, SEXP_OP_STRING_REF,2,0, SEXP_STRING, SEXP_FIXNUM, 0,"string-cursor-ref", 0, NULL), +#else _OP(SEXP_OPC_GETTER, SEXP_OP_STRING_REF,2,0, SEXP_STRING, SEXP_FIXNUM, 0,"string-ref", 0, NULL), +#endif +#if SEXP_USE_MUTABLE_STRINGS +#if SEXP_USE_UTF8_STRINGS +_OP(SEXP_OPC_SETTER, SEXP_OP_STRING_SET,3,0, SEXP_STRING, SEXP_FIXNUM, 0,"string-cursor-set!", 0, NULL), +#else _OP(SEXP_OPC_SETTER, SEXP_OP_STRING_SET,3,0, SEXP_STRING, SEXP_FIXNUM, 0,"string-set!", 0, NULL), +#endif +#endif _OP(SEXP_OPC_GETTER, SEXP_OP_STRING_LENGTH,1,0, SEXP_STRING, 0, 0,"string-length", 0, NULL), _OP(SEXP_OPC_GENERIC, SEXP_OP_FIX2FLO, 1, 0, 0, 0, 0, "exact->inexact", 0, NULL), _OP(SEXP_OPC_GENERIC, SEXP_OP_FLO2FIX, 1, 0, 0, 0, 0, "inexact->exact", 0, NULL), @@ -101,6 +114,7 @@ _FN4(SEXP_ENV, SEXP_ENV, "%env-copy!", 0, sexp_env_copy_op), _FN2(SEXP_EXCEPTION, SEXP_OPORT, "print-exception", 0, sexp_print_exception_op), _FN1(SEXP_EXCEPTION, "exception-type", 0, sexp_exception_type_op), _FN2OPT(SEXP_FIXNUM, SEXP_CHAR, "make-string", sexp_make_character(' '), sexp_make_string_op), +_FN2OPT(SEXP_FIXNUM, SEXP_FIXNUM, "make-byte-vector", SEXP_ZERO, sexp_make_bytes_op), _FN2OPT(SEXP_FIXNUM, SEXP_FIXNUM, "string->number", SEXP_TEN, sexp_string_to_number_op), _FN3(SEXP_STRING, SEXP_STRING, "string-cmp", 0, sexp_string_cmp_op), _FN3(SEXP_STRING, SEXP_FIXNUM, "substring", 0, sexp_substring_op), @@ -134,6 +148,11 @@ _FN1(0, "floor", 0, sexp_floor), _FN1(0, "ceiling", 0, sexp_ceiling), #endif _FN2(0, 0, "expt", 0, sexp_expt_op), +#if SEXP_USE_UTF8_STRINGS +_FN2(SEXP_STRING, SEXP_FIXNUM, "string-index->offset", 0, sexp_string_index_to_offset), +_FN2(SEXP_STRING, SEXP_FIXNUM, "string-ref", 0, sexp_string_utf8_index_ref), +_FN3(SEXP_STRING, SEXP_FIXNUM, "string-set!", 0, sexp_string_utf8_index_set), +#endif #if SEXP_USE_TYPE_DEFS _FN2(SEXP_STRING, SEXP_FIXNUM, "register-simple-type", 0, sexp_register_simple_type_op), _FN2(SEXP_STRING, SEXP_FIXNUM, "make-type-predicate", 0, sexp_make_type_predicate_op), diff --git a/opt/opcode_names.h b/opt/opcode_names.h index a8c06e9a..52c639f9 100644 --- a/opt/opcode_names.h +++ b/opt/opcode_names.h @@ -1,16 +1,21 @@ static const char* reverse_opcode_names[] = - {"NOOP", "RAISE", "RESUMECC", "CALLCC", "APPLY1", "TAIL-CALL", "CALL", - "FCALL0", "FCALL1", "FCALL2", "FCALL3", "FCALL4", "FCALL5", "FCALL6", "FCALLN", - "JUMP-UNLESS", "JUMP", "PUSH", "DROP", "GLOBAL-REF", "GLOBAL-KNOWN-REF", - "STACK-REF", "LOCAL-REF", "LOCAL-SET", - "CLOSURE-REF", "VECTOR-REF", "VECTOR-SET", "VECTOR-LENGTH", "STRING-REF", - "STRING-SET", "STRING-LENGTH", "MAKE-PROCEDURE", "MAKE-VECTOR", + {"NOOP", "RAISE", "RESUMECC", "CALLCC", "APPLY1", + "TAIL-CALL", "CALL", "FCALL0", "FCALL1", "FCALL2", + "FCALL3", "FCALL4", "FCALL5", "FCALL6", "FCALLN", + "JUMP-UNLESS", "JUMP", "PUSH", "DROP", + "GLOBAL-REF", "GLOBAL-KNOWN-REF", "STACK-REF", + "LOCAL-REF", "LOCAL-SET", "CLOSURE-REF", + "VECTOR-REF", "VECTOR-SET", "VECTOR-LENGTH", + "BYTES-REF", "BYTES-SET", "BYTES-LENGTH", + "STRING-REF", "STRING-SET", "STRING-LENGTH", + "MAKE-PROCEDURE", "MAKE-VECTOR", "MAKE-EXCEPTION", "AND", "NULL?", "FIXNUM?", "SYMBOL?", "CHAR?", "EOF?", "TYPEP", "MAKE", "SLOT-REF", "SLOT-SET", "CAR", "CDR", "SET-CAR", "SET-CDR", "CONS", "ADD", "SUB", "MUL", "DIV", "QUOTIENT", "REMAINDER", "LT", "LE", "EQN", "EQ", "EXACT->INEXACT", "INEXACT->EXACT", "CHAR->INTEGER", "INTEGER->CHAR", "CHAR-UPCASE", "CHAR-DOWNCASE", - "WRITE-CHAR", "NEWLINE", "READ-CHAR", "PEEK-CHAR", "YIELD", "RET", "DONE", + "WRITE-CHAR", "NEWLINE", "READ-CHAR", "PEEK-CHAR", + "YIELD", "RET", "DONE", }; diff --git a/sexp.c b/sexp.c index 1af3d9a0..01b9e0dd 100644 --- a/sexp.c +++ b/sexp.c @@ -88,7 +88,12 @@ static struct sexp_struct _sexp_type_specs[] = { _DEF_TYPE(SEXP_BOOLEAN, 0, 0, 0, 0, 0, 0, 0, 0, "boolean", NULL), _DEF_TYPE(SEXP_PAIR, sexp_offsetof(pair, car), 2, 3, 0, 0, sexp_sizeof(pair), 0, 0, "pair", NULL), _DEF_TYPE(SEXP_SYMBOL, 0, 0, 0, 0, 0, sexp_sizeof(symbol)+1, sexp_offsetof(symbol, length), 1, "symbol", NULL), + _DEF_TYPE(SEXP_BYTES, 0, 0, 0, 0, 0, sexp_sizeof(bytes)+1, sexp_offsetof(bytes, length), 1, "byte-vector", NULL), +#if SEXP_USE_PACKED_STRINGS _DEF_TYPE(SEXP_STRING, 0, 0, 0, 0, 0, sexp_sizeof(string)+1, sexp_offsetof(string, length), 1, "string", NULL), +#else + _DEF_TYPE(SEXP_STRING, sexp_offsetof(string, bytes), 1, 1, 0, 0, sexp_sizeof(string), 0, 0, "string", NULL), +#endif _DEF_TYPE(SEXP_VECTOR, sexp_offsetof(vector, data), 0, 0, sexp_offsetof(vector, length), 1, sexp_sizeof(vector), sexp_offsetof(vector, length), sizeof(sexp), "vector", NULL), _DEF_TYPE(SEXP_FLONUM, 0, 0, 0, 0, 0, sexp_sizeof(flonum), 0, 0, "real", NULL), _DEF_TYPE(SEXP_BIGNUM, 0, 0, 0, 0, 0, sexp_sizeof(bignum), sexp_offsetof(bignum, length), sizeof(sexp_uint_t), "bignum", NULL), @@ -666,24 +671,44 @@ sexp sexp_make_flonum (sexp ctx, float f) { #endif #endif -sexp sexp_make_string_op (sexp ctx sexp_api_params(self, n), sexp len, sexp ch) { +sexp sexp_make_bytes_op (sexp ctx sexp_api_params(self, n), sexp len, sexp i) { sexp_sint_t clen = sexp_unbox_fixnum(len); sexp s; sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, len); if (clen < 0) return sexp_xtype_exception(ctx, self, "negative length", len); - s = sexp_alloc_atomic(ctx, sexp_sizeof(string)+clen+1); + s = sexp_alloc_atomic(ctx, sexp_sizeof(bytes)+clen+1); if (sexp_exceptionp(s)) return s; - sexp_pointer_tag(s) = SEXP_STRING; + sexp_pointer_tag(s) = SEXP_BYTES; #if SEXP_USE_HEADER_MAGIC sexp_pointer_magic(s) = SEXP_POINTER_MAGIC; #endif - sexp_string_length(s) = clen; - if (sexp_charp(ch)) - memset(sexp_string_data(s), sexp_unbox_character(ch), clen); - sexp_string_data(s)[clen] = '\0'; + sexp_bytes_length(s) = clen; + if (sexp_fixnump(i)) + memset(sexp_bytes_data(s), sexp_unbox_fixnum(i), clen); + sexp_bytes_data(s)[clen] = '\0'; return s; } +sexp sexp_make_string_op (sexp ctx sexp_api_params(self, n), sexp len, sexp ch) +{ + sexp i = (sexp_charp(ch) ? sexp_make_fixnum(sexp_unbox_character(ch)) : ch); + sexp_gc_var2(b, s); + b = sexp_make_bytes_op(ctx sexp_api_pass(self, n), len, i); + if (sexp_exceptionp(b)) return b; +#if SEXP_USE_PACKED_STRINGS + sexp_pointer_tag(b) = SEXP_STRING; + return b; +#else + sexp_gc_preserve2(ctx, b, s); + s = sexp_alloc_type(ctx, string, SEXP_STRING); + sexp_string_bytes(s) = b; + sexp_string_offset(s) = 0; + sexp_string_length(s) = sexp_unbox_fixnum(len); + sexp_gc_release2(ctx); + return s; +#endif +} + sexp sexp_c_string (sexp ctx, const char *str, sexp_sint_t slen) { sexp_sint_t len = ((slen >= 0) ? slen : strlen(str)); sexp s = sexp_make_string(ctx, sexp_make_fixnum(len), SEXP_VOID); @@ -790,14 +815,17 @@ sexp sexp_intern(sexp ctx, const char *str, sexp_sint_t len) { bucket = 0; #endif for (ls=sexp_context_symbols(ctx)[bucket]; sexp_pairp(ls); ls=sexp_cdr(ls)) - if ((sexp_string_length(tmp=sexp_symbol_string(sexp_car(ls))) == len) - && ! strncmp(str, sexp_string_data(tmp), len)) + if ((sexp_symbol_length(tmp=sexp_car(ls)) == len) + && ! strncmp(str, sexp_symbol_data(tmp), len)) return sexp_car(ls); /* not found, make a new symbol */ sexp_gc_preserve1(ctx, sym); sym = sexp_c_string(ctx, str, len); if (sexp_exceptionp(sym)) return sym; +#if ! SEXP_USE_PACKED_STRINGS + sym = sexp_string_bytes(sym); +#endif sexp_pointer_tag(sym) = SEXP_SYMBOL; sexp_push(ctx, sexp_context_symbols(ctx)[bucket], sym); sexp_gc_release1(ctx); @@ -1190,8 +1218,8 @@ sexp sexp_write_one (sexp ctx, sexp obj, sexp out) { sexp_write_char(ctx, '"', out); break; case SEXP_SYMBOL: - i = sexp_string_length(sexp_symbol_string(obj)); - str = sexp_string_data(sexp_symbol_string(obj)); + i = sexp_symbol_length(obj); + str = sexp_symbol_data(obj); for ( ; i>0; str++, i--) { if ((str[0] == '\\') || is_separator(str[0])) sexp_write_char(ctx, '\\', out); @@ -1253,8 +1281,17 @@ sexp sexp_write_one (sexp ctx, sexp obj, sexp out) { sexp_write_char(ctx, sexp_unbox_character(obj), out); } else { sexp_write_string(ctx, "#\\x", out); - sexp_write_char(ctx, hex_digit(sexp_unbox_character(obj)>>4), out); - sexp_write_char(ctx, hex_digit(sexp_unbox_character(obj)&0xF), out); + c = sexp_unbox_character(obj); + if (c >= 0x100) { + if (c >= 0x10000) { + sexp_write_char(ctx, hex_digit((c>>20)&0x0F), out); + sexp_write_char(ctx, hex_digit((c>>16)&0x0F), out); + } + sexp_write_char(ctx, hex_digit((c>>12)&0x0F), out); + sexp_write_char(ctx, hex_digit((c>>8)&0x0F), out); + } + sexp_write_char(ctx, hex_digit((c>>4)&0x0F), out); + sexp_write_char(ctx, hex_digit(c&0x0F), out); } } else if (sexp_symbolp(obj)) { diff --git a/vm.c b/vm.c index 88e4e494..50f8e8b8 100644 --- a/vm.c +++ b/vm.c @@ -2,8 +2,6 @@ /* Copyright (c) 2009-2010 Alex Shinn. All rights reserved. */ /* BSD-style license: http://synthcode.com/license.txt */ -static sexp sexp_apply1 (sexp ctx, sexp f, sexp x); - #if SEXP_USE_DEBUG_VM > 1 static void sexp_print_stack (sexp ctx, sexp *stack, int top, int fp, sexp out) { int i; @@ -834,6 +832,7 @@ sexp sexp_vm (sexp ctx, sexp proc) { sexp_raise("vector-length: not a vector", sexp_list1(ctx, _ARG1)); _ARG1 = sexp_make_fixnum(sexp_vector_length(_ARG1)); break; + case SEXP_OP_BYTES_REF: case SEXP_OP_STRING_REF: if (! sexp_stringp(_ARG1)) sexp_raise("string-ref: not a string", sexp_list1(ctx, _ARG1)); @@ -842,9 +841,17 @@ sexp sexp_vm (sexp ctx, sexp proc) { i = sexp_unbox_fixnum(_ARG2); if ((i < 0) || (i >= sexp_string_length(_ARG1))) sexp_raise("string-ref: index out of range", sexp_list2(ctx, _ARG1, _ARG2)); - _ARG2 = sexp_string_ref(_ARG1, _ARG2); + if (ip[-1] == SEXP_OP_BYTES_REF) + _ARG2 = sexp_bytes_ref(_ARG1, _ARG2); + else +#if SEXP_USE_UTF8_STRINGS + _ARG2 = sexp_string_utf8_ref(ctx, _ARG1, _ARG2); +#else + _ARG2 = sexp_string_ref(_ARG1, _ARG2); +#endif top--; break; + case SEXP_OP_BYTES_SET: case SEXP_OP_STRING_SET: if (! sexp_stringp(_ARG1)) sexp_raise("string-set!: not a string", sexp_list1(ctx, _ARG1)); @@ -857,14 +864,30 @@ sexp sexp_vm (sexp ctx, sexp proc) { i = sexp_unbox_fixnum(_ARG2); if ((i < 0) || (i >= sexp_string_length(_ARG1))) sexp_raise("string-set!: index out of range", sexp_list2(ctx, _ARG1, _ARG2)); - sexp_string_set(_ARG1, _ARG2, _ARG3); + if (ip[-1] == SEXP_OP_BYTES_SET) + sexp_bytes_set(_ARG1, _ARG2, _ARG3); + else +#if SEXP_USE_UTF8_STRINGS + sexp_string_utf8_set(ctx, _ARG1, _ARG2, _ARG3); +#else + sexp_string_set(_ARG1, _ARG2, _ARG3); +#endif _ARG3 = SEXP_VOID; top-=2; break; + case SEXP_OP_BYTES_LENGTH: + if (! sexp_stringp(_ARG1)) + sexp_raise("bytes-length: not a byte-vector", sexp_list1(ctx, _ARG1)); + _ARG1 = sexp_make_fixnum(sexp_bytes_length(_ARG1)); + break; case SEXP_OP_STRING_LENGTH: if (! sexp_stringp(_ARG1)) sexp_raise("string-length: not a string", sexp_list1(ctx, _ARG1)); +#if SEXP_USE_UTF8_STRINGS + _ARG1 = sexp_make_fixnum(sexp_string_utf8_length((unsigned char*)sexp_string_data(_ARG1), sexp_string_length(_ARG1))); +#else _ARG1 = sexp_make_fixnum(sexp_string_length(_ARG1)); +#endif break; case SEXP_OP_MAKE_PROCEDURE: sexp_context_top(ctx) = top; @@ -1244,6 +1267,11 @@ sexp sexp_vm (sexp ctx, sexp proc) { sexp_raise("write-char: not a character", sexp_list1(ctx, _ARG1)); if (! sexp_oportp(_ARG2)) sexp_raise("write-char: not an output-port", sexp_list1(ctx, _ARG2)); +#if SEXP_USE_UTF8_STRINGS + if (sexp_unbox_character(_ARG1) >= 0x80) + sexp_write_utf8_char(ctx, sexp_unbox_character(_ARG1), _ARG2); + else +#endif sexp_write_char(ctx, sexp_unbox_character(_ARG1), _ARG2); _ARG2 = SEXP_VOID; top--; @@ -1258,6 +1286,11 @@ sexp sexp_vm (sexp ctx, sexp proc) { if (! sexp_iportp(_ARG1)) sexp_raise("read-char: not an input-port", sexp_list1(ctx, _ARG1)); i = sexp_read_char(ctx, _ARG1); +#if SEXP_USE_UTF8_STRINGS + if (i >= 0x80) + _ARG1 = sexp_read_utf8_char(ctx, _ARG1, i); + else +#endif _ARG1 = (i == EOF) ? SEXP_EOF : sexp_make_character(i); break; case SEXP_OP_PEEK_CHAR: @@ -1302,7 +1335,7 @@ sexp sexp_vm (sexp ctx, sexp proc) { /******************************* apply ********************************/ -static sexp sexp_apply1 (sexp ctx, sexp f, sexp x) { +sexp sexp_apply1 (sexp ctx, sexp f, sexp x) { sexp res; sexp_gc_var1(args); if (sexp_opcodep(f)) {