mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-05-18 21:29:19 +02:00
Add some extended utf8 IO utilities.
This commit is contained in:
parent
5b19aab107
commit
c4611cc33f
5 changed files with 101 additions and 28 deletions
|
@ -134,6 +134,20 @@
|
|||
(read-string 4096 in)
|
||||
(read-line in)))
|
||||
|
||||
(let ((bv (string->utf8 "日本語")))
|
||||
(test #\日 (utf8-ref bv 0))
|
||||
(test #\本 (utf8-ref bv 3))
|
||||
(test #\語 (utf8-ref bv 6))
|
||||
(test 3 (utf8-next bv 0 9))
|
||||
(test 6 (utf8-next bv 3 9))
|
||||
(test 9 (utf8-next bv 6 9))
|
||||
(test #f (utf8-next bv 9 9))
|
||||
(test 6 (utf8-prev bv 9 0))
|
||||
(test 3 (utf8-prev bv 6 0))
|
||||
(test 0 (utf8-prev bv 3 0))
|
||||
(test #f (utf8-prev bv 0 0))
|
||||
)
|
||||
|
||||
(test #u8(0 1 2)
|
||||
(let ((in (bytevectors->input-port (list #u8(0 1 2)))))
|
||||
(read-bytevector 3 in)))
|
||||
|
|
|
@ -14,7 +14,8 @@
|
|||
make-filtered-output-port make-filtered-input-port
|
||||
string-count-chars
|
||||
open-input-bytevector open-output-bytevector get-output-bytevector
|
||||
string->utf8 utf8->string
|
||||
string->utf8 string->utf8! string-offset utf8->string utf8->string!
|
||||
utf8-ref utf8-next utf8-prev
|
||||
write-string write-u8 read-u8 peek-u8 send-file
|
||||
is-a-socket?
|
||||
call-with-input-file call-with-output-file)
|
||||
|
|
|
@ -9,25 +9,10 @@
|
|||
(call-with-input-string " "
|
||||
(lambda (in) (read-char in) (read-char in))))
|
||||
|
||||
;; Copy whole characters from the given cursor positions.
|
||||
;; Return the src cursor position of the next unwritten char,
|
||||
;; which may be before `to' if the char would overflow.
|
||||
;; Now provided as a primitive from (chibi ast).
|
||||
;; (define (string-cursor-copy! dst start src from to)
|
||||
;; (let lp ((i from)
|
||||
;; (j (string-cursor->index dst start)))
|
||||
;; (let ((i2 (string-cursor-next src i)))
|
||||
;; (cond ((> i2 to) i)
|
||||
;; (else
|
||||
;; (string-set! dst j (string-cursor-ref src i))
|
||||
;; (lp i2 (+ j 1)))))))
|
||||
|
||||
(define (utf8->string vec . o)
|
||||
(if (pair? o)
|
||||
(let ((start (car o))
|
||||
(end (if (pair? (cdr o)) (cadr o) (bytevector-length vec))))
|
||||
(utf8->string (subbytes vec start end)))
|
||||
(string-copy (utf8->string! vec))))
|
||||
(let ((start (if (pair? o) (car o) 0))
|
||||
(end (if (and (pair? o) (pair? (cdr o))) (cadr o) (bytevector-length vec))))
|
||||
(string-copy (utf8->string! vec start end))))
|
||||
|
||||
(define (string->utf8 str . o)
|
||||
(if (pair? o)
|
||||
|
|
|
@ -50,8 +50,19 @@
|
|||
((value ctx sexp) (value self sexp) sexp sexp sexp (default NULL sexp)))
|
||||
(define-c sexp (%string->utf8 "sexp_string_to_utf8")
|
||||
((value ctx sexp) (value self sexp) sexp))
|
||||
(define-c sexp (utf8->string! "sexp_utf8_to_string_x")
|
||||
(define-c sexp (string->utf8! "sexp_string_to_utf8_x")
|
||||
((value ctx sexp) (value self sexp) sexp))
|
||||
(define-c sexp (string-offset "sexp_string_offset_op")
|
||||
((value ctx sexp) (value self sexp) sexp))
|
||||
(define-c sexp (utf8->string! "sexp_utf8_to_string_x")
|
||||
((value ctx sexp) (value self sexp) sexp sexp sexp))
|
||||
|
||||
(define-c sexp (utf8-ref "sexp_utf8_ref")
|
||||
((value ctx sexp) (value self sexp) sexp sexp))
|
||||
(define-c sexp (utf8-next "sexp_utf8_next")
|
||||
((value ctx sexp) (value self sexp) sexp sexp sexp))
|
||||
(define-c sexp (utf8-prev "sexp_utf8_prev")
|
||||
((value ctx sexp) (value self sexp) sexp sexp sexp))
|
||||
|
||||
(define-c sexp (write-u8 "sexp_write_u8")
|
||||
((value ctx sexp) (value self sexp) sexp (default (current-output-port) sexp)))
|
||||
|
|
|
@ -258,15 +258,15 @@ sexp sexp_make_custom_binary_output_port (sexp ctx, sexp self,
|
|||
return res;
|
||||
}
|
||||
|
||||
sexp sexp_bytes_to_string (sexp ctx, sexp vec) {
|
||||
sexp sexp_bytes_to_string (sexp ctx, sexp vec, sexp_uint_t offset, sexp_uint_t size) {
|
||||
sexp res;
|
||||
#if SEXP_USE_PACKED_STRINGS
|
||||
res = sexp_c_string(ctx, sexp_bytes_data(vec), sexp_bytes_length(vec));
|
||||
res = sexp_c_string(ctx, sexp_bytes_data(vec) + offset, size);
|
||||
#else
|
||||
res = sexp_alloc_type(ctx, string, SEXP_STRING);
|
||||
sexp_string_bytes(res) = vec;
|
||||
sexp_string_offset(res) = 0;
|
||||
sexp_string_size(res) = sexp_bytes_length(vec);
|
||||
sexp_string_offset(res) = offset;
|
||||
sexp_string_size(res) = size - offset;
|
||||
#endif
|
||||
return res;
|
||||
}
|
||||
|
@ -275,7 +275,7 @@ sexp sexp_open_input_bytevector (sexp ctx, sexp self, sexp vec) {
|
|||
sexp_gc_var2(str, res);
|
||||
sexp_assert_type(ctx, sexp_bytesp, SEXP_BYTES, vec);
|
||||
sexp_gc_preserve2(ctx, str, res);
|
||||
str = sexp_bytes_to_string(ctx, vec);
|
||||
str = sexp_bytes_to_string(ctx, vec, 0, sexp_bytes_length(vec));
|
||||
res = sexp_open_input_string(ctx, str);
|
||||
sexp_port_binaryp(res) = 1;
|
||||
sexp_gc_release2(ctx);
|
||||
|
@ -341,10 +341,72 @@ sexp sexp_string_to_utf8 (sexp ctx, sexp self, sexp str) {
|
|||
return sexp_string_to_bytes(ctx, res);
|
||||
}
|
||||
|
||||
/* TODO: add validation */
|
||||
sexp sexp_utf8_to_string_x (sexp ctx, sexp self, sexp vec) {
|
||||
sexp sexp_string_to_utf8_x (sexp ctx, sexp self, sexp str) {
|
||||
sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str);
|
||||
#if SEXP_USE_PACKED_STRINGS
|
||||
return sexp_string_to_utf8(ctx, self, str);
|
||||
#else
|
||||
return sexp_string_bytes(str);
|
||||
#endif
|
||||
}
|
||||
|
||||
sexp sexp_string_offset_op (sexp ctx, sexp self, sexp str) {
|
||||
sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str);
|
||||
#if SEXP_USE_PACKED_STRINGS
|
||||
return SEXP_ZERO;
|
||||
#else
|
||||
return sexp_make_fixnum(sexp_string_offset(str));
|
||||
#endif
|
||||
}
|
||||
|
||||
sexp sexp_utf8_ref (sexp ctx, sexp self, sexp bv, sexp offset) {
|
||||
sexp_assert_type(ctx, sexp_bytesp, SEXP_BYTES, bv);
|
||||
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, offset);
|
||||
unsigned char *p=(unsigned char*)sexp_bytes_data(bv) + sexp_unbox_fixnum(offset);
|
||||
if (*p < 0x80)
|
||||
return sexp_make_character(*p);
|
||||
else if ((*p < 0xC0) || (*p > 0xF7))
|
||||
return sexp_user_exception(ctx, NULL, "utf8-ref: invalid utf8 byte", offset);
|
||||
else if (*p < 0xE0)
|
||||
return sexp_make_character(((p[0]&0x3F)<<6) + (p[1]&0x3F));
|
||||
else if (*p < 0xF0)
|
||||
return sexp_make_character(((p[0]&0x1F)<<12) + ((p[1]&0x3F)<<6) + (p[2]&0x3F));
|
||||
else
|
||||
return sexp_make_character(((p[0]&0x0F)<<18) + ((p[1]&0x3F)<<12) + ((p[2]&0x3F)<<6) + (p[3]&0x3F));
|
||||
}
|
||||
|
||||
/* computes length, consider scanning permissively */
|
||||
sexp sexp_utf8_next (sexp ctx, sexp self, sexp bv, sexp offset, sexp end) {
|
||||
sexp_sint_t initial, res;
|
||||
sexp_assert_type(ctx, sexp_bytesp, SEXP_BYTES, bv);
|
||||
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, offset);
|
||||
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, end);
|
||||
if (sexp_unbox_fixnum(offset) >= sexp_unbox_fixnum(end)) return SEXP_FALSE;
|
||||
initial = ((unsigned char*)sexp_bytes_data(bv) + sexp_unbox_fixnum(offset))[0];
|
||||
res = sexp_unbox_fixnum(offset) + (initial < 0xC0 ? 1 : initial < 0xE0 ? 2 : 3 + ((initial>>4)&1));
|
||||
return res > sexp_unbox_fixnum(end) ? SEXP_FALSE : sexp_make_fixnum(res);
|
||||
}
|
||||
|
||||
/* scans backwards permissively */
|
||||
sexp sexp_utf8_prev (sexp ctx, sexp self, sexp bv, sexp offset, sexp start) {
|
||||
sexp_sint_t i, limit;
|
||||
sexp_assert_type(ctx, sexp_bytesp, SEXP_BYTES, bv);
|
||||
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, offset);
|
||||
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, start);
|
||||
unsigned char *p=(unsigned char*)sexp_bytes_data(bv);
|
||||
i = sexp_unbox_fixnum(offset) - 1;
|
||||
limit = sexp_unbox_fixnum(start);
|
||||
while (i >= limit && ((p[i]>>6) == 2))
|
||||
--i;
|
||||
return i < limit ? SEXP_FALSE : sexp_make_fixnum(i);
|
||||
}
|
||||
|
||||
/* TODO: add optional encoding validation */
|
||||
sexp sexp_utf8_to_string_x (sexp ctx, sexp self, sexp vec, sexp offset, sexp size) {
|
||||
sexp_assert_type(ctx, sexp_bytesp, SEXP_BYTES, vec);
|
||||
return sexp_bytes_to_string(ctx, vec);
|
||||
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, offset);
|
||||
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, size);
|
||||
return sexp_bytes_to_string(ctx, vec, sexp_unbox_fixnum(offset), sexp_unbox_fixnum(size));
|
||||
}
|
||||
|
||||
sexp sexp_write_u8 (sexp ctx, sexp self, sexp u8, sexp out) {
|
||||
|
|
Loading…
Add table
Reference in a new issue