Add some extended utf8 IO utilities.

This commit is contained in:
Alex Shinn 2024-03-18 22:47:40 +09:00
parent 5b19aab107
commit c4611cc33f
5 changed files with 101 additions and 28 deletions

View file

@ -134,6 +134,20 @@
(read-string 4096 in) (read-string 4096 in)
(read-line in))) (read-line in)))
(let ((bv (string->utf8 "日本語")))
(test #\日 (utf8-ref bv 0))
(test #\本 (utf8-ref bv 3))
(test #\語 (utf8-ref bv 6))
(test 3 (utf8-next bv 0 9))
(test 6 (utf8-next bv 3 9))
(test 9 (utf8-next bv 6 9))
(test #f (utf8-next bv 9 9))
(test 6 (utf8-prev bv 9 0))
(test 3 (utf8-prev bv 6 0))
(test 0 (utf8-prev bv 3 0))
(test #f (utf8-prev bv 0 0))
)
(test #u8(0 1 2) (test #u8(0 1 2)
(let ((in (bytevectors->input-port (list #u8(0 1 2))))) (let ((in (bytevectors->input-port (list #u8(0 1 2)))))
(read-bytevector 3 in))) (read-bytevector 3 in)))

View file

@ -14,7 +14,8 @@
make-filtered-output-port make-filtered-input-port make-filtered-output-port make-filtered-input-port
string-count-chars string-count-chars
open-input-bytevector open-output-bytevector get-output-bytevector open-input-bytevector open-output-bytevector get-output-bytevector
string->utf8 utf8->string string->utf8 string->utf8! string-offset utf8->string utf8->string!
utf8-ref utf8-next utf8-prev
write-string write-u8 read-u8 peek-u8 send-file write-string write-u8 read-u8 peek-u8 send-file
is-a-socket? is-a-socket?
call-with-input-file call-with-output-file) call-with-input-file call-with-output-file)

View file

@ -9,25 +9,10 @@
(call-with-input-string " " (call-with-input-string " "
(lambda (in) (read-char in) (read-char in)))) (lambda (in) (read-char in) (read-char in))))
;; Copy whole characters from the given cursor positions.
;; Return the src cursor position of the next unwritten char,
;; which may be before `to' if the char would overflow.
;; Now provided as a primitive from (chibi ast).
;; (define (string-cursor-copy! dst start src from to)
;; (let lp ((i from)
;; (j (string-cursor->index dst start)))
;; (let ((i2 (string-cursor-next src i)))
;; (cond ((> i2 to) i)
;; (else
;; (string-set! dst j (string-cursor-ref src i))
;; (lp i2 (+ j 1)))))))
(define (utf8->string vec . o) (define (utf8->string vec . o)
(if (pair? o) (let ((start (if (pair? o) (car o) 0))
(let ((start (car o)) (end (if (and (pair? o) (pair? (cdr o))) (cadr o) (bytevector-length vec))))
(end (if (pair? (cdr o)) (cadr o) (bytevector-length vec)))) (string-copy (utf8->string! vec start end))))
(utf8->string (subbytes vec start end)))
(string-copy (utf8->string! vec))))
(define (string->utf8 str . o) (define (string->utf8 str . o)
(if (pair? o) (if (pair? o)

View file

@ -50,8 +50,19 @@
((value ctx sexp) (value self sexp) sexp sexp sexp (default NULL sexp))) ((value ctx sexp) (value self sexp) sexp sexp sexp (default NULL sexp)))
(define-c sexp (%string->utf8 "sexp_string_to_utf8") (define-c sexp (%string->utf8 "sexp_string_to_utf8")
((value ctx sexp) (value self sexp) sexp)) ((value ctx sexp) (value self sexp) sexp))
(define-c sexp (utf8->string! "sexp_utf8_to_string_x") (define-c sexp (string->utf8! "sexp_string_to_utf8_x")
((value ctx sexp) (value self sexp) sexp)) ((value ctx sexp) (value self sexp) sexp))
(define-c sexp (string-offset "sexp_string_offset_op")
((value ctx sexp) (value self sexp) sexp))
(define-c sexp (utf8->string! "sexp_utf8_to_string_x")
((value ctx sexp) (value self sexp) sexp sexp sexp))
(define-c sexp (utf8-ref "sexp_utf8_ref")
((value ctx sexp) (value self sexp) sexp sexp))
(define-c sexp (utf8-next "sexp_utf8_next")
((value ctx sexp) (value self sexp) sexp sexp sexp))
(define-c sexp (utf8-prev "sexp_utf8_prev")
((value ctx sexp) (value self sexp) sexp sexp sexp))
(define-c sexp (write-u8 "sexp_write_u8") (define-c sexp (write-u8 "sexp_write_u8")
((value ctx sexp) (value self sexp) sexp (default (current-output-port) sexp))) ((value ctx sexp) (value self sexp) sexp (default (current-output-port) sexp)))

View file

@ -258,15 +258,15 @@ sexp sexp_make_custom_binary_output_port (sexp ctx, sexp self,
return res; return res;
} }
sexp sexp_bytes_to_string (sexp ctx, sexp vec) { sexp sexp_bytes_to_string (sexp ctx, sexp vec, sexp_uint_t offset, sexp_uint_t size) {
sexp res; sexp res;
#if SEXP_USE_PACKED_STRINGS #if SEXP_USE_PACKED_STRINGS
res = sexp_c_string(ctx, sexp_bytes_data(vec), sexp_bytes_length(vec)); res = sexp_c_string(ctx, sexp_bytes_data(vec) + offset, size);
#else #else
res = sexp_alloc_type(ctx, string, SEXP_STRING); res = sexp_alloc_type(ctx, string, SEXP_STRING);
sexp_string_bytes(res) = vec; sexp_string_bytes(res) = vec;
sexp_string_offset(res) = 0; sexp_string_offset(res) = offset;
sexp_string_size(res) = sexp_bytes_length(vec); sexp_string_size(res) = size - offset;
#endif #endif
return res; return res;
} }
@ -275,7 +275,7 @@ sexp sexp_open_input_bytevector (sexp ctx, sexp self, sexp vec) {
sexp_gc_var2(str, res); sexp_gc_var2(str, res);
sexp_assert_type(ctx, sexp_bytesp, SEXP_BYTES, vec); sexp_assert_type(ctx, sexp_bytesp, SEXP_BYTES, vec);
sexp_gc_preserve2(ctx, str, res); sexp_gc_preserve2(ctx, str, res);
str = sexp_bytes_to_string(ctx, vec); str = sexp_bytes_to_string(ctx, vec, 0, sexp_bytes_length(vec));
res = sexp_open_input_string(ctx, str); res = sexp_open_input_string(ctx, str);
sexp_port_binaryp(res) = 1; sexp_port_binaryp(res) = 1;
sexp_gc_release2(ctx); sexp_gc_release2(ctx);
@ -341,10 +341,72 @@ sexp sexp_string_to_utf8 (sexp ctx, sexp self, sexp str) {
return sexp_string_to_bytes(ctx, res); return sexp_string_to_bytes(ctx, res);
} }
/* TODO: add validation */ sexp sexp_string_to_utf8_x (sexp ctx, sexp self, sexp str) {
sexp sexp_utf8_to_string_x (sexp ctx, sexp self, sexp vec) { sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str);
#if SEXP_USE_PACKED_STRINGS
return sexp_string_to_utf8(ctx, self, str);
#else
return sexp_string_bytes(str);
#endif
}
sexp sexp_string_offset_op (sexp ctx, sexp self, sexp str) {
sexp_assert_type(ctx, sexp_stringp, SEXP_STRING, str);
#if SEXP_USE_PACKED_STRINGS
return SEXP_ZERO;
#else
return sexp_make_fixnum(sexp_string_offset(str));
#endif
}
sexp sexp_utf8_ref (sexp ctx, sexp self, sexp bv, sexp offset) {
sexp_assert_type(ctx, sexp_bytesp, SEXP_BYTES, bv);
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, offset);
unsigned char *p=(unsigned char*)sexp_bytes_data(bv) + sexp_unbox_fixnum(offset);
if (*p < 0x80)
return sexp_make_character(*p);
else if ((*p < 0xC0) || (*p > 0xF7))
return sexp_user_exception(ctx, NULL, "utf8-ref: invalid utf8 byte", offset);
else if (*p < 0xE0)
return sexp_make_character(((p[0]&0x3F)<<6) + (p[1]&0x3F));
else if (*p < 0xF0)
return sexp_make_character(((p[0]&0x1F)<<12) + ((p[1]&0x3F)<<6) + (p[2]&0x3F));
else
return sexp_make_character(((p[0]&0x0F)<<18) + ((p[1]&0x3F)<<12) + ((p[2]&0x3F)<<6) + (p[3]&0x3F));
}
/* computes length, consider scanning permissively */
sexp sexp_utf8_next (sexp ctx, sexp self, sexp bv, sexp offset, sexp end) {
sexp_sint_t initial, res;
sexp_assert_type(ctx, sexp_bytesp, SEXP_BYTES, bv);
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, offset);
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, end);
if (sexp_unbox_fixnum(offset) >= sexp_unbox_fixnum(end)) return SEXP_FALSE;
initial = ((unsigned char*)sexp_bytes_data(bv) + sexp_unbox_fixnum(offset))[0];
res = sexp_unbox_fixnum(offset) + (initial < 0xC0 ? 1 : initial < 0xE0 ? 2 : 3 + ((initial>>4)&1));
return res > sexp_unbox_fixnum(end) ? SEXP_FALSE : sexp_make_fixnum(res);
}
/* scans backwards permissively */
sexp sexp_utf8_prev (sexp ctx, sexp self, sexp bv, sexp offset, sexp start) {
sexp_sint_t i, limit;
sexp_assert_type(ctx, sexp_bytesp, SEXP_BYTES, bv);
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, offset);
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, start);
unsigned char *p=(unsigned char*)sexp_bytes_data(bv);
i = sexp_unbox_fixnum(offset) - 1;
limit = sexp_unbox_fixnum(start);
while (i >= limit && ((p[i]>>6) == 2))
--i;
return i < limit ? SEXP_FALSE : sexp_make_fixnum(i);
}
/* TODO: add optional encoding validation */
sexp sexp_utf8_to_string_x (sexp ctx, sexp self, sexp vec, sexp offset, sexp size) {
sexp_assert_type(ctx, sexp_bytesp, SEXP_BYTES, vec); sexp_assert_type(ctx, sexp_bytesp, SEXP_BYTES, vec);
return sexp_bytes_to_string(ctx, vec); sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, offset);
sexp_assert_type(ctx, sexp_fixnump, SEXP_FIXNUM, size);
return sexp_bytes_to_string(ctx, vec, sexp_unbox_fixnum(offset), sexp_unbox_fixnum(size));
} }
sexp sexp_write_u8 (sexp ctx, sexp self, sexp u8, sexp out) { sexp sexp_write_u8 (sexp ctx, sexp self, sexp u8, sexp out) {