mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-05-19 05:39:18 +02:00
518 lines
20 KiB
Scheme
518 lines
20 KiB
Scheme
;;; Copyright (C) William D Clinger (2016).
|
|
;;;
|
|
;;; Permission is hereby granted, free of charge, to any person
|
|
;;; obtaining a copy of this software and associated documentation
|
|
;;; files (the "Software"), to deal in the Software without
|
|
;;; restriction, including without limitation the rights to use,
|
|
;;; copy, modify, merge, publish, distribute, sublicense, and/or
|
|
;;; sell copies of the Software, and to permit persons to whom the
|
|
;;; Software is furnished to do so, subject to the following
|
|
;;; conditions:
|
|
;;;
|
|
;;; The above copyright notice and this permission notice shall be
|
|
;;; included in all copies or substantial portions of the Software.
|
|
;;;
|
|
;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
;;; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
;;; OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
;;; FIXME: these utilities should be in a separate file
|
|
|
|
(define (complain name . args)
|
|
(apply error
|
|
(string-append (symbol->string name) ": illegal arguments")
|
|
args))
|
|
|
|
(define (list->bytevector bytes)
|
|
(let* ((n (length bytes))
|
|
(bv (make-bytevector n)))
|
|
(do ((i 0 (+ i 1))
|
|
(bytes bytes (cdr bytes)))
|
|
((= i n))
|
|
(bytevector-u8-set! bv i (car bytes)))
|
|
bv))
|
|
|
|
;;; 1-argument version for internal use
|
|
|
|
(define (%string->text s)
|
|
(if (string? s)
|
|
(text-tabulate (lambda (i) (string-ref s i))
|
|
(string-length s))
|
|
(complain 'string->text s)))
|
|
|
|
;;; A portable implementation can't rely on inlining,
|
|
;;; but it can rely on macros.
|
|
|
|
(define N 128)
|
|
|
|
(define (length&i0 len i0)
|
|
(+ (* N len) i0))
|
|
|
|
#;
|
|
(define (length&i0.length k)
|
|
(quotient k N))
|
|
|
|
#;
|
|
(define (length&i0.i0 k)
|
|
(remainder k N))
|
|
|
|
(define-syntax length&i0.length
|
|
(syntax-rules ()
|
|
((_ k)
|
|
(quotient k N))))
|
|
|
|
(define-syntax length&i0.i0
|
|
(syntax-rules ()
|
|
((_ k)
|
|
(remainder k N))))
|
|
|
|
(define-record-type text-rtd
|
|
(new-text0 k chunks)
|
|
text?
|
|
(k text.k)
|
|
(chunks text.chunks))
|
|
|
|
(define (%new-text len i0 chunks)
|
|
(new-text0 (length&i0 len i0) chunks))
|
|
|
|
(define the-empty-text
|
|
(%new-text 0 0 (vector (make-bytevector 0))))
|
|
|
|
;;; text? is defined by the record definition above.
|
|
|
|
(define (text-length txt)
|
|
(if (text? txt)
|
|
(length&i0.length (text.k txt))
|
|
(error "text-length: not a text" txt)))
|
|
|
|
(define (text-ref txt i)
|
|
(if (and (text? txt)
|
|
(exact-integer? i)
|
|
(<= 0 i))
|
|
(let* ((k (text.k txt))
|
|
(chunks (text.chunks txt))
|
|
(len (length&i0.length k))
|
|
(i0 (length&i0.i0 k))
|
|
(i+i0 (+ i i0))
|
|
(j (quotient i+i0 N))
|
|
(ii (remainder i+i0 N)))
|
|
(if (< i len)
|
|
(let* ((sj (vector-ref chunks j))
|
|
(sjn (bytevector-length sj)))
|
|
(if (if (< j (- (vector-length chunks) 1))
|
|
(= sjn N)
|
|
(= sjn (remainder (+ i0 len) N)))
|
|
(integer->char (bytevector-u8-ref sj ii))
|
|
(%utf8-ref sj ii)))
|
|
(error "text-ref: index out of range" txt i)))
|
|
(error "text-ref: illegal arguments" txt i)))
|
|
|
|
;;; Non-checking versions for internal use.
|
|
|
|
(define (%text-length txt)
|
|
(length&i0.length (text.k txt)))
|
|
|
|
(define (%text-ref txt i)
|
|
(let* ((k (text.k txt))
|
|
(chunks (text.chunks txt))
|
|
(len (length&i0.length k))
|
|
(i0 (length&i0.i0 k))
|
|
(i+i0 (+ i i0))
|
|
(j (quotient i+i0 N))
|
|
(ii (remainder i+i0 N))
|
|
(sj (vector-ref chunks j))
|
|
(sjn (bytevector-length sj)))
|
|
(if (if (< j (- (vector-length chunks) 1))
|
|
(= sjn N)
|
|
(= sjn (remainder (+ i0 len) N)))
|
|
(integer->char (bytevector-u8-ref sj ii))
|
|
(%utf8-ref sj ii))))
|
|
|
|
;;; Returns character i of the UTF-8.
|
|
|
|
(define (%utf8-ref bv i)
|
|
(let loop ((j 0) ; character index of (bytevector-u8-ref bv k)
|
|
(k 0)) ; byte index into bv
|
|
(if (= i j)
|
|
(%utf8-char-at bv k)
|
|
(let ((byte (bytevector-u8-ref bv k)))
|
|
(cond ((< byte 128)
|
|
(loop (+ j 1) (+ k 1)))
|
|
((< byte #b11100000)
|
|
(loop (+ j 1) (+ k 2)))
|
|
((< byte #b11110000)
|
|
(loop (+ j 1) (+ k 3)))
|
|
(else
|
|
(loop (+ j 1) (+ k 4))))))))
|
|
|
|
(define (%utf8-char-at bv k)
|
|
(let ((byte (bytevector-u8-ref bv k)))
|
|
(cond ((< byte 128)
|
|
(integer->char byte))
|
|
((< byte #b11100000)
|
|
(let* ((byte2 (bytevector-u8-ref bv (+ k 1)))
|
|
(bits1 (- byte #b11000000))
|
|
(bits2 (- byte2 #b10000000))
|
|
(cp (+ (* 64 bits1) bits2)))
|
|
(integer->char cp)))
|
|
((< byte #b11110000)
|
|
(let* ((byte2 (bytevector-u8-ref bv (+ k 1)))
|
|
(byte3 (bytevector-u8-ref bv (+ k 2)))
|
|
(bits1 (- byte #b11100000))
|
|
(bits2 (- byte2 #b10000000))
|
|
(bits3 (- byte3 #b10000000))
|
|
(cp (+ (* 64 64 bits1) (* 64 bits2) bits3)))
|
|
(integer->char cp)))
|
|
(else
|
|
(let* ((byte2 (bytevector-u8-ref bv (+ k 1)))
|
|
(byte3 (bytevector-u8-ref bv (+ k 2)))
|
|
(byte4 (bytevector-u8-ref bv (+ k 3)))
|
|
(bits1 (- byte #b11110000))
|
|
(bits2 (- byte2 #b10000000))
|
|
(bits3 (- byte3 #b10000000))
|
|
(bits4 (- byte4 #b10000000))
|
|
(cp (+ (* 64 64 64 bits1)
|
|
(* 64 64 bits2)
|
|
(* 64 bits3)
|
|
bits4)))
|
|
(integer->char cp))))))
|
|
|
|
;;; text-tabulate avoids side effects (in case proc returns more than once)
|
|
|
|
(define (text-tabulate proc len)
|
|
(if (= 0 len)
|
|
the-empty-text
|
|
(let loop ((i len) ; highest index that's been tabulated
|
|
(chunks '())
|
|
(bytes '()))
|
|
(cond ((= 0 i)
|
|
(%new-text len
|
|
0
|
|
(list->vector
|
|
(cons (list->bytevector bytes)
|
|
chunks))))
|
|
((and (= 0 (remainder i N))
|
|
(not (null? bytes)))
|
|
(loop i
|
|
(cons (list->bytevector bytes) chunks)
|
|
'()))
|
|
(else
|
|
(let* ((i-1 (- i 1))
|
|
(c (proc i-1)))
|
|
(if (char? c)
|
|
(let ((cp (char->integer c)))
|
|
(loop i-1
|
|
chunks
|
|
(cond ((< cp #x0080)
|
|
(cons cp bytes))
|
|
((< cp #x0800)
|
|
(let* ((bits1 (quotient cp 64))
|
|
(bits2 (remainder cp 64))
|
|
(byte1 (+ bits1 #b11000000))
|
|
(byte2 (+ bits2 #b10000000)))
|
|
(cons byte1 (cons byte2 bytes))))
|
|
((< cp #x10000)
|
|
(let* ((bits1 (quotient cp (* 64 64)))
|
|
(bits2 (quotient
|
|
(remainder cp (* 64 64))
|
|
64))
|
|
(bits3 (remainder cp 64))
|
|
(byte1 (+ bits1 #b11100000))
|
|
(byte2 (+ bits2 #b10000000))
|
|
(byte3 (+ bits3 #b10000000)))
|
|
(cons byte1
|
|
(cons byte2
|
|
(cons byte3 bytes)))))
|
|
(else
|
|
(let* ((bits1 (quotient cp (* 64 64 64)))
|
|
(bits2 (quotient
|
|
(remainder cp (* 64 64 64))
|
|
(* 64 64)))
|
|
(bits3 (quotient
|
|
(remainder cp (* 64 64))
|
|
64))
|
|
(bits4 (remainder cp 64))
|
|
(byte1 (+ bits1 #b11110000))
|
|
(byte2 (+ bits2 #b10000000))
|
|
(byte3 (+ bits3 #b10000000))
|
|
(byte4 (+ bits4 #b10000000)))
|
|
(cons byte1
|
|
(cons byte2
|
|
(cons byte3
|
|
(cons byte4
|
|
bytes)))))))))
|
|
(error "text-tabulate: proc returned a non-character"
|
|
proc len c))))))))
|
|
|
|
;;; FIXME: should the fast case do something different
|
|
;;; if the length of the result is sufficiently small?
|
|
;;; Probably not: splitting a 100-character text into
|
|
;;; 100 1-character texts should be fast, and programmers
|
|
;;; can now use text-copy instead if they're worried about it.
|
|
;;;
|
|
;;; subtext is now defined only for texts; use subtextual
|
|
;;; if the first argument might be a string.
|
|
|
|
(define (subtext txt start end)
|
|
(cond ((and (text? txt)
|
|
(exact-integer? start)
|
|
(exact-integer? end)
|
|
(<= 0 start end))
|
|
(%subtext txt start end))
|
|
#; ((string? txt)
|
|
(%string->text (substring txt start end)))
|
|
(else
|
|
(complain 'subtext txt start end))))
|
|
|
|
(define (%subtext txt start end)
|
|
(let* ((k (text.k txt))
|
|
(chunks (text.chunks txt))
|
|
(len (length&i0.length k))
|
|
(i0 (length&i0.i0 k))
|
|
(i+i0 (+ start i0))
|
|
(end+i0 (+ end i0))
|
|
(len+i0 (+ len i0))
|
|
(jstart (quotient i+i0 N))
|
|
(jend (quotient end+i0 N))
|
|
(jlen (quotient len N)))
|
|
(if (<= end len)
|
|
(cond ((= start end)
|
|
the-empty-text)
|
|
((and (= 0 jstart)
|
|
(= jlen jend))
|
|
;; the fast case
|
|
(%new-text (- end start) i+i0 chunks))
|
|
(else
|
|
(let* ((v (make-vector (+ 1 (- jend jstart)))))
|
|
(do ((j jstart (+ j 1)))
|
|
((> j jend))
|
|
(vector-set! v (- j jstart) (vector-ref chunks j)))
|
|
(%new-text (- end start)
|
|
(remainder i+i0 N)
|
|
v))))
|
|
(error "subtext: end out of range" txt start end))))
|
|
|
|
;;; There are a lot of special cases that could be exploited here:
|
|
;;; share the characters of the longest text
|
|
;;; share the characters of the longest run of texts
|
|
;;; whose characters don't have to be copied
|
|
;;; if (text-length txt1) is a multiple of N,
|
|
;;; and txt2 starts at offset 0,
|
|
;;; then txt1 and txt2 can be concatenated
|
|
;;; without copying any of their characters
|
|
;;;
|
|
;;; That's a partial list.
|
|
;;; It would be easy to spend more time detecting special cases
|
|
;;; than would be saved on average.
|
|
;;; In the interest of simplicity and reliability, this code
|
|
;;; currently implements only two special cases:
|
|
;;; share the full chunks of the longest text
|
|
;;; provided
|
|
;;; it contains at least N characters
|
|
;;; it contains at least half the characters of the result
|
|
;;; its characters start at offset zero
|
|
;;; share the full chunks of the first text
|
|
;;; provided
|
|
;;; it contains at least N characters
|
|
;;; its characters start at offset zero
|
|
|
|
(define (textual-concatenate texts)
|
|
(cond ((not (list? texts))
|
|
(complain 'textual-concatenate texts))
|
|
((null? texts) the-empty-text)
|
|
((null? (cdr texts))
|
|
(let ((txt (car texts)))
|
|
(cond ((text? txt) txt)
|
|
((string? txt)
|
|
(%string->text txt))
|
|
(else (complain 'textual-concatenate texts)))))
|
|
(else
|
|
(let loop ((items (reverse texts))
|
|
(real-texts '())
|
|
(n 0)
|
|
(longest #f)
|
|
(longest-length 0))
|
|
(cond ((null? items)
|
|
(%text-concatenate-n real-texts n longest longest-length))
|
|
((text? (car items))
|
|
(let* ((txt (car items))
|
|
(k (%text-length txt)))
|
|
(loop (cdr items)
|
|
(cons txt real-texts)
|
|
(+ n k)
|
|
(if (> k longest-length) txt longest)
|
|
(max k longest-length))))
|
|
((string? (car items))
|
|
(loop (cons (%string->text (car items)) (cdr items))
|
|
real-texts n longest longest-length))
|
|
(else
|
|
(complain 'textual-concatenate texts)))))))
|
|
|
|
;;; All of the texts are really texts. No strings.
|
|
;;; n is the length of the result.
|
|
;;; longest is #f or the longest of the texts, and
|
|
;;; longest-length is its length (or zero).
|
|
|
|
(define (%text-concatenate-n texts n longest longest-length)
|
|
(if (and longest
|
|
(> longest-length N)
|
|
(< n (+ longest-length longest-length))
|
|
(= 0 (length&i0.i0 (text.k longest))))
|
|
(if (eq? longest (car texts))
|
|
(%%text-concatenate-n texts n)
|
|
(let loop ((texts texts)
|
|
(front '())
|
|
(front-length 0))
|
|
(cond ((eq? longest (car texts))
|
|
(%%text-concatenate-front
|
|
(reverse front)
|
|
(%%text-concatenate-n texts (- n front-length))
|
|
front-length
|
|
n))
|
|
(else
|
|
(let ((txt (car texts)))
|
|
(loop (cdr texts)
|
|
(cons txt front)
|
|
(+ front-length (%text-length txt))))))))
|
|
(%%text-concatenate-n texts n)))
|
|
|
|
;;; texts is a non-empty list of texts, with no strings.
|
|
;;; n is the length of the result.
|
|
;;;
|
|
;;; The text returned has a start index of zero.
|
|
;;;
|
|
;;; Special case:
|
|
;;; If the first text has a start index of zero,
|
|
;;; then its full chunks don't have to be copied.
|
|
|
|
(define (%%text-concatenate-n texts n)
|
|
(if (= 0 n)
|
|
the-empty-text
|
|
(let* ((n/N (quotient n N))
|
|
(m (remainder n N))
|
|
(nchunks (+ n/N (if (= 0 m) 0 1)))
|
|
(chunks (make-vector nchunks 'bug-in-text-concatenate))
|
|
(txt (car texts))
|
|
(k (text.k txt))
|
|
(len (length&i0.length k))
|
|
(i0 (length&i0.i0 k)))
|
|
(if (and (> len N)
|
|
(= 0 i0))
|
|
(let* ((j (quotient len N))
|
|
(ti (* j N))
|
|
(chunks0 (text.chunks txt)))
|
|
(do ((i 0 (+ i 1)))
|
|
((= i j))
|
|
(vector-set! chunks i (vector-ref chunks0 i)))
|
|
(%%text-concatenate-finish n 0 chunks j texts ti))
|
|
(%%text-concatenate-finish n 0 chunks 0 texts 0)))))
|
|
|
|
;;; All of the texts are really texts. No strings.
|
|
;;; The second argument is a text that starts at offset zero.
|
|
;;; k is the total length of the texts passed as first argument.
|
|
;;; n is the length of the result.
|
|
;;;
|
|
;;; Returns the texts concatenated with the second argument,
|
|
;;; without copying any chunks of the second argument.
|
|
|
|
(define (%%text-concatenate-front texts txt k n)
|
|
(let* ((k/N (quotient k N))
|
|
(mk (remainder k N))
|
|
(i0 (if (= 0 mk) 0 (- N mk))) ; start offset for result
|
|
(kchunks (+ k/N (if (= 0 mk) 0 1))) ; number of new chunks
|
|
(n-k (- n k))
|
|
(n-k/N (quotient n-k N))
|
|
(m (remainder n-k N))
|
|
(nchunks (+ kchunks
|
|
n-k/N
|
|
(if (= 0 m) 0 1)))
|
|
(chunks (make-vector nchunks 'bug-in-text-concatenate))
|
|
(chunks2 (text.chunks txt)))
|
|
|
|
;; copy extant chunks
|
|
|
|
(do ((i kchunks (+ i 1)))
|
|
((= i nchunks))
|
|
(vector-set! chunks i (vector-ref chunks2 (- i kchunks))))
|
|
|
|
(%%text-concatenate-finish n i0 chunks 0 texts 0)))
|
|
|
|
;;; Given:
|
|
;;;
|
|
;;; n : the length of a text to be created
|
|
;;; i0 : start offset for the text to be created
|
|
;;; chunks : the vector of chunks for that new text
|
|
;;; j : vector index of first uninitialized chunk
|
|
;;; texts : a non-empty list of texts to be copied into the chunks
|
|
;;; ti : index of first uncopied character in the first text
|
|
;;;
|
|
;;; Creates new chunks as necessary, copies the texts into those chunks
|
|
;;; and returns a new text.
|
|
;;; The given texts may not fill the chunks because the chunks
|
|
;;; of some shared text may already have been copied into some
|
|
;;; tail of the chunks vector.
|
|
|
|
(define (%%text-concatenate-finish n i0 chunks j texts ti)
|
|
(let loop ((texts (cdr texts))
|
|
(txt (car texts))
|
|
(j j) ; index into chunks
|
|
(k i0) ; index into (vector-ref chunks j)
|
|
(ti ti) ; index into txt
|
|
(bytes (make-list i0 0))) ; bytes being collected for next chunk
|
|
(cond ((= k N)
|
|
(let ((bv (list->bytevector (reverse bytes))))
|
|
(vector-set! chunks j bv))
|
|
(loop texts txt (+ j 1) 0 ti '()))
|
|
((= ti (%text-length txt))
|
|
(if (null? texts)
|
|
(begin (if (not (null? bytes))
|
|
(let ((bv (list->bytevector (reverse bytes))))
|
|
(vector-set! chunks j bv)))
|
|
(%new-text n i0 chunks))
|
|
(loop (cdr texts) (car texts) j k 0 bytes)))
|
|
(else
|
|
(let* ((cp (char->integer (%text-ref txt ti)))
|
|
(bytes (cond ((< cp #x0080)
|
|
(cons cp bytes))
|
|
((< cp #x0800)
|
|
(let* ((bits1 (quotient cp 64))
|
|
(bits2 (remainder cp 64))
|
|
(byte1 (+ bits1 #b11000000))
|
|
(byte2 (+ bits2 #b10000000)))
|
|
(cons byte2 (cons byte1 bytes))))
|
|
((< cp #x10000)
|
|
(let* ((bits1 (quotient cp (* 64 64)))
|
|
(bits2 (quotient
|
|
(remainder cp (* 64 64))
|
|
64))
|
|
(bits3 (remainder cp 64))
|
|
(byte1 (+ bits1 #b11100000))
|
|
(byte2 (+ bits2 #b10000000))
|
|
(byte3 (+ bits3 #b10000000)))
|
|
(cons byte3
|
|
(cons byte2
|
|
(cons byte1 bytes)))))
|
|
(else
|
|
(let* ((bits1 (quotient cp (* 64 64 64)))
|
|
(bits2 (quotient
|
|
(remainder cp (* 64 64 64))
|
|
(* 64 64)))
|
|
(bits3 (quotient
|
|
(remainder cp (* 64 64))
|
|
64))
|
|
(bits4 (remainder cp 64))
|
|
(byte1 (+ bits1 #b11110000))
|
|
(byte2 (+ bits2 #b10000000))
|
|
(byte3 (+ bits3 #b10000000))
|
|
(byte4 (+ bits4 #b10000000)))
|
|
(cons byte4
|
|
(cons byte3
|
|
(cons byte2
|
|
(cons byte1
|
|
bytes)))))))))
|
|
(loop texts txt j (+ k 1) (+ ti 1) bytes))))))
|