mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-05-19 21:59:17 +02:00
adding unicode-string-width/wide
This commit is contained in:
parent
6f1cf6588f
commit
1164ecf9b7
13 changed files with 166 additions and 152 deletions
7
Makefile
7
Makefile
|
@ -184,12 +184,19 @@ data/%.txt:
|
||||||
build-lib/chibi/char-set/derived.scm: data/UnicodeData.txt data/DerivedCoreProperties.txt chibi-scheme$(EXE)
|
build-lib/chibi/char-set/derived.scm: data/UnicodeData.txt data/DerivedCoreProperties.txt chibi-scheme$(EXE)
|
||||||
$(CHIBI) tools/extract-unicode-props.scm --default > $@
|
$(CHIBI) tools/extract-unicode-props.scm --default > $@
|
||||||
|
|
||||||
|
build-lib/chibi/char-set/width.scm: data/UnicodeData.txt data/EastAsianWidth.txt chibi-scheme$(EXE)
|
||||||
|
$(CHIBI) tools/extract-unicode-props.scm Zero-Width=Mn > $@
|
||||||
|
$(CHIBI) tools/extract-unicode-props.scm -d data/EastAsianWidth.txt Full-Width=F@1,W@1 Ambiguous-Width=A@1 >> $@
|
||||||
|
|
||||||
lib/chibi/char-set/ascii.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE)
|
lib/chibi/char-set/ascii.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE)
|
||||||
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm --ascii chibi.char-set.compute > $@
|
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm --ascii chibi.char-set.compute > $@
|
||||||
|
|
||||||
lib/chibi/char-set/full.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE)
|
lib/chibi/char-set/full.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE)
|
||||||
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm chibi.char-set.compute > $@
|
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm chibi.char-set.compute > $@
|
||||||
|
|
||||||
|
lib/chibi/show/width.scm: build-lib/chibi/char-set/width.scm chibi-scheme$(EXE)
|
||||||
|
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm --predicate chibi.char-set.width > $@
|
||||||
|
|
||||||
lib/scheme/char/case-offsets.scm: data/UnicodeData.txt chibi-scheme$(EXE) all-libs
|
lib/scheme/char/case-offsets.scm: data/UnicodeData.txt chibi-scheme$(EXE) all-libs
|
||||||
$(CHIBI) tools/extract-case-offsets.scm $< > $@
|
$(CHIBI) tools/extract-case-offsets.scm $< > $@
|
||||||
|
|
||||||
|
|
|
@ -168,3 +168,23 @@
|
||||||
,(iset-bits iset)
|
,(iset-bits iset)
|
||||||
,(iset->code (iset-left iset))
|
,(iset->code (iset-left iset))
|
||||||
,(iset->code (iset-right iset)))))
|
,(iset->code (iset-right iset)))))
|
||||||
|
|
||||||
|
;; uses only if, <, <=, >, and SRFI 151 bit-set?
|
||||||
|
(define (iset->code/lambda iset)
|
||||||
|
(define (code iset)
|
||||||
|
(and iset
|
||||||
|
(if (and (not (iset-left iset))
|
||||||
|
(not (iset-right iset))
|
||||||
|
(not (iset-bits iset)))
|
||||||
|
`(<= ,(iset-start iset) n ,(iset-end iset))
|
||||||
|
`(if (< n ,(iset-start iset))
|
||||||
|
,(code (iset-left iset))
|
||||||
|
,(if (and (not (iset-right iset)) (not (iset-bits iset)))
|
||||||
|
`(<= n ,(iset-end iset))
|
||||||
|
`(if (> n ,(iset-end iset))
|
||||||
|
,(code (iset-right iset))
|
||||||
|
,(if (iset-bits iset)
|
||||||
|
`(bit-set? (- n ,(iset-start iset))
|
||||||
|
,(iset-bits iset))
|
||||||
|
#t)))))))
|
||||||
|
`(lambda (n) ,(code iset)))
|
||||||
|
|
|
@ -17,4 +17,5 @@
|
||||||
(bitwise-and (%mask size) (arithmetic-shift n (- position)))))))
|
(bitwise-and (%mask size) (arithmetic-shift n (- position)))))))
|
||||||
(include "optimize.scm")
|
(include "optimize.scm")
|
||||||
(export
|
(export
|
||||||
iset-balance iset-balance! iset-optimize iset-optimize! iset->code))
|
iset-balance iset-balance! iset-optimize iset-optimize!
|
||||||
|
iset->code iset->code/lambda))
|
||||||
|
|
|
@ -742,7 +742,13 @@ def | 6
|
||||||
(test "日本語"
|
(test "日本語"
|
||||||
(show #f (as-unicode (with ((pad-char #\〜)) (padded/both 5 "日本語")))))
|
(show #f (as-unicode (with ((pad-char #\〜)) (padded/both 5 "日本語")))))
|
||||||
(test "日本語 col: 6"
|
(test "日本語 col: 6"
|
||||||
(show #f (as-unicode "日本語" (fn ((col)) (each " col: " col)))))
|
(show #f (as-unicode "日本語" (fn (col) (each " col: " col)))))
|
||||||
|
(test "日本語ΠΜΕ col: 9"
|
||||||
|
(show #f (as-unicode "日本語ΠΜΕ" (fn (col) (each " col: " col)))))
|
||||||
|
(test "日本語ΠΜΕ col: 12"
|
||||||
|
(show #f (with ((ambiguous-is-wide? #t))
|
||||||
|
(as-unicode "日本語ΠΜΕ"
|
||||||
|
(fn (col) (each " col: " col))))))
|
||||||
|
|
||||||
;; from-file
|
;; from-file
|
||||||
;; for reference, filesystem-test relies on creating files under /tmp
|
;; for reference, filesystem-test relies on creating files under /tmp
|
||||||
|
|
|
@ -1,134 +1,70 @@
|
||||||
;; unicode.scm -- Unicode character width and ANSI escape support
|
;; unicode.scm -- Unicode character width and ANSI escape support
|
||||||
;; Copyright (c) 2006-2017 Alex Shinn. All rights reserved.
|
;; Copyright (c) 2006-2020 Alex Shinn. All rights reserved.
|
||||||
;; BSD-style license: http://synthcode.com/license.txt
|
;; BSD-style license: http://synthcode.com/license.txt
|
||||||
|
|
||||||
;; a condensed non-spacing mark range from UnicodeData.txt (chars with
|
(define (unicode-char-width ch ambiguous-is-wide?)
|
||||||
;; the Mn property) - generated partially by hand, should automate
|
(let ((ci (char->integer ch)))
|
||||||
;; this better
|
|
||||||
|
|
||||||
(define low-non-spacing-chars
|
|
||||||
(bytevector
|
|
||||||
#xff #xff #xff #xff #xff #xff #xff #xff #xff #xff #xff #xff #xff #xff 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
#x78 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 #xfe #xff #xff #xff #xff #xff #x1f 0 0 0 0 0 0 0
|
|
||||||
0 0 #x3f 0 0 0 0 0 0 #xf8 #xff #x01 0 0 #x01 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 #xc0 #xff #xff #x3f 0 0
|
|
||||||
0 0 #x02 0 0 0 #xff #xff #xff #x07 0 0 0 0 0 0
|
|
||||||
0 0 0 0 #xc0 #xff #x01 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
#x06 0 0 0 0 0 0 #x10 #xfe #x21 #x1e 0 #x0c 0 0 0
|
|
||||||
#x02 0 0 0 0 0 0 #x10 #x1e #x20 0 0 #x0c 0 0 0
|
|
||||||
#x06 0 0 0 0 0 0 #x10 #xfe #x3f 0 0 0 0 #x03 0
|
|
||||||
#x06 0 0 0 0 0 0 #x30 #xfe #x21 0 0 #x0c 0 0 0
|
|
||||||
#x02 0 0 0 0 0 0 #x90 #x0e #x20 #x40 0 0 0 0 0
|
|
||||||
#x04 0 0 0 0 0 0 0 0 #x20 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 #xc0 #xc1 #xff #x7f 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 #x10 #x40 #x30 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 #x0e #x20 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 #x04 #x7c 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 #xf2 #x07 #x80 #x7f 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 #xf2 #x1f 0 #x3f 0 0 0 0 0 0
|
|
||||||
0 0 0 #x03 0 0 #xa0 #x02 0 0 0 0 0 0 #xfe #x7f
|
|
||||||
#xdf 0 #xff #xff #xff #xff #xff #x1f #x40 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 #xe0 #xfd #x02 0 0 0 #x03 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 #x1c 0 0 0 #x1c 0 0 0 #x0c 0 0 0 #x0c 0
|
|
||||||
0 0 0 0 0 0 #x80 #x3f #x40 #xfe #x0f #x20 0 0 0 0
|
|
||||||
0 #x38 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 #x02 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 #x87 #x01 #x04 #x0e 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
||||||
0 0 0 0 0 0 0 0 0 0 #xff #x1f #xe2 #x07
|
|
||||||
))
|
|
||||||
|
|
||||||
(define (unicode-char-width c)
|
|
||||||
(let ((ci (char->integer c)))
|
|
||||||
(cond
|
(cond
|
||||||
;; hand-checked ranges from EastAsianWidth.txt
|
((char-set:zero-width? ci)
|
||||||
((<= #x1100 ci #x115F) 2) ; Hangul
|
0)
|
||||||
((<= #x2E80 ci #x4DB5) 2) ; CJK
|
((char-set:full-width? ci)
|
||||||
((<= #x4E00 ci #xA4C6) 2)
|
2)
|
||||||
((<= #xAC00 ci #xD7A3) 2) ; Hangul
|
((and ambiguous-is-wide? (char-set:ambiguous-width? ci))
|
||||||
((<= #xF900 ci #xFAD9) 2) ; CJK compat
|
2)
|
||||||
((<= #xFE10 ci #xFE6B) 2)
|
(else
|
||||||
((<= #xFF01 ci #xFF60) 2)
|
1))))
|
||||||
((<= #xFFE0 ci #xFFE6) 2)
|
|
||||||
((<= #x20000 ci #x30000) 2)
|
(define (unicode-terminal-width/aux str start end ambiguous-is-wide?)
|
||||||
;; non-spacing mark (Mn) ranges from UnicodeData.txt
|
(let lp1 ((sc start) (width 0))
|
||||||
((<= #x0300 ci #x3029)
|
(if (string-cursor>=? sc end)
|
||||||
;; inlined bit-vector-ref for portability
|
width
|
||||||
(let* ((i (- ci #x0300))
|
(let ((c (string-ref/cursor str sc)))
|
||||||
(byte (quotient i 8))
|
(cond
|
||||||
(off (remainder i 8)))
|
;; ANSI escapes
|
||||||
(if (zero? (bitwise-and (bytevector-u8-ref low-non-spacing-chars byte)
|
;; TODO: maintain a state machine so the escape can be
|
||||||
(arithmetic-shift 1 off)))
|
;; spread across multiple strings
|
||||||
1
|
((and (= 27 (char->integer c)) ; esc
|
||||||
0)))
|
(string-cursor<? (string-cursor-next str sc) end)
|
||||||
((<= #x302A ci #x302F) 0)
|
(eqv? #\[ (string-ref/cursor str (string-cursor-next str sc))))
|
||||||
((<= #x3099 ci #x309A) 0)
|
(let lp2 ((sc (string-cursor-forward str sc 2)))
|
||||||
((= #xFB1E ci) 0)
|
(cond ((string-cursor>=? sc end) width)
|
||||||
((<= #xFE00 ci #xFE23) 0)
|
((memv (string-ref/cursor str sc) '(#\m #\newline))
|
||||||
((<= #x1D167 ci #x1D169) 0)
|
(lp1 (string-cursor-next str sc) width))
|
||||||
((<= #x1D17B ci #x1D182) 0)
|
(else (lp2 (string-cursor-next str sc))))))
|
||||||
((<= #x1D185 ci #x1D18B) 0)
|
;; fast-path ASCII
|
||||||
((<= #x1D1AA ci #x1D1AD) 0)
|
((char<=? c #\~)
|
||||||
((<= #xE0100 ci #xE01EF) 0)
|
(lp1 (string-cursor-next str sc) (+ width 1)))
|
||||||
(else 1))))
|
;; unicode
|
||||||
|
(else
|
||||||
|
(lp1 (string-cursor-next str sc)
|
||||||
|
(+ width (unicode-char-width c ambiguous-is-wide?)))))))))
|
||||||
|
|
||||||
|
(define (cursor-arg str x)
|
||||||
|
(if (string-cursor? x) x (string-index->cursor str x)))
|
||||||
|
|
||||||
|
;; convert args to cursors internally for efficiency
|
||||||
|
|
||||||
(define (unicode-terminal-width str . o)
|
(define (unicode-terminal-width str . o)
|
||||||
(let ((start (if (pair? o) (car o) 0))
|
(let ((start (cursor-arg str (if (pair? o)
|
||||||
(end (if (and (pair? o) (pair? (cdr o)))
|
(car o)
|
||||||
(cadr o)
|
(string-cursor-start str))))
|
||||||
(string-length str))))
|
(end (cursor-arg str (if (and (pair? o) (pair? (cdr o)))
|
||||||
(let lp1 ((i start) (width 0))
|
(cadr o)
|
||||||
(if (>= i end)
|
(string-cursor-end str)))))
|
||||||
width
|
(unicode-terminal-width/aux str start end #f)))
|
||||||
(let ((c (string-ref str i)))
|
|
||||||
(cond
|
(define (unicode-terminal-width/wide str . o)
|
||||||
;; ANSI escapes
|
(let ((start (cursor-arg str (if (pair? o)
|
||||||
((and (= 27 (char->integer c)) ; esc
|
(car o)
|
||||||
(< (+ i 1) end)
|
(string-cursor-start str))))
|
||||||
(eqv? #\[ (string-ref str (+ i 1))))
|
(end (cursor-arg str (if (and (pair? o) (pair? (cdr o)))
|
||||||
(let lp2 ((i (+ i 2)))
|
(cadr o)
|
||||||
(cond ((>= i end) width)
|
(string-cursor-end str)))))
|
||||||
((memv (string-ref str i) '(#\m #\newline))
|
(unicode-terminal-width/aux str start end #t)))
|
||||||
(lp1 (+ i 1) width))
|
|
||||||
(else (lp2 (+ i 1))))))
|
|
||||||
;; unicode characters
|
|
||||||
((>= (char->integer c) #x80)
|
|
||||||
(lp1 (+ i 1) (+ width (unicode-char-width c))))
|
|
||||||
;; normal ASCII
|
|
||||||
(else (lp1 (+ i 1) (+ width 1)))))))))
|
|
||||||
|
|
||||||
(define (as-unicode . args)
|
(define (as-unicode . args)
|
||||||
(with ((string-width unicode-terminal-width))
|
(fn (ambiguous-is-wide?)
|
||||||
(each-in-list args)))
|
(with ((string-width (if ambiguous-is-wide?
|
||||||
|
unicode-terminal-width/wide
|
||||||
|
unicode-terminal-width)))
|
||||||
|
(each-in-list args))))
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
(define-library (chibi show unicode)
|
(define-library (chibi show unicode)
|
||||||
(import (scheme base) (chibi show base) (srfi 151))
|
(import (scheme base) (chibi show base) (srfi 130) (srfi 151))
|
||||||
(export as-unicode unicode-terminal-width)
|
(export as-unicode unicode-terminal-width unicode-terminal-width/wide)
|
||||||
(include "unicode.scm"))
|
(include "width.scm" "unicode.scm"))
|
||||||
|
|
|
@ -19,14 +19,14 @@
|
||||||
;; state variables
|
;; state variables
|
||||||
port row col width output writer string-width pad-char ellipsis
|
port row col width output writer string-width pad-char ellipsis
|
||||||
radix precision decimal-sep decimal-align sign-rule
|
radix precision decimal-sep decimal-align sign-rule
|
||||||
comma-sep comma-rule word-separator?
|
comma-sep comma-rule word-separator? ambiguous-is-wide?
|
||||||
;; pretty
|
;; pretty
|
||||||
pretty pretty-shared pretty-simply pretty-color
|
pretty pretty-shared pretty-simply pretty-color
|
||||||
;; columnar
|
;; columnar
|
||||||
columnar tabular wrapped wrapped/list wrapped/char
|
columnar tabular wrapped wrapped/list wrapped/char
|
||||||
justified from-file line-numbers show-columns
|
justified from-file line-numbers show-columns
|
||||||
;; unicode
|
;; unicode
|
||||||
as-unicode unicode-terminal-width
|
as-unicode unicode-terminal-width unicode-terminal-width/wide
|
||||||
upcased downcased
|
upcased downcased
|
||||||
;; color
|
;; color
|
||||||
as-red as-blue as-green as-cyan as-yellow
|
as-red as-blue as-green as-cyan as-yellow
|
||||||
|
|
|
@ -38,6 +38,7 @@
|
||||||
(output output-default)
|
(output output-default)
|
||||||
(string-width substring-length)
|
(string-width substring-length)
|
||||||
(word-separator? char-whitespace?)
|
(word-separator? char-whitespace?)
|
||||||
|
(ambiguous-is-wide? #f)
|
||||||
(ellipsis "")
|
(ellipsis "")
|
||||||
(decimal-align #f)
|
(decimal-align #f)
|
||||||
(decimal-sep #f)
|
(decimal-sep #f)
|
||||||
|
|
|
@ -48,7 +48,7 @@
|
||||||
;; state variables
|
;; state variables
|
||||||
port row col width output writer string-width pad-char ellipsis
|
port row col width output writer string-width pad-char ellipsis
|
||||||
radix precision decimal-sep decimal-align sign-rule
|
radix precision decimal-sep decimal-align sign-rule
|
||||||
comma-sep comma-rule word-separator?
|
comma-sep comma-rule word-separator? ambiguous-is-wide?
|
||||||
)
|
)
|
||||||
(include "base.scm")
|
(include "base.scm")
|
||||||
(include "../../chibi/show/write.scm")
|
(include "../../chibi/show/write.scm")
|
||||||
|
|
|
@ -741,6 +741,12 @@ def | 6
|
||||||
(show #f (as-unicode (with ((pad-char #\〜)) (padded/both 5 "日本語")))))
|
(show #f (as-unicode (with ((pad-char #\〜)) (padded/both 5 "日本語")))))
|
||||||
(test "日本語 col: 6"
|
(test "日本語 col: 6"
|
||||||
(show #f (as-unicode "日本語" (fn (col) (each " col: " col)))))
|
(show #f (as-unicode "日本語" (fn (col) (each " col: " col)))))
|
||||||
|
(test "日本語ΠΜΕ col: 9"
|
||||||
|
(show #f (as-unicode "日本語ΠΜΕ" (fn (col) (each " col: " col)))))
|
||||||
|
(test "日本語ΠΜΕ col: 12"
|
||||||
|
(show #f (with ((ambiguous-is-wide? #t))
|
||||||
|
(as-unicode "日本語ΠΜΕ"
|
||||||
|
(fn (col) (each " col: " col))))))
|
||||||
|
|
||||||
;; from-file
|
;; from-file
|
||||||
;; for reference, filesystem-test relies on creating files under /tmp
|
;; for reference, filesystem-test relies on creating files under /tmp
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
|
|
||||||
(define-library (srfi 166 unicode)
|
(define-library (srfi 166 unicode)
|
||||||
(import (scheme base) (srfi 151) (srfi 166 base))
|
(import (scheme base) (srfi 130) (srfi 151) (srfi 166 base))
|
||||||
(export-all)
|
(export as-unicode unicode-terminal-width unicode-terminal-width/wide)
|
||||||
(include "../../chibi/show/unicode.scm"))
|
(include "../../chibi/show/width.scm"
|
||||||
|
"../../chibi/show/unicode.scm"))
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
;; A value can be any of:
|
;; A value can be any of:
|
||||||
;;
|
;;
|
||||||
;; Property_Name: all unicode characters with the given derived property
|
;; Property_Name: all unicode characters with the given derived property
|
||||||
|
;; Prop@N: a property matched at a given field (instead of default category)
|
||||||
;; Xx: all unicode characters with the given general category
|
;; Xx: all unicode characters with the given general category
|
||||||
;; X: all unicode characters with any general category X*
|
;; X: all unicode characters with any general category X*
|
||||||
;; NNNN: a single unicode value in hex format
|
;; NNNN: a single unicode value in hex format
|
||||||
|
@ -25,7 +26,7 @@
|
||||||
;; Unicode property name.
|
;; Unicode property name.
|
||||||
;;
|
;;
|
||||||
;; Assumes the files UnicodeData.txt and DerivedCoreProperties.txt are
|
;; Assumes the files UnicodeData.txt and DerivedCoreProperties.txt are
|
||||||
;; in the data/ current directory, unless overridden with the --data or
|
;; in the ./data/ directory, unless overridden with the --data or
|
||||||
;; --derived options.
|
;; --derived options.
|
||||||
|
|
||||||
(import (chibi) (chibi io) (chibi string))
|
(import (chibi) (chibi io) (chibi string))
|
||||||
|
@ -35,8 +36,9 @@
|
||||||
(for-each (lambda (x) (display x err)) args)
|
(for-each (lambda (x) (display x err)) args)
|
||||||
(newline err)))
|
(newline err)))
|
||||||
|
|
||||||
;; Parse UnicodeData.txt for characters matching a given class.
|
;; Parse UnicodeData.txt or other semi-colon-delimited TSV file for
|
||||||
(define (extract-char-set-category cat data)
|
;; characters matching a given class in a given field.
|
||||||
|
(define (extract-char-set-category cat field data)
|
||||||
(define (join-to-range n ls)
|
(define (join-to-range n ls)
|
||||||
(cond
|
(cond
|
||||||
((null? ls)
|
((null? ls)
|
||||||
|
@ -62,22 +64,43 @@
|
||||||
((or (equal? line "") (eqv? #\# (string-ref line 0)))
|
((or (equal? line "") (eqv? #\# (string-ref line 0)))
|
||||||
(lp ranges))
|
(lp ranges))
|
||||||
(else
|
(else
|
||||||
(let ((ls (string-split line #\; 4)))
|
(let* ((line (substring-cursor line
|
||||||
|
(string-cursor-start line)
|
||||||
|
(string-find line #\#)))
|
||||||
|
(ls (map string-trim (string-split line #\;))))
|
||||||
(cond
|
(cond
|
||||||
((< (length ls) 3)
|
((<= (length ls) field)
|
||||||
(warn "invalid UnicodeData line: " line)
|
(warn "invalid UnicodeData line: " line)
|
||||||
(lp ranges))
|
(lp ranges))
|
||||||
(else
|
(else
|
||||||
(let ((ch (string->number (car ls) 16))
|
(let ((ch (if (string-contains (car ls) "..")
|
||||||
|
(let* ((sc (string-contains (car ls) "..")))
|
||||||
|
(cons (string->number
|
||||||
|
(substring-cursor
|
||||||
|
(car ls)
|
||||||
|
(string-cursor-start (car ls))
|
||||||
|
sc)
|
||||||
|
16)
|
||||||
|
(string->number
|
||||||
|
(substring-cursor
|
||||||
|
(car ls)
|
||||||
|
(string-cursor-forward (car ls) sc 2))
|
||||||
|
16)))
|
||||||
|
(string->number (car ls) 16)))
|
||||||
(name (cadr ls))
|
(name (cadr ls))
|
||||||
(ch-cat (car (cddr ls))))
|
(ch-cat (list-ref ls field)))
|
||||||
(cond
|
(cond
|
||||||
((or (not ch) (not (= 2 (string-length ch-cat))))
|
((not (or (integer? ch)
|
||||||
|
(and (pair? ch)
|
||||||
|
(integer? (car ch))
|
||||||
|
(integer? (cdr ch)))))
|
||||||
(warn "invalid UnicodeData line: " line))
|
(warn "invalid UnicodeData line: " line))
|
||||||
((if (char? cat)
|
((if (char? cat)
|
||||||
(eqv? cat (string-ref ch-cat 0))
|
(eqv? cat (string-ref ch-cat 0))
|
||||||
(equal? cat ch-cat))
|
(equal? cat ch-cat))
|
||||||
(lp (join-to-range ch ranges)))
|
(lp (if (pair? ch)
|
||||||
|
(cons ch ranges)
|
||||||
|
(join-to-range ch ranges))))
|
||||||
(else
|
(else
|
||||||
(lp ranges))))))))))))))
|
(lp ranges))))))))))))))
|
||||||
|
|
||||||
|
@ -131,13 +154,19 @@
|
||||||
(error "invalid character range, expected NNNN-MMMM, got: " def))))
|
(error "invalid character range, expected NNNN-MMMM, got: " def))))
|
||||||
((string->number def 16)
|
((string->number def 16)
|
||||||
=> (lambda (start) `(char-set ,(integer->char start))))
|
=> (lambda (start) `(char-set ,(integer->char start))))
|
||||||
|
((string-find? def #\@)
|
||||||
|
(let* ((sc (string-find def #\@))
|
||||||
|
(cat (substring-cursor def (string-cursor-start def) sc))
|
||||||
|
(field (string->number
|
||||||
|
(substring-cursor def (string-cursor-next def sc)))))
|
||||||
|
(extract-char-set-category cat field data)))
|
||||||
((and (= 1 (string-length def))
|
((and (= 1 (string-length def))
|
||||||
(char-upper-case? (string-ref def 0)))
|
(char-upper-case? (string-ref def 0)))
|
||||||
(extract-char-set-category (string-ref def 0) data))
|
(extract-char-set-category (string-ref def 0) 2 data))
|
||||||
((and (= 2 (string-length def))
|
((and (= 2 (string-length def))
|
||||||
(char-upper-case? (string-ref def 0))
|
(char-upper-case? (string-ref def 0))
|
||||||
(char-lower-case? (string-ref def 1)))
|
(char-lower-case? (string-ref def 1)))
|
||||||
(extract-char-set-category def data))
|
(extract-char-set-category def 2 data))
|
||||||
;; derived properties
|
;; derived properties
|
||||||
((and (> (string-length def) 1)
|
((and (> (string-length def) 1)
|
||||||
(eqv? #\: (string-ref def 0)))
|
(eqv? #\: (string-ref def 0)))
|
||||||
|
|
|
@ -21,13 +21,15 @@
|
||||||
(remove (lambda (i) (hash-table-exists? ls2-tab i)) ls1)))
|
(remove (lambda (i) (hash-table-exists? ls2-tab i)) ls1)))
|
||||||
|
|
||||||
(let ((args (command-line)))
|
(let ((args (command-line)))
|
||||||
(let lp ((ls (cdr args)) (ascii? #f))
|
(let lp ((ls (cdr args)) (ascii? #f) (predicate? #f))
|
||||||
(cond
|
(cond
|
||||||
((and (pair? ls) (not (equal? "" (car ls)))
|
((and (pair? ls) (not (equal? "" (car ls)))
|
||||||
(eqv? #\- (string-ref (car ls) 0)))
|
(eqv? #\- (string-ref (car ls) 0)))
|
||||||
(cond
|
(cond
|
||||||
((member (car ls) '("-a" "--ascii"))
|
((member (car ls) '("-a" "--ascii"))
|
||||||
(lp (cdr ls) #t))
|
(lp (cdr ls) #t predicate?))
|
||||||
|
((member (car ls) '("-p" "--predicate"))
|
||||||
|
(lp (cdr ls) ascii? #t))
|
||||||
(else (error "unknown option" (car ls)))))
|
(else (error "unknown option" (car ls)))))
|
||||||
((or (null? ls) (pair? (cdr ls)))
|
((or (null? ls) (pair? (cdr ls)))
|
||||||
(error "usage: optimize-char-sets.scm [--ascii] module.name"))
|
(error "usage: optimize-char-sets.scm [--ascii] module.name"))
|
||||||
|
@ -78,8 +80,13 @@
|
||||||
(newline (current-error-port))
|
(newline (current-error-port))
|
||||||
(error "optimized iset is different"))))
|
(error "optimized iset is different"))))
|
||||||
(display " writing\n" (current-error-port))
|
(display " writing\n" (current-error-port))
|
||||||
(write `(define ,exp
|
(write
|
||||||
(immutable-char-set ,(iset->code iset2))))
|
(if predicate?
|
||||||
|
`(define ,(string->symbol
|
||||||
|
(string-append (symbol->string exp) "?"))
|
||||||
|
,(iset->code/lambda iset2))
|
||||||
|
`(define ,exp
|
||||||
|
(immutable-char-set ,(iset->code iset2)))))
|
||||||
(newline)
|
(newline)
|
||||||
(newline)
|
(newline)
|
||||||
(display " done\n" (current-error-port)))))))
|
(display " done\n" (current-error-port)))))))
|
||||||
|
|
Loading…
Add table
Reference in a new issue