adding unicode-string-width/wide

This commit is contained in:
Alex Shinn 2020-06-03 10:43:22 +09:00
parent 6f1cf6588f
commit 1164ecf9b7
13 changed files with 166 additions and 152 deletions

View file

@ -184,12 +184,19 @@ data/%.txt:
build-lib/chibi/char-set/derived.scm: data/UnicodeData.txt data/DerivedCoreProperties.txt chibi-scheme$(EXE)
$(CHIBI) tools/extract-unicode-props.scm --default > $@
build-lib/chibi/char-set/width.scm: data/UnicodeData.txt data/EastAsianWidth.txt chibi-scheme$(EXE)
$(CHIBI) tools/extract-unicode-props.scm Zero-Width=Mn > $@
$(CHIBI) tools/extract-unicode-props.scm -d data/EastAsianWidth.txt Full-Width=F@1,W@1 Ambiguous-Width=A@1 >> $@
lib/chibi/char-set/ascii.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE)
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm --ascii chibi.char-set.compute > $@
lib/chibi/char-set/full.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE)
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm chibi.char-set.compute > $@
lib/chibi/show/width.scm: build-lib/chibi/char-set/width.scm chibi-scheme$(EXE)
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm --predicate chibi.char-set.width > $@
lib/scheme/char/case-offsets.scm: data/UnicodeData.txt chibi-scheme$(EXE) all-libs
$(CHIBI) tools/extract-case-offsets.scm $< > $@

View file

@ -168,3 +168,23 @@
,(iset-bits iset)
,(iset->code (iset-left iset))
,(iset->code (iset-right iset)))))
;; uses only if, <, <=, >, and SRFI 151 bit-set?
(define (iset->code/lambda iset)
(define (code iset)
(and iset
(if (and (not (iset-left iset))
(not (iset-right iset))
(not (iset-bits iset)))
`(<= ,(iset-start iset) n ,(iset-end iset))
`(if (< n ,(iset-start iset))
,(code (iset-left iset))
,(if (and (not (iset-right iset)) (not (iset-bits iset)))
`(<= n ,(iset-end iset))
`(if (> n ,(iset-end iset))
,(code (iset-right iset))
,(if (iset-bits iset)
`(bit-set? (- n ,(iset-start iset))
,(iset-bits iset))
#t)))))))
`(lambda (n) ,(code iset)))

View file

@ -17,4 +17,5 @@
(bitwise-and (%mask size) (arithmetic-shift n (- position)))))))
(include "optimize.scm")
(export
iset-balance iset-balance! iset-optimize iset-optimize! iset->code))
iset-balance iset-balance! iset-optimize iset-optimize!
iset->code iset->code/lambda))

View file

@ -742,7 +742,13 @@ def | 6
(test "日本語"
(show #f (as-unicode (with ((pad-char #\〜)) (padded/both 5 "日本語")))))
(test "日本語 col: 6"
(show #f (as-unicode "日本語" (fn ((col)) (each " col: " col)))))
(show #f (as-unicode "日本語" (fn (col) (each " col: " col)))))
(test "日本語ΠΜΕ col: 9"
(show #f (as-unicode "日本語ΠΜΕ" (fn (col) (each " col: " col)))))
(test "日本語ΠΜΕ col: 12"
(show #f (with ((ambiguous-is-wide? #t))
(as-unicode "日本語ΠΜΕ"
(fn (col) (each " col: " col))))))
;; from-file
;; for reference, filesystem-test relies on creating files under /tmp

View file

@ -1,134 +1,70 @@
;; unicode.scm -- Unicode character width and ANSI escape support
;; Copyright (c) 2006-2017 Alex Shinn. All rights reserved.
;; Copyright (c) 2006-2020 Alex Shinn. All rights reserved.
;; BSD-style license: http://synthcode.com/license.txt
;; a condensed non-spacing mark range from UnicodeData.txt (chars with
;; the Mn property) - generated partially by hand, should automate
;; this better
(define low-non-spacing-chars
(bytevector
#xff #xff #xff #xff #xff #xff #xff #xff #xff #xff #xff #xff #xff #xff 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#x78 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 #xfe #xff #xff #xff #xff #xff #x1f 0 0 0 0 0 0 0
0 0 #x3f 0 0 0 0 0 0 #xf8 #xff #x01 0 0 #x01 0
0 0 0 0 0 0 0 0 0 0 #xc0 #xff #xff #x3f 0 0
0 0 #x02 0 0 0 #xff #xff #xff #x07 0 0 0 0 0 0
0 0 0 0 #xc0 #xff #x01 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#x06 0 0 0 0 0 0 #x10 #xfe #x21 #x1e 0 #x0c 0 0 0
#x02 0 0 0 0 0 0 #x10 #x1e #x20 0 0 #x0c 0 0 0
#x06 0 0 0 0 0 0 #x10 #xfe #x3f 0 0 0 0 #x03 0
#x06 0 0 0 0 0 0 #x30 #xfe #x21 0 0 #x0c 0 0 0
#x02 0 0 0 0 0 0 #x90 #x0e #x20 #x40 0 0 0 0 0
#x04 0 0 0 0 0 0 0 0 #x20 0 0 0 0 0 0
0 0 0 0 0 0 0 #xc0 #xc1 #xff #x7f 0 0 0 0 0
0 0 0 0 0 0 0 #x10 #x40 #x30 0 0 0 0 0 0
0 0 0 0 0 0 0 0 #x0e #x20 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 #x04 #x7c 0 0 0 0 0
0 0 0 0 0 0 #xf2 #x07 #x80 #x7f 0 0 0 0 0 0
0 0 0 0 0 0 #xf2 #x1f 0 #x3f 0 0 0 0 0 0
0 0 0 #x03 0 0 #xa0 #x02 0 0 0 0 0 0 #xfe #x7f
#xdf 0 #xff #xff #xff #xff #xff #x1f #x40 0 0 0 0 0 0 0
0 0 0 0 0 #xe0 #xfd #x02 0 0 0 #x03 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 #x1c 0 0 0 #x1c 0 0 0 #x0c 0 0 0 #x0c 0
0 0 0 0 0 0 #x80 #x3f #x40 #xfe #x0f #x20 0 0 0 0
0 #x38 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 #x02 0 0 0 0 0 0 0 0 0 0
0 0 0 0 #x87 #x01 #x04 #x0e 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 #xff #x1f #xe2 #x07
))
(define (unicode-char-width c)
(let ((ci (char->integer c)))
(define (unicode-char-width ch ambiguous-is-wide?)
(let ((ci (char->integer ch)))
(cond
;; hand-checked ranges from EastAsianWidth.txt
((<= #x1100 ci #x115F) 2) ; Hangul
((<= #x2E80 ci #x4DB5) 2) ; CJK
((<= #x4E00 ci #xA4C6) 2)
((<= #xAC00 ci #xD7A3) 2) ; Hangul
((<= #xF900 ci #xFAD9) 2) ; CJK compat
((<= #xFE10 ci #xFE6B) 2)
((<= #xFF01 ci #xFF60) 2)
((<= #xFFE0 ci #xFFE6) 2)
((<= #x20000 ci #x30000) 2)
;; non-spacing mark (Mn) ranges from UnicodeData.txt
((<= #x0300 ci #x3029)
;; inlined bit-vector-ref for portability
(let* ((i (- ci #x0300))
(byte (quotient i 8))
(off (remainder i 8)))
(if (zero? (bitwise-and (bytevector-u8-ref low-non-spacing-chars byte)
(arithmetic-shift 1 off)))
1
0)))
((<= #x302A ci #x302F) 0)
((<= #x3099 ci #x309A) 0)
((= #xFB1E ci) 0)
((<= #xFE00 ci #xFE23) 0)
((<= #x1D167 ci #x1D169) 0)
((<= #x1D17B ci #x1D182) 0)
((<= #x1D185 ci #x1D18B) 0)
((<= #x1D1AA ci #x1D1AD) 0)
((<= #xE0100 ci #xE01EF) 0)
(else 1))))
((char-set:zero-width? ci)
0)
((char-set:full-width? ci)
2)
((and ambiguous-is-wide? (char-set:ambiguous-width? ci))
2)
(else
1))))
(define (unicode-terminal-width/aux str start end ambiguous-is-wide?)
(let lp1 ((sc start) (width 0))
(if (string-cursor>=? sc end)
width
(let ((c (string-ref/cursor str sc)))
(cond
;; ANSI escapes
;; TODO: maintain a state machine so the escape can be
;; spread across multiple strings
((and (= 27 (char->integer c)) ; esc
(string-cursor<? (string-cursor-next str sc) end)
(eqv? #\[ (string-ref/cursor str (string-cursor-next str sc))))
(let lp2 ((sc (string-cursor-forward str sc 2)))
(cond ((string-cursor>=? sc end) width)
((memv (string-ref/cursor str sc) '(#\m #\newline))
(lp1 (string-cursor-next str sc) width))
(else (lp2 (string-cursor-next str sc))))))
;; fast-path ASCII
((char<=? c #\~)
(lp1 (string-cursor-next str sc) (+ width 1)))
;; unicode
(else
(lp1 (string-cursor-next str sc)
(+ width (unicode-char-width c ambiguous-is-wide?)))))))))
(define (cursor-arg str x)
(if (string-cursor? x) x (string-index->cursor str x)))
;; convert args to cursors internally for efficiency
(define (unicode-terminal-width str . o)
(let ((start (if (pair? o) (car o) 0))
(end (if (and (pair? o) (pair? (cdr o)))
(cadr o)
(string-length str))))
(let lp1 ((i start) (width 0))
(if (>= i end)
width
(let ((c (string-ref str i)))
(cond
;; ANSI escapes
((and (= 27 (char->integer c)) ; esc
(< (+ i 1) end)
(eqv? #\[ (string-ref str (+ i 1))))
(let lp2 ((i (+ i 2)))
(cond ((>= i end) width)
((memv (string-ref str i) '(#\m #\newline))
(lp1 (+ i 1) width))
(else (lp2 (+ i 1))))))
;; unicode characters
((>= (char->integer c) #x80)
(lp1 (+ i 1) (+ width (unicode-char-width c))))
;; normal ASCII
(else (lp1 (+ i 1) (+ width 1)))))))))
(let ((start (cursor-arg str (if (pair? o)
(car o)
(string-cursor-start str))))
(end (cursor-arg str (if (and (pair? o) (pair? (cdr o)))
(cadr o)
(string-cursor-end str)))))
(unicode-terminal-width/aux str start end #f)))
(define (unicode-terminal-width/wide str . o)
(let ((start (cursor-arg str (if (pair? o)
(car o)
(string-cursor-start str))))
(end (cursor-arg str (if (and (pair? o) (pair? (cdr o)))
(cadr o)
(string-cursor-end str)))))
(unicode-terminal-width/aux str start end #t)))
(define (as-unicode . args)
(with ((string-width unicode-terminal-width))
(each-in-list args)))
(fn (ambiguous-is-wide?)
(with ((string-width (if ambiguous-is-wide?
unicode-terminal-width/wide
unicode-terminal-width)))
(each-in-list args))))

View file

@ -1,5 +1,5 @@
(define-library (chibi show unicode)
(import (scheme base) (chibi show base) (srfi 151))
(export as-unicode unicode-terminal-width)
(include "unicode.scm"))
(import (scheme base) (chibi show base) (srfi 130) (srfi 151))
(export as-unicode unicode-terminal-width unicode-terminal-width/wide)
(include "width.scm" "unicode.scm"))

View file

@ -19,14 +19,14 @@
;; state variables
port row col width output writer string-width pad-char ellipsis
radix precision decimal-sep decimal-align sign-rule
comma-sep comma-rule word-separator?
comma-sep comma-rule word-separator? ambiguous-is-wide?
;; pretty
pretty pretty-shared pretty-simply pretty-color
;; columnar
columnar tabular wrapped wrapped/list wrapped/char
justified from-file line-numbers show-columns
;; unicode
as-unicode unicode-terminal-width
as-unicode unicode-terminal-width unicode-terminal-width/wide
upcased downcased
;; color
as-red as-blue as-green as-cyan as-yellow

View file

@ -38,6 +38,7 @@
(output output-default)
(string-width substring-length)
(word-separator? char-whitespace?)
(ambiguous-is-wide? #f)
(ellipsis "")
(decimal-align #f)
(decimal-sep #f)

View file

@ -48,7 +48,7 @@
;; state variables
port row col width output writer string-width pad-char ellipsis
radix precision decimal-sep decimal-align sign-rule
comma-sep comma-rule word-separator?
comma-sep comma-rule word-separator? ambiguous-is-wide?
)
(include "base.scm")
(include "../../chibi/show/write.scm")

View file

@ -741,6 +741,12 @@ def | 6
(show #f (as-unicode (with ((pad-char #\〜)) (padded/both 5 "日本語")))))
(test "日本語 col: 6"
(show #f (as-unicode "日本語" (fn (col) (each " col: " col)))))
(test "日本語ΠΜΕ col: 9"
(show #f (as-unicode "日本語ΠΜΕ" (fn (col) (each " col: " col)))))
(test "日本語ΠΜΕ col: 12"
(show #f (with ((ambiguous-is-wide? #t))
(as-unicode "日本語ΠΜΕ"
(fn (col) (each " col: " col))))))
;; from-file
;; for reference, filesystem-test relies on creating files under /tmp

View file

@ -1,5 +1,6 @@
(define-library (srfi 166 unicode)
(import (scheme base) (srfi 151) (srfi 166 base))
(export-all)
(include "../../chibi/show/unicode.scm"))
(import (scheme base) (srfi 130) (srfi 151) (srfi 166 base))
(export as-unicode unicode-terminal-width unicode-terminal-width/wide)
(include "../../chibi/show/width.scm"
"../../chibi/show/unicode.scm"))

View file

@ -15,6 +15,7 @@
;; A value can be any of:
;;
;; Property_Name: all unicode characters with the given derived property
;; Prop@N: a property matched at a given field (instead of default category)
;; Xx: all unicode characters with the given general category
;; X: all unicode characters with any general category X*
;; NNNN: a single unicode value in hex format
@ -25,7 +26,7 @@
;; Unicode property name.
;;
;; Assumes the files UnicodeData.txt and DerivedCoreProperties.txt are
;; in the data/ current directory, unless overridden with the --data or
;; in the ./data/ directory, unless overridden with the --data or
;; --derived options.
(import (chibi) (chibi io) (chibi string))
@ -35,8 +36,9 @@
(for-each (lambda (x) (display x err)) args)
(newline err)))
;; Parse UnicodeData.txt for characters matching a given class.
(define (extract-char-set-category cat data)
;; Parse UnicodeData.txt or other semi-colon-delimited TSV file for
;; characters matching a given class in a given field.
(define (extract-char-set-category cat field data)
(define (join-to-range n ls)
(cond
((null? ls)
@ -62,22 +64,43 @@
((or (equal? line "") (eqv? #\# (string-ref line 0)))
(lp ranges))
(else
(let ((ls (string-split line #\; 4)))
(let* ((line (substring-cursor line
(string-cursor-start line)
(string-find line #\#)))
(ls (map string-trim (string-split line #\;))))
(cond
((< (length ls) 3)
((<= (length ls) field)
(warn "invalid UnicodeData line: " line)
(lp ranges))
(else
(let ((ch (string->number (car ls) 16))
(let ((ch (if (string-contains (car ls) "..")
(let* ((sc (string-contains (car ls) "..")))
(cons (string->number
(substring-cursor
(car ls)
(string-cursor-start (car ls))
sc)
16)
(string->number
(substring-cursor
(car ls)
(string-cursor-forward (car ls) sc 2))
16)))
(string->number (car ls) 16)))
(name (cadr ls))
(ch-cat (car (cddr ls))))
(ch-cat (list-ref ls field)))
(cond
((or (not ch) (not (= 2 (string-length ch-cat))))
((not (or (integer? ch)
(and (pair? ch)
(integer? (car ch))
(integer? (cdr ch)))))
(warn "invalid UnicodeData line: " line))
((if (char? cat)
(eqv? cat (string-ref ch-cat 0))
(equal? cat ch-cat))
(lp (join-to-range ch ranges)))
(lp (if (pair? ch)
(cons ch ranges)
(join-to-range ch ranges))))
(else
(lp ranges))))))))))))))
@ -131,13 +154,19 @@
(error "invalid character range, expected NNNN-MMMM, got: " def))))
((string->number def 16)
=> (lambda (start) `(char-set ,(integer->char start))))
((string-find? def #\@)
(let* ((sc (string-find def #\@))
(cat (substring-cursor def (string-cursor-start def) sc))
(field (string->number
(substring-cursor def (string-cursor-next def sc)))))
(extract-char-set-category cat field data)))
((and (= 1 (string-length def))
(char-upper-case? (string-ref def 0)))
(extract-char-set-category (string-ref def 0) data))
(extract-char-set-category (string-ref def 0) 2 data))
((and (= 2 (string-length def))
(char-upper-case? (string-ref def 0))
(char-lower-case? (string-ref def 1)))
(extract-char-set-category def data))
(extract-char-set-category def 2 data))
;; derived properties
((and (> (string-length def) 1)
(eqv? #\: (string-ref def 0)))

View file

@ -21,13 +21,15 @@
(remove (lambda (i) (hash-table-exists? ls2-tab i)) ls1)))
(let ((args (command-line)))
(let lp ((ls (cdr args)) (ascii? #f))
(let lp ((ls (cdr args)) (ascii? #f) (predicate? #f))
(cond
((and (pair? ls) (not (equal? "" (car ls)))
(eqv? #\- (string-ref (car ls) 0)))
(cond
((member (car ls) '("-a" "--ascii"))
(lp (cdr ls) #t))
(lp (cdr ls) #t predicate?))
((member (car ls) '("-p" "--predicate"))
(lp (cdr ls) ascii? #t))
(else (error "unknown option" (car ls)))))
((or (null? ls) (pair? (cdr ls)))
(error "usage: optimize-char-sets.scm [--ascii] module.name"))
@ -78,8 +80,13 @@
(newline (current-error-port))
(error "optimized iset is different"))))
(display " writing\n" (current-error-port))
(write `(define ,exp
(immutable-char-set ,(iset->code iset2))))
(write
(if predicate?
`(define ,(string->symbol
(string-append (symbol->string exp) "?"))
,(iset->code/lambda iset2))
`(define ,exp
(immutable-char-set ,(iset->code iset2)))))
(newline)
(newline)
(display " done\n" (current-error-port)))))))