diff --git a/Makefile b/Makefile index 3b161fd1..1210c94f 100644 --- a/Makefile +++ b/Makefile @@ -184,12 +184,19 @@ data/%.txt: build-lib/chibi/char-set/derived.scm: data/UnicodeData.txt data/DerivedCoreProperties.txt chibi-scheme$(EXE) $(CHIBI) tools/extract-unicode-props.scm --default > $@ +build-lib/chibi/char-set/width.scm: data/UnicodeData.txt data/EastAsianWidth.txt chibi-scheme$(EXE) + $(CHIBI) tools/extract-unicode-props.scm Zero-Width=Mn > $@ + $(CHIBI) tools/extract-unicode-props.scm -d data/EastAsianWidth.txt Full-Width=F@1,W@1 Ambiguous-Width=A@1 >> $@ + lib/chibi/char-set/ascii.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE) $(CHIBI) -Abuild-lib tools/optimize-char-sets.scm --ascii chibi.char-set.compute > $@ lib/chibi/char-set/full.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE) $(CHIBI) -Abuild-lib tools/optimize-char-sets.scm chibi.char-set.compute > $@ +lib/chibi/show/width.scm: build-lib/chibi/char-set/width.scm chibi-scheme$(EXE) + $(CHIBI) -Abuild-lib tools/optimize-char-sets.scm --predicate chibi.char-set.width > $@ + lib/scheme/char/case-offsets.scm: data/UnicodeData.txt chibi-scheme$(EXE) all-libs $(CHIBI) tools/extract-case-offsets.scm $< > $@ diff --git a/lib/chibi/iset/optimize.scm b/lib/chibi/iset/optimize.scm index 8657cdb5..887696c6 100644 --- a/lib/chibi/iset/optimize.scm +++ b/lib/chibi/iset/optimize.scm @@ -168,3 +168,23 @@ ,(iset-bits iset) ,(iset->code (iset-left iset)) ,(iset->code (iset-right iset))))) + +;; uses only if, <, <=, >, and SRFI 151 bit-set? +(define (iset->code/lambda iset) + (define (code iset) + (and iset + (if (and (not (iset-left iset)) + (not (iset-right iset)) + (not (iset-bits iset))) + `(<= ,(iset-start iset) n ,(iset-end iset)) + `(if (< n ,(iset-start iset)) + ,(code (iset-left iset)) + ,(if (and (not (iset-right iset)) (not (iset-bits iset))) + `(<= n ,(iset-end iset)) + `(if (> n ,(iset-end iset)) + ,(code (iset-right iset)) + ,(if (iset-bits iset) + `(bit-set? (- n ,(iset-start iset)) + ,(iset-bits iset)) + #t))))))) + `(lambda (n) ,(code iset))) diff --git a/lib/chibi/iset/optimize.sld b/lib/chibi/iset/optimize.sld index 4a2f52bc..7ad11c61 100644 --- a/lib/chibi/iset/optimize.sld +++ b/lib/chibi/iset/optimize.sld @@ -17,4 +17,5 @@ (bitwise-and (%mask size) (arithmetic-shift n (- position))))))) (include "optimize.scm") (export - iset-balance iset-balance! iset-optimize iset-optimize! iset->code)) + iset-balance iset-balance! iset-optimize iset-optimize! + iset->code iset->code/lambda)) diff --git a/lib/chibi/show-test.sld b/lib/chibi/show-test.sld index 8880ac14..f337bfc8 100644 --- a/lib/chibi/show-test.sld +++ b/lib/chibi/show-test.sld @@ -742,7 +742,13 @@ def | 6 (test "日本語" (show #f (as-unicode (with ((pad-char #\〜)) (padded/both 5 "日本語"))))) (test "日本語 col: 6" - (show #f (as-unicode "日本語" (fn ((col)) (each " col: " col))))) + (show #f (as-unicode "日本語" (fn (col) (each " col: " col))))) + (test "日本語ΠΜΕ col: 9" + (show #f (as-unicode "日本語ΠΜΕ" (fn (col) (each " col: " col))))) + (test "日本語ΠΜΕ col: 12" + (show #f (with ((ambiguous-is-wide? #t)) + (as-unicode "日本語ΠΜΕ" + (fn (col) (each " col: " col)))))) ;; from-file ;; for reference, filesystem-test relies on creating files under /tmp diff --git a/lib/chibi/show/unicode.scm b/lib/chibi/show/unicode.scm index 3cee213d..d4c665f9 100644 --- a/lib/chibi/show/unicode.scm +++ b/lib/chibi/show/unicode.scm @@ -1,134 +1,70 @@ ;; unicode.scm -- Unicode character width and ANSI escape support -;; Copyright (c) 2006-2017 Alex Shinn. All rights reserved. +;; Copyright (c) 2006-2020 Alex Shinn. All rights reserved. ;; BSD-style license: http://synthcode.com/license.txt -;; a condensed non-spacing mark range from UnicodeData.txt (chars with -;; the Mn property) - generated partially by hand, should automate -;; this better - -(define low-non-spacing-chars - (bytevector -#xff #xff #xff #xff #xff #xff #xff #xff #xff #xff #xff #xff #xff #xff 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -#x78 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 #xfe #xff #xff #xff #xff #xff #x1f 0 0 0 0 0 0 0 - 0 0 #x3f 0 0 0 0 0 0 #xf8 #xff #x01 0 0 #x01 0 - 0 0 0 0 0 0 0 0 0 0 #xc0 #xff #xff #x3f 0 0 - 0 0 #x02 0 0 0 #xff #xff #xff #x07 0 0 0 0 0 0 - 0 0 0 0 #xc0 #xff #x01 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -#x06 0 0 0 0 0 0 #x10 #xfe #x21 #x1e 0 #x0c 0 0 0 -#x02 0 0 0 0 0 0 #x10 #x1e #x20 0 0 #x0c 0 0 0 -#x06 0 0 0 0 0 0 #x10 #xfe #x3f 0 0 0 0 #x03 0 -#x06 0 0 0 0 0 0 #x30 #xfe #x21 0 0 #x0c 0 0 0 -#x02 0 0 0 0 0 0 #x90 #x0e #x20 #x40 0 0 0 0 0 -#x04 0 0 0 0 0 0 0 0 #x20 0 0 0 0 0 0 - 0 0 0 0 0 0 0 #xc0 #xc1 #xff #x7f 0 0 0 0 0 - 0 0 0 0 0 0 0 #x10 #x40 #x30 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 #x0e #x20 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 #x04 #x7c 0 0 0 0 0 - 0 0 0 0 0 0 #xf2 #x07 #x80 #x7f 0 0 0 0 0 0 - 0 0 0 0 0 0 #xf2 #x1f 0 #x3f 0 0 0 0 0 0 - 0 0 0 #x03 0 0 #xa0 #x02 0 0 0 0 0 0 #xfe #x7f -#xdf 0 #xff #xff #xff #xff #xff #x1f #x40 0 0 0 0 0 0 0 - 0 0 0 0 0 #xe0 #xfd #x02 0 0 0 #x03 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 #x1c 0 0 0 #x1c 0 0 0 #x0c 0 0 0 #x0c 0 - 0 0 0 0 0 0 #x80 #x3f #x40 #xfe #x0f #x20 0 0 0 0 - 0 #x38 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 #x02 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 #x87 #x01 #x04 #x0e 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - 0 0 0 0 0 0 0 0 0 0 #xff #x1f #xe2 #x07 - )) - -(define (unicode-char-width c) - (let ((ci (char->integer c))) +(define (unicode-char-width ch ambiguous-is-wide?) + (let ((ci (char->integer ch))) (cond - ;; hand-checked ranges from EastAsianWidth.txt - ((<= #x1100 ci #x115F) 2) ; Hangul - ((<= #x2E80 ci #x4DB5) 2) ; CJK - ((<= #x4E00 ci #xA4C6) 2) - ((<= #xAC00 ci #xD7A3) 2) ; Hangul - ((<= #xF900 ci #xFAD9) 2) ; CJK compat - ((<= #xFE10 ci #xFE6B) 2) - ((<= #xFF01 ci #xFF60) 2) - ((<= #xFFE0 ci #xFFE6) 2) - ((<= #x20000 ci #x30000) 2) - ;; non-spacing mark (Mn) ranges from UnicodeData.txt - ((<= #x0300 ci #x3029) - ;; inlined bit-vector-ref for portability - (let* ((i (- ci #x0300)) - (byte (quotient i 8)) - (off (remainder i 8))) - (if (zero? (bitwise-and (bytevector-u8-ref low-non-spacing-chars byte) - (arithmetic-shift 1 off))) - 1 - 0))) - ((<= #x302A ci #x302F) 0) - ((<= #x3099 ci #x309A) 0) - ((= #xFB1E ci) 0) - ((<= #xFE00 ci #xFE23) 0) - ((<= #x1D167 ci #x1D169) 0) - ((<= #x1D17B ci #x1D182) 0) - ((<= #x1D185 ci #x1D18B) 0) - ((<= #x1D1AA ci #x1D1AD) 0) - ((<= #xE0100 ci #xE01EF) 0) - (else 1)))) + ((char-set:zero-width? ci) + 0) + ((char-set:full-width? ci) + 2) + ((and ambiguous-is-wide? (char-set:ambiguous-width? ci)) + 2) + (else + 1)))) + +(define (unicode-terminal-width/aux str start end ambiguous-is-wide?) + (let lp1 ((sc start) (width 0)) + (if (string-cursor>=? sc end) + width + (let ((c (string-ref/cursor str sc))) + (cond + ;; ANSI escapes + ;; TODO: maintain a state machine so the escape can be + ;; spread across multiple strings + ((and (= 27 (char->integer c)) ; esc + (string-cursor=? sc end) width) + ((memv (string-ref/cursor str sc) '(#\m #\newline)) + (lp1 (string-cursor-next str sc) width)) + (else (lp2 (string-cursor-next str sc)))))) + ;; fast-path ASCII + ((char<=? c #\~) + (lp1 (string-cursor-next str sc) (+ width 1))) + ;; unicode + (else + (lp1 (string-cursor-next str sc) + (+ width (unicode-char-width c ambiguous-is-wide?))))))))) + +(define (cursor-arg str x) + (if (string-cursor? x) x (string-index->cursor str x))) + +;; convert args to cursors internally for efficiency (define (unicode-terminal-width str . o) - (let ((start (if (pair? o) (car o) 0)) - (end (if (and (pair? o) (pair? (cdr o))) - (cadr o) - (string-length str)))) - (let lp1 ((i start) (width 0)) - (if (>= i end) - width - (let ((c (string-ref str i))) - (cond - ;; ANSI escapes - ((and (= 27 (char->integer c)) ; esc - (< (+ i 1) end) - (eqv? #\[ (string-ref str (+ i 1)))) - (let lp2 ((i (+ i 2))) - (cond ((>= i end) width) - ((memv (string-ref str i) '(#\m #\newline)) - (lp1 (+ i 1) width)) - (else (lp2 (+ i 1)))))) - ;; unicode characters - ((>= (char->integer c) #x80) - (lp1 (+ i 1) (+ width (unicode-char-width c)))) - ;; normal ASCII - (else (lp1 (+ i 1) (+ width 1))))))))) + (let ((start (cursor-arg str (if (pair? o) + (car o) + (string-cursor-start str)))) + (end (cursor-arg str (if (and (pair? o) (pair? (cdr o))) + (cadr o) + (string-cursor-end str))))) + (unicode-terminal-width/aux str start end #f))) + +(define (unicode-terminal-width/wide str . o) + (let ((start (cursor-arg str (if (pair? o) + (car o) + (string-cursor-start str)))) + (end (cursor-arg str (if (and (pair? o) (pair? (cdr o))) + (cadr o) + (string-cursor-end str))))) + (unicode-terminal-width/aux str start end #t))) (define (as-unicode . args) - (with ((string-width unicode-terminal-width)) - (each-in-list args))) + (fn (ambiguous-is-wide?) + (with ((string-width (if ambiguous-is-wide? + unicode-terminal-width/wide + unicode-terminal-width))) + (each-in-list args)))) diff --git a/lib/chibi/show/unicode.sld b/lib/chibi/show/unicode.sld index 88e84ab3..435e9ba9 100644 --- a/lib/chibi/show/unicode.sld +++ b/lib/chibi/show/unicode.sld @@ -1,5 +1,5 @@ (define-library (chibi show unicode) - (import (scheme base) (chibi show base) (srfi 151)) - (export as-unicode unicode-terminal-width) - (include "unicode.scm")) + (import (scheme base) (chibi show base) (srfi 130) (srfi 151)) + (export as-unicode unicode-terminal-width unicode-terminal-width/wide) + (include "width.scm" "unicode.scm")) diff --git a/lib/srfi/166.sld b/lib/srfi/166.sld index 2eb2df4f..a9c9e1a6 100644 --- a/lib/srfi/166.sld +++ b/lib/srfi/166.sld @@ -19,14 +19,14 @@ ;; state variables port row col width output writer string-width pad-char ellipsis radix precision decimal-sep decimal-align sign-rule - comma-sep comma-rule word-separator? + comma-sep comma-rule word-separator? ambiguous-is-wide? ;; pretty pretty pretty-shared pretty-simply pretty-color ;; columnar columnar tabular wrapped wrapped/list wrapped/char justified from-file line-numbers show-columns ;; unicode - as-unicode unicode-terminal-width + as-unicode unicode-terminal-width unicode-terminal-width/wide upcased downcased ;; color as-red as-blue as-green as-cyan as-yellow diff --git a/lib/srfi/166/base.scm b/lib/srfi/166/base.scm index e3a1f020..2ae9857b 100644 --- a/lib/srfi/166/base.scm +++ b/lib/srfi/166/base.scm @@ -38,6 +38,7 @@ (output output-default) (string-width substring-length) (word-separator? char-whitespace?) + (ambiguous-is-wide? #f) (ellipsis "") (decimal-align #f) (decimal-sep #f) diff --git a/lib/srfi/166/base.sld b/lib/srfi/166/base.sld index dc7da4f5..177ea01b 100644 --- a/lib/srfi/166/base.sld +++ b/lib/srfi/166/base.sld @@ -48,7 +48,7 @@ ;; state variables port row col width output writer string-width pad-char ellipsis radix precision decimal-sep decimal-align sign-rule - comma-sep comma-rule word-separator? + comma-sep comma-rule word-separator? ambiguous-is-wide? ) (include "base.scm") (include "../../chibi/show/write.scm") diff --git a/lib/srfi/166/test.sld b/lib/srfi/166/test.sld index 583b4018..98146e79 100644 --- a/lib/srfi/166/test.sld +++ b/lib/srfi/166/test.sld @@ -741,6 +741,12 @@ def | 6 (show #f (as-unicode (with ((pad-char #\〜)) (padded/both 5 "日本語"))))) (test "日本語 col: 6" (show #f (as-unicode "日本語" (fn (col) (each " col: " col))))) + (test "日本語ΠΜΕ col: 9" + (show #f (as-unicode "日本語ΠΜΕ" (fn (col) (each " col: " col))))) + (test "日本語ΠΜΕ col: 12" + (show #f (with ((ambiguous-is-wide? #t)) + (as-unicode "日本語ΠΜΕ" + (fn (col) (each " col: " col)))))) ;; from-file ;; for reference, filesystem-test relies on creating files under /tmp diff --git a/lib/srfi/166/unicode.sld b/lib/srfi/166/unicode.sld index 5284244e..56729106 100644 --- a/lib/srfi/166/unicode.sld +++ b/lib/srfi/166/unicode.sld @@ -1,5 +1,6 @@ (define-library (srfi 166 unicode) - (import (scheme base) (srfi 151) (srfi 166 base)) - (export-all) - (include "../../chibi/show/unicode.scm")) + (import (scheme base) (srfi 130) (srfi 151) (srfi 166 base)) + (export as-unicode unicode-terminal-width unicode-terminal-width/wide) + (include "../../chibi/show/width.scm" + "../../chibi/show/unicode.scm")) diff --git a/tools/extract-unicode-props.scm b/tools/extract-unicode-props.scm index 0ab57a8e..9f203bde 100755 --- a/tools/extract-unicode-props.scm +++ b/tools/extract-unicode-props.scm @@ -15,6 +15,7 @@ ;; A value can be any of: ;; ;; Property_Name: all unicode characters with the given derived property +;; Prop@N: a property matched at a given field (instead of default category) ;; Xx: all unicode characters with the given general category ;; X: all unicode characters with any general category X* ;; NNNN: a single unicode value in hex format @@ -25,7 +26,7 @@ ;; Unicode property name. ;; ;; Assumes the files UnicodeData.txt and DerivedCoreProperties.txt are -;; in the data/ current directory, unless overridden with the --data or +;; in the ./data/ directory, unless overridden with the --data or ;; --derived options. (import (chibi) (chibi io) (chibi string)) @@ -35,8 +36,9 @@ (for-each (lambda (x) (display x err)) args) (newline err))) -;; Parse UnicodeData.txt for characters matching a given class. -(define (extract-char-set-category cat data) +;; Parse UnicodeData.txt or other semi-colon-delimited TSV file for +;; characters matching a given class in a given field. +(define (extract-char-set-category cat field data) (define (join-to-range n ls) (cond ((null? ls) @@ -62,22 +64,43 @@ ((or (equal? line "") (eqv? #\# (string-ref line 0))) (lp ranges)) (else - (let ((ls (string-split line #\; 4))) + (let* ((line (substring-cursor line + (string-cursor-start line) + (string-find line #\#))) + (ls (map string-trim (string-split line #\;)))) (cond - ((< (length ls) 3) + ((<= (length ls) field) (warn "invalid UnicodeData line: " line) (lp ranges)) (else - (let ((ch (string->number (car ls) 16)) + (let ((ch (if (string-contains (car ls) "..") + (let* ((sc (string-contains (car ls) ".."))) + (cons (string->number + (substring-cursor + (car ls) + (string-cursor-start (car ls)) + sc) + 16) + (string->number + (substring-cursor + (car ls) + (string-cursor-forward (car ls) sc 2)) + 16))) + (string->number (car ls) 16))) (name (cadr ls)) - (ch-cat (car (cddr ls)))) + (ch-cat (list-ref ls field))) (cond - ((or (not ch) (not (= 2 (string-length ch-cat)))) + ((not (or (integer? ch) + (and (pair? ch) + (integer? (car ch)) + (integer? (cdr ch))))) (warn "invalid UnicodeData line: " line)) ((if (char? cat) (eqv? cat (string-ref ch-cat 0)) (equal? cat ch-cat)) - (lp (join-to-range ch ranges))) + (lp (if (pair? ch) + (cons ch ranges) + (join-to-range ch ranges)))) (else (lp ranges)))))))))))))) @@ -131,13 +154,19 @@ (error "invalid character range, expected NNNN-MMMM, got: " def)))) ((string->number def 16) => (lambda (start) `(char-set ,(integer->char start)))) + ((string-find? def #\@) + (let* ((sc (string-find def #\@)) + (cat (substring-cursor def (string-cursor-start def) sc)) + (field (string->number + (substring-cursor def (string-cursor-next def sc))))) + (extract-char-set-category cat field data))) ((and (= 1 (string-length def)) (char-upper-case? (string-ref def 0))) - (extract-char-set-category (string-ref def 0) data)) + (extract-char-set-category (string-ref def 0) 2 data)) ((and (= 2 (string-length def)) (char-upper-case? (string-ref def 0)) (char-lower-case? (string-ref def 1))) - (extract-char-set-category def data)) + (extract-char-set-category def 2 data)) ;; derived properties ((and (> (string-length def) 1) (eqv? #\: (string-ref def 0))) diff --git a/tools/optimize-char-sets.scm b/tools/optimize-char-sets.scm index d020bf1b..8eebcfcc 100644 --- a/tools/optimize-char-sets.scm +++ b/tools/optimize-char-sets.scm @@ -21,13 +21,15 @@ (remove (lambda (i) (hash-table-exists? ls2-tab i)) ls1))) (let ((args (command-line))) - (let lp ((ls (cdr args)) (ascii? #f)) + (let lp ((ls (cdr args)) (ascii? #f) (predicate? #f)) (cond ((and (pair? ls) (not (equal? "" (car ls))) (eqv? #\- (string-ref (car ls) 0))) (cond ((member (car ls) '("-a" "--ascii")) - (lp (cdr ls) #t)) + (lp (cdr ls) #t predicate?)) + ((member (car ls) '("-p" "--predicate")) + (lp (cdr ls) ascii? #t)) (else (error "unknown option" (car ls))))) ((or (null? ls) (pair? (cdr ls))) (error "usage: optimize-char-sets.scm [--ascii] module.name")) @@ -78,8 +80,13 @@ (newline (current-error-port)) (error "optimized iset is different")))) (display " writing\n" (current-error-port)) - (write `(define ,exp - (immutable-char-set ,(iset->code iset2)))) + (write + (if predicate? + `(define ,(string->symbol + (string-append (symbol->string exp) "?")) + ,(iset->code/lambda iset2)) + `(define ,exp + (immutable-char-set ,(iset->code iset2))))) (newline) (newline) (display " done\n" (current-error-port)))))))