fix case folding, update to unicode 13

This commit is contained in:
Alex Shinn 2020-06-04 22:08:07 +09:00
parent 6fb0640721
commit c245d6cee8
15 changed files with 364 additions and 208 deletions

View file

@ -197,8 +197,11 @@ lib/chibi/char-set/full.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(
lib/chibi/show/width.scm: build-lib/chibi/char-set/width.scm chibi-scheme$(EXE)
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm --predicate chibi.char-set.width > $@
lib/scheme/char/case-offsets.scm: data/UnicodeData.txt chibi-scheme$(EXE) all-libs
$(CHIBI) tools/extract-case-offsets.scm $< > $@
lib/scheme/char/case-offsets.scm: data/UnicodeData.txt data/CaseFolding.txt chibi-scheme$(EXE) all-libs
$(CHIBI) tools/extract-case-offsets.scm data/UnicodeData.txt data/CaseFolding.txt > $@
lib/scheme/char/special-casing.scm: data/CaseFolding.txt data/SpecialCasing.txt chibi-scheme$(EXE) all-libs
$(CHIBI) tools/extract-special-casing.scm data/CaseFolding.txt data/SpecialCasing.txt > $@
########################################################################
# Tests

View file

@ -1,42 +1,42 @@
;; char-set:lower-case
(define char-set:lower-case (immutable-char-set (%make-iset 97 122 #f #f #f)))
(define char-set:lower-case (immutable-char-set (%make-iset 97 127 67108863 #f #f)))
;; char-set:upper-case
(define char-set:upper-case (immutable-char-set (%make-iset 65 90 #f #f #f)))
(define char-set:upper-case (immutable-char-set (%make-iset 65 127 67108863 #f #f)))
;; char-set:title-case
(define char-set:title-case (immutable-char-set (%make-iset 0 0 0 #f #f)))
;; char-set:letter
(define char-set:letter (immutable-char-set (%make-iset 97 122 #f (%make-iset 65 90 #f #f #f) #f)))
(define char-set:letter (immutable-char-set (%make-iset 65 127 288230371923853311 #f #f)))
;; char-set:punctuation
(define char-set:punctuation (immutable-char-set (%make-iset 63 64 #f (%make-iset 44 47 #f (%make-iset 37 42 #f (%make-iset 33 35 #f #f #f) #f) (%make-iset 58 59 #f #f #f)) (%make-iset 123 123 #f (%make-iset 95 95 #f (%make-iset 91 93 #f #f #f) #f) (%make-iset 125 125 #f #f #f)))))
(define char-set:punctuation (immutable-char-set (%make-iset 33 127 6189700203056200029306911735 #f #f)))
;; char-set:symbol
(define char-set:symbol (immutable-char-set (%make-iset 94 94 #f (%make-iset 43 43 #f (%make-iset 36 36 #f #f #f) (%make-iset 60 62 #f #f #f)) (%make-iset 124 124 #f (%make-iset 96 96 #f #f #f) (%make-iset 126 126 #f #f #f)))))
(define char-set:symbol (immutable-char-set (%make-iset 36 127 1547425050547877224499904641 #f #f)))
;; char-set:blank
(define char-set:blank (immutable-char-set (%make-iset 32 32 #f (%make-iset 9 9 #f #f #f) #f)))
(define char-set:blank (immutable-char-set (%make-iset 9 32 8388609 #f #f)))
;; char-set:whitespace
(define char-set:whitespace (immutable-char-set (%make-iset 32 32 #f (%make-iset 9 13 #f #f #f) #f)))
(define char-set:whitespace (immutable-char-set (%make-iset 9 127 8388639 #f #f)))
;; char-set:digit
(define char-set:digit (immutable-char-set (%make-iset 48 57 #f #f #f)))
;; char-set:letter+digit
(define char-set:letter+digit (immutable-char-set (%make-iset 65 90 #f (%make-iset 48 57 #f #f #f) (%make-iset 97 122 #f #f #f))))
(define char-set:letter+digit (immutable-char-set (%make-iset 48 127 37778931308803301180415 #f #f)))
;; char-set:hex-digit
(define char-set:hex-digit (immutable-char-set (%make-iset 65 70 #f (%make-iset 48 57 #f #f #f) (%make-iset 97 102 #f #f #f))))
(define char-set:hex-digit (immutable-char-set (%make-iset 48 102 35465847073801215 #f #f)))
;; char-set:iso-control
(define char-set:iso-control (immutable-char-set (%make-iset 127 127 #f (%make-iset 0 31 #f #f #f) #f)))
(define char-set:iso-control (immutable-char-set (%make-iset 0 127 170141183460469231731687303720179073023 #f #f)))
;; char-set:graphic
(define char-set:graphic (immutable-char-set (%make-iset 33 126 #f #f #f)))
(define char-set:graphic (immutable-char-set (%make-iset 33 127 19807040628566084398385987583 #f #f)))
;; char-set:printing
(define char-set:printing (immutable-char-set (%make-iset 32 126 #f (%make-iset 9 13 #f #f #f) #f)))
(define char-set:printing (immutable-char-set (%make-iset 9 127 332306998946228968225951765061697567 #f #f)))

File diff suppressed because one or more lines are too long

View file

@ -8,8 +8,7 @@
padded padded/left padded/right padded/both
trimmed trimmed/left trimmed/right trimmed/both trimmed/lazy
fitted fitted/left fitted/right fitted/both
joined joined/prefix joined/suffix joined/last joined/dot joined/range
upcased downcased)
joined joined/prefix joined/suffix joined/last joined/dot joined/range)
(import (scheme base) (scheme char) (scheme write)
(chibi show base))
(include "show/show.scm"))

View file

@ -82,21 +82,6 @@
(fn (col pad-char)
(displayed (make-string (max 0 (- where col)) pad-char))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; String transformations
(define (with-string-transformer proc . ls)
(fn ((orig-output output))
(let ((output* (lambda (str) (orig-output (proc str)))))
(with ((output output*))
(each-in-list ls)))))
;;> Show each of \var{ls}, uppercasing all generated text.
(define (upcased . ls) (apply with-string-transformer string-upcase ls))
;;> Show each of \var{ls}, lowercasing all generated text.
(define (downcased . ls) (apply with-string-transformer string-downcase ls))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Padding and trimming

View file

@ -68,3 +68,18 @@
unicode-terminal-width/wide
unicode-terminal-width)))
(each-in-list args))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; String transformations
(define (with-string-transformer proc . ls)
(fn ((orig-output output))
(let ((output* (lambda (str) (orig-output (proc str)))))
(with ((output output*))
(each-in-list ls)))))
;;> Show each of \var{ls}, uppercasing all generated text.
(define (upcased . ls) (apply with-string-transformer string-upcase ls))
;;> Show each of \var{ls}, lowercasing all generated text.
(define (downcased . ls) (apply with-string-transformer string-downcase ls))

View file

@ -1,5 +1,11 @@
(define-library (chibi show unicode)
(import (scheme base) (chibi show base) (srfi 130) (srfi 151))
(export as-unicode unicode-terminal-width unicode-terminal-width/wide)
(import (scheme base)
(scheme char)
(chibi show base)
(srfi 130)
(srfi 151))
(export as-unicode
unicode-terminal-width unicode-terminal-width/wide
upcased downcased)
(include "width.scm" "unicode.scm"))

View file

@ -24,4 +24,5 @@
char-downcase char-foldcase char-lower-case? char-numeric?
char-upcase char-upper-case? char-whitespace? digit-value
string-ci<=? string-ci<? string-ci=? string-ci>=? string-ci>?
char-get-special-case
string-downcase string-foldcase string-upcase))

File diff suppressed because one or more lines are too long

View file

@ -44,7 +44,7 @@
(define (char-foldcase ch)
(or (bsearch-kv char-foldcase-map (char->integer ch) 0
(- (vector-length char-foldcase-map) 2))
(char-downcase ch)))
ch))
(define (char-cmp-ci op a ls)
(let lp ((op op) (a (char->integer (char-foldcase a))) (ls ls))
@ -83,17 +83,20 @@
(let ((ch (read-char in)))
(cond
((not (eof-object? ch))
(write-string
(cond
((and (not fold?) (eqv? ch #\x03A3))
(let ((ch2 (peek-char in)))
(cond
((and (not fold?) (eqv? ch #\x03A3)) ;; sigma
(let ((ch2 (peek-char in)))
(write-char
(if (or (eof-object? ch2)
(not (char-set-contains? char-set:letter ch2)))
#\x03C2
#\x03C3)))
((char-get-special-case ch (if fold? 4 1)))
(else (if fold? (char-foldcase ch) (char-downcase ch))))
out)
#\x03C3)
out)))
((char-get-special-case ch (if fold? 4 1))
=> (lambda (s) (write-string s out)))
(else
(write-char (if fold? (char-foldcase ch) (char-downcase ch))
out)))
(lp)))))))))
(define (string-downcase str) (string-down-or-fold-case str #f))

View file

@ -1,109 +1,109 @@
;; Unconditional non-1-to-1 case mappings derived from Unicode data
;; file SpecialCasing.txt.
;; auto-generated by extract-special-casing.scm
(define special-cases
;; <code> <lower> <title> <upper> [<fold>]
#(#(223 "ß" "Ss" "SS" "ss")
#(304 "i̇" "İ" "İ")
#(329 "ʼn" "ʼN" "ʼN")
#(496 "ǰ" "J̌" "J̌")
#(912 "ΐ" "Ϊ́" "Ϊ́")
#(944 "ΰ" "Ϋ́" "Ϋ́")
#(1415 "և" "Եւ" "ԵՒ")
#(7830 "ẖ" "H̱" "H̱")
#(7831 "ẗ" "T̈" "T̈")
#(7832 "ẘ" "W̊" "W̊")
#(7833 "ẙ" "Y̊" "Y̊")
#(7834 "ẚ" "Aʾ" "Aʾ")
#(8016 "ὐ" "Υ̓" "Υ̓")
#(8018 "ὒ" "Υ̓̀" "Υ̓̀")
#(8020 "ὔ" "Υ̓́" "Υ̓́")
#(8022 "ὖ" "Υ̓͂" "Υ̓͂")
#(8064 "ᾀ" "ᾈ" "ἈΙ")
#(8065 "ᾁ" "ᾉ" "ἉΙ")
#(8066 "ᾂ" "ᾊ" "ἊΙ")
#(8067 "ᾃ" "ᾋ" "ἋΙ")
#(8068 "ᾄ" "ᾌ" "ἌΙ")
#(8069 "ᾅ" "ᾍ" "ἍΙ")
#(8070 "ᾆ" "ᾎ" "ἎΙ")
#(8071 "ᾇ" "ᾏ" "ἏΙ")
#(8072 "ᾀ" "ᾈ" "ἈΙ")
#(8073 "ᾁ" "ᾉ" "ἉΙ")
#(8074 "ᾂ" "ᾊ" "ἊΙ")
#(8075 "ᾃ" "ᾋ" "ἋΙ")
#(8076 "ᾄ" "ᾌ" "ἌΙ")
#(8077 "ᾅ" "ᾍ" "ἍΙ")
#(8078 "ᾆ" "ᾎ" "ἎΙ")
#(8079 "ᾇ" "ᾏ" "ἏΙ")
#(8080 "ᾐ" "ᾘ" "ἨΙ")
#(8081 "ᾑ" "ᾙ" "ἩΙ")
#(8082 "ᾒ" "ᾚ" "ἪΙ")
#(8083 "ᾓ" "ᾛ" "ἫΙ")
#(8084 "ᾔ" "ᾜ" "ἬΙ")
#(8085 "ᾕ" "ᾝ" "ἭΙ")
#(8086 "ᾖ" "ᾞ" "ἮΙ")
#(8087 "ᾗ" "ᾟ" "ἯΙ")
#(8088 "ᾐ" "ᾘ" "ἨΙ")
#(8089 "ᾑ" "ᾙ" "ἩΙ")
#(8090 "ᾒ" "ᾚ" "ἪΙ")
#(8091 "ᾓ" "ᾛ" "ἫΙ")
#(8092 "ᾔ" "ᾜ" "ἬΙ")
#(8093 "ᾕ" "ᾝ" "ἭΙ")
#(8094 "ᾖ" "ᾞ" "ἮΙ")
#(8095 "ᾗ" "ᾟ" "ἯΙ")
#(8096 "ᾠ" "ᾨ" "ὨΙ")
#(8097 "ᾡ" "ᾩ" "ὩΙ")
#(8098 "ᾢ" "ᾪ" "ὪΙ")
#(8099 "ᾣ" "ᾫ" "ὫΙ")
#(8100 "ᾤ" "ᾬ" "ὬΙ")
#(8101 "ᾥ" "ᾭ" "ὭΙ")
#(8102 "ᾦ" "ᾮ" "ὮΙ")
#(8103 "ᾧ" "ᾯ" "ὯΙ")
#(8104 "ᾠ" "ᾨ" "ὨΙ")
#(8105 "ᾡ" "ᾩ" "ὩΙ")
#(8106 "ᾢ" "ᾪ" "ὪΙ")
#(8107 "ᾣ" "ᾫ" "ὫΙ")
#(8108 "ᾤ" "ᾬ" "ὬΙ")
#(8109 "ᾥ" "ᾭ" "ὭΙ")
#(8110 "ᾦ" "ᾮ" "ὮΙ")
#(8111 "ᾧ" "ᾯ" "ὯΙ")
#(8114 "ᾲ" "Ὰͅ" "ᾺΙ")
#(8115 "ᾳ" "ᾼ" "ΑΙ")
#(8116 "ᾴ" "Άͅ" "ΆΙ")
#(8118 "ᾶ" "Α͂" "Α͂")
#(8119 "ᾷ" "ᾼ͂" "Α͂Ι")
#(8124 "ᾳ" "ᾼ" "ΑΙ")
#(8130 "ῂ" "Ὴͅ" "ῊΙ")
#(8131 "ῃ" "ῌ" "ΗΙ")
#(8132 "ῄ" "Ήͅ" "ΉΙ")
#(8134 "ῆ" "Η͂" "Η͂")
#(8135 "ῇ" "ῌ͂" "Η͂Ι")
#(8140 "ῃ" "ῌ" "ΗΙ")
#(8146 "ῒ" "Ϊ̀" "Ϊ̀")
#(8147 "ΐ" "Ϊ́" "Ϊ́")
#(8150 "ῖ" "Ι͂" "Ι͂")
#(8151 "ῗ" "Ϊ͂" "Ϊ͂")
#(8162 "ῢ" "Ϋ̀" "Ϋ̀")
#(8163 "ΰ" "Ϋ́" "Ϋ́")
#(8164 "ῤ" "Ρ̓" "Ρ̓")
#(8166 "ῦ" "Υ͂" "Υ͂")
#(8167 "ῧ" "Ϋ͂" "Ϋ͂")
#(8178 "ῲ" "Ὼͅ" "ῺΙ")
#(8179 "ῳ" "ῼ" "ΩΙ")
#(8180 "ῴ" "Ώͅ" "ΏΙ")
#(8182 "ῶ" "Ω͂" "Ω͂")
#(8183 "ῷ" "ῼ͂" "Ω͂Ι")
#(8188 "ῳ" "ῼ" "ΩΙ")
#(64256 "ff" "Ff" "FF")
#(64257 "fi" "Fi" "FI")
#(64258 "fl" "Fl" "FL")
#(64259 "ffi" "Ffi" "FFI")
#(64260 "ffl" "Ffl" "FFL")
#(64261 "ſt" "St" "ST")
#(64262 "st" "St" "ST")
#(64275 "ﬓ" "Մն" "ՄՆ")
#(64276 "ﬔ" "Մե" "ՄԵ")
#(64277 "ﬕ" "Մի" "ՄԻ")
#(64278 "ﬖ" "Վն" "ՎՆ")
#(64279 "ﬗ" "Մխ" "ՄԽ")))
'#(#(#xdf "ß" "Ss" "SS" "ss")
#(#x130 "i̇" "İ" "İ" "i̇")
#(#x149 "ʼn" "ʼN" "ʼN" "ʼn")
#(#x1f0 "ǰ" "J̌" "J̌" "ǰ")
#(#x390 "ΐ" "Ϊ́" "Ϊ́" "ΐ")
#(#x3b0 "ΰ" "Ϋ́" "Ϋ́" "ΰ")
#(#x587 "և" "Եւ" "ԵՒ" "եւ")
#(#x1e96 "ẖ" "H̱" "H̱" "ẖ")
#(#x1e97 "ẗ" "T̈" "T̈" "ẗ")
#(#x1e98 "ẘ" "W̊" "W̊" "ẘ")
#(#x1e99 "ẙ" "Y̊" "Y̊" "ẙ")
#(#x1e9a "ẚ" "Aʾ" "Aʾ" "aʾ")
#(#x1e9e "ß" "Ss" "SS" "ss") ;; TODO: this had to be added by hand
#(#x1f50 "ὐ" "Υ̓" "Υ̓" "ὐ")
#(#x1f52 "ὒ" "Υ̓̀" "Υ̓̀" "ὒ")
#(#x1f54 "ὔ" "Υ̓́" "Υ̓́" "ὔ")
#(#x1f56 "ὖ" "Υ̓͂" "Υ̓͂" "ὖ")
#(#x1f80 "ᾀ" "ᾈ" "ἈΙ" "ἀι")
#(#x1f81 "ᾁ" "ᾉ" "ἉΙ" "ἁι")
#(#x1f82 "ᾂ" "ᾊ" "ἊΙ" "ἂι")
#(#x1f83 "ᾃ" "ᾋ" "ἋΙ" "ἃι")
#(#x1f84 "ᾄ" "ᾌ" "ἌΙ" "ἄι")
#(#x1f85 "ᾅ" "ᾍ" "ἍΙ" "ἅι")
#(#x1f86 "ᾆ" "ᾎ" "ἎΙ" "ἆι")
#(#x1f87 "ᾇ" "ᾏ" "ἏΙ" "ἇι")
#(#x1f88 "ᾀ" "ᾈ" "ἈΙ" "ἀι")
#(#x1f89 "ᾁ" "ᾉ" "ἉΙ" "ἁι")
#(#x1f8a "ᾂ" "ᾊ" "ἊΙ" "ἂι")
#(#x1f8b "ᾃ" "ᾋ" "ἋΙ" "ἃι")
#(#x1f8c "ᾄ" "ᾌ" "ἌΙ" "ἄι")
#(#x1f8d "ᾅ" "ᾍ" "ἍΙ" "ἅι")
#(#x1f8e "ᾆ" "ᾎ" "ἎΙ" "ἆι")
#(#x1f8f "ᾇ" "ᾏ" "ἏΙ" "ἇι")
#(#x1f90 "ᾐ" "ᾘ" "ἨΙ" "ἠι")
#(#x1f91 "ᾑ" "ᾙ" "ἩΙ" "ἡι")
#(#x1f92 "ᾒ" "ᾚ" "ἪΙ" "ἢι")
#(#x1f93 "ᾓ" "ᾛ" "ἫΙ" "ἣι")
#(#x1f94 "ᾔ" "ᾜ" "ἬΙ" "ἤι")
#(#x1f95 "ᾕ" "ᾝ" "ἭΙ" "ἥι")
#(#x1f96 "ᾖ" "ᾞ" "ἮΙ" "ἦι")
#(#x1f97 "ᾗ" "ᾟ" "ἯΙ" "ἧι")
#(#x1f98 "ᾐ" "ᾘ" "ἨΙ" "ἠι")
#(#x1f99 "ᾑ" "ᾙ" "ἩΙ" "ἡι")
#(#x1f9a "ᾒ" "ᾚ" "ἪΙ" "ἢι")
#(#x1f9b "ᾓ" "ᾛ" "ἫΙ" "ἣι")
#(#x1f9c "ᾔ" "ᾜ" "ἬΙ" "ἤι")
#(#x1f9d "ᾕ" "ᾝ" "ἭΙ" "ἥι")
#(#x1f9e "ᾖ" "ᾞ" "ἮΙ" "ἦι")
#(#x1f9f "ᾗ" "ᾟ" "ἯΙ" "ἧι")
#(#x1fa0 "ᾠ" "ᾨ" "ὨΙ" "ὠι")
#(#x1fa1 "ᾡ" "ᾩ" "ὩΙ" "ὡι")
#(#x1fa2 "ᾢ" "ᾪ" "ὪΙ" "ὢι")
#(#x1fa3 "ᾣ" "ᾫ" "ὫΙ" "ὣι")
#(#x1fa4 "ᾤ" "ᾬ" "ὬΙ" "ὤι")
#(#x1fa5 "ᾥ" "ᾭ" "ὭΙ" "ὥι")
#(#x1fa6 "ᾦ" "ᾮ" "ὮΙ" "ὦι")
#(#x1fa7 "ᾧ" "ᾯ" "ὯΙ" "ὧι")
#(#x1fa8 "ᾠ" "ᾨ" "ὨΙ" "ὠι")
#(#x1fa9 "ᾡ" "ᾩ" "ὩΙ" "ὡι")
#(#x1faa "ᾢ" "ᾪ" "ὪΙ" "ὢι")
#(#x1fab "ᾣ" "ᾫ" "ὫΙ" "ὣι")
#(#x1fac "ᾤ" "ᾬ" "ὬΙ" "ὤι")
#(#x1fad "ᾥ" "ᾭ" "ὭΙ" "ὥι")
#(#x1fae "ᾦ" "ᾮ" "ὮΙ" "ὦι")
#(#x1faf "ᾧ" "ᾯ" "ὯΙ" "ὧι")
#(#x1fb2 "ᾲ" "Ὰͅ" "ᾺΙ" "ὰι")
#(#x1fb3 "ᾳ" "ᾼ" "ΑΙ" "αι")
#(#x1fb4 "ᾴ" "Άͅ" "ΆΙ" "άι")
#(#x1fb6 "ᾶ" "Α͂" "Α͂" "ᾶ")
#(#x1fb7 "ᾷ" "ᾼ͂" "Α͂Ι" "ᾶι")
#(#x1fbc "ᾳ" "ᾼ" "ΑΙ" "αι")
#(#x1fc2 "ῂ" "Ὴͅ" "ῊΙ" "ὴι")
#(#x1fc3 "ῃ" "ῌ" "ΗΙ" "ηι")
#(#x1fc4 "ῄ" "Ήͅ" "ΉΙ" "ήι")
#(#x1fc6 "ῆ" "Η͂" "Η͂" "ῆ")
#(#x1fc7 "ῇ" "ῌ͂" "Η͂Ι" "ῆι")
#(#x1fcc "ῃ" "ῌ" "ΗΙ" "ηι")
#(#x1fd2 "ῒ" "Ϊ̀" "Ϊ̀" "ῒ")
#(#x1fd3 "ΐ" "Ϊ́" "Ϊ́" "ΐ")
#(#x1fd6 "ῖ" "Ι͂" "Ι͂" "ῖ")
#(#x1fd7 "ῗ" "Ϊ͂" "Ϊ͂" "ῗ")
#(#x1fe2 "ῢ" "Ϋ̀" "Ϋ̀" "ῢ")
#(#x1fe3 "ΰ" "Ϋ́" "Ϋ́" "ΰ")
#(#x1fe4 "ῤ" "Ρ̓" "Ρ̓" "ῤ")
#(#x1fe6 "ῦ" "Υ͂" "Υ͂" "ῦ")
#(#x1fe7 "ῧ" "Ϋ͂" "Ϋ͂" "ῧ")
#(#x1ff2 "ῲ" "Ὼͅ" "ῺΙ" "ὼι")
#(#x1ff3 "ῳ" "ῼ" "ΩΙ" "ωι")
#(#x1ff4 "ῴ" "Ώͅ" "ΏΙ" "ώι")
#(#x1ff6 "ῶ" "Ω͂" "Ω͂" "ῶ")
#(#x1ff7 "ῷ" "ῼ͂" "Ω͂Ι" "ῶι")
#(#x1ffc "ῳ" "ῼ" "ΩΙ" "ωι")
#(#xfb00 "ff" "Ff" "FF" "ff")
#(#xfb01 "fi" "Fi" "FI" "fi")
#(#xfb02 "fl" "Fl" "FL" "fl")
#(#xfb03 "ffi" "Ffi" "FFI" "ffi")
#(#xfb04 "ffl" "Ffl" "FFL" "ffl")
#(#xfb05 "ſt" "St" "ST" "st")
#(#xfb06 "st" "St" "ST" "st")
#(#xfb13 "ﬓ" "Մն" "ՄՆ" "մն")
#(#xfb14 "ﬔ" "Մե" "ՄԵ" "մե")
#(#xfb15 "ﬕ" "Մի" "ՄԻ" "մի")
#(#xfb16 "ﬖ" "Վն" "ՎՆ" "վն")
#(#xfb17 "ﬗ" "Մխ" "ՄԽ" "մխ")
))

View file

@ -42,7 +42,6 @@
joined/range padded padded/right padded/both
trimmed trimmed/right trimmed/both trimmed/lazy
fitted fitted/right fitted/both output-default
upcased downcased
;; computations
fn with with! forked call-with-output
;; state variables

View file

@ -1,6 +1,12 @@
(define-library (srfi 166 unicode)
(import (scheme base) (srfi 130) (srfi 151) (srfi 166 base))
(export as-unicode unicode-terminal-width unicode-terminal-width/wide)
(import (scheme base)
(scheme char)
(srfi 130)
(srfi 151)
(srfi 166 base))
(export as-unicode
unicode-terminal-width unicode-terminal-width/wide
upcased downcased)
(include "../../chibi/show/width.scm"
"../../chibi/show/unicode.scm"))

View file

@ -3,7 +3,7 @@
;; Extract sets of char case offsets.
;;
;; Usage:
;; extract-case-offsets.scm options UnicodeData.txt > out
;; extract-case-offsets.scm [options] UnicodeData.txt CaseFolding.txt > out
;;
;; Recognized options are:
;;
@ -23,7 +23,17 @@
(for-each (lambda (x) (display x err)) args)
(newline err)))
(define (write-offsets offset-map title-ups title-downs folds out min-count max-char-sets name)
(define (write-hex-list hex-ls out)
(let lp ((ls hex-ls))
(cond
((pair? ls)
(if (not (eq? ls hex-ls))
(write-char #\space out))
(write-string "#x" out)
(write-string (number->string (car ls) 16) out)
(lp (cdr ls))))))
(define (write-offsets offset-map title-ups title-downs out min-count max-char-sets name)
(let lp ((ls (sort (hash-table->alist offset-map)
(lambda (a b) (> (iset-size (cdr a)) (iset-size (cdr b))))))
(i 0)
@ -50,50 +60,61 @@
(map (lambda (y) (list y (+ y (car x))))
(iset->list (cdr x))))
ls)))
(write `(define char-downcase-map
',(list->vector
(append-map (lambda (x) x)
(sort (append pairs title-downs) < car))))
out)
(newline out)
(newline out)
(write `(define char-upcase-map
',(list->vector
(append-map (lambda (x) (list (cadr x) (car x)))
(delete-duplicates
(sort (append pairs title-ups) < cadr)
(lambda (a b) (eqv? (cadr a) (cadr b)))))))
out)
(newline out)
(newline out)
(write `(define char-foldcase-map
',(list->vector
(append-map (lambda (x) x)
(delete-duplicates
(sort folds < car)
(lambda (a b) (eqv? (cadr a) (cadr b)))))))
out)
(newline out))))))
(write-string "(define char-downcase-map\n '#(" out)
(write-hex-list
(append-map (lambda (x) x) (sort (append pairs title-downs) < car))
out)
(write-string "))\n\n" out)
(write-string "(define char-upcase-map\n '#(" out)
(write-hex-list
(append-map (lambda (x) (list (cadr x) (car x)))
(delete-duplicates
(sort (append pairs title-ups) < cadr)
(lambda (a b) (eqv? (cadr a) (cadr b)))))
out)
(write-string "))\n\n" out))))))
(define (extract-case-folding in out)
(define (write-folds folds out)
(write-string "(define char-foldcase-map\n '#(" out)
(write-hex-list
(append-map (lambda (x) x) (sort folds < car))
out)
(write-string "))\n" out))
(let lp ((folds '()))
(let ((line (read-line in)))
(cond
((eof-object? line)
(write-folds folds out))
((or (equal? line "") (eqv? #\# (string-ref line 0)))
(lp folds))
(else
(let* ((line (substring-cursor line
(string-cursor-start line)
(string-find line #\#)))
(ls (map string-trim (string-split line #\;))))
(if (and (>= (length ls) 3)
(member (second ls) '("C" "S")))
(let ((base (string->number (first ls) 16))
(folded (string->number (third ls) 16)))
(if (and base folded)
(lp (cons (list base folded) folds))
(lp folds)))
(lp folds))))))))
;; value;char;name;category;combining_class;bidi_class;decomposition;numeric1;numeric2;numeric3;bidi_mirrored;unicode1_name;ISO_comment;uppercase_mapping;lowercase_mapping;titlecase_mapping
(define (extract-case-mapping in out min-count max-char-sets name)
(define (string-trim-comment str comment-ch)
(car (string-split str comment-ch 2)))
(define (extract-single-decomposition str)
(and (not (equal? "" str))
(let ((s (string-trim (last (string-split str #\>)))))
(and (not (string-contains s " "))
(string->number s 16)))))
(let ((offset-map (make-hash-table eq?))
(title-ups '())
(title-downs '())
(folds '()))
(title-downs '()))
(let lp ()
(let ((line (read-line in)))
(cond
((eof-object? line)
(write-offsets offset-map title-ups title-downs folds out
(write-offsets offset-map title-ups title-downs out
min-count max-char-sets name))
((or (equal? line "") (eqv? #\# (string-ref line 0)))
(lp))
@ -106,8 +127,7 @@
(else
(let ((base (string->number (list-ref ls 0) 16))
(upper (string->number (list-ref ls 12) 16))
(lower (string->number (list-ref ls 13) 16))
(folded (extract-single-decomposition (list-ref ls 5))))
(lower (string->number (list-ref ls 13) 16)))
(cond
((or upper lower)
(cond
@ -121,12 +141,7 @@
offset-map
(- (or lower base) (or upper base))
(lambda (is) (iset-adjoin! is (or upper base)))
(lambda () (make-iset)))))))
(cond
((and folded (not (eqv? folded (or lower base))))
;; (write `(fold: ,line ,base ,folded) (current-error-port))
;; (newline (current-error-port))
(set! folds (cons (list base folded) folds)))))))
(lambda () (make-iset))))))))))
(lp))))))))
(let ((args (command-line)))
@ -150,8 +165,8 @@
(open-output-file (cadr ls))))
(else
(error "unknown option: " (car ls)))))
((null? ls)
(error "usage: extract-case-offsets <UnicodeData.txt>"))
((not (= 2 (length ls)))
(error "usage: extract-case-offsets <UnicodeData.txt> <CaseFolding.txt>"))
(else
(if (equal? "-" (car ls))
(extract-case-mapping
@ -159,4 +174,8 @@
(call-with-input-file (car ls)
(lambda (in)
(extract-case-mapping in out min-count max-char-sets name))))
(if (equal? "-" (cadr ls))
(extract-case-folding (current-input-port) out)
(call-with-input-file (cadr ls)
(lambda (in) (extract-case-folding in out))))
(close-output-port out)))))

View file

@ -0,0 +1,117 @@
#!/usr/bin/env chibi-scheme
;; Build a table of special case (non 1:1) case mappings.
;;
;; Usage:
;; extract-special-casing.scm [options] CaseFolding.txt SpecialCasing.txt > out
(import (chibi) (srfi 1) (srfi 125) (srfi 95) (chibi io) (chibi string))
(define (warn . args)
(let ((err (current-error-port)))
(for-each (lambda (x) (display x err)) args)
(newline err)))
(define (useq->string str)
(let ((ls (map (lambda (s) (string->number s 16))
(map string-trim (string-split str #\space)))))
(and ls
(every integer? ls)
(list->string (map integer->char ls)))))
(define (extract-full-folds in)
(let ((res (make-hash-table equal?)))
(let lp ()
(let ((line (read-line in)))
(cond
((eof-object? line)
res)
(else
(let* ((line (substring-cursor line
(string-cursor-start line)
(string-find line #\#)))
(ls (map string-trim (string-split line #\;))))
(if (and (>= (length ls) 3)
(member (second ls) '("F")))
(let ((base (string->number (first ls) 16))
(folded (useq->string (third ls))))
(if (and base folded)
(hash-table-set! res base folded))))
(lp))))))))
(define (extract-special-cases in out folds)
;; TODO: handle folds not in cases (currently only #\x1e9e)
(define (write-cases cases out)
(write-string "(define special-cases\n ;; <code> <lower> <title> <upper> [<fold>]\n '#(" out)
(for-each
(lambda (x)
(if (not (eq? x (car cases)))
(write-string " " out))
(write-string "#(#x" out)
(write-string (number->string (first x) 16) out)
(write-char #\space out)
(write (second x) out)
(write-char #\space out)
(write (third x) out)
(write-char #\space out)
(write (fourth x) out)
(cond
((hash-table-ref/default folds (first x) #f)
=> (lambda (fold)
(write-char #\space out)
(write fold out))))
(write-string ")\n" out))
cases)
(write-string " ))\n" out))
(let lp ((cases '()))
(let ((line (read-line in)))
(cond
((eof-object? line)
(write-cases (sort cases < car) out))
((or (equal? line "") (eqv? #\# (string-ref line 0)))
(lp cases))
(else
(let* ((line (substring-cursor line
(string-cursor-start line)
(string-find line #\#)))
(ls (map string-trim (string-split line #\;))))
(if (and (>= (length ls) 4)
(or (= 4 (length ls))
(string-null? (list-ref ls 4))))
(let ((base (string->number (first ls) 16))
(lower (useq->string (second ls)))
(title (useq->string (third ls)))
(upper (useq->string (fourth ls))))
(if (and base lower title upper)
(lp (cons (list base lower title upper) cases))
(lp cases)))
(lp cases))))))))
(let ((args (command-line)))
(let lp ((ls (cdr args))
(min-count 26)
(max-char-sets #f)
(name "char-downcase-offsets")
(out (current-output-port)))
(cond
((and (pair? ls) (not (equal? "" (car ls)))
(eqv? #\- (string-ref (car ls) 0)))
(cond
((member (car ls) '("-o" "--output"))
(lp (cddr ls) min-count max-char-sets name
(open-output-file (cadr ls))))
(else
(error "unknown option: " (car ls)))))
((not (= 2 (length ls)))
(error "usage: extract-special-casing <CaseFolding.txt> <SpecialCasing.txt>"))
(else
(display ";; auto-generated by extract-special-casing.scm\n\n" out)
(let ((folds
(if (equal? "-" (car ls))
(extract-full-folds (current-input-port))
(call-with-input-file (car ls) extract-full-folds))))
(if (equal? "-" (cadr ls))
(extract-special-cases (current-input-port) out folds)
(call-with-input-file (cadr ls)
(lambda (in) (extract-special-cases in out folds)))))
(close-output-port out)))))