mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-05-18 21:29:19 +02:00
fix case folding, update to unicode 13
This commit is contained in:
parent
6fb0640721
commit
c245d6cee8
15 changed files with 364 additions and 208 deletions
7
Makefile
7
Makefile
|
@ -197,8 +197,11 @@ lib/chibi/char-set/full.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(
|
|||
lib/chibi/show/width.scm: build-lib/chibi/char-set/width.scm chibi-scheme$(EXE)
|
||||
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm --predicate chibi.char-set.width > $@
|
||||
|
||||
lib/scheme/char/case-offsets.scm: data/UnicodeData.txt chibi-scheme$(EXE) all-libs
|
||||
$(CHIBI) tools/extract-case-offsets.scm $< > $@
|
||||
lib/scheme/char/case-offsets.scm: data/UnicodeData.txt data/CaseFolding.txt chibi-scheme$(EXE) all-libs
|
||||
$(CHIBI) tools/extract-case-offsets.scm data/UnicodeData.txt data/CaseFolding.txt > $@
|
||||
|
||||
lib/scheme/char/special-casing.scm: data/CaseFolding.txt data/SpecialCasing.txt chibi-scheme$(EXE) all-libs
|
||||
$(CHIBI) tools/extract-special-casing.scm data/CaseFolding.txt data/SpecialCasing.txt > $@
|
||||
|
||||
########################################################################
|
||||
# Tests
|
||||
|
|
|
@ -1,42 +1,42 @@
|
|||
;; char-set:lower-case
|
||||
(define char-set:lower-case (immutable-char-set (%make-iset 97 122 #f #f #f)))
|
||||
(define char-set:lower-case (immutable-char-set (%make-iset 97 127 67108863 #f #f)))
|
||||
|
||||
;; char-set:upper-case
|
||||
(define char-set:upper-case (immutable-char-set (%make-iset 65 90 #f #f #f)))
|
||||
(define char-set:upper-case (immutable-char-set (%make-iset 65 127 67108863 #f #f)))
|
||||
|
||||
;; char-set:title-case
|
||||
(define char-set:title-case (immutable-char-set (%make-iset 0 0 0 #f #f)))
|
||||
|
||||
;; char-set:letter
|
||||
(define char-set:letter (immutable-char-set (%make-iset 97 122 #f (%make-iset 65 90 #f #f #f) #f)))
|
||||
(define char-set:letter (immutable-char-set (%make-iset 65 127 288230371923853311 #f #f)))
|
||||
|
||||
;; char-set:punctuation
|
||||
(define char-set:punctuation (immutable-char-set (%make-iset 63 64 #f (%make-iset 44 47 #f (%make-iset 37 42 #f (%make-iset 33 35 #f #f #f) #f) (%make-iset 58 59 #f #f #f)) (%make-iset 123 123 #f (%make-iset 95 95 #f (%make-iset 91 93 #f #f #f) #f) (%make-iset 125 125 #f #f #f)))))
|
||||
(define char-set:punctuation (immutable-char-set (%make-iset 33 127 6189700203056200029306911735 #f #f)))
|
||||
|
||||
;; char-set:symbol
|
||||
(define char-set:symbol (immutable-char-set (%make-iset 94 94 #f (%make-iset 43 43 #f (%make-iset 36 36 #f #f #f) (%make-iset 60 62 #f #f #f)) (%make-iset 124 124 #f (%make-iset 96 96 #f #f #f) (%make-iset 126 126 #f #f #f)))))
|
||||
(define char-set:symbol (immutable-char-set (%make-iset 36 127 1547425050547877224499904641 #f #f)))
|
||||
|
||||
;; char-set:blank
|
||||
(define char-set:blank (immutable-char-set (%make-iset 32 32 #f (%make-iset 9 9 #f #f #f) #f)))
|
||||
(define char-set:blank (immutable-char-set (%make-iset 9 32 8388609 #f #f)))
|
||||
|
||||
;; char-set:whitespace
|
||||
(define char-set:whitespace (immutable-char-set (%make-iset 32 32 #f (%make-iset 9 13 #f #f #f) #f)))
|
||||
(define char-set:whitespace (immutable-char-set (%make-iset 9 127 8388639 #f #f)))
|
||||
|
||||
;; char-set:digit
|
||||
(define char-set:digit (immutable-char-set (%make-iset 48 57 #f #f #f)))
|
||||
|
||||
;; char-set:letter+digit
|
||||
(define char-set:letter+digit (immutable-char-set (%make-iset 65 90 #f (%make-iset 48 57 #f #f #f) (%make-iset 97 122 #f #f #f))))
|
||||
(define char-set:letter+digit (immutable-char-set (%make-iset 48 127 37778931308803301180415 #f #f)))
|
||||
|
||||
;; char-set:hex-digit
|
||||
(define char-set:hex-digit (immutable-char-set (%make-iset 65 70 #f (%make-iset 48 57 #f #f #f) (%make-iset 97 102 #f #f #f))))
|
||||
(define char-set:hex-digit (immutable-char-set (%make-iset 48 102 35465847073801215 #f #f)))
|
||||
|
||||
;; char-set:iso-control
|
||||
(define char-set:iso-control (immutable-char-set (%make-iset 127 127 #f (%make-iset 0 31 #f #f #f) #f)))
|
||||
(define char-set:iso-control (immutable-char-set (%make-iset 0 127 170141183460469231731687303720179073023 #f #f)))
|
||||
|
||||
;; char-set:graphic
|
||||
(define char-set:graphic (immutable-char-set (%make-iset 33 126 #f #f #f)))
|
||||
(define char-set:graphic (immutable-char-set (%make-iset 33 127 19807040628566084398385987583 #f #f)))
|
||||
|
||||
;; char-set:printing
|
||||
(define char-set:printing (immutable-char-set (%make-iset 32 126 #f (%make-iset 9 13 #f #f #f) #f)))
|
||||
(define char-set:printing (immutable-char-set (%make-iset 9 127 332306998946228968225951765061697567 #f #f)))
|
||||
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -8,8 +8,7 @@
|
|||
padded padded/left padded/right padded/both
|
||||
trimmed trimmed/left trimmed/right trimmed/both trimmed/lazy
|
||||
fitted fitted/left fitted/right fitted/both
|
||||
joined joined/prefix joined/suffix joined/last joined/dot joined/range
|
||||
upcased downcased)
|
||||
joined joined/prefix joined/suffix joined/last joined/dot joined/range)
|
||||
(import (scheme base) (scheme char) (scheme write)
|
||||
(chibi show base))
|
||||
(include "show/show.scm"))
|
||||
|
|
|
@ -82,21 +82,6 @@
|
|||
(fn (col pad-char)
|
||||
(displayed (make-string (max 0 (- where col)) pad-char))))
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; String transformations
|
||||
|
||||
(define (with-string-transformer proc . ls)
|
||||
(fn ((orig-output output))
|
||||
(let ((output* (lambda (str) (orig-output (proc str)))))
|
||||
(with ((output output*))
|
||||
(each-in-list ls)))))
|
||||
|
||||
;;> Show each of \var{ls}, uppercasing all generated text.
|
||||
(define (upcased . ls) (apply with-string-transformer string-upcase ls))
|
||||
|
||||
;;> Show each of \var{ls}, lowercasing all generated text.
|
||||
(define (downcased . ls) (apply with-string-transformer string-downcase ls))
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Padding and trimming
|
||||
|
||||
|
|
|
@ -68,3 +68,18 @@
|
|||
unicode-terminal-width/wide
|
||||
unicode-terminal-width)))
|
||||
(each-in-list args))))
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;; String transformations
|
||||
|
||||
(define (with-string-transformer proc . ls)
|
||||
(fn ((orig-output output))
|
||||
(let ((output* (lambda (str) (orig-output (proc str)))))
|
||||
(with ((output output*))
|
||||
(each-in-list ls)))))
|
||||
|
||||
;;> Show each of \var{ls}, uppercasing all generated text.
|
||||
(define (upcased . ls) (apply with-string-transformer string-upcase ls))
|
||||
|
||||
;;> Show each of \var{ls}, lowercasing all generated text.
|
||||
(define (downcased . ls) (apply with-string-transformer string-downcase ls))
|
||||
|
|
|
@ -1,5 +1,11 @@
|
|||
|
||||
(define-library (chibi show unicode)
|
||||
(import (scheme base) (chibi show base) (srfi 130) (srfi 151))
|
||||
(export as-unicode unicode-terminal-width unicode-terminal-width/wide)
|
||||
(import (scheme base)
|
||||
(scheme char)
|
||||
(chibi show base)
|
||||
(srfi 130)
|
||||
(srfi 151))
|
||||
(export as-unicode
|
||||
unicode-terminal-width unicode-terminal-width/wide
|
||||
upcased downcased)
|
||||
(include "width.scm" "unicode.scm"))
|
||||
|
|
|
@ -24,4 +24,5 @@
|
|||
char-downcase char-foldcase char-lower-case? char-numeric?
|
||||
char-upcase char-upper-case? char-whitespace? digit-value
|
||||
string-ci<=? string-ci<? string-ci=? string-ci>=? string-ci>?
|
||||
char-get-special-case
|
||||
string-downcase string-foldcase string-upcase))
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -44,7 +44,7 @@
|
|||
(define (char-foldcase ch)
|
||||
(or (bsearch-kv char-foldcase-map (char->integer ch) 0
|
||||
(- (vector-length char-foldcase-map) 2))
|
||||
(char-downcase ch)))
|
||||
ch))
|
||||
|
||||
(define (char-cmp-ci op a ls)
|
||||
(let lp ((op op) (a (char->integer (char-foldcase a))) (ls ls))
|
||||
|
@ -83,17 +83,20 @@
|
|||
(let ((ch (read-char in)))
|
||||
(cond
|
||||
((not (eof-object? ch))
|
||||
(write-string
|
||||
(cond
|
||||
((and (not fold?) (eqv? ch #\x03A3))
|
||||
(let ((ch2 (peek-char in)))
|
||||
(cond
|
||||
((and (not fold?) (eqv? ch #\x03A3)) ;; sigma
|
||||
(let ((ch2 (peek-char in)))
|
||||
(write-char
|
||||
(if (or (eof-object? ch2)
|
||||
(not (char-set-contains? char-set:letter ch2)))
|
||||
#\x03C2
|
||||
#\x03C3)))
|
||||
((char-get-special-case ch (if fold? 4 1)))
|
||||
(else (if fold? (char-foldcase ch) (char-downcase ch))))
|
||||
out)
|
||||
#\x03C3)
|
||||
out)))
|
||||
((char-get-special-case ch (if fold? 4 1))
|
||||
=> (lambda (s) (write-string s out)))
|
||||
(else
|
||||
(write-char (if fold? (char-foldcase ch) (char-downcase ch))
|
||||
out)))
|
||||
(lp)))))))))
|
||||
|
||||
(define (string-downcase str) (string-down-or-fold-case str #f))
|
||||
|
|
|
@ -1,109 +1,109 @@
|
|||
|
||||
;; Unconditional non-1-to-1 case mappings derived from Unicode data
|
||||
;; file SpecialCasing.txt.
|
||||
;; auto-generated by extract-special-casing.scm
|
||||
|
||||
(define special-cases
|
||||
;; <code> <lower> <title> <upper> [<fold>]
|
||||
#(#(223 "ß" "Ss" "SS" "ss")
|
||||
#(304 "i̇" "İ" "İ")
|
||||
#(329 "ʼn" "ʼN" "ʼN")
|
||||
#(496 "ǰ" "J̌" "J̌")
|
||||
#(912 "ΐ" "Ϊ́" "Ϊ́")
|
||||
#(944 "ΰ" "Ϋ́" "Ϋ́")
|
||||
#(1415 "և" "Եւ" "ԵՒ")
|
||||
#(7830 "ẖ" "H̱" "H̱")
|
||||
#(7831 "ẗ" "T̈" "T̈")
|
||||
#(7832 "ẘ" "W̊" "W̊")
|
||||
#(7833 "ẙ" "Y̊" "Y̊")
|
||||
#(7834 "ẚ" "Aʾ" "Aʾ")
|
||||
#(8016 "ὐ" "Υ̓" "Υ̓")
|
||||
#(8018 "ὒ" "Υ̓̀" "Υ̓̀")
|
||||
#(8020 "ὔ" "Υ̓́" "Υ̓́")
|
||||
#(8022 "ὖ" "Υ̓͂" "Υ̓͂")
|
||||
#(8064 "ᾀ" "ᾈ" "ἈΙ")
|
||||
#(8065 "ᾁ" "ᾉ" "ἉΙ")
|
||||
#(8066 "ᾂ" "ᾊ" "ἊΙ")
|
||||
#(8067 "ᾃ" "ᾋ" "ἋΙ")
|
||||
#(8068 "ᾄ" "ᾌ" "ἌΙ")
|
||||
#(8069 "ᾅ" "ᾍ" "ἍΙ")
|
||||
#(8070 "ᾆ" "ᾎ" "ἎΙ")
|
||||
#(8071 "ᾇ" "ᾏ" "ἏΙ")
|
||||
#(8072 "ᾀ" "ᾈ" "ἈΙ")
|
||||
#(8073 "ᾁ" "ᾉ" "ἉΙ")
|
||||
#(8074 "ᾂ" "ᾊ" "ἊΙ")
|
||||
#(8075 "ᾃ" "ᾋ" "ἋΙ")
|
||||
#(8076 "ᾄ" "ᾌ" "ἌΙ")
|
||||
#(8077 "ᾅ" "ᾍ" "ἍΙ")
|
||||
#(8078 "ᾆ" "ᾎ" "ἎΙ")
|
||||
#(8079 "ᾇ" "ᾏ" "ἏΙ")
|
||||
#(8080 "ᾐ" "ᾘ" "ἨΙ")
|
||||
#(8081 "ᾑ" "ᾙ" "ἩΙ")
|
||||
#(8082 "ᾒ" "ᾚ" "ἪΙ")
|
||||
#(8083 "ᾓ" "ᾛ" "ἫΙ")
|
||||
#(8084 "ᾔ" "ᾜ" "ἬΙ")
|
||||
#(8085 "ᾕ" "ᾝ" "ἭΙ")
|
||||
#(8086 "ᾖ" "ᾞ" "ἮΙ")
|
||||
#(8087 "ᾗ" "ᾟ" "ἯΙ")
|
||||
#(8088 "ᾐ" "ᾘ" "ἨΙ")
|
||||
#(8089 "ᾑ" "ᾙ" "ἩΙ")
|
||||
#(8090 "ᾒ" "ᾚ" "ἪΙ")
|
||||
#(8091 "ᾓ" "ᾛ" "ἫΙ")
|
||||
#(8092 "ᾔ" "ᾜ" "ἬΙ")
|
||||
#(8093 "ᾕ" "ᾝ" "ἭΙ")
|
||||
#(8094 "ᾖ" "ᾞ" "ἮΙ")
|
||||
#(8095 "ᾗ" "ᾟ" "ἯΙ")
|
||||
#(8096 "ᾠ" "ᾨ" "ὨΙ")
|
||||
#(8097 "ᾡ" "ᾩ" "ὩΙ")
|
||||
#(8098 "ᾢ" "ᾪ" "ὪΙ")
|
||||
#(8099 "ᾣ" "ᾫ" "ὫΙ")
|
||||
#(8100 "ᾤ" "ᾬ" "ὬΙ")
|
||||
#(8101 "ᾥ" "ᾭ" "ὭΙ")
|
||||
#(8102 "ᾦ" "ᾮ" "ὮΙ")
|
||||
#(8103 "ᾧ" "ᾯ" "ὯΙ")
|
||||
#(8104 "ᾠ" "ᾨ" "ὨΙ")
|
||||
#(8105 "ᾡ" "ᾩ" "ὩΙ")
|
||||
#(8106 "ᾢ" "ᾪ" "ὪΙ")
|
||||
#(8107 "ᾣ" "ᾫ" "ὫΙ")
|
||||
#(8108 "ᾤ" "ᾬ" "ὬΙ")
|
||||
#(8109 "ᾥ" "ᾭ" "ὭΙ")
|
||||
#(8110 "ᾦ" "ᾮ" "ὮΙ")
|
||||
#(8111 "ᾧ" "ᾯ" "ὯΙ")
|
||||
#(8114 "ᾲ" "Ὰͅ" "ᾺΙ")
|
||||
#(8115 "ᾳ" "ᾼ" "ΑΙ")
|
||||
#(8116 "ᾴ" "Άͅ" "ΆΙ")
|
||||
#(8118 "ᾶ" "Α͂" "Α͂")
|
||||
#(8119 "ᾷ" "ᾼ͂" "Α͂Ι")
|
||||
#(8124 "ᾳ" "ᾼ" "ΑΙ")
|
||||
#(8130 "ῂ" "Ὴͅ" "ῊΙ")
|
||||
#(8131 "ῃ" "ῌ" "ΗΙ")
|
||||
#(8132 "ῄ" "Ήͅ" "ΉΙ")
|
||||
#(8134 "ῆ" "Η͂" "Η͂")
|
||||
#(8135 "ῇ" "ῌ͂" "Η͂Ι")
|
||||
#(8140 "ῃ" "ῌ" "ΗΙ")
|
||||
#(8146 "ῒ" "Ϊ̀" "Ϊ̀")
|
||||
#(8147 "ΐ" "Ϊ́" "Ϊ́")
|
||||
#(8150 "ῖ" "Ι͂" "Ι͂")
|
||||
#(8151 "ῗ" "Ϊ͂" "Ϊ͂")
|
||||
#(8162 "ῢ" "Ϋ̀" "Ϋ̀")
|
||||
#(8163 "ΰ" "Ϋ́" "Ϋ́")
|
||||
#(8164 "ῤ" "Ρ̓" "Ρ̓")
|
||||
#(8166 "ῦ" "Υ͂" "Υ͂")
|
||||
#(8167 "ῧ" "Ϋ͂" "Ϋ͂")
|
||||
#(8178 "ῲ" "Ὼͅ" "ῺΙ")
|
||||
#(8179 "ῳ" "ῼ" "ΩΙ")
|
||||
#(8180 "ῴ" "Ώͅ" "ΏΙ")
|
||||
#(8182 "ῶ" "Ω͂" "Ω͂")
|
||||
#(8183 "ῷ" "ῼ͂" "Ω͂Ι")
|
||||
#(8188 "ῳ" "ῼ" "ΩΙ")
|
||||
#(64256 "ff" "Ff" "FF")
|
||||
#(64257 "fi" "Fi" "FI")
|
||||
#(64258 "fl" "Fl" "FL")
|
||||
#(64259 "ffi" "Ffi" "FFI")
|
||||
#(64260 "ffl" "Ffl" "FFL")
|
||||
#(64261 "ſt" "St" "ST")
|
||||
#(64262 "st" "St" "ST")
|
||||
#(64275 "ﬓ" "Մն" "ՄՆ")
|
||||
#(64276 "ﬔ" "Մե" "ՄԵ")
|
||||
#(64277 "ﬕ" "Մի" "ՄԻ")
|
||||
#(64278 "ﬖ" "Վն" "ՎՆ")
|
||||
#(64279 "ﬗ" "Մխ" "ՄԽ")))
|
||||
'#(#(#xdf "ß" "Ss" "SS" "ss")
|
||||
#(#x130 "i̇" "İ" "İ" "i̇")
|
||||
#(#x149 "ʼn" "ʼN" "ʼN" "ʼn")
|
||||
#(#x1f0 "ǰ" "J̌" "J̌" "ǰ")
|
||||
#(#x390 "ΐ" "Ϊ́" "Ϊ́" "ΐ")
|
||||
#(#x3b0 "ΰ" "Ϋ́" "Ϋ́" "ΰ")
|
||||
#(#x587 "և" "Եւ" "ԵՒ" "եւ")
|
||||
#(#x1e96 "ẖ" "H̱" "H̱" "ẖ")
|
||||
#(#x1e97 "ẗ" "T̈" "T̈" "ẗ")
|
||||
#(#x1e98 "ẘ" "W̊" "W̊" "ẘ")
|
||||
#(#x1e99 "ẙ" "Y̊" "Y̊" "ẙ")
|
||||
#(#x1e9a "ẚ" "Aʾ" "Aʾ" "aʾ")
|
||||
#(#x1e9e "ß" "Ss" "SS" "ss") ;; TODO: this had to be added by hand
|
||||
#(#x1f50 "ὐ" "Υ̓" "Υ̓" "ὐ")
|
||||
#(#x1f52 "ὒ" "Υ̓̀" "Υ̓̀" "ὒ")
|
||||
#(#x1f54 "ὔ" "Υ̓́" "Υ̓́" "ὔ")
|
||||
#(#x1f56 "ὖ" "Υ̓͂" "Υ̓͂" "ὖ")
|
||||
#(#x1f80 "ᾀ" "ᾈ" "ἈΙ" "ἀι")
|
||||
#(#x1f81 "ᾁ" "ᾉ" "ἉΙ" "ἁι")
|
||||
#(#x1f82 "ᾂ" "ᾊ" "ἊΙ" "ἂι")
|
||||
#(#x1f83 "ᾃ" "ᾋ" "ἋΙ" "ἃι")
|
||||
#(#x1f84 "ᾄ" "ᾌ" "ἌΙ" "ἄι")
|
||||
#(#x1f85 "ᾅ" "ᾍ" "ἍΙ" "ἅι")
|
||||
#(#x1f86 "ᾆ" "ᾎ" "ἎΙ" "ἆι")
|
||||
#(#x1f87 "ᾇ" "ᾏ" "ἏΙ" "ἇι")
|
||||
#(#x1f88 "ᾀ" "ᾈ" "ἈΙ" "ἀι")
|
||||
#(#x1f89 "ᾁ" "ᾉ" "ἉΙ" "ἁι")
|
||||
#(#x1f8a "ᾂ" "ᾊ" "ἊΙ" "ἂι")
|
||||
#(#x1f8b "ᾃ" "ᾋ" "ἋΙ" "ἃι")
|
||||
#(#x1f8c "ᾄ" "ᾌ" "ἌΙ" "ἄι")
|
||||
#(#x1f8d "ᾅ" "ᾍ" "ἍΙ" "ἅι")
|
||||
#(#x1f8e "ᾆ" "ᾎ" "ἎΙ" "ἆι")
|
||||
#(#x1f8f "ᾇ" "ᾏ" "ἏΙ" "ἇι")
|
||||
#(#x1f90 "ᾐ" "ᾘ" "ἨΙ" "ἠι")
|
||||
#(#x1f91 "ᾑ" "ᾙ" "ἩΙ" "ἡι")
|
||||
#(#x1f92 "ᾒ" "ᾚ" "ἪΙ" "ἢι")
|
||||
#(#x1f93 "ᾓ" "ᾛ" "ἫΙ" "ἣι")
|
||||
#(#x1f94 "ᾔ" "ᾜ" "ἬΙ" "ἤι")
|
||||
#(#x1f95 "ᾕ" "ᾝ" "ἭΙ" "ἥι")
|
||||
#(#x1f96 "ᾖ" "ᾞ" "ἮΙ" "ἦι")
|
||||
#(#x1f97 "ᾗ" "ᾟ" "ἯΙ" "ἧι")
|
||||
#(#x1f98 "ᾐ" "ᾘ" "ἨΙ" "ἠι")
|
||||
#(#x1f99 "ᾑ" "ᾙ" "ἩΙ" "ἡι")
|
||||
#(#x1f9a "ᾒ" "ᾚ" "ἪΙ" "ἢι")
|
||||
#(#x1f9b "ᾓ" "ᾛ" "ἫΙ" "ἣι")
|
||||
#(#x1f9c "ᾔ" "ᾜ" "ἬΙ" "ἤι")
|
||||
#(#x1f9d "ᾕ" "ᾝ" "ἭΙ" "ἥι")
|
||||
#(#x1f9e "ᾖ" "ᾞ" "ἮΙ" "ἦι")
|
||||
#(#x1f9f "ᾗ" "ᾟ" "ἯΙ" "ἧι")
|
||||
#(#x1fa0 "ᾠ" "ᾨ" "ὨΙ" "ὠι")
|
||||
#(#x1fa1 "ᾡ" "ᾩ" "ὩΙ" "ὡι")
|
||||
#(#x1fa2 "ᾢ" "ᾪ" "ὪΙ" "ὢι")
|
||||
#(#x1fa3 "ᾣ" "ᾫ" "ὫΙ" "ὣι")
|
||||
#(#x1fa4 "ᾤ" "ᾬ" "ὬΙ" "ὤι")
|
||||
#(#x1fa5 "ᾥ" "ᾭ" "ὭΙ" "ὥι")
|
||||
#(#x1fa6 "ᾦ" "ᾮ" "ὮΙ" "ὦι")
|
||||
#(#x1fa7 "ᾧ" "ᾯ" "ὯΙ" "ὧι")
|
||||
#(#x1fa8 "ᾠ" "ᾨ" "ὨΙ" "ὠι")
|
||||
#(#x1fa9 "ᾡ" "ᾩ" "ὩΙ" "ὡι")
|
||||
#(#x1faa "ᾢ" "ᾪ" "ὪΙ" "ὢι")
|
||||
#(#x1fab "ᾣ" "ᾫ" "ὫΙ" "ὣι")
|
||||
#(#x1fac "ᾤ" "ᾬ" "ὬΙ" "ὤι")
|
||||
#(#x1fad "ᾥ" "ᾭ" "ὭΙ" "ὥι")
|
||||
#(#x1fae "ᾦ" "ᾮ" "ὮΙ" "ὦι")
|
||||
#(#x1faf "ᾧ" "ᾯ" "ὯΙ" "ὧι")
|
||||
#(#x1fb2 "ᾲ" "Ὰͅ" "ᾺΙ" "ὰι")
|
||||
#(#x1fb3 "ᾳ" "ᾼ" "ΑΙ" "αι")
|
||||
#(#x1fb4 "ᾴ" "Άͅ" "ΆΙ" "άι")
|
||||
#(#x1fb6 "ᾶ" "Α͂" "Α͂" "ᾶ")
|
||||
#(#x1fb7 "ᾷ" "ᾼ͂" "Α͂Ι" "ᾶι")
|
||||
#(#x1fbc "ᾳ" "ᾼ" "ΑΙ" "αι")
|
||||
#(#x1fc2 "ῂ" "Ὴͅ" "ῊΙ" "ὴι")
|
||||
#(#x1fc3 "ῃ" "ῌ" "ΗΙ" "ηι")
|
||||
#(#x1fc4 "ῄ" "Ήͅ" "ΉΙ" "ήι")
|
||||
#(#x1fc6 "ῆ" "Η͂" "Η͂" "ῆ")
|
||||
#(#x1fc7 "ῇ" "ῌ͂" "Η͂Ι" "ῆι")
|
||||
#(#x1fcc "ῃ" "ῌ" "ΗΙ" "ηι")
|
||||
#(#x1fd2 "ῒ" "Ϊ̀" "Ϊ̀" "ῒ")
|
||||
#(#x1fd3 "ΐ" "Ϊ́" "Ϊ́" "ΐ")
|
||||
#(#x1fd6 "ῖ" "Ι͂" "Ι͂" "ῖ")
|
||||
#(#x1fd7 "ῗ" "Ϊ͂" "Ϊ͂" "ῗ")
|
||||
#(#x1fe2 "ῢ" "Ϋ̀" "Ϋ̀" "ῢ")
|
||||
#(#x1fe3 "ΰ" "Ϋ́" "Ϋ́" "ΰ")
|
||||
#(#x1fe4 "ῤ" "Ρ̓" "Ρ̓" "ῤ")
|
||||
#(#x1fe6 "ῦ" "Υ͂" "Υ͂" "ῦ")
|
||||
#(#x1fe7 "ῧ" "Ϋ͂" "Ϋ͂" "ῧ")
|
||||
#(#x1ff2 "ῲ" "Ὼͅ" "ῺΙ" "ὼι")
|
||||
#(#x1ff3 "ῳ" "ῼ" "ΩΙ" "ωι")
|
||||
#(#x1ff4 "ῴ" "Ώͅ" "ΏΙ" "ώι")
|
||||
#(#x1ff6 "ῶ" "Ω͂" "Ω͂" "ῶ")
|
||||
#(#x1ff7 "ῷ" "ῼ͂" "Ω͂Ι" "ῶι")
|
||||
#(#x1ffc "ῳ" "ῼ" "ΩΙ" "ωι")
|
||||
#(#xfb00 "ff" "Ff" "FF" "ff")
|
||||
#(#xfb01 "fi" "Fi" "FI" "fi")
|
||||
#(#xfb02 "fl" "Fl" "FL" "fl")
|
||||
#(#xfb03 "ffi" "Ffi" "FFI" "ffi")
|
||||
#(#xfb04 "ffl" "Ffl" "FFL" "ffl")
|
||||
#(#xfb05 "ſt" "St" "ST" "st")
|
||||
#(#xfb06 "st" "St" "ST" "st")
|
||||
#(#xfb13 "ﬓ" "Մն" "ՄՆ" "մն")
|
||||
#(#xfb14 "ﬔ" "Մե" "ՄԵ" "մե")
|
||||
#(#xfb15 "ﬕ" "Մի" "ՄԻ" "մի")
|
||||
#(#xfb16 "ﬖ" "Վն" "ՎՆ" "վն")
|
||||
#(#xfb17 "ﬗ" "Մխ" "ՄԽ" "մխ")
|
||||
))
|
||||
|
|
|
@ -42,7 +42,6 @@
|
|||
joined/range padded padded/right padded/both
|
||||
trimmed trimmed/right trimmed/both trimmed/lazy
|
||||
fitted fitted/right fitted/both output-default
|
||||
upcased downcased
|
||||
;; computations
|
||||
fn with with! forked call-with-output
|
||||
;; state variables
|
||||
|
|
|
@ -1,6 +1,12 @@
|
|||
|
||||
(define-library (srfi 166 unicode)
|
||||
(import (scheme base) (srfi 130) (srfi 151) (srfi 166 base))
|
||||
(export as-unicode unicode-terminal-width unicode-terminal-width/wide)
|
||||
(import (scheme base)
|
||||
(scheme char)
|
||||
(srfi 130)
|
||||
(srfi 151)
|
||||
(srfi 166 base))
|
||||
(export as-unicode
|
||||
unicode-terminal-width unicode-terminal-width/wide
|
||||
upcased downcased)
|
||||
(include "../../chibi/show/width.scm"
|
||||
"../../chibi/show/unicode.scm"))
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
;; Extract sets of char case offsets.
|
||||
;;
|
||||
;; Usage:
|
||||
;; extract-case-offsets.scm options UnicodeData.txt > out
|
||||
;; extract-case-offsets.scm [options] UnicodeData.txt CaseFolding.txt > out
|
||||
;;
|
||||
;; Recognized options are:
|
||||
;;
|
||||
|
@ -23,7 +23,17 @@
|
|||
(for-each (lambda (x) (display x err)) args)
|
||||
(newline err)))
|
||||
|
||||
(define (write-offsets offset-map title-ups title-downs folds out min-count max-char-sets name)
|
||||
(define (write-hex-list hex-ls out)
|
||||
(let lp ((ls hex-ls))
|
||||
(cond
|
||||
((pair? ls)
|
||||
(if (not (eq? ls hex-ls))
|
||||
(write-char #\space out))
|
||||
(write-string "#x" out)
|
||||
(write-string (number->string (car ls) 16) out)
|
||||
(lp (cdr ls))))))
|
||||
|
||||
(define (write-offsets offset-map title-ups title-downs out min-count max-char-sets name)
|
||||
(let lp ((ls (sort (hash-table->alist offset-map)
|
||||
(lambda (a b) (> (iset-size (cdr a)) (iset-size (cdr b))))))
|
||||
(i 0)
|
||||
|
@ -50,50 +60,61 @@
|
|||
(map (lambda (y) (list y (+ y (car x))))
|
||||
(iset->list (cdr x))))
|
||||
ls)))
|
||||
(write `(define char-downcase-map
|
||||
',(list->vector
|
||||
(append-map (lambda (x) x)
|
||||
(sort (append pairs title-downs) < car))))
|
||||
out)
|
||||
(newline out)
|
||||
(newline out)
|
||||
(write `(define char-upcase-map
|
||||
',(list->vector
|
||||
(append-map (lambda (x) (list (cadr x) (car x)))
|
||||
(delete-duplicates
|
||||
(sort (append pairs title-ups) < cadr)
|
||||
(lambda (a b) (eqv? (cadr a) (cadr b)))))))
|
||||
out)
|
||||
(newline out)
|
||||
(newline out)
|
||||
(write `(define char-foldcase-map
|
||||
',(list->vector
|
||||
(append-map (lambda (x) x)
|
||||
(delete-duplicates
|
||||
(sort folds < car)
|
||||
(lambda (a b) (eqv? (cadr a) (cadr b)))))))
|
||||
out)
|
||||
(newline out))))))
|
||||
(write-string "(define char-downcase-map\n '#(" out)
|
||||
(write-hex-list
|
||||
(append-map (lambda (x) x) (sort (append pairs title-downs) < car))
|
||||
out)
|
||||
(write-string "))\n\n" out)
|
||||
(write-string "(define char-upcase-map\n '#(" out)
|
||||
(write-hex-list
|
||||
(append-map (lambda (x) (list (cadr x) (car x)))
|
||||
(delete-duplicates
|
||||
(sort (append pairs title-ups) < cadr)
|
||||
(lambda (a b) (eqv? (cadr a) (cadr b)))))
|
||||
out)
|
||||
(write-string "))\n\n" out))))))
|
||||
|
||||
(define (extract-case-folding in out)
|
||||
(define (write-folds folds out)
|
||||
(write-string "(define char-foldcase-map\n '#(" out)
|
||||
(write-hex-list
|
||||
(append-map (lambda (x) x) (sort folds < car))
|
||||
out)
|
||||
(write-string "))\n" out))
|
||||
(let lp ((folds '()))
|
||||
(let ((line (read-line in)))
|
||||
(cond
|
||||
((eof-object? line)
|
||||
(write-folds folds out))
|
||||
((or (equal? line "") (eqv? #\# (string-ref line 0)))
|
||||
(lp folds))
|
||||
(else
|
||||
(let* ((line (substring-cursor line
|
||||
(string-cursor-start line)
|
||||
(string-find line #\#)))
|
||||
(ls (map string-trim (string-split line #\;))))
|
||||
(if (and (>= (length ls) 3)
|
||||
(member (second ls) '("C" "S")))
|
||||
(let ((base (string->number (first ls) 16))
|
||||
(folded (string->number (third ls) 16)))
|
||||
(if (and base folded)
|
||||
(lp (cons (list base folded) folds))
|
||||
(lp folds)))
|
||||
(lp folds))))))))
|
||||
|
||||
;; value;char;name;category;combining_class;bidi_class;decomposition;numeric1;numeric2;numeric3;bidi_mirrored;unicode1_name;ISO_comment;uppercase_mapping;lowercase_mapping;titlecase_mapping
|
||||
|
||||
(define (extract-case-mapping in out min-count max-char-sets name)
|
||||
(define (string-trim-comment str comment-ch)
|
||||
(car (string-split str comment-ch 2)))
|
||||
(define (extract-single-decomposition str)
|
||||
(and (not (equal? "" str))
|
||||
(let ((s (string-trim (last (string-split str #\>)))))
|
||||
(and (not (string-contains s " "))
|
||||
(string->number s 16)))))
|
||||
(let ((offset-map (make-hash-table eq?))
|
||||
(title-ups '())
|
||||
(title-downs '())
|
||||
(folds '()))
|
||||
(title-downs '()))
|
||||
(let lp ()
|
||||
(let ((line (read-line in)))
|
||||
(cond
|
||||
((eof-object? line)
|
||||
(write-offsets offset-map title-ups title-downs folds out
|
||||
(write-offsets offset-map title-ups title-downs out
|
||||
min-count max-char-sets name))
|
||||
((or (equal? line "") (eqv? #\# (string-ref line 0)))
|
||||
(lp))
|
||||
|
@ -106,8 +127,7 @@
|
|||
(else
|
||||
(let ((base (string->number (list-ref ls 0) 16))
|
||||
(upper (string->number (list-ref ls 12) 16))
|
||||
(lower (string->number (list-ref ls 13) 16))
|
||||
(folded (extract-single-decomposition (list-ref ls 5))))
|
||||
(lower (string->number (list-ref ls 13) 16)))
|
||||
(cond
|
||||
((or upper lower)
|
||||
(cond
|
||||
|
@ -121,12 +141,7 @@
|
|||
offset-map
|
||||
(- (or lower base) (or upper base))
|
||||
(lambda (is) (iset-adjoin! is (or upper base)))
|
||||
(lambda () (make-iset)))))))
|
||||
(cond
|
||||
((and folded (not (eqv? folded (or lower base))))
|
||||
;; (write `(fold: ,line ,base ,folded) (current-error-port))
|
||||
;; (newline (current-error-port))
|
||||
(set! folds (cons (list base folded) folds)))))))
|
||||
(lambda () (make-iset))))))))))
|
||||
(lp))))))))
|
||||
|
||||
(let ((args (command-line)))
|
||||
|
@ -150,8 +165,8 @@
|
|||
(open-output-file (cadr ls))))
|
||||
(else
|
||||
(error "unknown option: " (car ls)))))
|
||||
((null? ls)
|
||||
(error "usage: extract-case-offsets <UnicodeData.txt>"))
|
||||
((not (= 2 (length ls)))
|
||||
(error "usage: extract-case-offsets <UnicodeData.txt> <CaseFolding.txt>"))
|
||||
(else
|
||||
(if (equal? "-" (car ls))
|
||||
(extract-case-mapping
|
||||
|
@ -159,4 +174,8 @@
|
|||
(call-with-input-file (car ls)
|
||||
(lambda (in)
|
||||
(extract-case-mapping in out min-count max-char-sets name))))
|
||||
(if (equal? "-" (cadr ls))
|
||||
(extract-case-folding (current-input-port) out)
|
||||
(call-with-input-file (cadr ls)
|
||||
(lambda (in) (extract-case-folding in out))))
|
||||
(close-output-port out)))))
|
||||
|
|
117
tools/extract-special-casing.scm
Normal file
117
tools/extract-special-casing.scm
Normal file
|
@ -0,0 +1,117 @@
|
|||
#!/usr/bin/env chibi-scheme
|
||||
|
||||
;; Build a table of special case (non 1:1) case mappings.
|
||||
;;
|
||||
;; Usage:
|
||||
;; extract-special-casing.scm [options] CaseFolding.txt SpecialCasing.txt > out
|
||||
|
||||
(import (chibi) (srfi 1) (srfi 125) (srfi 95) (chibi io) (chibi string))
|
||||
|
||||
(define (warn . args)
|
||||
(let ((err (current-error-port)))
|
||||
(for-each (lambda (x) (display x err)) args)
|
||||
(newline err)))
|
||||
|
||||
(define (useq->string str)
|
||||
(let ((ls (map (lambda (s) (string->number s 16))
|
||||
(map string-trim (string-split str #\space)))))
|
||||
(and ls
|
||||
(every integer? ls)
|
||||
(list->string (map integer->char ls)))))
|
||||
|
||||
(define (extract-full-folds in)
|
||||
(let ((res (make-hash-table equal?)))
|
||||
(let lp ()
|
||||
(let ((line (read-line in)))
|
||||
(cond
|
||||
((eof-object? line)
|
||||
res)
|
||||
(else
|
||||
(let* ((line (substring-cursor line
|
||||
(string-cursor-start line)
|
||||
(string-find line #\#)))
|
||||
(ls (map string-trim (string-split line #\;))))
|
||||
(if (and (>= (length ls) 3)
|
||||
(member (second ls) '("F")))
|
||||
(let ((base (string->number (first ls) 16))
|
||||
(folded (useq->string (third ls))))
|
||||
(if (and base folded)
|
||||
(hash-table-set! res base folded))))
|
||||
(lp))))))))
|
||||
|
||||
(define (extract-special-cases in out folds)
|
||||
;; TODO: handle folds not in cases (currently only #\x1e9e)
|
||||
(define (write-cases cases out)
|
||||
(write-string "(define special-cases\n ;; <code> <lower> <title> <upper> [<fold>]\n '#(" out)
|
||||
(for-each
|
||||
(lambda (x)
|
||||
(if (not (eq? x (car cases)))
|
||||
(write-string " " out))
|
||||
(write-string "#(#x" out)
|
||||
(write-string (number->string (first x) 16) out)
|
||||
(write-char #\space out)
|
||||
(write (second x) out)
|
||||
(write-char #\space out)
|
||||
(write (third x) out)
|
||||
(write-char #\space out)
|
||||
(write (fourth x) out)
|
||||
(cond
|
||||
((hash-table-ref/default folds (first x) #f)
|
||||
=> (lambda (fold)
|
||||
(write-char #\space out)
|
||||
(write fold out))))
|
||||
(write-string ")\n" out))
|
||||
cases)
|
||||
(write-string " ))\n" out))
|
||||
(let lp ((cases '()))
|
||||
(let ((line (read-line in)))
|
||||
(cond
|
||||
((eof-object? line)
|
||||
(write-cases (sort cases < car) out))
|
||||
((or (equal? line "") (eqv? #\# (string-ref line 0)))
|
||||
(lp cases))
|
||||
(else
|
||||
(let* ((line (substring-cursor line
|
||||
(string-cursor-start line)
|
||||
(string-find line #\#)))
|
||||
(ls (map string-trim (string-split line #\;))))
|
||||
(if (and (>= (length ls) 4)
|
||||
(or (= 4 (length ls))
|
||||
(string-null? (list-ref ls 4))))
|
||||
(let ((base (string->number (first ls) 16))
|
||||
(lower (useq->string (second ls)))
|
||||
(title (useq->string (third ls)))
|
||||
(upper (useq->string (fourth ls))))
|
||||
(if (and base lower title upper)
|
||||
(lp (cons (list base lower title upper) cases))
|
||||
(lp cases)))
|
||||
(lp cases))))))))
|
||||
|
||||
(let ((args (command-line)))
|
||||
(let lp ((ls (cdr args))
|
||||
(min-count 26)
|
||||
(max-char-sets #f)
|
||||
(name "char-downcase-offsets")
|
||||
(out (current-output-port)))
|
||||
(cond
|
||||
((and (pair? ls) (not (equal? "" (car ls)))
|
||||
(eqv? #\- (string-ref (car ls) 0)))
|
||||
(cond
|
||||
((member (car ls) '("-o" "--output"))
|
||||
(lp (cddr ls) min-count max-char-sets name
|
||||
(open-output-file (cadr ls))))
|
||||
(else
|
||||
(error "unknown option: " (car ls)))))
|
||||
((not (= 2 (length ls)))
|
||||
(error "usage: extract-special-casing <CaseFolding.txt> <SpecialCasing.txt>"))
|
||||
(else
|
||||
(display ";; auto-generated by extract-special-casing.scm\n\n" out)
|
||||
(let ((folds
|
||||
(if (equal? "-" (car ls))
|
||||
(extract-full-folds (current-input-port))
|
||||
(call-with-input-file (car ls) extract-full-folds))))
|
||||
(if (equal? "-" (cadr ls))
|
||||
(extract-special-cases (current-input-port) out folds)
|
||||
(call-with-input-file (cadr ls)
|
||||
(lambda (in) (extract-special-cases in out folds)))))
|
||||
(close-output-port out)))))
|
Loading…
Add table
Reference in a new issue