fix case folding, update to unicode 13

This commit is contained in:
Alex Shinn 2020-06-04 22:08:07 +09:00
parent 6fb0640721
commit c245d6cee8
15 changed files with 364 additions and 208 deletions

View file

@ -197,8 +197,11 @@ lib/chibi/char-set/full.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(
lib/chibi/show/width.scm: build-lib/chibi/char-set/width.scm chibi-scheme$(EXE) lib/chibi/show/width.scm: build-lib/chibi/char-set/width.scm chibi-scheme$(EXE)
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm --predicate chibi.char-set.width > $@ $(CHIBI) -Abuild-lib tools/optimize-char-sets.scm --predicate chibi.char-set.width > $@
lib/scheme/char/case-offsets.scm: data/UnicodeData.txt chibi-scheme$(EXE) all-libs lib/scheme/char/case-offsets.scm: data/UnicodeData.txt data/CaseFolding.txt chibi-scheme$(EXE) all-libs
$(CHIBI) tools/extract-case-offsets.scm $< > $@ $(CHIBI) tools/extract-case-offsets.scm data/UnicodeData.txt data/CaseFolding.txt > $@
lib/scheme/char/special-casing.scm: data/CaseFolding.txt data/SpecialCasing.txt chibi-scheme$(EXE) all-libs
$(CHIBI) tools/extract-special-casing.scm data/CaseFolding.txt data/SpecialCasing.txt > $@
######################################################################## ########################################################################
# Tests # Tests

View file

@ -1,42 +1,42 @@
;; char-set:lower-case ;; char-set:lower-case
(define char-set:lower-case (immutable-char-set (%make-iset 97 122 #f #f #f))) (define char-set:lower-case (immutable-char-set (%make-iset 97 127 67108863 #f #f)))
;; char-set:upper-case ;; char-set:upper-case
(define char-set:upper-case (immutable-char-set (%make-iset 65 90 #f #f #f))) (define char-set:upper-case (immutable-char-set (%make-iset 65 127 67108863 #f #f)))
;; char-set:title-case ;; char-set:title-case
(define char-set:title-case (immutable-char-set (%make-iset 0 0 0 #f #f))) (define char-set:title-case (immutable-char-set (%make-iset 0 0 0 #f #f)))
;; char-set:letter ;; char-set:letter
(define char-set:letter (immutable-char-set (%make-iset 97 122 #f (%make-iset 65 90 #f #f #f) #f))) (define char-set:letter (immutable-char-set (%make-iset 65 127 288230371923853311 #f #f)))
;; char-set:punctuation ;; char-set:punctuation
(define char-set:punctuation (immutable-char-set (%make-iset 63 64 #f (%make-iset 44 47 #f (%make-iset 37 42 #f (%make-iset 33 35 #f #f #f) #f) (%make-iset 58 59 #f #f #f)) (%make-iset 123 123 #f (%make-iset 95 95 #f (%make-iset 91 93 #f #f #f) #f) (%make-iset 125 125 #f #f #f))))) (define char-set:punctuation (immutable-char-set (%make-iset 33 127 6189700203056200029306911735 #f #f)))
;; char-set:symbol ;; char-set:symbol
(define char-set:symbol (immutable-char-set (%make-iset 94 94 #f (%make-iset 43 43 #f (%make-iset 36 36 #f #f #f) (%make-iset 60 62 #f #f #f)) (%make-iset 124 124 #f (%make-iset 96 96 #f #f #f) (%make-iset 126 126 #f #f #f))))) (define char-set:symbol (immutable-char-set (%make-iset 36 127 1547425050547877224499904641 #f #f)))
;; char-set:blank ;; char-set:blank
(define char-set:blank (immutable-char-set (%make-iset 32 32 #f (%make-iset 9 9 #f #f #f) #f))) (define char-set:blank (immutable-char-set (%make-iset 9 32 8388609 #f #f)))
;; char-set:whitespace ;; char-set:whitespace
(define char-set:whitespace (immutable-char-set (%make-iset 32 32 #f (%make-iset 9 13 #f #f #f) #f))) (define char-set:whitespace (immutable-char-set (%make-iset 9 127 8388639 #f #f)))
;; char-set:digit ;; char-set:digit
(define char-set:digit (immutable-char-set (%make-iset 48 57 #f #f #f))) (define char-set:digit (immutable-char-set (%make-iset 48 57 #f #f #f)))
;; char-set:letter+digit ;; char-set:letter+digit
(define char-set:letter+digit (immutable-char-set (%make-iset 65 90 #f (%make-iset 48 57 #f #f #f) (%make-iset 97 122 #f #f #f)))) (define char-set:letter+digit (immutable-char-set (%make-iset 48 127 37778931308803301180415 #f #f)))
;; char-set:hex-digit ;; char-set:hex-digit
(define char-set:hex-digit (immutable-char-set (%make-iset 65 70 #f (%make-iset 48 57 #f #f #f) (%make-iset 97 102 #f #f #f)))) (define char-set:hex-digit (immutable-char-set (%make-iset 48 102 35465847073801215 #f #f)))
;; char-set:iso-control ;; char-set:iso-control
(define char-set:iso-control (immutable-char-set (%make-iset 127 127 #f (%make-iset 0 31 #f #f #f) #f))) (define char-set:iso-control (immutable-char-set (%make-iset 0 127 170141183460469231731687303720179073023 #f #f)))
;; char-set:graphic ;; char-set:graphic
(define char-set:graphic (immutable-char-set (%make-iset 33 126 #f #f #f))) (define char-set:graphic (immutable-char-set (%make-iset 33 127 19807040628566084398385987583 #f #f)))
;; char-set:printing ;; char-set:printing
(define char-set:printing (immutable-char-set (%make-iset 32 126 #f (%make-iset 9 13 #f #f #f) #f))) (define char-set:printing (immutable-char-set (%make-iset 9 127 332306998946228968225951765061697567 #f #f)))

File diff suppressed because one or more lines are too long

View file

@ -8,8 +8,7 @@
padded padded/left padded/right padded/both padded padded/left padded/right padded/both
trimmed trimmed/left trimmed/right trimmed/both trimmed/lazy trimmed trimmed/left trimmed/right trimmed/both trimmed/lazy
fitted fitted/left fitted/right fitted/both fitted fitted/left fitted/right fitted/both
joined joined/prefix joined/suffix joined/last joined/dot joined/range joined joined/prefix joined/suffix joined/last joined/dot joined/range)
upcased downcased)
(import (scheme base) (scheme char) (scheme write) (import (scheme base) (scheme char) (scheme write)
(chibi show base)) (chibi show base))
(include "show/show.scm")) (include "show/show.scm"))

View file

@ -82,21 +82,6 @@
(fn (col pad-char) (fn (col pad-char)
(displayed (make-string (max 0 (- where col)) pad-char)))) (displayed (make-string (max 0 (- where col)) pad-char))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; String transformations
(define (with-string-transformer proc . ls)
(fn ((orig-output output))
(let ((output* (lambda (str) (orig-output (proc str)))))
(with ((output output*))
(each-in-list ls)))))
;;> Show each of \var{ls}, uppercasing all generated text.
(define (upcased . ls) (apply with-string-transformer string-upcase ls))
;;> Show each of \var{ls}, lowercasing all generated text.
(define (downcased . ls) (apply with-string-transformer string-downcase ls))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Padding and trimming ;; Padding and trimming

View file

@ -68,3 +68,18 @@
unicode-terminal-width/wide unicode-terminal-width/wide
unicode-terminal-width))) unicode-terminal-width)))
(each-in-list args)))) (each-in-list args))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; String transformations
(define (with-string-transformer proc . ls)
(fn ((orig-output output))
(let ((output* (lambda (str) (orig-output (proc str)))))
(with ((output output*))
(each-in-list ls)))))
;;> Show each of \var{ls}, uppercasing all generated text.
(define (upcased . ls) (apply with-string-transformer string-upcase ls))
;;> Show each of \var{ls}, lowercasing all generated text.
(define (downcased . ls) (apply with-string-transformer string-downcase ls))

View file

@ -1,5 +1,11 @@
(define-library (chibi show unicode) (define-library (chibi show unicode)
(import (scheme base) (chibi show base) (srfi 130) (srfi 151)) (import (scheme base)
(export as-unicode unicode-terminal-width unicode-terminal-width/wide) (scheme char)
(chibi show base)
(srfi 130)
(srfi 151))
(export as-unicode
unicode-terminal-width unicode-terminal-width/wide
upcased downcased)
(include "width.scm" "unicode.scm")) (include "width.scm" "unicode.scm"))

View file

@ -24,4 +24,5 @@
char-downcase char-foldcase char-lower-case? char-numeric? char-downcase char-foldcase char-lower-case? char-numeric?
char-upcase char-upper-case? char-whitespace? digit-value char-upcase char-upper-case? char-whitespace? digit-value
string-ci<=? string-ci<? string-ci=? string-ci>=? string-ci>? string-ci<=? string-ci<? string-ci=? string-ci>=? string-ci>?
char-get-special-case
string-downcase string-foldcase string-upcase)) string-downcase string-foldcase string-upcase))

File diff suppressed because one or more lines are too long

View file

@ -44,7 +44,7 @@
(define (char-foldcase ch) (define (char-foldcase ch)
(or (bsearch-kv char-foldcase-map (char->integer ch) 0 (or (bsearch-kv char-foldcase-map (char->integer ch) 0
(- (vector-length char-foldcase-map) 2)) (- (vector-length char-foldcase-map) 2))
(char-downcase ch))) ch))
(define (char-cmp-ci op a ls) (define (char-cmp-ci op a ls)
(let lp ((op op) (a (char->integer (char-foldcase a))) (ls ls)) (let lp ((op op) (a (char->integer (char-foldcase a))) (ls ls))
@ -83,17 +83,20 @@
(let ((ch (read-char in))) (let ((ch (read-char in)))
(cond (cond
((not (eof-object? ch)) ((not (eof-object? ch))
(write-string (cond
(cond ((and (not fold?) (eqv? ch #\x03A3)) ;; sigma
((and (not fold?) (eqv? ch #\x03A3)) (let ((ch2 (peek-char in)))
(let ((ch2 (peek-char in))) (write-char
(if (or (eof-object? ch2) (if (or (eof-object? ch2)
(not (char-set-contains? char-set:letter ch2))) (not (char-set-contains? char-set:letter ch2)))
#\x03C2 #\x03C2
#\x03C3))) #\x03C3)
((char-get-special-case ch (if fold? 4 1))) out)))
(else (if fold? (char-foldcase ch) (char-downcase ch)))) ((char-get-special-case ch (if fold? 4 1))
out) => (lambda (s) (write-string s out)))
(else
(write-char (if fold? (char-foldcase ch) (char-downcase ch))
out)))
(lp))))))))) (lp)))))))))
(define (string-downcase str) (string-down-or-fold-case str #f)) (define (string-downcase str) (string-down-or-fold-case str #f))

View file

@ -1,109 +1,109 @@
;; auto-generated by extract-special-casing.scm
;; Unconditional non-1-to-1 case mappings derived from Unicode data
;; file SpecialCasing.txt.
(define special-cases (define special-cases
;; <code> <lower> <title> <upper> [<fold>] ;; <code> <lower> <title> <upper> [<fold>]
#(#(223 "ß" "Ss" "SS" "ss") '#(#(#xdf "ß" "Ss" "SS" "ss")
#(304 "i̇" "İ" "İ") #(#x130 "i̇" "İ" "İ" "i̇")
#(329 "ʼn" "ʼN" "ʼN") #(#x149 "ʼn" "ʼN" "ʼN" "ʼn")
#(496 "ǰ" "J̌" "J̌") #(#x1f0 "ǰ" "J̌" "J̌" "ǰ")
#(912 "ΐ" "Ϊ́" "Ϊ́") #(#x390 "ΐ" "Ϊ́" "Ϊ́" "ΐ")
#(944 "ΰ" "Ϋ́" "Ϋ́") #(#x3b0 "ΰ" "Ϋ́" "Ϋ́" "ΰ")
#(1415 "և" "Եւ" "ԵՒ") #(#x587 "և" "Եւ" "ԵՒ" "եւ")
#(7830 "ẖ" "H̱" "H̱") #(#x1e96 "ẖ" "H̱" "H̱" "ẖ")
#(7831 "ẗ" "T̈" "T̈") #(#x1e97 "ẗ" "T̈" "T̈" "ẗ")
#(7832 "ẘ" "W̊" "W̊") #(#x1e98 "ẘ" "W̊" "W̊" "ẘ")
#(7833 "ẙ" "Y̊" "Y̊") #(#x1e99 "ẙ" "Y̊" "Y̊" "ẙ")
#(7834 "ẚ" "Aʾ" "Aʾ") #(#x1e9a "ẚ" "Aʾ" "Aʾ" "aʾ")
#(8016 "ὐ" "Υ̓" "Υ̓") #(#x1e9e "ß" "Ss" "SS" "ss") ;; TODO: this had to be added by hand
#(8018 "ὒ" "Υ̓̀" "Υ̓̀") #(#x1f50 "ὐ" "Υ̓" "Υ̓" "ὐ")
#(8020 "ὔ" "Υ̓́" "Υ̓́") #(#x1f52 "ὒ" "Υ̓̀" "Υ̓̀" "ὒ")
#(8022 "ὖ" "Υ̓͂" "Υ̓͂") #(#x1f54 "ὔ" "Υ̓́" "Υ̓́" "ὔ")
#(8064 "ᾀ" "ᾈ" "ἈΙ") #(#x1f56 "ὖ" "Υ̓͂" "Υ̓͂" "ὖ")
#(8065 "ᾁ" "ᾉ" "ἉΙ") #(#x1f80 "ᾀ" "ᾈ" "ἈΙ" "ἀι")
#(8066 "ᾂ" "ᾊ" "ἊΙ") #(#x1f81 "ᾁ" "ᾉ" "ἉΙ" "ἁι")
#(8067 "ᾃ" "ᾋ" "ἋΙ") #(#x1f82 "ᾂ" "ᾊ" "ἊΙ" "ἂι")
#(8068 "ᾄ" "ᾌ" "ἌΙ") #(#x1f83 "ᾃ" "ᾋ" "ἋΙ" "ἃι")
#(8069 "ᾅ" "ᾍ" "ἍΙ") #(#x1f84 "ᾄ" "ᾌ" "ἌΙ" "ἄι")
#(8070 "ᾆ" "ᾎ" "ἎΙ") #(#x1f85 "ᾅ" "ᾍ" "ἍΙ" "ἅι")
#(8071 "ᾇ" "ᾏ" "ἏΙ") #(#x1f86 "ᾆ" "ᾎ" "ἎΙ" "ἆι")
#(8072 "ᾀ" "ᾈ" "ἈΙ") #(#x1f87 "ᾇ" "ᾏ" "ἏΙ" "ἇι")
#(8073 "ᾁ" "ᾉ" "ἉΙ") #(#x1f88 "ᾀ" "ᾈ" "ἈΙ" "ἀι")
#(8074 "ᾂ" "ᾊ" "ἊΙ") #(#x1f89 "ᾁ" "ᾉ" "ἉΙ" "ἁι")
#(8075 "ᾃ" "ᾋ" "ἋΙ") #(#x1f8a "ᾂ" "ᾊ" "ἊΙ" "ἂι")
#(8076 "ᾄ" "ᾌ" "ἌΙ") #(#x1f8b "ᾃ" "ᾋ" "ἋΙ" "ἃι")
#(8077 "ᾅ" "ᾍ" "ἍΙ") #(#x1f8c "ᾄ" "ᾌ" "ἌΙ" "ἄι")
#(8078 "ᾆ" "ᾎ" "ἎΙ") #(#x1f8d "ᾅ" "ᾍ" "ἍΙ" "ἅι")
#(8079 "ᾇ" "ᾏ" "ἏΙ") #(#x1f8e "ᾆ" "ᾎ" "ἎΙ" "ἆι")
#(8080 "ᾐ" "ᾘ" "ἨΙ") #(#x1f8f "ᾇ" "ᾏ" "ἏΙ" "ἇι")
#(8081 "ᾑ" "ᾙ" "ἩΙ") #(#x1f90 "ᾐ" "ᾘ" "ἨΙ" "ἠι")
#(8082 "ᾒ" "ᾚ" "ἪΙ") #(#x1f91 "ᾑ" "ᾙ" "ἩΙ" "ἡι")
#(8083 "ᾓ" "ᾛ" "ἫΙ") #(#x1f92 "ᾒ" "ᾚ" "ἪΙ" "ἢι")
#(8084 "ᾔ" "ᾜ" "ἬΙ") #(#x1f93 "ᾓ" "ᾛ" "ἫΙ" "ἣι")
#(8085 "ᾕ" "ᾝ" "ἭΙ") #(#x1f94 "ᾔ" "ᾜ" "ἬΙ" "ἤι")
#(8086 "ᾖ" "ᾞ" "ἮΙ") #(#x1f95 "ᾕ" "ᾝ" "ἭΙ" "ἥι")
#(8087 "ᾗ" "ᾟ" "ἯΙ") #(#x1f96 "ᾖ" "ᾞ" "ἮΙ" "ἦι")
#(8088 "ᾐ" "ᾘ" "ἨΙ") #(#x1f97 "ᾗ" "ᾟ" "ἯΙ" "ἧι")
#(8089 "ᾑ" "ᾙ" "ἩΙ") #(#x1f98 "ᾐ" "ᾘ" "ἨΙ" "ἠι")
#(8090 "ᾒ" "ᾚ" "ἪΙ") #(#x1f99 "ᾑ" "ᾙ" "ἩΙ" "ἡι")
#(8091 "ᾓ" "ᾛ" "ἫΙ") #(#x1f9a "ᾒ" "ᾚ" "ἪΙ" "ἢι")
#(8092 "ᾔ" "ᾜ" "ἬΙ") #(#x1f9b "ᾓ" "ᾛ" "ἫΙ" "ἣι")
#(8093 "ᾕ" "ᾝ" "ἭΙ") #(#x1f9c "ᾔ" "ᾜ" "ἬΙ" "ἤι")
#(8094 "ᾖ" "ᾞ" "ἮΙ") #(#x1f9d "ᾕ" "ᾝ" "ἭΙ" "ἥι")
#(8095 "ᾗ" "ᾟ" "ἯΙ") #(#x1f9e "ᾖ" "ᾞ" "ἮΙ" "ἦι")
#(8096 "ᾠ" "ᾨ" "ὨΙ") #(#x1f9f "ᾗ" "ᾟ" "ἯΙ" "ἧι")
#(8097 "ᾡ" "ᾩ" "ὩΙ") #(#x1fa0 "ᾠ" "ᾨ" "ὨΙ" "ὠι")
#(8098 "ᾢ" "ᾪ" "ὪΙ") #(#x1fa1 "ᾡ" "ᾩ" "ὩΙ" "ὡι")
#(8099 "ᾣ" "ᾫ" "ὫΙ") #(#x1fa2 "ᾢ" "ᾪ" "ὪΙ" "ὢι")
#(8100 "ᾤ" "ᾬ" "ὬΙ") #(#x1fa3 "ᾣ" "ᾫ" "ὫΙ" "ὣι")
#(8101 "ᾥ" "ᾭ" "ὭΙ") #(#x1fa4 "ᾤ" "ᾬ" "ὬΙ" "ὤι")
#(8102 "ᾦ" "ᾮ" "ὮΙ") #(#x1fa5 "ᾥ" "ᾭ" "ὭΙ" "ὥι")
#(8103 "ᾧ" "ᾯ" "ὯΙ") #(#x1fa6 "ᾦ" "ᾮ" "ὮΙ" "ὦι")
#(8104 "ᾠ" "ᾨ" "ὨΙ") #(#x1fa7 "ᾧ" "ᾯ" "ὯΙ" "ὧι")
#(8105 "ᾡ" "ᾩ" "ὩΙ") #(#x1fa8 "ᾠ" "ᾨ" "ὨΙ" "ὠι")
#(8106 "ᾢ" "ᾪ" "ὪΙ") #(#x1fa9 "ᾡ" "ᾩ" "ὩΙ" "ὡι")
#(8107 "ᾣ" "ᾫ" "ὫΙ") #(#x1faa "ᾢ" "ᾪ" "ὪΙ" "ὢι")
#(8108 "ᾤ" "ᾬ" "ὬΙ") #(#x1fab "ᾣ" "ᾫ" "ὫΙ" "ὣι")
#(8109 "ᾥ" "ᾭ" "ὭΙ") #(#x1fac "ᾤ" "ᾬ" "ὬΙ" "ὤι")
#(8110 "ᾦ" "ᾮ" "ὮΙ") #(#x1fad "ᾥ" "ᾭ" "ὭΙ" "ὥι")
#(8111 "ᾧ" "ᾯ" "ὯΙ") #(#x1fae "ᾦ" "ᾮ" "ὮΙ" "ὦι")
#(8114 "ᾲ" "Ὰͅ" "ᾺΙ") #(#x1faf "ᾧ" "ᾯ" "ὯΙ" "ὧι")
#(8115 "ᾳ" "ᾼ" "ΑΙ") #(#x1fb2 "ᾲ" "Ὰͅ" "ᾺΙ" "ὰι")
#(8116 "ᾴ" "Άͅ" "ΆΙ") #(#x1fb3 "ᾳ" "ᾼ" "ΑΙ" "αι")
#(8118 "ᾶ" "Α͂" "Α͂") #(#x1fb4 "ᾴ" "Άͅ" "ΆΙ" "άι")
#(8119 "ᾷ" "ᾼ͂" "Α͂Ι") #(#x1fb6 "ᾶ" "Α͂" "Α͂" "ᾶ")
#(8124 "ᾳ" "ᾼ" "ΑΙ") #(#x1fb7 "ᾷ" "ᾼ͂" "Α͂Ι" "ᾶι")
#(8130 "ῂ" "Ὴͅ" "ῊΙ") #(#x1fbc "ᾳ" "ᾼ" "ΑΙ" "αι")
#(8131 "ῃ" "ῌ" "ΗΙ") #(#x1fc2 "ῂ" "Ὴͅ" "ῊΙ" "ὴι")
#(8132 "ῄ" "Ήͅ" "ΉΙ") #(#x1fc3 "ῃ" "ῌ" "ΗΙ" "ηι")
#(8134 "ῆ" "Η͂" "Η͂") #(#x1fc4 "ῄ" "Ήͅ" "ΉΙ" "ήι")
#(8135 "ῇ" "ῌ͂" "Η͂Ι") #(#x1fc6 "ῆ" "Η͂" "Η͂" "ῆ")
#(8140 "ῃ" "ῌ" "ΗΙ") #(#x1fc7 "ῇ" "ῌ͂" "Η͂Ι" "ῆι")
#(8146 "ῒ" "Ϊ̀" "Ϊ̀") #(#x1fcc "ῃ" "ῌ" "ΗΙ" "ηι")
#(8147 "ΐ" "Ϊ́" "Ϊ́") #(#x1fd2 "ῒ" "Ϊ̀" "Ϊ̀" "ῒ")
#(8150 "ῖ" "Ι͂" "Ι͂") #(#x1fd3 "ΐ" "Ϊ́" "Ϊ́" "ΐ")
#(8151 "ῗ" "Ϊ͂" "Ϊ͂") #(#x1fd6 "ῖ" "Ι͂" "Ι͂" "ῖ")
#(8162 "ῢ" "Ϋ̀" "Ϋ̀") #(#x1fd7 "ῗ" "Ϊ͂" "Ϊ͂" "ῗ")
#(8163 "ΰ" "Ϋ́" "Ϋ́") #(#x1fe2 "ῢ" "Ϋ̀" "Ϋ̀" "ῢ")
#(8164 "ῤ" "Ρ̓" "Ρ̓") #(#x1fe3 "ΰ" "Ϋ́" "Ϋ́" "ΰ")
#(8166 "ῦ" "Υ͂" "Υ͂") #(#x1fe4 "ῤ" "Ρ̓" "Ρ̓" "ῤ")
#(8167 "ῧ" "Ϋ͂" "Ϋ͂") #(#x1fe6 "ῦ" "Υ͂" "Υ͂" "ῦ")
#(8178 "ῲ" "Ὼͅ" "ῺΙ") #(#x1fe7 "ῧ" "Ϋ͂" "Ϋ͂" "ῧ")
#(8179 "ῳ" "ῼ" "ΩΙ") #(#x1ff2 "ῲ" "Ὼͅ" "ῺΙ" "ὼι")
#(8180 "ῴ" "Ώͅ" "ΏΙ") #(#x1ff3 "ῳ" "ῼ" "ΩΙ" "ωι")
#(8182 "ῶ" "Ω͂" "Ω͂") #(#x1ff4 "ῴ" "Ώͅ" "ΏΙ" "ώι")
#(8183 "ῷ" "ῼ͂" "Ω͂Ι") #(#x1ff6 "ῶ" "Ω͂" "Ω͂" "ῶ")
#(8188 "ῳ" "ῼ" "ΩΙ") #(#x1ff7 "ῷ" "ῼ͂" "Ω͂Ι" "ῶι")
#(64256 "ff" "Ff" "FF") #(#x1ffc "ῳ" "ῼ" "ΩΙ" "ωι")
#(64257 "fi" "Fi" "FI") #(#xfb00 "ff" "Ff" "FF" "ff")
#(64258 "fl" "Fl" "FL") #(#xfb01 "fi" "Fi" "FI" "fi")
#(64259 "ffi" "Ffi" "FFI") #(#xfb02 "fl" "Fl" "FL" "fl")
#(64260 "ffl" "Ffl" "FFL") #(#xfb03 "ffi" "Ffi" "FFI" "ffi")
#(64261 "ſt" "St" "ST") #(#xfb04 "ffl" "Ffl" "FFL" "ffl")
#(64262 "st" "St" "ST") #(#xfb05 "ſt" "St" "ST" "st")
#(64275 "ﬓ" "Մն" "ՄՆ") #(#xfb06 "st" "St" "ST" "st")
#(64276 "ﬔ" "Մե" "ՄԵ") #(#xfb13 "ﬓ" "Մն" "ՄՆ" "մն")
#(64277 "ﬕ" "Մի" "ՄԻ") #(#xfb14 "ﬔ" "Մե" "ՄԵ" "մե")
#(64278 "ﬖ" "Վն" "ՎՆ") #(#xfb15 "ﬕ" "Մի" "ՄԻ" "մի")
#(64279 "ﬗ" "Մխ" "ՄԽ"))) #(#xfb16 "ﬖ" "Վն" "ՎՆ" "վն")
#(#xfb17 "ﬗ" "Մխ" "ՄԽ" "մխ")
))

View file

@ -42,7 +42,6 @@
joined/range padded padded/right padded/both joined/range padded padded/right padded/both
trimmed trimmed/right trimmed/both trimmed/lazy trimmed trimmed/right trimmed/both trimmed/lazy
fitted fitted/right fitted/both output-default fitted fitted/right fitted/both output-default
upcased downcased
;; computations ;; computations
fn with with! forked call-with-output fn with with! forked call-with-output
;; state variables ;; state variables

View file

@ -1,6 +1,12 @@
(define-library (srfi 166 unicode) (define-library (srfi 166 unicode)
(import (scheme base) (srfi 130) (srfi 151) (srfi 166 base)) (import (scheme base)
(export as-unicode unicode-terminal-width unicode-terminal-width/wide) (scheme char)
(srfi 130)
(srfi 151)
(srfi 166 base))
(export as-unicode
unicode-terminal-width unicode-terminal-width/wide
upcased downcased)
(include "../../chibi/show/width.scm" (include "../../chibi/show/width.scm"
"../../chibi/show/unicode.scm")) "../../chibi/show/unicode.scm"))

View file

@ -3,7 +3,7 @@
;; Extract sets of char case offsets. ;; Extract sets of char case offsets.
;; ;;
;; Usage: ;; Usage:
;; extract-case-offsets.scm options UnicodeData.txt > out ;; extract-case-offsets.scm [options] UnicodeData.txt CaseFolding.txt > out
;; ;;
;; Recognized options are: ;; Recognized options are:
;; ;;
@ -23,7 +23,17 @@
(for-each (lambda (x) (display x err)) args) (for-each (lambda (x) (display x err)) args)
(newline err))) (newline err)))
(define (write-offsets offset-map title-ups title-downs folds out min-count max-char-sets name) (define (write-hex-list hex-ls out)
(let lp ((ls hex-ls))
(cond
((pair? ls)
(if (not (eq? ls hex-ls))
(write-char #\space out))
(write-string "#x" out)
(write-string (number->string (car ls) 16) out)
(lp (cdr ls))))))
(define (write-offsets offset-map title-ups title-downs out min-count max-char-sets name)
(let lp ((ls (sort (hash-table->alist offset-map) (let lp ((ls (sort (hash-table->alist offset-map)
(lambda (a b) (> (iset-size (cdr a)) (iset-size (cdr b)))))) (lambda (a b) (> (iset-size (cdr a)) (iset-size (cdr b))))))
(i 0) (i 0)
@ -50,50 +60,61 @@
(map (lambda (y) (list y (+ y (car x)))) (map (lambda (y) (list y (+ y (car x))))
(iset->list (cdr x)))) (iset->list (cdr x))))
ls))) ls)))
(write `(define char-downcase-map (write-string "(define char-downcase-map\n '#(" out)
',(list->vector (write-hex-list
(append-map (lambda (x) x) (append-map (lambda (x) x) (sort (append pairs title-downs) < car))
(sort (append pairs title-downs) < car)))) out)
out) (write-string "))\n\n" out)
(newline out) (write-string "(define char-upcase-map\n '#(" out)
(newline out) (write-hex-list
(write `(define char-upcase-map (append-map (lambda (x) (list (cadr x) (car x)))
',(list->vector (delete-duplicates
(append-map (lambda (x) (list (cadr x) (car x))) (sort (append pairs title-ups) < cadr)
(delete-duplicates (lambda (a b) (eqv? (cadr a) (cadr b)))))
(sort (append pairs title-ups) < cadr) out)
(lambda (a b) (eqv? (cadr a) (cadr b))))))) (write-string "))\n\n" out))))))
out)
(newline out) (define (extract-case-folding in out)
(newline out) (define (write-folds folds out)
(write `(define char-foldcase-map (write-string "(define char-foldcase-map\n '#(" out)
',(list->vector (write-hex-list
(append-map (lambda (x) x) (append-map (lambda (x) x) (sort folds < car))
(delete-duplicates out)
(sort folds < car) (write-string "))\n" out))
(lambda (a b) (eqv? (cadr a) (cadr b))))))) (let lp ((folds '()))
out) (let ((line (read-line in)))
(newline out)))))) (cond
((eof-object? line)
(write-folds folds out))
((or (equal? line "") (eqv? #\# (string-ref line 0)))
(lp folds))
(else
(let* ((line (substring-cursor line
(string-cursor-start line)
(string-find line #\#)))
(ls (map string-trim (string-split line #\;))))
(if (and (>= (length ls) 3)
(member (second ls) '("C" "S")))
(let ((base (string->number (first ls) 16))
(folded (string->number (third ls) 16)))
(if (and base folded)
(lp (cons (list base folded) folds))
(lp folds)))
(lp folds))))))))
;; value;char;name;category;combining_class;bidi_class;decomposition;numeric1;numeric2;numeric3;bidi_mirrored;unicode1_name;ISO_comment;uppercase_mapping;lowercase_mapping;titlecase_mapping ;; value;char;name;category;combining_class;bidi_class;decomposition;numeric1;numeric2;numeric3;bidi_mirrored;unicode1_name;ISO_comment;uppercase_mapping;lowercase_mapping;titlecase_mapping
(define (extract-case-mapping in out min-count max-char-sets name) (define (extract-case-mapping in out min-count max-char-sets name)
(define (string-trim-comment str comment-ch) (define (string-trim-comment str comment-ch)
(car (string-split str comment-ch 2))) (car (string-split str comment-ch 2)))
(define (extract-single-decomposition str)
(and (not (equal? "" str))
(let ((s (string-trim (last (string-split str #\>)))))
(and (not (string-contains s " "))
(string->number s 16)))))
(let ((offset-map (make-hash-table eq?)) (let ((offset-map (make-hash-table eq?))
(title-ups '()) (title-ups '())
(title-downs '()) (title-downs '()))
(folds '()))
(let lp () (let lp ()
(let ((line (read-line in))) (let ((line (read-line in)))
(cond (cond
((eof-object? line) ((eof-object? line)
(write-offsets offset-map title-ups title-downs folds out (write-offsets offset-map title-ups title-downs out
min-count max-char-sets name)) min-count max-char-sets name))
((or (equal? line "") (eqv? #\# (string-ref line 0))) ((or (equal? line "") (eqv? #\# (string-ref line 0)))
(lp)) (lp))
@ -106,8 +127,7 @@
(else (else
(let ((base (string->number (list-ref ls 0) 16)) (let ((base (string->number (list-ref ls 0) 16))
(upper (string->number (list-ref ls 12) 16)) (upper (string->number (list-ref ls 12) 16))
(lower (string->number (list-ref ls 13) 16)) (lower (string->number (list-ref ls 13) 16)))
(folded (extract-single-decomposition (list-ref ls 5))))
(cond (cond
((or upper lower) ((or upper lower)
(cond (cond
@ -121,12 +141,7 @@
offset-map offset-map
(- (or lower base) (or upper base)) (- (or lower base) (or upper base))
(lambda (is) (iset-adjoin! is (or upper base))) (lambda (is) (iset-adjoin! is (or upper base)))
(lambda () (make-iset))))))) (lambda () (make-iset))))))))))
(cond
((and folded (not (eqv? folded (or lower base))))
;; (write `(fold: ,line ,base ,folded) (current-error-port))
;; (newline (current-error-port))
(set! folds (cons (list base folded) folds)))))))
(lp)))))))) (lp))))))))
(let ((args (command-line))) (let ((args (command-line)))
@ -150,8 +165,8 @@
(open-output-file (cadr ls)))) (open-output-file (cadr ls))))
(else (else
(error "unknown option: " (car ls))))) (error "unknown option: " (car ls)))))
((null? ls) ((not (= 2 (length ls)))
(error "usage: extract-case-offsets <UnicodeData.txt>")) (error "usage: extract-case-offsets <UnicodeData.txt> <CaseFolding.txt>"))
(else (else
(if (equal? "-" (car ls)) (if (equal? "-" (car ls))
(extract-case-mapping (extract-case-mapping
@ -159,4 +174,8 @@
(call-with-input-file (car ls) (call-with-input-file (car ls)
(lambda (in) (lambda (in)
(extract-case-mapping in out min-count max-char-sets name)))) (extract-case-mapping in out min-count max-char-sets name))))
(if (equal? "-" (cadr ls))
(extract-case-folding (current-input-port) out)
(call-with-input-file (cadr ls)
(lambda (in) (extract-case-folding in out))))
(close-output-port out))))) (close-output-port out)))))

View file

@ -0,0 +1,117 @@
#!/usr/bin/env chibi-scheme
;; Build a table of special case (non 1:1) case mappings.
;;
;; Usage:
;; extract-special-casing.scm [options] CaseFolding.txt SpecialCasing.txt > out
(import (chibi) (srfi 1) (srfi 125) (srfi 95) (chibi io) (chibi string))
(define (warn . args)
(let ((err (current-error-port)))
(for-each (lambda (x) (display x err)) args)
(newline err)))
(define (useq->string str)
(let ((ls (map (lambda (s) (string->number s 16))
(map string-trim (string-split str #\space)))))
(and ls
(every integer? ls)
(list->string (map integer->char ls)))))
(define (extract-full-folds in)
(let ((res (make-hash-table equal?)))
(let lp ()
(let ((line (read-line in)))
(cond
((eof-object? line)
res)
(else
(let* ((line (substring-cursor line
(string-cursor-start line)
(string-find line #\#)))
(ls (map string-trim (string-split line #\;))))
(if (and (>= (length ls) 3)
(member (second ls) '("F")))
(let ((base (string->number (first ls) 16))
(folded (useq->string (third ls))))
(if (and base folded)
(hash-table-set! res base folded))))
(lp))))))))
(define (extract-special-cases in out folds)
;; TODO: handle folds not in cases (currently only #\x1e9e)
(define (write-cases cases out)
(write-string "(define special-cases\n ;; <code> <lower> <title> <upper> [<fold>]\n '#(" out)
(for-each
(lambda (x)
(if (not (eq? x (car cases)))
(write-string " " out))
(write-string "#(#x" out)
(write-string (number->string (first x) 16) out)
(write-char #\space out)
(write (second x) out)
(write-char #\space out)
(write (third x) out)
(write-char #\space out)
(write (fourth x) out)
(cond
((hash-table-ref/default folds (first x) #f)
=> (lambda (fold)
(write-char #\space out)
(write fold out))))
(write-string ")\n" out))
cases)
(write-string " ))\n" out))
(let lp ((cases '()))
(let ((line (read-line in)))
(cond
((eof-object? line)
(write-cases (sort cases < car) out))
((or (equal? line "") (eqv? #\# (string-ref line 0)))
(lp cases))
(else
(let* ((line (substring-cursor line
(string-cursor-start line)
(string-find line #\#)))
(ls (map string-trim (string-split line #\;))))
(if (and (>= (length ls) 4)
(or (= 4 (length ls))
(string-null? (list-ref ls 4))))
(let ((base (string->number (first ls) 16))
(lower (useq->string (second ls)))
(title (useq->string (third ls)))
(upper (useq->string (fourth ls))))
(if (and base lower title upper)
(lp (cons (list base lower title upper) cases))
(lp cases)))
(lp cases))))))))
(let ((args (command-line)))
(let lp ((ls (cdr args))
(min-count 26)
(max-char-sets #f)
(name "char-downcase-offsets")
(out (current-output-port)))
(cond
((and (pair? ls) (not (equal? "" (car ls)))
(eqv? #\- (string-ref (car ls) 0)))
(cond
((member (car ls) '("-o" "--output"))
(lp (cddr ls) min-count max-char-sets name
(open-output-file (cadr ls))))
(else
(error "unknown option: " (car ls)))))
((not (= 2 (length ls)))
(error "usage: extract-special-casing <CaseFolding.txt> <SpecialCasing.txt>"))
(else
(display ";; auto-generated by extract-special-casing.scm\n\n" out)
(let ((folds
(if (equal? "-" (car ls))
(extract-full-folds (current-input-port))
(call-with-input-file (car ls) extract-full-folds))))
(if (equal? "-" (cadr ls))
(extract-special-cases (current-input-port) out folds)
(call-with-input-file (cadr ls)
(lambda (in) (extract-special-cases in out folds)))))
(close-output-port out)))))