Adding special case (non-1-to-1) char case mappings.

This commit is contained in:
Alex Shinn 2013-09-29 14:45:51 +09:00
parent 6094e5fb12
commit 3683e8cbdc
4 changed files with 154 additions and 6 deletions

View file

@ -3,10 +3,12 @@
(import (scheme base))
(cond-expand
(full-unicode
(import (chibi char-set full)
(import (scheme write)
(chibi char-set full)
(chibi char-set base)
(chibi iset base))
(include "char/full.scm")
(include "char/special-casing.scm")
(include "char/case-offsets.scm"))
(else
(include "char/ascii.scm")

View file

@ -62,14 +62,40 @@
(define (char-ci<=? a . ls) (char-cmp-ci <= a ls))
(define (char-ci>=? a . ls) (char-cmp-ci >= a ls))
(define (char-get-special-case ch off)
(let ((i (char->integer ch)))
(let loop ((a 0) (b (vector-length special-cases)))
(if (= a b)
#f
(let* ((mid (+ a (quotient (- b a) 2)))
(vec (vector-ref special-cases mid))
(val (vector-ref vec 0)))
(cond ((< i val) (if (= mid b) #f (loop a mid)))
((> i val) (if (= mid a) #f (loop mid b)))
(else (vector-ref vec off))))))))
(define (call-with-output-string proc)
(let ((out (open-output-string)))
(proc out)
(get-output-string out)))
(define (string-downcase str)
(string-map char-downcase str))
(call-with-output-string
(lambda (out)
(string-for-each
(lambda (ch)
(display (or (char-get-special-case ch 1) (char-downcase ch)) out))
str))))
(define string-foldcase string-downcase)
(define (string-upcase str)
(string-map char-upcase str))
(define (string-foldcase str)
(string-map char-foldcase str))
(call-with-output-string
(lambda (out)
(string-for-each
(lambda (ch)
(display (or (char-get-special-case ch 3) (char-upcase ch)) out))
str))))
(define (string-cmp-ci op a ls)
(let lp ((op op) (a (string-foldcase a)) (ls ls))

View file

@ -0,0 +1,109 @@
;; Unconditoinal non-1-to-1 case mappings derived from Unicode data
;; file SpecialCasing.txt.
(define special-cases
;; <code> <lower> <title> <upper>
#(#(223 "ß" "Ss" "SS")
#(304 "i̇" "İ" "İ")
#(329 "ʼn" "ʼN" "ʼN")
#(496 "ǰ" "J̌" "J̌")
#(912 "ΐ" "Ϊ́" "Ϊ́")
#(944 "ΰ" "Ϋ́" "Ϋ́")
#(1415 "և" "Եւ" "ԵՒ")
#(7830 "ẖ" "H̱" "H̱")
#(7831 "ẗ" "T̈" "T̈")
#(7832 "ẘ" "W̊" "W̊")
#(7833 "ẙ" "Y̊" "Y̊")
#(7834 "ẚ" "Aʾ" "Aʾ")
#(8016 "ὐ" "Υ̓" "Υ̓")
#(8018 "ὒ" "Υ̓̀" "Υ̓̀")
#(8020 "ὔ" "Υ̓́" "Υ̓́")
#(8022 "ὖ" "Υ̓͂" "Υ̓͂")
#(8064 "ᾀ" "ᾈ" "ἈΙ")
#(8065 "ᾁ" "ᾉ" "ἉΙ")
#(8066 "ᾂ" "ᾊ" "ἊΙ")
#(8067 "ᾃ" "ᾋ" "ἋΙ")
#(8068 "ᾄ" "ᾌ" "ἌΙ")
#(8069 "ᾅ" "ᾍ" "ἍΙ")
#(8070 "ᾆ" "ᾎ" "ἎΙ")
#(8071 "ᾇ" "ᾏ" "ἏΙ")
#(8072 "ᾀ" "ᾈ" "ἈΙ")
#(8073 "ᾁ" "ᾉ" "ἉΙ")
#(8074 "ᾂ" "ᾊ" "ἊΙ")
#(8075 "ᾃ" "ᾋ" "ἋΙ")
#(8076 "ᾄ" "ᾌ" "ἌΙ")
#(8077 "ᾅ" "ᾍ" "ἍΙ")
#(8078 "ᾆ" "ᾎ" "ἎΙ")
#(8079 "ᾇ" "ᾏ" "ἏΙ")
#(8080 "ᾐ" "ᾘ" "ἨΙ")
#(8081 "ᾑ" "ᾙ" "ἩΙ")
#(8082 "ᾒ" "ᾚ" "ἪΙ")
#(8083 "ᾓ" "ᾛ" "ἫΙ")
#(8084 "ᾔ" "ᾜ" "ἬΙ")
#(8085 "ᾕ" "ᾝ" "ἭΙ")
#(8086 "ᾖ" "ᾞ" "ἮΙ")
#(8087 "ᾗ" "ᾟ" "ἯΙ")
#(8088 "ᾐ" "ᾘ" "ἨΙ")
#(8089 "ᾑ" "ᾙ" "ἩΙ")
#(8090 "ᾒ" "ᾚ" "ἪΙ")
#(8091 "ᾓ" "ᾛ" "ἫΙ")
#(8092 "ᾔ" "ᾜ" "ἬΙ")
#(8093 "ᾕ" "ᾝ" "ἭΙ")
#(8094 "ᾖ" "ᾞ" "ἮΙ")
#(8095 "ᾗ" "ᾟ" "ἯΙ")
#(8096 "ᾠ" "ᾨ" "ὨΙ")
#(8097 "ᾡ" "ᾩ" "ὩΙ")
#(8098 "ᾢ" "ᾪ" "ὪΙ")
#(8099 "ᾣ" "ᾫ" "ὫΙ")
#(8100 "ᾤ" "ᾬ" "ὬΙ")
#(8101 "ᾥ" "ᾭ" "ὭΙ")
#(8102 "ᾦ" "ᾮ" "ὮΙ")
#(8103 "ᾧ" "ᾯ" "ὯΙ")
#(8104 "ᾠ" "ᾨ" "ὨΙ")
#(8105 "ᾡ" "ᾩ" "ὩΙ")
#(8106 "ᾢ" "ᾪ" "ὪΙ")
#(8107 "ᾣ" "ᾫ" "ὫΙ")
#(8108 "ᾤ" "ᾬ" "ὬΙ")
#(8109 "ᾥ" "ᾭ" "ὭΙ")
#(8110 "ᾦ" "ᾮ" "ὮΙ")
#(8111 "ᾧ" "ᾯ" "ὯΙ")
#(8114 "ᾲ" "Ὰͅ" "ᾺΙ")
#(8115 "ᾳ" "ᾼ" "ΑΙ")
#(8116 "ᾴ" "Άͅ" "ΆΙ")
#(8118 "ᾶ" "Α͂" "Α͂")
#(8119 "ᾷ" "ᾼ͂" "Α͂Ι")
#(8124 "ᾳ" "ᾼ" "ΑΙ")
#(8130 "ῂ" "Ὴͅ" "ῊΙ")
#(8131 "ῃ" "ῌ" "ΗΙ")
#(8132 "ῄ" "Ήͅ" "ΉΙ")
#(8134 "ῆ" "Η͂" "Η͂")
#(8135 "ῇ" "ῌ͂" "Η͂Ι")
#(8140 "ῃ" "ῌ" "ΗΙ")
#(8146 "ῒ" "Ϊ̀" "Ϊ̀")
#(8147 "ΐ" "Ϊ́" "Ϊ́")
#(8150 "ῖ" "Ι͂" "Ι͂")
#(8151 "ῗ" "Ϊ͂" "Ϊ͂")
#(8162 "ῢ" "Ϋ̀" "Ϋ̀")
#(8163 "ΰ" "Ϋ́" "Ϋ́")
#(8164 "ῤ" "Ρ̓" "Ρ̓")
#(8166 "ῦ" "Υ͂" "Υ͂")
#(8167 "ῧ" "Ϋ͂" "Ϋ͂")
#(8178 "ῲ" "Ὼͅ" "ῺΙ")
#(8179 "ῳ" "ῼ" "ΩΙ")
#(8180 "ῴ" "Ώͅ" "ΏΙ")
#(8182 "ῶ" "Ω͂" "Ω͂")
#(8183 "ῷ" "ῼ͂" "Ω͂Ι")
#(8188 "ῳ" "ῼ" "ΩΙ")
#(64256 "ff" "Ff" "FF")
#(64257 "fi" "Fi" "FI")
#(64258 "fl" "Fl" "FL")
#(64259 "ffi" "Ffi" "FFI")
#(64260 "ffl" "Ffl" "FFL")
#(64261 "ſt" "St" "ST")
#(64262 "st" "St" "ST")
#(64275 "ﬓ" "Մն" "ՄՆ")
#(64276 "ﬔ" "Մե" "ՄԵ")
#(64277 "ﬕ" "Մի" "ՄԻ")
#(64278 "ﬖ" "Վն" "ՎՆ")
#(64279 "ﬗ" "Մխ" "ՄԽ")))

View file

@ -1103,6 +1103,7 @@
(test #t (string-ci<=? "ΑΒΓ" "αβγ"))
(test #t (string-ci>=? "ΑΒΓ" "αβγ"))
;; latin
(test "ABC" (string-upcase "abc"))
(test "ABC" (string-upcase "ABC"))
(test "abc" (string-downcase "abc"))
@ -1110,6 +1111,7 @@
(test "abc" (string-foldcase "abc"))
(test "abc" (string-foldcase "ABC"))
;; cyrillic
(test "ΑΒΓ" (string-upcase "αβγ"))
(test "ΑΒΓ" (string-upcase "ΑΒΓ"))
(test "αβγ" (string-downcase "αβγ"))
@ -1117,6 +1119,15 @@
(test "αβγ" (string-foldcase "αβγ"))
(test "αβγ" (string-foldcase "ΑΒΓ"))
;; special cases
(test "SSA" (string-upcase "ßa"))
(test "ßa" (string-downcase "ßa"))
(test "ssa" (string-downcase "SSA"))
(test "İ" (string-upcase "İ"))
(test "i̇" (string-downcase "İ"))
(test "i̇" (string-foldcase "İ"))
(test "J̌" (string-upcase "ǰ"))
(test "" (substring "" 0 0))
(test "" (substring "a" 0 0))
(test "" (substring "abc" 1 1))