diff --git a/lib/scheme/char.sld b/lib/scheme/char.sld index 604715a7..2da5c423 100644 --- a/lib/scheme/char.sld +++ b/lib/scheme/char.sld @@ -3,10 +3,12 @@ (import (scheme base)) (cond-expand (full-unicode - (import (chibi char-set full) + (import (scheme write) + (chibi char-set full) (chibi char-set base) (chibi iset base)) (include "char/full.scm") + (include "char/special-casing.scm") (include "char/case-offsets.scm")) (else (include "char/ascii.scm") diff --git a/lib/scheme/char/full.scm b/lib/scheme/char/full.scm index 9a663b37..27b075ca 100644 --- a/lib/scheme/char/full.scm +++ b/lib/scheme/char/full.scm @@ -62,14 +62,40 @@ (define (char-ci<=? a . ls) (char-cmp-ci <= a ls)) (define (char-ci>=? a . ls) (char-cmp-ci >= a ls)) +(define (char-get-special-case ch off) + (let ((i (char->integer ch))) + (let loop ((a 0) (b (vector-length special-cases))) + (if (= a b) + #f + (let* ((mid (+ a (quotient (- b a) 2))) + (vec (vector-ref special-cases mid)) + (val (vector-ref vec 0))) + (cond ((< i val) (if (= mid b) #f (loop a mid))) + ((> i val) (if (= mid a) #f (loop mid b))) + (else (vector-ref vec off)))))))) + +(define (call-with-output-string proc) + (let ((out (open-output-string))) + (proc out) + (get-output-string out))) + (define (string-downcase str) - (string-map char-downcase str)) + (call-with-output-string + (lambda (out) + (string-for-each + (lambda (ch) + (display (or (char-get-special-case ch 1) (char-downcase ch)) out)) + str)))) + +(define string-foldcase string-downcase) (define (string-upcase str) - (string-map char-upcase str)) - -(define (string-foldcase str) - (string-map char-foldcase str)) + (call-with-output-string + (lambda (out) + (string-for-each + (lambda (ch) + (display (or (char-get-special-case ch 3) (char-upcase ch)) out)) + str)))) (define (string-cmp-ci op a ls) (let lp ((op op) (a (string-foldcase a)) (ls ls)) diff --git a/lib/scheme/char/special-casing.scm b/lib/scheme/char/special-casing.scm new file mode 100644 index 00000000..83b32623 --- /dev/null +++ b/lib/scheme/char/special-casing.scm @@ -0,0 +1,109 @@ + +;; Unconditoinal non-1-to-1 case mappings derived from Unicode data +;; file SpecialCasing.txt. + +(define special-cases + ;; <upper> + #(#(223 "ß" "Ss" "SS") + #(304 "i̇" "İ" "İ") + #(329 "ʼn" "ʼN" "ʼN") + #(496 "ǰ" "J̌" "J̌") + #(912 "ΐ" "Ϊ́" "Ϊ́") + #(944 "ΰ" "Ϋ́" "Ϋ́") + #(1415 "և" "Եւ" "ԵՒ") + #(7830 "ẖ" "H̱" "H̱") + #(7831 "ẗ" "T̈" "T̈") + #(7832 "ẘ" "W̊" "W̊") + #(7833 "ẙ" "Y̊" "Y̊") + #(7834 "ẚ" "Aʾ" "Aʾ") + #(8016 "ὐ" "Υ̓" "Υ̓") + #(8018 "ὒ" "Υ̓̀" "Υ̓̀") + #(8020 "ὔ" "Υ̓́" "Υ̓́") + #(8022 "ὖ" "Υ̓͂" "Υ̓͂") + #(8064 "ᾀ" "ᾈ" "ἈΙ") + #(8065 "ᾁ" "ᾉ" "ἉΙ") + #(8066 "ᾂ" "ᾊ" "ἊΙ") + #(8067 "ᾃ" "ᾋ" "ἋΙ") + #(8068 "ᾄ" "ᾌ" "ἌΙ") + #(8069 "ᾅ" "ᾍ" "ἍΙ") + #(8070 "ᾆ" "ᾎ" "ἎΙ") + #(8071 "ᾇ" "ᾏ" "ἏΙ") + #(8072 "ᾀ" "ᾈ" "ἈΙ") + #(8073 "ᾁ" "ᾉ" "ἉΙ") + #(8074 "ᾂ" "ᾊ" "ἊΙ") + #(8075 "ᾃ" "ᾋ" "ἋΙ") + #(8076 "ᾄ" "ᾌ" "ἌΙ") + #(8077 "ᾅ" "ᾍ" "ἍΙ") + #(8078 "ᾆ" "ᾎ" "ἎΙ") + #(8079 "ᾇ" "ᾏ" "ἏΙ") + #(8080 "ᾐ" "ᾘ" "ἨΙ") + #(8081 "ᾑ" "ᾙ" "ἩΙ") + #(8082 "ᾒ" "ᾚ" "ἪΙ") + #(8083 "ᾓ" "ᾛ" "ἫΙ") + #(8084 "ᾔ" "ᾜ" "ἬΙ") + #(8085 "ᾕ" "ᾝ" "ἭΙ") + #(8086 "ᾖ" "ᾞ" "ἮΙ") + #(8087 "ᾗ" "ᾟ" "ἯΙ") + #(8088 "ᾐ" "ᾘ" "ἨΙ") + #(8089 "ᾑ" "ᾙ" "ἩΙ") + #(8090 "ᾒ" "ᾚ" "ἪΙ") + #(8091 "ᾓ" "ᾛ" "ἫΙ") + #(8092 "ᾔ" "ᾜ" "ἬΙ") + #(8093 "ᾕ" "ᾝ" "ἭΙ") + #(8094 "ᾖ" "ᾞ" "ἮΙ") + #(8095 "ᾗ" "ᾟ" "ἯΙ") + #(8096 "ᾠ" "ᾨ" "ὨΙ") + #(8097 "ᾡ" "ᾩ" "ὩΙ") + #(8098 "ᾢ" "ᾪ" "ὪΙ") + #(8099 "ᾣ" "ᾫ" "ὫΙ") + #(8100 "ᾤ" "ᾬ" "ὬΙ") + #(8101 "ᾥ" "ᾭ" "ὭΙ") + #(8102 "ᾦ" "ᾮ" "ὮΙ") + #(8103 "ᾧ" "ᾯ" "ὯΙ") + #(8104 "ᾠ" "ᾨ" "ὨΙ") + #(8105 "ᾡ" "ᾩ" "ὩΙ") + #(8106 "ᾢ" "ᾪ" "ὪΙ") + #(8107 "ᾣ" "ᾫ" "ὫΙ") + #(8108 "ᾤ" "ᾬ" "ὬΙ") + #(8109 "ᾥ" "ᾭ" "ὭΙ") + #(8110 "ᾦ" "ᾮ" "ὮΙ") + #(8111 "ᾧ" "ᾯ" "ὯΙ") + #(8114 "ᾲ" "Ὰͅ" "ᾺΙ") + #(8115 "ᾳ" "ᾼ" "ΑΙ") + #(8116 "ᾴ" "Άͅ" "ΆΙ") + #(8118 "ᾶ" "Α͂" "Α͂") + #(8119 "ᾷ" "ᾼ͂" "Α͂Ι") + #(8124 "ᾳ" "ᾼ" "ΑΙ") + #(8130 "ῂ" "Ὴͅ" "ῊΙ") + #(8131 "ῃ" "ῌ" "ΗΙ") + #(8132 "ῄ" "Ήͅ" "ΉΙ") + #(8134 "ῆ" "Η͂" "Η͂") + #(8135 "ῇ" "ῌ͂" "Η͂Ι") + #(8140 "ῃ" "ῌ" "ΗΙ") + #(8146 "ῒ" "Ϊ̀" "Ϊ̀") + #(8147 "ΐ" "Ϊ́" "Ϊ́") + #(8150 "ῖ" "Ι͂" "Ι͂") + #(8151 "ῗ" "Ϊ͂" "Ϊ͂") + #(8162 "ῢ" "Ϋ̀" "Ϋ̀") + #(8163 "ΰ" "Ϋ́" "Ϋ́") + #(8164 "ῤ" "Ρ̓" "Ρ̓") + #(8166 "ῦ" "Υ͂" "Υ͂") + #(8167 "ῧ" "Ϋ͂" "Ϋ͂") + #(8178 "ῲ" "Ὼͅ" "ῺΙ") + #(8179 "ῳ" "ῼ" "ΩΙ") + #(8180 "ῴ" "Ώͅ" "ΏΙ") + #(8182 "ῶ" "Ω͂" "Ω͂") + #(8183 "ῷ" "ῼ͂" "Ω͂Ι") + #(8188 "ῳ" "ῼ" "ΩΙ") + #(64256 "ff" "Ff" "FF") + #(64257 "fi" "Fi" "FI") + #(64258 "fl" "Fl" "FL") + #(64259 "ffi" "Ffi" "FFI") + #(64260 "ffl" "Ffl" "FFL") + #(64261 "ſt" "St" "ST") + #(64262 "st" "St" "ST") + #(64275 "ﬓ" "Մն" "ՄՆ") + #(64276 "ﬔ" "Մե" "ՄԵ") + #(64277 "ﬕ" "Մի" "ՄԻ") + #(64278 "ﬖ" "Վն" "ՎՆ") + #(64279 "ﬗ" "Մխ" "ՄԽ"))) diff --git a/tests/r7rs-tests.scm b/tests/r7rs-tests.scm index 3782dbcb..5050b2ca 100644 --- a/tests/r7rs-tests.scm +++ b/tests/r7rs-tests.scm @@ -1103,6 +1103,7 @@ (test #t (string-ci<=? "ΑΒΓ" "αβγ")) (test #t (string-ci>=? "ΑΒΓ" "αβγ")) +;; latin (test "ABC" (string-upcase "abc")) (test "ABC" (string-upcase "ABC")) (test "abc" (string-downcase "abc")) @@ -1110,6 +1111,7 @@ (test "abc" (string-foldcase "abc")) (test "abc" (string-foldcase "ABC")) +;; cyrillic (test "ΑΒΓ" (string-upcase "αβγ")) (test "ΑΒΓ" (string-upcase "ΑΒΓ")) (test "αβγ" (string-downcase "αβγ")) @@ -1117,6 +1119,15 @@ (test "αβγ" (string-foldcase "αβγ")) (test "αβγ" (string-foldcase "ΑΒΓ")) +;; special cases +(test "SSA" (string-upcase "ßa")) +(test "ßa" (string-downcase "ßa")) +(test "ssa" (string-downcase "SSA")) +(test "İ" (string-upcase "İ")) +(test "i̇" (string-downcase "İ")) +(test "i̇" (string-foldcase "İ")) +(test "J̌" (string-upcase "ǰ")) + (test "" (substring "" 0 0)) (test "" (substring "a" 0 0)) (test "" (substring "abc" 1 1))