mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-05-19 05:39:18 +02:00
distinguishing lowercase and foldcase (fixes issue #420)
This commit is contained in:
parent
ed0be227cc
commit
4a7a809c8d
8 changed files with 113 additions and 82 deletions
2
Makefile
2
Makefile
|
@ -198,7 +198,7 @@ lib/chibi/char-set/ascii.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$
|
||||||
lib/chibi/char-set/full.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE)
|
lib/chibi/char-set/full.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE)
|
||||||
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm chibi.char-set.compute > $@
|
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm chibi.char-set.compute > $@
|
||||||
|
|
||||||
lib/scheme/char/case-offsets.scm: data/CaseFolding.txt chibi-scheme$(EXE) all-libs
|
lib/scheme/char/case-offsets.scm: data/UnicodeData.txt chibi-scheme$(EXE) all-libs
|
||||||
$(CHIBI) tools/extract-case-offsets.scm $< > $@
|
$(CHIBI) tools/extract-case-offsets.scm $< > $@
|
||||||
|
|
||||||
########################################################################
|
########################################################################
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -5,50 +5,47 @@
|
||||||
(define (char-numeric? ch) (char-set-contains? char-set:digit ch))
|
(define (char-numeric? ch) (char-set-contains? char-set:digit ch))
|
||||||
(define (char-whitespace? ch) (char-set-contains? char-set:whitespace ch))
|
(define (char-whitespace? ch) (char-set-contains? char-set:whitespace ch))
|
||||||
|
|
||||||
|
(define (bsearch-kv vec n lo hi)
|
||||||
|
(and (<= lo hi)
|
||||||
|
(let* ((mid (+ lo (* (quotient (- hi lo) 4) 2)))
|
||||||
|
(m (vector-ref vec mid)))
|
||||||
|
(cond
|
||||||
|
((= n m)
|
||||||
|
(integer->char (vector-ref vec (+ mid 1))))
|
||||||
|
((< n m)
|
||||||
|
(bsearch-kv vec n lo (- mid 2)))
|
||||||
|
(else
|
||||||
|
(bsearch-kv vec n (+ mid 2) hi))))))
|
||||||
|
|
||||||
(define (char-downcase ch)
|
(define (char-downcase ch)
|
||||||
(let ((n (char->integer ch)))
|
(let ((n (char->integer ch)))
|
||||||
(let lp ((ls char-downcase-offsets))
|
(let lp ((ls char-downcase-offsets))
|
||||||
(cond
|
(cond
|
||||||
((null? ls)
|
((null? ls)
|
||||||
(let lp ((lo 0) (hi (- (vector-length char-downcase-map) 2)))
|
(or (bsearch-kv char-downcase-map n 0
|
||||||
(if (> lo hi)
|
(- (vector-length char-downcase-map) 2))
|
||||||
ch
|
ch))
|
||||||
(let* ((mid (+ lo (* (quotient (- hi lo) 4) 2)))
|
|
||||||
(m (vector-ref char-downcase-map mid)))
|
|
||||||
(cond
|
|
||||||
((= n m)
|
|
||||||
(integer->char (vector-ref char-downcase-map (+ mid 1))))
|
|
||||||
((< n m)
|
|
||||||
(lp lo (- mid 2)))
|
|
||||||
(else
|
|
||||||
(lp (+ mid 2) hi)))))))
|
|
||||||
((iset-contains? (caar ls) n)
|
((iset-contains? (caar ls) n)
|
||||||
(integer->char (+ n (cdar ls))))
|
(integer->char (+ n (cdar ls))))
|
||||||
(else (lp (cdr ls)))))))
|
(else (lp (cdr ls)))))))
|
||||||
|
|
||||||
(define char-foldcase char-downcase)
|
|
||||||
|
|
||||||
(define (char-upcase ch)
|
(define (char-upcase ch)
|
||||||
(let ((n (char->integer ch)))
|
(let ((n (char->integer ch)))
|
||||||
(let lp ((ls char-downcase-offsets))
|
(let lp ((ls char-downcase-offsets))
|
||||||
(cond
|
(cond
|
||||||
((null? ls)
|
((null? ls)
|
||||||
(let lp ((lo 0) (hi (- (vector-length char-upcase-map) 2)))
|
(or (bsearch-kv char-upcase-map n 0
|
||||||
(if (> lo hi)
|
(- (vector-length char-upcase-map) 2))
|
||||||
ch
|
ch))
|
||||||
(let* ((mid (+ lo (* (quotient (- hi lo) 4) 2)))
|
|
||||||
(m (vector-ref char-upcase-map mid)))
|
|
||||||
(cond
|
|
||||||
((= n m)
|
|
||||||
(integer->char (vector-ref char-upcase-map (+ mid 1))))
|
|
||||||
((< n m)
|
|
||||||
(lp lo (- mid 2)))
|
|
||||||
(else
|
|
||||||
(lp (+ mid 2) hi)))))))
|
|
||||||
((iset-contains? (caar ls) (- n (cdar ls)))
|
((iset-contains? (caar ls) (- n (cdar ls)))
|
||||||
(integer->char (- n (cdar ls))))
|
(integer->char (- n (cdar ls))))
|
||||||
(else (lp (cdr ls)))))))
|
(else (lp (cdr ls)))))))
|
||||||
|
|
||||||
|
(define (char-foldcase ch)
|
||||||
|
(or (bsearch-kv char-foldcase-map (char->integer ch) 0
|
||||||
|
(- (vector-length char-foldcase-map) 2))
|
||||||
|
(char-downcase ch)))
|
||||||
|
|
||||||
(define (char-cmp-ci op a ls)
|
(define (char-cmp-ci op a ls)
|
||||||
(let lp ((op op) (a (char->integer (char-foldcase a))) (ls ls))
|
(let lp ((op op) (a (char->integer (char-foldcase a))) (ls ls))
|
||||||
(if (null? ls)
|
(if (null? ls)
|
||||||
|
@ -70,7 +67,8 @@
|
||||||
(val (vector-ref vec 0)))
|
(val (vector-ref vec 0)))
|
||||||
(cond ((< i val) (and (< mid b) (lp a mid)))
|
(cond ((< i val) (and (< mid b) (lp a mid)))
|
||||||
((> i val) (and (> mid a) (lp mid b)))
|
((> i val) (and (> mid a) (lp mid b)))
|
||||||
(else (vector-ref vec off)))))))
|
(else
|
||||||
|
(vector-ref vec (if (>= off (vector-length vec)) 1 off))))))))
|
||||||
|
|
||||||
(define (call-with-output-string proc)
|
(define (call-with-output-string proc)
|
||||||
(let ((out (open-output-string)))
|
(let ((out (open-output-string)))
|
||||||
|
@ -86,13 +84,15 @@
|
||||||
(cond
|
(cond
|
||||||
((not (eof-object? ch))
|
((not (eof-object? ch))
|
||||||
(write-string
|
(write-string
|
||||||
(if (and (not fold?) (eqv? ch #\x03A3))
|
(cond
|
||||||
|
((and (not fold?) (eqv? ch #\x03A3))
|
||||||
(let ((ch2 (peek-char in)))
|
(let ((ch2 (peek-char in)))
|
||||||
(if (or (eof-object? ch2)
|
(if (or (eof-object? ch2)
|
||||||
(not (char-set-contains? char-set:letter ch2)))
|
(not (char-set-contains? char-set:letter ch2)))
|
||||||
#\x03C2
|
#\x03C2
|
||||||
#\x03C3))
|
#\x03C3)))
|
||||||
(or (char-get-special-case ch 1) (char-downcase ch)))
|
((char-get-special-case ch (if fold? 4 1)))
|
||||||
|
(else (if fold? (char-foldcase ch) (char-downcase ch))))
|
||||||
out)
|
out)
|
||||||
(lp)))))))))
|
(lp)))))))))
|
||||||
|
|
||||||
|
|
|
@ -3,8 +3,8 @@
|
||||||
;; file SpecialCasing.txt.
|
;; file SpecialCasing.txt.
|
||||||
|
|
||||||
(define special-cases
|
(define special-cases
|
||||||
;; <code> <lower> <title> <upper>
|
;; <code> <lower> <title> <upper> [<fold>]
|
||||||
#(#(223 "ß" "Ss" "SS")
|
#(#(223 "ß" "Ss" "SS" "ss")
|
||||||
#(304 "i̇" "İ" "İ")
|
#(304 "i̇" "İ" "İ")
|
||||||
#(329 "ʼn" "ʼN" "ʼN")
|
#(329 "ʼn" "ʼN" "ʼN")
|
||||||
#(496 "ǰ" "J̌" "J̌")
|
#(496 "ǰ" "J̌" "J̌")
|
||||||
|
|
|
@ -22,6 +22,7 @@
|
||||||
|
|
||||||
(test-group "titlecase/char"
|
(test-group "titlecase/char"
|
||||||
(test #\x01C5 (char-titlecase #\x01C4))
|
(test #\x01C5 (char-titlecase #\x01C4))
|
||||||
|
(test #\x01C5 (char-titlecase #\x01C5))
|
||||||
(test #\x01C5 (char-titlecase #\x01C6))
|
(test #\x01C5 (char-titlecase #\x01C6))
|
||||||
(test #\x1F88 (char-titlecase #\x1F80))
|
(test #\x1F88 (char-titlecase #\x1F80))
|
||||||
(test #\x01C7 (char-upcase #\x01C7))
|
(test #\x01C7 (char-upcase #\x01C7))
|
||||||
|
|
|
@ -5,7 +5,8 @@
|
||||||
|
|
||||||
;;> Returns the single-character titlecase mapping of argument
|
;;> Returns the single-character titlecase mapping of argument
|
||||||
(define (char-titlecase ch)
|
(define (char-titlecase ch)
|
||||||
(cond ((assv ch title-single-map) => cadr)
|
(cond ((char-set-contains? char-set:title-case ch) ch)
|
||||||
|
((assv ch title-single-map) => cadr)
|
||||||
(else (char-upcase ch))))
|
(else (char-upcase ch))))
|
||||||
|
|
||||||
;; Returns #t if a character is caseless, otherwise #f
|
;; Returns #t if a character is caseless, otherwise #f
|
||||||
|
@ -29,7 +30,7 @@
|
||||||
;; ch has multiple- or single-character titlecase mapping
|
;; ch has multiple- or single-character titlecase mapping
|
||||||
(lp n2 #f (append-reverse (cdr multi-title) result))
|
(lp n2 #f (append-reverse (cdr multi-title) result))
|
||||||
;; ch has single-character uppercase mapping
|
;; ch has single-character uppercase mapping
|
||||||
(lp n2 (char-caseless? ch) (cons (char-upcase ch) result))))
|
(lp n2 (char-caseless? ch) (cons (char-titlecase ch) result))))
|
||||||
;; ch must be lowercased
|
;; ch must be lowercased
|
||||||
(let ((multi-downcase (assv ch lower-multiple-map)))
|
(let ((multi-downcase (assv ch lower-multiple-map)))
|
||||||
(if multi-downcase
|
(if multi-downcase
|
||||||
|
|
|
@ -1320,10 +1320,14 @@
|
||||||
(test "SSA" (string-upcase "ßa"))
|
(test "SSA" (string-upcase "ßa"))
|
||||||
(test "ßa" (string-downcase "ßa"))
|
(test "ßa" (string-downcase "ßa"))
|
||||||
(test "ssa" (string-downcase "SSA"))
|
(test "ssa" (string-downcase "SSA"))
|
||||||
|
(test "maß" (string-downcase "Maß"))
|
||||||
|
(test "mass" (string-foldcase "Maß"))
|
||||||
(test "İ" (string-upcase "İ"))
|
(test "İ" (string-upcase "İ"))
|
||||||
(test "i\x0307;" (string-downcase "İ"))
|
(test "i\x0307;" (string-downcase "İ"))
|
||||||
(test "i\x0307;" (string-foldcase "İ"))
|
(test "i\x0307;" (string-foldcase "İ"))
|
||||||
(test "J̌" (string-upcase "ǰ"))
|
(test "J̌" (string-upcase "ǰ"))
|
||||||
|
(test "ſ" (string-downcase "ſ"))
|
||||||
|
(test "s" (string-foldcase "ſ"))
|
||||||
|
|
||||||
;; context-sensitive (final sigma)
|
;; context-sensitive (final sigma)
|
||||||
(test "ΓΛΏΣΣΑ" (string-upcase "γλώσσα"))
|
(test "ΓΛΏΣΣΑ" (string-upcase "γλώσσα"))
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
;; Extract sets of char case offsets.
|
;; Extract sets of char case offsets.
|
||||||
;;
|
;;
|
||||||
;; Usage:
|
;; Usage:
|
||||||
;; extract-case-offsets.scm options CaseFolding.txt > out
|
;; extract-case-offsets.scm options UnicodeData.txt > out
|
||||||
;;
|
;;
|
||||||
;; Recognized options are:
|
;; Recognized options are:
|
||||||
;;
|
;;
|
||||||
|
@ -23,7 +23,7 @@
|
||||||
(for-each (lambda (x) (display x err)) args)
|
(for-each (lambda (x) (display x err)) args)
|
||||||
(newline err)))
|
(newline err)))
|
||||||
|
|
||||||
(define (write-offsets offset-map extras out min-count max-char-sets name)
|
(define (write-offsets offset-map title-ups title-downs folds out min-count max-char-sets name)
|
||||||
(let lp ((ls (sort (hash-table->alist offset-map)
|
(let lp ((ls (sort (hash-table->alist offset-map)
|
||||||
(lambda (a b) (> (iset-size (cdr a)) (iset-size (cdr b))))))
|
(lambda (a b) (> (iset-size (cdr a)) (iset-size (cdr b))))))
|
||||||
(i 0)
|
(i 0)
|
||||||
|
@ -38,23 +38,22 @@
|
||||||
,(caar ls))
|
,(caar ls))
|
||||||
res)))
|
res)))
|
||||||
(else
|
(else
|
||||||
|
(display ";; auto-generated by extract-case-offsets.scm\n\n")
|
||||||
(write `(define ,(string->symbol name)
|
(write `(define ,(string->symbol name)
|
||||||
(list ,@(reverse res)))
|
(list ,@(reverse res)))
|
||||||
out)
|
out)
|
||||||
(newline out)
|
(newline out)
|
||||||
(newline out)
|
(newline out)
|
||||||
(let ((pairs
|
(let ((pairs
|
||||||
(sort
|
|
||||||
(append
|
|
||||||
extras
|
|
||||||
(append-map
|
(append-map
|
||||||
(lambda (x)
|
(lambda (x)
|
||||||
(map (lambda (y) (list y (+ y (car x))))
|
(map (lambda (y) (list y (+ y (car x))))
|
||||||
(iset->list (cdr x))))
|
(iset->list (cdr x))))
|
||||||
ls))
|
ls)))
|
||||||
(lambda (a b) (< (car a) (car b))))))
|
|
||||||
(write `(define char-downcase-map
|
(write `(define char-downcase-map
|
||||||
',(list->vector (append-map (lambda (x) x) pairs)))
|
',(list->vector
|
||||||
|
(append-map (lambda (x) x)
|
||||||
|
(sort (append pairs title-downs) < car))))
|
||||||
out)
|
out)
|
||||||
(newline out)
|
(newline out)
|
||||||
(newline out)
|
(newline out)
|
||||||
|
@ -62,50 +61,72 @@
|
||||||
',(list->vector
|
',(list->vector
|
||||||
(append-map (lambda (x) (list (cadr x) (car x)))
|
(append-map (lambda (x) (list (cadr x) (car x)))
|
||||||
(delete-duplicates
|
(delete-duplicates
|
||||||
(sort pairs
|
(sort (append pairs title-ups) < cadr)
|
||||||
(lambda (a b) (< (cadr a) (cadr b))))
|
(lambda (a b) (eqv? (cadr a) (cadr b)))))))
|
||||||
|
out)
|
||||||
|
(newline out)
|
||||||
|
(newline out)
|
||||||
|
(write `(define char-foldcase-map
|
||||||
|
',(list->vector
|
||||||
|
(append-map (lambda (x) x)
|
||||||
|
(delete-duplicates
|
||||||
|
(sort folds < car)
|
||||||
(lambda (a b) (eqv? (cadr a) (cadr b)))))))
|
(lambda (a b) (eqv? (cadr a) (cadr b)))))))
|
||||||
out)
|
out)
|
||||||
(newline out))))))
|
(newline out))))))
|
||||||
|
|
||||||
(define (extract-case-folding in out min-count max-char-sets name)
|
;; value;char;name;category;combining_class;bidi_class;decomposition;numeric1;numeric2;numeric3;bidi_mirrored;unicode1_name;ISO_comment;uppercase_mapping;lowercase_mapping;titlecase_mapping
|
||||||
|
|
||||||
|
(define (extract-case-mapping in out min-count max-char-sets name)
|
||||||
(define (string-trim-comment str comment-ch)
|
(define (string-trim-comment str comment-ch)
|
||||||
(car (string-split str comment-ch 2)))
|
(car (string-split str comment-ch 2)))
|
||||||
|
(define (extract-single-decomposition str)
|
||||||
|
(and (not (equal? "" str))
|
||||||
|
(let ((s (string-trim (last (string-split str #\>)))))
|
||||||
|
(and (not (string-contains s " "))
|
||||||
|
(string->number s 16)))))
|
||||||
(let ((offset-map (make-hash-table eq?))
|
(let ((offset-map (make-hash-table eq?))
|
||||||
(extras '()))
|
(title-ups '())
|
||||||
|
(title-downs '())
|
||||||
|
(folds '()))
|
||||||
(let lp ()
|
(let lp ()
|
||||||
(let ((line (read-line in)))
|
(let ((line (read-line in)))
|
||||||
(cond
|
(cond
|
||||||
((eof-object? line)
|
((eof-object? line)
|
||||||
(write-offsets offset-map extras out min-count max-char-sets name))
|
(write-offsets offset-map title-ups title-downs folds out
|
||||||
|
min-count max-char-sets name))
|
||||||
((or (equal? line "") (eqv? #\# (string-ref line 0)))
|
((or (equal? line "") (eqv? #\# (string-ref line 0)))
|
||||||
(lp))
|
(lp))
|
||||||
(else
|
(else
|
||||||
(let ((ls (map string-trim
|
(let ((ls (map string-trim
|
||||||
(string-split (string-trim-comment line #\#) #\;))))
|
(string-split (string-trim-comment line #\#) #\;))))
|
||||||
(cond
|
(cond
|
||||||
((< (length ls) 3)
|
((< (length ls) 15)
|
||||||
(warn "invalid CaseFolding.txt line: " line))
|
(warn "invalid UnicodeData.txt line: " line))
|
||||||
(else
|
(else
|
||||||
(let ((upper (string->number (car ls) 16))
|
(let ((base (string->number (list-ref ls 0) 16))
|
||||||
(status (string->symbol (cadr ls))))
|
(upper (string->number (list-ref ls 12) 16))
|
||||||
|
(lower (string->number (list-ref ls 13) 16))
|
||||||
|
(folded (extract-single-decomposition (list-ref ls 5))))
|
||||||
(cond
|
(cond
|
||||||
((not upper)
|
((or upper lower)
|
||||||
(warn "invalid upper char in CaseFolding.txt: " line))
|
|
||||||
((memv status '(C S))
|
|
||||||
(let ((lower (string->number (car (cddr ls)) 16)))
|
|
||||||
;; don't store titlecase mappings
|
|
||||||
(cond
|
(cond
|
||||||
((not lower)
|
((iset-contains? char-set:title-case base)
|
||||||
(warn "invalid lower char in CaseFolding.txt: " line))
|
(if upper
|
||||||
((iset-contains? char-set:title-case upper)
|
(set! title-ups `((,upper ,base) ,@title-ups)))
|
||||||
(set! extras (cons (list upper lower) extras)))
|
(if lower
|
||||||
|
(set! title-downs `((,base ,lower) ,@title-downs))))
|
||||||
(else
|
(else
|
||||||
(hash-table-update!
|
(hash-table-update!
|
||||||
offset-map
|
offset-map
|
||||||
(- lower upper)
|
(- (or lower base) (or upper base))
|
||||||
(lambda (is) (iset-adjoin! is upper))
|
(lambda (is) (iset-adjoin! is (or upper base)))
|
||||||
(lambda () (make-iset)))))))))))
|
(lambda () (make-iset)))))))
|
||||||
|
(cond
|
||||||
|
((and folded (not (eqv? folded (or lower base))))
|
||||||
|
;; (write `(fold: ,line ,base ,folded) (current-error-port))
|
||||||
|
;; (newline (current-error-port))
|
||||||
|
(set! folds (cons (list base folded) folds)))))))
|
||||||
(lp))))))))
|
(lp))))))))
|
||||||
|
|
||||||
(let ((args (command-line)))
|
(let ((args (command-line)))
|
||||||
|
@ -130,12 +151,12 @@
|
||||||
(else
|
(else
|
||||||
(error "unknown option: " (car ls)))))
|
(error "unknown option: " (car ls)))))
|
||||||
((null? ls)
|
((null? ls)
|
||||||
(error "usage: extract-case-offsets <CaseFolding.txt>"))
|
(error "usage: extract-case-offsets <UnicodeData.txt>"))
|
||||||
(else
|
(else
|
||||||
(if (equal? "-" (car ls))
|
(if (equal? "-" (car ls))
|
||||||
(extract-case-folding
|
(extract-case-mapping
|
||||||
(current-input-port) out min-count max-char-sets name)
|
(current-input-port) out min-count max-char-sets name)
|
||||||
(call-with-input-file (car ls)
|
(call-with-input-file (car ls)
|
||||||
(lambda (in)
|
(lambda (in)
|
||||||
(extract-case-folding in out min-count max-char-sets name))))
|
(extract-case-mapping in out min-count max-char-sets name))))
|
||||||
(close-output-port out)))))
|
(close-output-port out)))))
|
||||||
|
|
Loading…
Add table
Reference in a new issue