distinguishing lowercase and foldcase (fixes issue #420)

This commit is contained in:
Alex Shinn 2017-08-27 14:10:05 +09:00
parent ed0be227cc
commit 4a7a809c8d
8 changed files with 113 additions and 82 deletions

View file

@ -198,7 +198,7 @@ lib/chibi/char-set/ascii.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$
lib/chibi/char-set/full.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE)
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm chibi.char-set.compute > $@
lib/scheme/char/case-offsets.scm: data/CaseFolding.txt chibi-scheme$(EXE) all-libs
lib/scheme/char/case-offsets.scm: data/UnicodeData.txt chibi-scheme$(EXE) all-libs
$(CHIBI) tools/extract-case-offsets.scm $< > $@
########################################################################

File diff suppressed because one or more lines are too long

View file

@ -5,50 +5,47 @@
(define (char-numeric? ch) (char-set-contains? char-set:digit ch))
(define (char-whitespace? ch) (char-set-contains? char-set:whitespace ch))
(define (bsearch-kv vec n lo hi)
(and (<= lo hi)
(let* ((mid (+ lo (* (quotient (- hi lo) 4) 2)))
(m (vector-ref vec mid)))
(cond
((= n m)
(integer->char (vector-ref vec (+ mid 1))))
((< n m)
(bsearch-kv vec n lo (- mid 2)))
(else
(bsearch-kv vec n (+ mid 2) hi))))))
(define (char-downcase ch)
(let ((n (char->integer ch)))
(let lp ((ls char-downcase-offsets))
(cond
((null? ls)
(let lp ((lo 0) (hi (- (vector-length char-downcase-map) 2)))
(if (> lo hi)
ch
(let* ((mid (+ lo (* (quotient (- hi lo) 4) 2)))
(m (vector-ref char-downcase-map mid)))
(cond
((= n m)
(integer->char (vector-ref char-downcase-map (+ mid 1))))
((< n m)
(lp lo (- mid 2)))
(else
(lp (+ mid 2) hi)))))))
(or (bsearch-kv char-downcase-map n 0
(- (vector-length char-downcase-map) 2))
ch))
((iset-contains? (caar ls) n)
(integer->char (+ n (cdar ls))))
(else (lp (cdr ls)))))))
(define char-foldcase char-downcase)
(define (char-upcase ch)
(let ((n (char->integer ch)))
(let lp ((ls char-downcase-offsets))
(cond
((null? ls)
(let lp ((lo 0) (hi (- (vector-length char-upcase-map) 2)))
(if (> lo hi)
ch
(let* ((mid (+ lo (* (quotient (- hi lo) 4) 2)))
(m (vector-ref char-upcase-map mid)))
(cond
((= n m)
(integer->char (vector-ref char-upcase-map (+ mid 1))))
((< n m)
(lp lo (- mid 2)))
(else
(lp (+ mid 2) hi)))))))
(or (bsearch-kv char-upcase-map n 0
(- (vector-length char-upcase-map) 2))
ch))
((iset-contains? (caar ls) (- n (cdar ls)))
(integer->char (- n (cdar ls))))
(else (lp (cdr ls)))))))
(define (char-foldcase ch)
(or (bsearch-kv char-foldcase-map (char->integer ch) 0
(- (vector-length char-foldcase-map) 2))
(char-downcase ch)))
(define (char-cmp-ci op a ls)
(let lp ((op op) (a (char->integer (char-foldcase a))) (ls ls))
(if (null? ls)
@ -70,7 +67,8 @@
(val (vector-ref vec 0)))
(cond ((< i val) (and (< mid b) (lp a mid)))
((> i val) (and (> mid a) (lp mid b)))
(else (vector-ref vec off)))))))
(else
(vector-ref vec (if (>= off (vector-length vec)) 1 off))))))))
(define (call-with-output-string proc)
(let ((out (open-output-string)))
@ -86,13 +84,15 @@
(cond
((not (eof-object? ch))
(write-string
(if (and (not fold?) (eqv? ch #\x03A3))
(cond
((and (not fold?) (eqv? ch #\x03A3))
(let ((ch2 (peek-char in)))
(if (or (eof-object? ch2)
(not (char-set-contains? char-set:letter ch2)))
#\x03C2
#\x03C3))
(or (char-get-special-case ch 1) (char-downcase ch)))
#\x03C3)))
((char-get-special-case ch (if fold? 4 1)))
(else (if fold? (char-foldcase ch) (char-downcase ch))))
out)
(lp)))))))))

View file

@ -3,8 +3,8 @@
;; file SpecialCasing.txt.
(define special-cases
;; <code> <lower> <title> <upper>
#(#(223 "ß" "Ss" "SS")
;; <code> <lower> <title> <upper> [<fold>]
#(#(223 "ß" "Ss" "SS" "ss")
#(304 "i̇" "İ" "İ")
#(329 "ʼn" "ʼN" "ʼN")
#(496 "ǰ" "J̌" "J̌")

View file

@ -22,6 +22,7 @@
(test-group "titlecase/char"
(test #\x01C5 (char-titlecase #\x01C4))
(test #\x01C5 (char-titlecase #\x01C5))
(test #\x01C5 (char-titlecase #\x01C6))
(test #\x1F88 (char-titlecase #\x1F80))
(test #\x01C7 (char-upcase #\x01C7))

View file

@ -5,7 +5,8 @@
;;> Returns the single-character titlecase mapping of argument
(define (char-titlecase ch)
(cond ((assv ch title-single-map) => cadr)
(cond ((char-set-contains? char-set:title-case ch) ch)
((assv ch title-single-map) => cadr)
(else (char-upcase ch))))
;; Returns #t if a character is caseless, otherwise #f
@ -29,7 +30,7 @@
;; ch has multiple- or single-character titlecase mapping
(lp n2 #f (append-reverse (cdr multi-title) result))
;; ch has single-character uppercase mapping
(lp n2 (char-caseless? ch) (cons (char-upcase ch) result))))
(lp n2 (char-caseless? ch) (cons (char-titlecase ch) result))))
;; ch must be lowercased
(let ((multi-downcase (assv ch lower-multiple-map)))
(if multi-downcase

View file

@ -1320,10 +1320,14 @@
(test "SSA" (string-upcase "ßa"))
(test "ßa" (string-downcase "ßa"))
(test "ssa" (string-downcase "SSA"))
(test "maß" (string-downcase "Maß"))
(test "mass" (string-foldcase "Maß"))
(test "İ" (string-upcase "İ"))
(test "i\x0307;" (string-downcase "İ"))
(test "i\x0307;" (string-foldcase "İ"))
(test "J̌" (string-upcase "ǰ"))
(test "ſ" (string-downcase "ſ"))
(test "s" (string-foldcase "ſ"))
;; context-sensitive (final sigma)
(test "ΓΛΏΣΣΑ" (string-upcase "γλώσσα"))

View file

@ -3,7 +3,7 @@
;; Extract sets of char case offsets.
;;
;; Usage:
;; extract-case-offsets.scm options CaseFolding.txt > out
;; extract-case-offsets.scm options UnicodeData.txt > out
;;
;; Recognized options are:
;;
@ -23,7 +23,7 @@
(for-each (lambda (x) (display x err)) args)
(newline err)))
(define (write-offsets offset-map extras out min-count max-char-sets name)
(define (write-offsets offset-map title-ups title-downs folds out min-count max-char-sets name)
(let lp ((ls (sort (hash-table->alist offset-map)
(lambda (a b) (> (iset-size (cdr a)) (iset-size (cdr b))))))
(i 0)
@ -38,23 +38,22 @@
,(caar ls))
res)))
(else
(display ";; auto-generated by extract-case-offsets.scm\n\n")
(write `(define ,(string->symbol name)
(list ,@(reverse res)))
out)
(newline out)
(newline out)
(let ((pairs
(sort
(append
extras
(append-map
(lambda (x)
(map (lambda (y) (list y (+ y (car x))))
(iset->list (cdr x))))
ls))
(lambda (a b) (< (car a) (car b))))))
ls)))
(write `(define char-downcase-map
',(list->vector (append-map (lambda (x) x) pairs)))
',(list->vector
(append-map (lambda (x) x)
(sort (append pairs title-downs) < car))))
out)
(newline out)
(newline out)
@ -62,50 +61,72 @@
',(list->vector
(append-map (lambda (x) (list (cadr x) (car x)))
(delete-duplicates
(sort pairs
(lambda (a b) (< (cadr a) (cadr b))))
(sort (append pairs title-ups) < cadr)
(lambda (a b) (eqv? (cadr a) (cadr b)))))))
out)
(newline out)
(newline out)
(write `(define char-foldcase-map
',(list->vector
(append-map (lambda (x) x)
(delete-duplicates
(sort folds < car)
(lambda (a b) (eqv? (cadr a) (cadr b)))))))
out)
(newline out))))))
(define (extract-case-folding in out min-count max-char-sets name)
;; value;char;name;category;combining_class;bidi_class;decomposition;numeric1;numeric2;numeric3;bidi_mirrored;unicode1_name;ISO_comment;uppercase_mapping;lowercase_mapping;titlecase_mapping
(define (extract-case-mapping in out min-count max-char-sets name)
(define (string-trim-comment str comment-ch)
(car (string-split str comment-ch 2)))
(define (extract-single-decomposition str)
(and (not (equal? "" str))
(let ((s (string-trim (last (string-split str #\>)))))
(and (not (string-contains s " "))
(string->number s 16)))))
(let ((offset-map (make-hash-table eq?))
(extras '()))
(title-ups '())
(title-downs '())
(folds '()))
(let lp ()
(let ((line (read-line in)))
(cond
((eof-object? line)
(write-offsets offset-map extras out min-count max-char-sets name))
(write-offsets offset-map title-ups title-downs folds out
min-count max-char-sets name))
((or (equal? line "") (eqv? #\# (string-ref line 0)))
(lp))
(else
(let ((ls (map string-trim
(string-split (string-trim-comment line #\#) #\;))))
(cond
((< (length ls) 3)
(warn "invalid CaseFolding.txt line: " line))
((< (length ls) 15)
(warn "invalid UnicodeData.txt line: " line))
(else
(let ((upper (string->number (car ls) 16))
(status (string->symbol (cadr ls))))
(let ((base (string->number (list-ref ls 0) 16))
(upper (string->number (list-ref ls 12) 16))
(lower (string->number (list-ref ls 13) 16))
(folded (extract-single-decomposition (list-ref ls 5))))
(cond
((not upper)
(warn "invalid upper char in CaseFolding.txt: " line))
((memv status '(C S))
(let ((lower (string->number (car (cddr ls)) 16)))
;; don't store titlecase mappings
((or upper lower)
(cond
((not lower)
(warn "invalid lower char in CaseFolding.txt: " line))
((iset-contains? char-set:title-case upper)
(set! extras (cons (list upper lower) extras)))
((iset-contains? char-set:title-case base)
(if upper
(set! title-ups `((,upper ,base) ,@title-ups)))
(if lower
(set! title-downs `((,base ,lower) ,@title-downs))))
(else
(hash-table-update!
offset-map
(- lower upper)
(lambda (is) (iset-adjoin! is upper))
(lambda () (make-iset)))))))))))
(- (or lower base) (or upper base))
(lambda (is) (iset-adjoin! is (or upper base)))
(lambda () (make-iset)))))))
(cond
((and folded (not (eqv? folded (or lower base))))
;; (write `(fold: ,line ,base ,folded) (current-error-port))
;; (newline (current-error-port))
(set! folds (cons (list base folded) folds)))))))
(lp))))))))
(let ((args (command-line)))
@ -130,12 +151,12 @@
(else
(error "unknown option: " (car ls)))))
((null? ls)
(error "usage: extract-case-offsets <CaseFolding.txt>"))
(error "usage: extract-case-offsets <UnicodeData.txt>"))
(else
(if (equal? "-" (car ls))
(extract-case-folding
(extract-case-mapping
(current-input-port) out min-count max-char-sets name)
(call-with-input-file (car ls)
(lambda (in)
(extract-case-folding in out min-count max-char-sets name))))
(extract-case-mapping in out min-count max-char-sets name))))
(close-output-port out)))))