distinguishing lowercase and foldcase (fixes issue #420)

This commit is contained in:
Alex Shinn 2017-08-27 14:10:05 +09:00
parent ed0be227cc
commit 4a7a809c8d
8 changed files with 113 additions and 82 deletions

View file

@ -198,7 +198,7 @@ lib/chibi/char-set/ascii.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$
lib/chibi/char-set/full.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE) lib/chibi/char-set/full.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE)
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm chibi.char-set.compute > $@ $(CHIBI) -Abuild-lib tools/optimize-char-sets.scm chibi.char-set.compute > $@
lib/scheme/char/case-offsets.scm: data/CaseFolding.txt chibi-scheme$(EXE) all-libs lib/scheme/char/case-offsets.scm: data/UnicodeData.txt chibi-scheme$(EXE) all-libs
$(CHIBI) tools/extract-case-offsets.scm $< > $@ $(CHIBI) tools/extract-case-offsets.scm $< > $@
######################################################################## ########################################################################

File diff suppressed because one or more lines are too long

View file

@ -5,50 +5,47 @@
(define (char-numeric? ch) (char-set-contains? char-set:digit ch)) (define (char-numeric? ch) (char-set-contains? char-set:digit ch))
(define (char-whitespace? ch) (char-set-contains? char-set:whitespace ch)) (define (char-whitespace? ch) (char-set-contains? char-set:whitespace ch))
(define (bsearch-kv vec n lo hi)
(and (<= lo hi)
(let* ((mid (+ lo (* (quotient (- hi lo) 4) 2)))
(m (vector-ref vec mid)))
(cond
((= n m)
(integer->char (vector-ref vec (+ mid 1))))
((< n m)
(bsearch-kv vec n lo (- mid 2)))
(else
(bsearch-kv vec n (+ mid 2) hi))))))
(define (char-downcase ch) (define (char-downcase ch)
(let ((n (char->integer ch))) (let ((n (char->integer ch)))
(let lp ((ls char-downcase-offsets)) (let lp ((ls char-downcase-offsets))
(cond (cond
((null? ls) ((null? ls)
(let lp ((lo 0) (hi (- (vector-length char-downcase-map) 2))) (or (bsearch-kv char-downcase-map n 0
(if (> lo hi) (- (vector-length char-downcase-map) 2))
ch ch))
(let* ((mid (+ lo (* (quotient (- hi lo) 4) 2)))
(m (vector-ref char-downcase-map mid)))
(cond
((= n m)
(integer->char (vector-ref char-downcase-map (+ mid 1))))
((< n m)
(lp lo (- mid 2)))
(else
(lp (+ mid 2) hi)))))))
((iset-contains? (caar ls) n) ((iset-contains? (caar ls) n)
(integer->char (+ n (cdar ls)))) (integer->char (+ n (cdar ls))))
(else (lp (cdr ls))))))) (else (lp (cdr ls)))))))
(define char-foldcase char-downcase)
(define (char-upcase ch) (define (char-upcase ch)
(let ((n (char->integer ch))) (let ((n (char->integer ch)))
(let lp ((ls char-downcase-offsets)) (let lp ((ls char-downcase-offsets))
(cond (cond
((null? ls) ((null? ls)
(let lp ((lo 0) (hi (- (vector-length char-upcase-map) 2))) (or (bsearch-kv char-upcase-map n 0
(if (> lo hi) (- (vector-length char-upcase-map) 2))
ch ch))
(let* ((mid (+ lo (* (quotient (- hi lo) 4) 2)))
(m (vector-ref char-upcase-map mid)))
(cond
((= n m)
(integer->char (vector-ref char-upcase-map (+ mid 1))))
((< n m)
(lp lo (- mid 2)))
(else
(lp (+ mid 2) hi)))))))
((iset-contains? (caar ls) (- n (cdar ls))) ((iset-contains? (caar ls) (- n (cdar ls)))
(integer->char (- n (cdar ls)))) (integer->char (- n (cdar ls))))
(else (lp (cdr ls))))))) (else (lp (cdr ls)))))))
(define (char-foldcase ch)
(or (bsearch-kv char-foldcase-map (char->integer ch) 0
(- (vector-length char-foldcase-map) 2))
(char-downcase ch)))
(define (char-cmp-ci op a ls) (define (char-cmp-ci op a ls)
(let lp ((op op) (a (char->integer (char-foldcase a))) (ls ls)) (let lp ((op op) (a (char->integer (char-foldcase a))) (ls ls))
(if (null? ls) (if (null? ls)
@ -70,7 +67,8 @@
(val (vector-ref vec 0))) (val (vector-ref vec 0)))
(cond ((< i val) (and (< mid b) (lp a mid))) (cond ((< i val) (and (< mid b) (lp a mid)))
((> i val) (and (> mid a) (lp mid b))) ((> i val) (and (> mid a) (lp mid b)))
(else (vector-ref vec off))))))) (else
(vector-ref vec (if (>= off (vector-length vec)) 1 off))))))))
(define (call-with-output-string proc) (define (call-with-output-string proc)
(let ((out (open-output-string))) (let ((out (open-output-string)))
@ -86,13 +84,15 @@
(cond (cond
((not (eof-object? ch)) ((not (eof-object? ch))
(write-string (write-string
(if (and (not fold?) (eqv? ch #\x03A3)) (cond
((and (not fold?) (eqv? ch #\x03A3))
(let ((ch2 (peek-char in))) (let ((ch2 (peek-char in)))
(if (or (eof-object? ch2) (if (or (eof-object? ch2)
(not (char-set-contains? char-set:letter ch2))) (not (char-set-contains? char-set:letter ch2)))
#\x03C2 #\x03C2
#\x03C3)) #\x03C3)))
(or (char-get-special-case ch 1) (char-downcase ch))) ((char-get-special-case ch (if fold? 4 1)))
(else (if fold? (char-foldcase ch) (char-downcase ch))))
out) out)
(lp))))))))) (lp)))))))))

View file

@ -3,8 +3,8 @@
;; file SpecialCasing.txt. ;; file SpecialCasing.txt.
(define special-cases (define special-cases
;; <code> <lower> <title> <upper> ;; <code> <lower> <title> <upper> [<fold>]
#(#(223 "ß" "Ss" "SS") #(#(223 "ß" "Ss" "SS" "ss")
#(304 "i̇" "İ" "İ") #(304 "i̇" "İ" "İ")
#(329 "ʼn" "ʼN" "ʼN") #(329 "ʼn" "ʼN" "ʼN")
#(496 "ǰ" "J̌" "J̌") #(496 "ǰ" "J̌" "J̌")

View file

@ -22,6 +22,7 @@
(test-group "titlecase/char" (test-group "titlecase/char"
(test #\x01C5 (char-titlecase #\x01C4)) (test #\x01C5 (char-titlecase #\x01C4))
(test #\x01C5 (char-titlecase #\x01C5))
(test #\x01C5 (char-titlecase #\x01C6)) (test #\x01C5 (char-titlecase #\x01C6))
(test #\x1F88 (char-titlecase #\x1F80)) (test #\x1F88 (char-titlecase #\x1F80))
(test #\x01C7 (char-upcase #\x01C7)) (test #\x01C7 (char-upcase #\x01C7))

View file

@ -5,7 +5,8 @@
;;> Returns the single-character titlecase mapping of argument ;;> Returns the single-character titlecase mapping of argument
(define (char-titlecase ch) (define (char-titlecase ch)
(cond ((assv ch title-single-map) => cadr) (cond ((char-set-contains? char-set:title-case ch) ch)
((assv ch title-single-map) => cadr)
(else (char-upcase ch)))) (else (char-upcase ch))))
;; Returns #t if a character is caseless, otherwise #f ;; Returns #t if a character is caseless, otherwise #f
@ -29,7 +30,7 @@
;; ch has multiple- or single-character titlecase mapping ;; ch has multiple- or single-character titlecase mapping
(lp n2 #f (append-reverse (cdr multi-title) result)) (lp n2 #f (append-reverse (cdr multi-title) result))
;; ch has single-character uppercase mapping ;; ch has single-character uppercase mapping
(lp n2 (char-caseless? ch) (cons (char-upcase ch) result)))) (lp n2 (char-caseless? ch) (cons (char-titlecase ch) result))))
;; ch must be lowercased ;; ch must be lowercased
(let ((multi-downcase (assv ch lower-multiple-map))) (let ((multi-downcase (assv ch lower-multiple-map)))
(if multi-downcase (if multi-downcase

View file

@ -1320,10 +1320,14 @@
(test "SSA" (string-upcase "ßa")) (test "SSA" (string-upcase "ßa"))
(test "ßa" (string-downcase "ßa")) (test "ßa" (string-downcase "ßa"))
(test "ssa" (string-downcase "SSA")) (test "ssa" (string-downcase "SSA"))
(test "maß" (string-downcase "Maß"))
(test "mass" (string-foldcase "Maß"))
(test "İ" (string-upcase "İ")) (test "İ" (string-upcase "İ"))
(test "i\x0307;" (string-downcase "İ")) (test "i\x0307;" (string-downcase "İ"))
(test "i\x0307;" (string-foldcase "İ")) (test "i\x0307;" (string-foldcase "İ"))
(test "J̌" (string-upcase "ǰ")) (test "J̌" (string-upcase "ǰ"))
(test "ſ" (string-downcase "ſ"))
(test "s" (string-foldcase "ſ"))
;; context-sensitive (final sigma) ;; context-sensitive (final sigma)
(test "ΓΛΏΣΣΑ" (string-upcase "γλώσσα")) (test "ΓΛΏΣΣΑ" (string-upcase "γλώσσα"))

View file

@ -3,7 +3,7 @@
;; Extract sets of char case offsets. ;; Extract sets of char case offsets.
;; ;;
;; Usage: ;; Usage:
;; extract-case-offsets.scm options CaseFolding.txt > out ;; extract-case-offsets.scm options UnicodeData.txt > out
;; ;;
;; Recognized options are: ;; Recognized options are:
;; ;;
@ -23,7 +23,7 @@
(for-each (lambda (x) (display x err)) args) (for-each (lambda (x) (display x err)) args)
(newline err))) (newline err)))
(define (write-offsets offset-map extras out min-count max-char-sets name) (define (write-offsets offset-map title-ups title-downs folds out min-count max-char-sets name)
(let lp ((ls (sort (hash-table->alist offset-map) (let lp ((ls (sort (hash-table->alist offset-map)
(lambda (a b) (> (iset-size (cdr a)) (iset-size (cdr b)))))) (lambda (a b) (> (iset-size (cdr a)) (iset-size (cdr b))))))
(i 0) (i 0)
@ -38,23 +38,22 @@
,(caar ls)) ,(caar ls))
res))) res)))
(else (else
(display ";; auto-generated by extract-case-offsets.scm\n\n")
(write `(define ,(string->symbol name) (write `(define ,(string->symbol name)
(list ,@(reverse res))) (list ,@(reverse res)))
out) out)
(newline out) (newline out)
(newline out) (newline out)
(let ((pairs (let ((pairs
(sort
(append
extras
(append-map (append-map
(lambda (x) (lambda (x)
(map (lambda (y) (list y (+ y (car x)))) (map (lambda (y) (list y (+ y (car x))))
(iset->list (cdr x)))) (iset->list (cdr x))))
ls)) ls)))
(lambda (a b) (< (car a) (car b))))))
(write `(define char-downcase-map (write `(define char-downcase-map
',(list->vector (append-map (lambda (x) x) pairs))) ',(list->vector
(append-map (lambda (x) x)
(sort (append pairs title-downs) < car))))
out) out)
(newline out) (newline out)
(newline out) (newline out)
@ -62,50 +61,72 @@
',(list->vector ',(list->vector
(append-map (lambda (x) (list (cadr x) (car x))) (append-map (lambda (x) (list (cadr x) (car x)))
(delete-duplicates (delete-duplicates
(sort pairs (sort (append pairs title-ups) < cadr)
(lambda (a b) (< (cadr a) (cadr b)))) (lambda (a b) (eqv? (cadr a) (cadr b)))))))
out)
(newline out)
(newline out)
(write `(define char-foldcase-map
',(list->vector
(append-map (lambda (x) x)
(delete-duplicates
(sort folds < car)
(lambda (a b) (eqv? (cadr a) (cadr b))))))) (lambda (a b) (eqv? (cadr a) (cadr b)))))))
out) out)
(newline out)))))) (newline out))))))
(define (extract-case-folding in out min-count max-char-sets name) ;; value;char;name;category;combining_class;bidi_class;decomposition;numeric1;numeric2;numeric3;bidi_mirrored;unicode1_name;ISO_comment;uppercase_mapping;lowercase_mapping;titlecase_mapping
(define (extract-case-mapping in out min-count max-char-sets name)
(define (string-trim-comment str comment-ch) (define (string-trim-comment str comment-ch)
(car (string-split str comment-ch 2))) (car (string-split str comment-ch 2)))
(define (extract-single-decomposition str)
(and (not (equal? "" str))
(let ((s (string-trim (last (string-split str #\>)))))
(and (not (string-contains s " "))
(string->number s 16)))))
(let ((offset-map (make-hash-table eq?)) (let ((offset-map (make-hash-table eq?))
(extras '())) (title-ups '())
(title-downs '())
(folds '()))
(let lp () (let lp ()
(let ((line (read-line in))) (let ((line (read-line in)))
(cond (cond
((eof-object? line) ((eof-object? line)
(write-offsets offset-map extras out min-count max-char-sets name)) (write-offsets offset-map title-ups title-downs folds out
min-count max-char-sets name))
((or (equal? line "") (eqv? #\# (string-ref line 0))) ((or (equal? line "") (eqv? #\# (string-ref line 0)))
(lp)) (lp))
(else (else
(let ((ls (map string-trim (let ((ls (map string-trim
(string-split (string-trim-comment line #\#) #\;)))) (string-split (string-trim-comment line #\#) #\;))))
(cond (cond
((< (length ls) 3) ((< (length ls) 15)
(warn "invalid CaseFolding.txt line: " line)) (warn "invalid UnicodeData.txt line: " line))
(else (else
(let ((upper (string->number (car ls) 16)) (let ((base (string->number (list-ref ls 0) 16))
(status (string->symbol (cadr ls)))) (upper (string->number (list-ref ls 12) 16))
(lower (string->number (list-ref ls 13) 16))
(folded (extract-single-decomposition (list-ref ls 5))))
(cond (cond
((not upper) ((or upper lower)
(warn "invalid upper char in CaseFolding.txt: " line))
((memv status '(C S))
(let ((lower (string->number (car (cddr ls)) 16)))
;; don't store titlecase mappings
(cond (cond
((not lower) ((iset-contains? char-set:title-case base)
(warn "invalid lower char in CaseFolding.txt: " line)) (if upper
((iset-contains? char-set:title-case upper) (set! title-ups `((,upper ,base) ,@title-ups)))
(set! extras (cons (list upper lower) extras))) (if lower
(set! title-downs `((,base ,lower) ,@title-downs))))
(else (else
(hash-table-update! (hash-table-update!
offset-map offset-map
(- lower upper) (- (or lower base) (or upper base))
(lambda (is) (iset-adjoin! is upper)) (lambda (is) (iset-adjoin! is (or upper base)))
(lambda () (make-iset))))))))))) (lambda () (make-iset)))))))
(cond
((and folded (not (eqv? folded (or lower base))))
;; (write `(fold: ,line ,base ,folded) (current-error-port))
;; (newline (current-error-port))
(set! folds (cons (list base folded) folds)))))))
(lp)))))))) (lp))))))))
(let ((args (command-line))) (let ((args (command-line)))
@ -130,12 +151,12 @@
(else (else
(error "unknown option: " (car ls))))) (error "unknown option: " (car ls)))))
((null? ls) ((null? ls)
(error "usage: extract-case-offsets <CaseFolding.txt>")) (error "usage: extract-case-offsets <UnicodeData.txt>"))
(else (else
(if (equal? "-" (car ls)) (if (equal? "-" (car ls))
(extract-case-folding (extract-case-mapping
(current-input-port) out min-count max-char-sets name) (current-input-port) out min-count max-char-sets name)
(call-with-input-file (car ls) (call-with-input-file (car ls)
(lambda (in) (lambda (in)
(extract-case-folding in out min-count max-char-sets name)))) (extract-case-mapping in out min-count max-char-sets name))))
(close-output-port out))))) (close-output-port out)))))