mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-05-18 21:29:19 +02:00
distinguishing lowercase and foldcase (fixes issue #420)
This commit is contained in:
parent
ed0be227cc
commit
4a7a809c8d
8 changed files with 113 additions and 82 deletions
2
Makefile
2
Makefile
|
@ -198,7 +198,7 @@ lib/chibi/char-set/ascii.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$
|
|||
lib/chibi/char-set/full.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE)
|
||||
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm chibi.char-set.compute > $@
|
||||
|
||||
lib/scheme/char/case-offsets.scm: data/CaseFolding.txt chibi-scheme$(EXE) all-libs
|
||||
lib/scheme/char/case-offsets.scm: data/UnicodeData.txt chibi-scheme$(EXE) all-libs
|
||||
$(CHIBI) tools/extract-case-offsets.scm $< > $@
|
||||
|
||||
########################################################################
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -5,50 +5,47 @@
|
|||
(define (char-numeric? ch) (char-set-contains? char-set:digit ch))
|
||||
(define (char-whitespace? ch) (char-set-contains? char-set:whitespace ch))
|
||||
|
||||
(define (bsearch-kv vec n lo hi)
|
||||
(and (<= lo hi)
|
||||
(let* ((mid (+ lo (* (quotient (- hi lo) 4) 2)))
|
||||
(m (vector-ref vec mid)))
|
||||
(cond
|
||||
((= n m)
|
||||
(integer->char (vector-ref vec (+ mid 1))))
|
||||
((< n m)
|
||||
(bsearch-kv vec n lo (- mid 2)))
|
||||
(else
|
||||
(bsearch-kv vec n (+ mid 2) hi))))))
|
||||
|
||||
(define (char-downcase ch)
|
||||
(let ((n (char->integer ch)))
|
||||
(let lp ((ls char-downcase-offsets))
|
||||
(cond
|
||||
((null? ls)
|
||||
(let lp ((lo 0) (hi (- (vector-length char-downcase-map) 2)))
|
||||
(if (> lo hi)
|
||||
ch
|
||||
(let* ((mid (+ lo (* (quotient (- hi lo) 4) 2)))
|
||||
(m (vector-ref char-downcase-map mid)))
|
||||
(cond
|
||||
((= n m)
|
||||
(integer->char (vector-ref char-downcase-map (+ mid 1))))
|
||||
((< n m)
|
||||
(lp lo (- mid 2)))
|
||||
(else
|
||||
(lp (+ mid 2) hi)))))))
|
||||
(or (bsearch-kv char-downcase-map n 0
|
||||
(- (vector-length char-downcase-map) 2))
|
||||
ch))
|
||||
((iset-contains? (caar ls) n)
|
||||
(integer->char (+ n (cdar ls))))
|
||||
(else (lp (cdr ls)))))))
|
||||
|
||||
(define char-foldcase char-downcase)
|
||||
|
||||
(define (char-upcase ch)
|
||||
(let ((n (char->integer ch)))
|
||||
(let lp ((ls char-downcase-offsets))
|
||||
(cond
|
||||
((null? ls)
|
||||
(let lp ((lo 0) (hi (- (vector-length char-upcase-map) 2)))
|
||||
(if (> lo hi)
|
||||
ch
|
||||
(let* ((mid (+ lo (* (quotient (- hi lo) 4) 2)))
|
||||
(m (vector-ref char-upcase-map mid)))
|
||||
(cond
|
||||
((= n m)
|
||||
(integer->char (vector-ref char-upcase-map (+ mid 1))))
|
||||
((< n m)
|
||||
(lp lo (- mid 2)))
|
||||
(else
|
||||
(lp (+ mid 2) hi)))))))
|
||||
(or (bsearch-kv char-upcase-map n 0
|
||||
(- (vector-length char-upcase-map) 2))
|
||||
ch))
|
||||
((iset-contains? (caar ls) (- n (cdar ls)))
|
||||
(integer->char (- n (cdar ls))))
|
||||
(else (lp (cdr ls)))))))
|
||||
|
||||
(define (char-foldcase ch)
|
||||
(or (bsearch-kv char-foldcase-map (char->integer ch) 0
|
||||
(- (vector-length char-foldcase-map) 2))
|
||||
(char-downcase ch)))
|
||||
|
||||
(define (char-cmp-ci op a ls)
|
||||
(let lp ((op op) (a (char->integer (char-foldcase a))) (ls ls))
|
||||
(if (null? ls)
|
||||
|
@ -70,7 +67,8 @@
|
|||
(val (vector-ref vec 0)))
|
||||
(cond ((< i val) (and (< mid b) (lp a mid)))
|
||||
((> i val) (and (> mid a) (lp mid b)))
|
||||
(else (vector-ref vec off)))))))
|
||||
(else
|
||||
(vector-ref vec (if (>= off (vector-length vec)) 1 off))))))))
|
||||
|
||||
(define (call-with-output-string proc)
|
||||
(let ((out (open-output-string)))
|
||||
|
@ -86,13 +84,15 @@
|
|||
(cond
|
||||
((not (eof-object? ch))
|
||||
(write-string
|
||||
(if (and (not fold?) (eqv? ch #\x03A3))
|
||||
(cond
|
||||
((and (not fold?) (eqv? ch #\x03A3))
|
||||
(let ((ch2 (peek-char in)))
|
||||
(if (or (eof-object? ch2)
|
||||
(not (char-set-contains? char-set:letter ch2)))
|
||||
#\x03C2
|
||||
#\x03C3))
|
||||
(or (char-get-special-case ch 1) (char-downcase ch)))
|
||||
#\x03C3)))
|
||||
((char-get-special-case ch (if fold? 4 1)))
|
||||
(else (if fold? (char-foldcase ch) (char-downcase ch))))
|
||||
out)
|
||||
(lp)))))))))
|
||||
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
;; file SpecialCasing.txt.
|
||||
|
||||
(define special-cases
|
||||
;; <code> <lower> <title> <upper>
|
||||
#(#(223 "ß" "Ss" "SS")
|
||||
;; <code> <lower> <title> <upper> [<fold>]
|
||||
#(#(223 "ß" "Ss" "SS" "ss")
|
||||
#(304 "i̇" "İ" "İ")
|
||||
#(329 "ʼn" "ʼN" "ʼN")
|
||||
#(496 "ǰ" "J̌" "J̌")
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
|
||||
(test-group "titlecase/char"
|
||||
(test #\x01C5 (char-titlecase #\x01C4))
|
||||
(test #\x01C5 (char-titlecase #\x01C5))
|
||||
(test #\x01C5 (char-titlecase #\x01C6))
|
||||
(test #\x1F88 (char-titlecase #\x1F80))
|
||||
(test #\x01C7 (char-upcase #\x01C7))
|
||||
|
|
|
@ -5,7 +5,8 @@
|
|||
|
||||
;;> Returns the single-character titlecase mapping of argument
|
||||
(define (char-titlecase ch)
|
||||
(cond ((assv ch title-single-map) => cadr)
|
||||
(cond ((char-set-contains? char-set:title-case ch) ch)
|
||||
((assv ch title-single-map) => cadr)
|
||||
(else (char-upcase ch))))
|
||||
|
||||
;; Returns #t if a character is caseless, otherwise #f
|
||||
|
@ -29,7 +30,7 @@
|
|||
;; ch has multiple- or single-character titlecase mapping
|
||||
(lp n2 #f (append-reverse (cdr multi-title) result))
|
||||
;; ch has single-character uppercase mapping
|
||||
(lp n2 (char-caseless? ch) (cons (char-upcase ch) result))))
|
||||
(lp n2 (char-caseless? ch) (cons (char-titlecase ch) result))))
|
||||
;; ch must be lowercased
|
||||
(let ((multi-downcase (assv ch lower-multiple-map)))
|
||||
(if multi-downcase
|
||||
|
|
|
@ -1320,10 +1320,14 @@
|
|||
(test "SSA" (string-upcase "ßa"))
|
||||
(test "ßa" (string-downcase "ßa"))
|
||||
(test "ssa" (string-downcase "SSA"))
|
||||
(test "maß" (string-downcase "Maß"))
|
||||
(test "mass" (string-foldcase "Maß"))
|
||||
(test "İ" (string-upcase "İ"))
|
||||
(test "i\x0307;" (string-downcase "İ"))
|
||||
(test "i\x0307;" (string-foldcase "İ"))
|
||||
(test "J̌" (string-upcase "ǰ"))
|
||||
(test "ſ" (string-downcase "ſ"))
|
||||
(test "s" (string-foldcase "ſ"))
|
||||
|
||||
;; context-sensitive (final sigma)
|
||||
(test "ΓΛΏΣΣΑ" (string-upcase "γλώσσα"))
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
;; Extract sets of char case offsets.
|
||||
;;
|
||||
;; Usage:
|
||||
;; extract-case-offsets.scm options CaseFolding.txt > out
|
||||
;; extract-case-offsets.scm options UnicodeData.txt > out
|
||||
;;
|
||||
;; Recognized options are:
|
||||
;;
|
||||
|
@ -23,7 +23,7 @@
|
|||
(for-each (lambda (x) (display x err)) args)
|
||||
(newline err)))
|
||||
|
||||
(define (write-offsets offset-map extras out min-count max-char-sets name)
|
||||
(define (write-offsets offset-map title-ups title-downs folds out min-count max-char-sets name)
|
||||
(let lp ((ls (sort (hash-table->alist offset-map)
|
||||
(lambda (a b) (> (iset-size (cdr a)) (iset-size (cdr b))))))
|
||||
(i 0)
|
||||
|
@ -38,23 +38,22 @@
|
|||
,(caar ls))
|
||||
res)))
|
||||
(else
|
||||
(display ";; auto-generated by extract-case-offsets.scm\n\n")
|
||||
(write `(define ,(string->symbol name)
|
||||
(list ,@(reverse res)))
|
||||
out)
|
||||
(newline out)
|
||||
(newline out)
|
||||
(let ((pairs
|
||||
(sort
|
||||
(append
|
||||
extras
|
||||
(append-map
|
||||
(lambda (x)
|
||||
(map (lambda (y) (list y (+ y (car x))))
|
||||
(iset->list (cdr x))))
|
||||
ls))
|
||||
(lambda (a b) (< (car a) (car b))))))
|
||||
ls)))
|
||||
(write `(define char-downcase-map
|
||||
',(list->vector (append-map (lambda (x) x) pairs)))
|
||||
',(list->vector
|
||||
(append-map (lambda (x) x)
|
||||
(sort (append pairs title-downs) < car))))
|
||||
out)
|
||||
(newline out)
|
||||
(newline out)
|
||||
|
@ -62,50 +61,72 @@
|
|||
',(list->vector
|
||||
(append-map (lambda (x) (list (cadr x) (car x)))
|
||||
(delete-duplicates
|
||||
(sort pairs
|
||||
(lambda (a b) (< (cadr a) (cadr b))))
|
||||
(sort (append pairs title-ups) < cadr)
|
||||
(lambda (a b) (eqv? (cadr a) (cadr b)))))))
|
||||
out)
|
||||
(newline out)
|
||||
(newline out)
|
||||
(write `(define char-foldcase-map
|
||||
',(list->vector
|
||||
(append-map (lambda (x) x)
|
||||
(delete-duplicates
|
||||
(sort folds < car)
|
||||
(lambda (a b) (eqv? (cadr a) (cadr b)))))))
|
||||
out)
|
||||
(newline out))))))
|
||||
|
||||
(define (extract-case-folding in out min-count max-char-sets name)
|
||||
;; value;char;name;category;combining_class;bidi_class;decomposition;numeric1;numeric2;numeric3;bidi_mirrored;unicode1_name;ISO_comment;uppercase_mapping;lowercase_mapping;titlecase_mapping
|
||||
|
||||
(define (extract-case-mapping in out min-count max-char-sets name)
|
||||
(define (string-trim-comment str comment-ch)
|
||||
(car (string-split str comment-ch 2)))
|
||||
(define (extract-single-decomposition str)
|
||||
(and (not (equal? "" str))
|
||||
(let ((s (string-trim (last (string-split str #\>)))))
|
||||
(and (not (string-contains s " "))
|
||||
(string->number s 16)))))
|
||||
(let ((offset-map (make-hash-table eq?))
|
||||
(extras '()))
|
||||
(title-ups '())
|
||||
(title-downs '())
|
||||
(folds '()))
|
||||
(let lp ()
|
||||
(let ((line (read-line in)))
|
||||
(cond
|
||||
((eof-object? line)
|
||||
(write-offsets offset-map extras out min-count max-char-sets name))
|
||||
(write-offsets offset-map title-ups title-downs folds out
|
||||
min-count max-char-sets name))
|
||||
((or (equal? line "") (eqv? #\# (string-ref line 0)))
|
||||
(lp))
|
||||
(else
|
||||
(let ((ls (map string-trim
|
||||
(string-split (string-trim-comment line #\#) #\;))))
|
||||
(cond
|
||||
((< (length ls) 3)
|
||||
(warn "invalid CaseFolding.txt line: " line))
|
||||
((< (length ls) 15)
|
||||
(warn "invalid UnicodeData.txt line: " line))
|
||||
(else
|
||||
(let ((upper (string->number (car ls) 16))
|
||||
(status (string->symbol (cadr ls))))
|
||||
(let ((base (string->number (list-ref ls 0) 16))
|
||||
(upper (string->number (list-ref ls 12) 16))
|
||||
(lower (string->number (list-ref ls 13) 16))
|
||||
(folded (extract-single-decomposition (list-ref ls 5))))
|
||||
(cond
|
||||
((not upper)
|
||||
(warn "invalid upper char in CaseFolding.txt: " line))
|
||||
((memv status '(C S))
|
||||
(let ((lower (string->number (car (cddr ls)) 16)))
|
||||
;; don't store titlecase mappings
|
||||
((or upper lower)
|
||||
(cond
|
||||
((not lower)
|
||||
(warn "invalid lower char in CaseFolding.txt: " line))
|
||||
((iset-contains? char-set:title-case upper)
|
||||
(set! extras (cons (list upper lower) extras)))
|
||||
((iset-contains? char-set:title-case base)
|
||||
(if upper
|
||||
(set! title-ups `((,upper ,base) ,@title-ups)))
|
||||
(if lower
|
||||
(set! title-downs `((,base ,lower) ,@title-downs))))
|
||||
(else
|
||||
(hash-table-update!
|
||||
offset-map
|
||||
(- lower upper)
|
||||
(lambda (is) (iset-adjoin! is upper))
|
||||
(lambda () (make-iset)))))))))))
|
||||
(- (or lower base) (or upper base))
|
||||
(lambda (is) (iset-adjoin! is (or upper base)))
|
||||
(lambda () (make-iset)))))))
|
||||
(cond
|
||||
((and folded (not (eqv? folded (or lower base))))
|
||||
;; (write `(fold: ,line ,base ,folded) (current-error-port))
|
||||
;; (newline (current-error-port))
|
||||
(set! folds (cons (list base folded) folds)))))))
|
||||
(lp))))))))
|
||||
|
||||
(let ((args (command-line)))
|
||||
|
@ -130,12 +151,12 @@
|
|||
(else
|
||||
(error "unknown option: " (car ls)))))
|
||||
((null? ls)
|
||||
(error "usage: extract-case-offsets <CaseFolding.txt>"))
|
||||
(error "usage: extract-case-offsets <UnicodeData.txt>"))
|
||||
(else
|
||||
(if (equal? "-" (car ls))
|
||||
(extract-case-folding
|
||||
(extract-case-mapping
|
||||
(current-input-port) out min-count max-char-sets name)
|
||||
(call-with-input-file (car ls)
|
||||
(lambda (in)
|
||||
(extract-case-folding in out min-count max-char-sets name))))
|
||||
(extract-case-mapping in out min-count max-char-sets name))))
|
||||
(close-output-port out)))))
|
||||
|
|
Loading…
Add table
Reference in a new issue