updating to Unicode 9 and adding SRFI 129

This commit is contained in:
Alex Shinn 2017-03-28 23:42:47 +09:00
parent fad7662d83
commit 6e2013153a
12 changed files with 239 additions and 51 deletions

File diff suppressed because one or more lines are too long

View file

@ -166,12 +166,12 @@
(define (iset-adjoin-node-left! iset node) (define (iset-adjoin-node-left! iset node)
(if (iset-left iset) (if (iset-left iset)
(iset-adjoin-node! (iset-left iset) node) (iset-adjoin-node! (iset-left iset) node)
(iset-left-set! iset node))) (iset-left-set! iset (iset-copy-node node))))
(define (iset-adjoin-node-right! iset node) (define (iset-adjoin-node-right! iset node)
(if (iset-right iset) (if (iset-right iset)
(iset-adjoin-node! (iset-right iset) node) (iset-adjoin-node! (iset-right iset) node)
(iset-right-set! iset node))) (iset-right-set! iset (iset-copy-node node))))
;; start and/or end are inside the node, split into: ;; start and/or end are inside the node, split into:
;; 1. node before start, if any ;; 1. node before start, if any

View file

@ -1,5 +1,5 @@
(define char-downcase-offsets (list (cons (%make-iset 8579 8579 #f (%make-iset 880 1318 727785128884533462021393239927996327852405488163715491688443665188997571489595753887351274307852390522542297450010406753856251110621253 (%make-iset 256 590 46622255956467149554182616418411281498693699703261745055828801286035997868185593113689917615708132693 #f #f) (%make-iset 7680 7934 154389452316421593898094646677763852341147247234746543203694500835710989587797 #f #f)) (%make-iset 42560 42646 340282366921041625127166164553021347157 (%make-iset 11360 11506 6277101735386773453122860106355103562402757378851211577985 #f #f) (%make-iset 42786 42920 6277101735386680792844867735842886829046547597890503906645 #f #f))) 1) (cons (%make-iset 913 1071 6277101736117431582331099698849039600728809650576250765311 (%make-iset 65 222 6277101735750628925305668009417060106580289400525817905151 #f #f) (%make-iset 65313 65338 #f #f #f)) 32) (cons (%make-iset 11264 11310 #f (%make-iset 1329 1366 #f #f #f) #f) 48) (cons (%make-iset 7944 8169 80881074129870197631446573842447034610982086425645202976396910985471 #f #f) -8) (cons (%make-iset 9398 9423 #f #f #f) 26) (cons (%make-iset 4256 4301 36009005809663 #f #f) 7264) (cons (%make-iset 66560 66599 #f #f #f) 40))) (define char-downcase-offsets (list (cons (%make-iset 8579 8579 #f (%make-iset 880 1326 242279574765202296849774602627805799045871610517108506109792008705054922007599484790559595901393133839376706845824795712315560179007557 (%make-iset 256 590 46620119969431228644100221389637723687450896370752335113822859777025626952209321538335201863828591957 #f #f) (%make-iset 7680 7934 38597363079105398474523661669075944487877262569105979164236916827797859947861 #f #f)) (%make-iset 42560 42650 1650586712898282464877040981 (%make-iset 11360 11506 92689287070683147437146300401934387177065089 #f #f) (%make-iset 42786 42934 446043941378096758217146850907508628813845845 #f #f))) 1) (cons (%make-iset 65313 65338 #f (%make-iset 913 1071 730750818495310275641373184626454206112216252415 (%make-iset 65 222 363948161469878586209393690477933956061783392255 #f #f) #f) (%make-iset 71840 71871 #f #f #f)) 32) (cons (%make-iset 11264 11310 #f (%make-iset 1329 1366 #f #f #f) #f) 48) (cons (%make-iset 43888 43967 #f #f #f) -38864) (cons (%make-iset 66736 66771 #f (%make-iset 66560 66599 #f #f #f) #f) 40) (cons (%make-iset 7944 8169 80881074129870197631446573842106752244061147962181828368965142774015 (%make-iset 5112 5117 #f #f #f) #f) -8) (cons (%make-iset 68736 68786 #f (%make-iset 908 908 #f #f #f) #f) 64) (cons (%make-iset 4256 4301 36009005809663 #f #f) 7264) (cons (%make-iset 125184 125217 #f #f #f) 34) (cons (%make-iset 9398 9423 #f #f #f) 26)))
(define char-downcase-map (quote #(181 956 376 255 383 115 385 595 390 596 393 598 394 599 398 477 399 601 400 603 403 608 404 611 406 617 407 616 412 623 413 626 415 629 422 640 425 643 430 648 433 650 434 651 439 658 452 454 455 457 458 460 497 499 502 405 503 447 544 414 570 11365 573 410 574 11366 579 384 580 649 581 652 837 953 902 940 904 941 905 942 906 943 908 972 910 973 911 974 975 983 976 946 977 952 981 966 982 960 1008 954 1009 961 1012 952 1013 949 1017 1010 1021 891 1022 892 1023 893 1024 1104 1025 1105 1026 1106 1027 1107 1028 1108 1029 1109 1030 1110 1031 1111 1032 1112 1033 1113 1034 1114 1035 1115 1036 1116 1037 1117 1038 1118 1039 1119 1216 1231 7835 7777 8122 8048 8123 8049 8126 953 8136 8050 8137 8051 8138 8052 8139 8053 8154 8054 8155 8055 8170 8058 8171 8059 8172 8165 8184 8056 8185 8057 8186 8060 8187 8061 8486 969 8490 107 8491 229 8498 8526 8544 8560 8545 8561 8546 8562 8547 8563 8548 8564 8549 8565 8550 8566 8551 8567 8552 8568 8553 8569 8554 8570 8555 8571 8556 8572 8557 8573 8558 8574 8559 8575 11362 619 11363 7549 11364 637 11373 593 11374 625 11375 592 11376 594 11390 575 11391 576 42877 7545 42893 613 42922 614))) (define char-downcase-map (quote #(181 956 376 255 383 115 385 595 390 596 393 598 394 599 398 477 399 601 400 603 403 608 404 611 406 617 407 616 412 623 413 626 415 629 422 640 425 643 430 648 433 650 434 651 439 658 452 454 453 454 455 457 456 457 458 460 459 460 497 499 498 499 502 405 503 447 544 414 570 11365 573 410 574 11366 579 384 580 649 581 652 837 953 895 1011 902 940 904 941 905 942 906 943 910 973 911 974 975 983 976 946 977 952 981 966 982 960 1008 954 1009 961 1012 952 1013 949 1017 1010 1021 891 1022 892 1023 893 1024 1104 1025 1105 1026 1106 1027 1107 1028 1108 1029 1109 1030 1110 1031 1111 1032 1112 1033 1113 1034 1114 1035 1115 1036 1116 1037 1117 1038 1118 1039 1119 1216 1231 7296 1074 7297 1076 7298 1086 7299 1089 7300 1090 7301 1090 7302 1098 7303 1123 7304 42571 7835 7777 7838 223 8072 8064 8073 8065 8074 8066 8075 8067 8076 8068 8077 8069 8078 8070 8079 8071 8088 8080 8089 8081 8090 8082 8091 8083 8092 8084 8093 8085 8094 8086 8095 8087 8104 8096 8105 8097 8106 8098 8107 8099 8108 8100 8109 8101 8110 8102 8111 8103 8122 8048 8123 8049 8124 8115 8126 953 8136 8050 8137 8051 8138 8052 8139 8053 8140 8131 8154 8054 8155 8055 8170 8058 8171 8059 8172 8165 8184 8056 8185 8057 8186 8060 8187 8061 8188 8179 8486 969 8490 107 8491 229 8498 8526 8544 8560 8545 8561 8546 8562 8547 8563 8548 8564 8549 8565 8550 8566 8551 8567 8552 8568 8553 8569 8554 8570 8555 8571 8556 8572 8557 8573 8558 8574 8559 8575 11362 619 11363 7549 11364 637 11373 593 11374 625 11375 592 11376 594 11390 575 11391 576 42877 7545 42893 613 42922 614 42923 604 42924 609 42925 620 42926 618 42928 670 42929 647 42930 669 42931 43859)))
(define char-upcase-map (quote #(107 8490 115 383 229 8491 255 376 384 579 405 502 410 573 414 544 447 503 454 452 457 455 460 458 477 398 499 497 575 11390 576 11391 592 11375 593 11373 594 11376 595 385 596 390 598 393 599 394 601 399 603 400 608 403 611 404 613 42893 614 42922 616 407 617 406 619 11362 623 412 625 11374 626 413 629 415 637 11364 640 422 643 425 648 430 649 580 650 433 651 434 652 581 658 439 891 1021 892 1022 893 1023 940 902 941 904 942 905 943 906 946 976 949 1013 1010 1017 952 1012 952 977 953 837 953 8126 954 1008 956 181 960 982 961 1009 966 981 969 8486 972 908 973 910 974 911 983 975 1104 1024 1105 1025 1106 1026 1107 1027 1108 1028 1109 1029 1110 1030 1111 1031 1112 1032 1113 1033 1114 1034 1115 1035 1116 1036 1117 1037 1118 1038 1119 1039 1231 1216 7545 42877 7549 11363 7777 7835 8048 8122 8049 8123 8050 8136 8051 8137 8052 8138 8053 8139 8054 8154 8055 8155 8056 8184 8057 8185 8058 8170 8059 8171 8060 8186 8061 8187 8165 8172 8526 8498 8560 8544 8561 8545 8562 8546 8563 8547 8564 8548 8565 8549 8566 8550 8567 8551 8568 8552 8569 8553 8570 8554 8571 8555 8572 8556 8573 8557 8574 8558 8575 8559 11365 570 11366 574))) (define char-upcase-map (quote #(107 8490 115 383 223 7838 229 8491 255 376 384 579 405 502 410 573 414 544 447 503 454 452 456 455 457 455 460 458 477 398 499 497 575 11390 576 11391 592 11375 593 11373 594 11376 595 385 596 390 598 393 599 394 601 399 603 400 604 42923 608 403 609 42924 611 404 613 42893 614 42922 616 407 617 406 618 42926 619 11362 620 42925 623 412 625 11374 626 413 629 415 637 11364 640 422 643 425 647 42929 648 430 649 580 650 433 651 434 652 581 658 439 669 42930 670 42928 891 1021 892 1022 893 1023 940 902 941 904 942 905 943 906 946 976 949 1013 952 977 953 837 954 1008 956 181 960 982 961 1009 966 981 969 8486 973 910 974 911 983 975 1010 1017 1011 895 1074 7296 1076 7297 1086 7298 1089 7299 1090 7300 1098 7302 1104 1024 1105 1025 1106 1026 1107 1027 1108 1028 1109 1029 1110 1030 1111 1031 1112 1032 1113 1033 1114 1034 1115 1035 1116 1036 1117 1037 1118 1038 1119 1039 1123 7303 1231 1216 7545 42877 7549 11363 7777 7835 8048 8122 8049 8123 8050 8136 8051 8137 8052 8138 8053 8139 8054 8154 8055 8155 8056 8184 8057 8185 8058 8170 8059 8171 8060 8186 8061 8187 8064 8072 8065 8073 8066 8074 8067 8075 8068 8076 8069 8077 8070 8078 8071 8079 8080 8088 8081 8089 8082 8090 8083 8091 8084 8092 8085 8093 8086 8094 8087 8095 8096 8104 8097 8105 8098 8106 8099 8107 8100 8108 8101 8109 8102 8110 8103 8111 8115 8124 8131 8140 8165 8172 8179 8188 8526 8498 8560 8544 8561 8545 8562 8546 8563 8547 8564 8548 8565 8549 8566 8550 8567 8551 8568 8552 8569 8553 8570 8554 8571 8555 8572 8556 8573 8557 8574 8558 8575 8559 11365 570 11366 574 42571 7304 43859 42931)))

View file

@ -10,7 +10,7 @@
(let lp ((ls char-downcase-offsets)) (let lp ((ls char-downcase-offsets))
(cond (cond
((null? ls) ((null? ls)
(let lp ((lo 0) (hi (vector-length char-downcase-map))) (let lp ((lo 0) (hi (- (vector-length char-downcase-map) 2)))
(if (> lo hi) (if (> lo hi)
ch ch
(let* ((mid (+ lo (* (quotient (- hi lo) 4) 2))) (let* ((mid (+ lo (* (quotient (- hi lo) 4) 2)))
@ -20,8 +20,6 @@
(integer->char (vector-ref char-downcase-map (+ mid 1)))) (integer->char (vector-ref char-downcase-map (+ mid 1))))
((< n m) ((< n m)
(lp lo (- mid 2))) (lp lo (- mid 2)))
((= (+ mid 2) hi)
ch)
(else (else
(lp (+ mid 2) hi))))))) (lp (+ mid 2) hi)))))))
((iset-contains? (caar ls) n) ((iset-contains? (caar ls) n)
@ -35,7 +33,7 @@
(let lp ((ls char-downcase-offsets)) (let lp ((ls char-downcase-offsets))
(cond (cond
((null? ls) ((null? ls)
(let lp ((lo 0) (hi (vector-length char-upcase-map))) (let lp ((lo 0) (hi (- (vector-length char-upcase-map) 2)))
(if (> lo hi) (if (> lo hi)
ch ch
(let* ((mid (+ lo (* (quotient (- hi lo) 4) 2))) (let* ((mid (+ lo (* (quotient (- hi lo) 4) 2)))
@ -45,8 +43,6 @@
(integer->char (vector-ref char-upcase-map (+ mid 1)))) (integer->char (vector-ref char-upcase-map (+ mid 1))))
((< n m) ((< n m)
(lp lo (- mid 2))) (lp lo (- mid 2)))
((= (+ mid 2) hi)
ch)
(else (else
(lp (+ mid 2) hi))))))) (lp (+ mid 2) hi)))))))
((iset-contains? (caar ls) (- n (cdar ls))) ((iset-contains? (caar ls) (- n (cdar ls)))

View file

@ -1,5 +1,5 @@
;; Unconditoinal non-1-to-1 case mappings derived from Unicode data ;; Unconditional non-1-to-1 case mappings derived from Unicode data
;; file SpecialCasing.txt. ;; file SpecialCasing.txt.
(define special-cases (define special-cases

10
lib/srfi/129.sld Normal file
View file

@ -0,0 +1,10 @@
;; Adaptation of John Cowan's reference impl for chibi, using the
;; our own char-set:title-case.
(define-library (srfi 129)
(import (scheme base) (scheme char)
(srfi 1)
(chibi char-set) (chibi char-set full) (chibi string))
(export char-title-case? char-titlecase string-titlecase)
(include "129/titlemaps.scm" "129/titlecase.scm"))

57
lib/srfi/129/test.sld Normal file
View file

@ -0,0 +1,57 @@
(define-library (srfi 129 test)
(export run-tests)
(import (scheme base) (scheme char) (srfi 129) (chibi test))
(begin
(define Floo "\xFB02;oo")
(define Floo-bar "\xFB02;oo bar")
(define Baffle "Ba\xFB04;e")
(define LJUBLJANA "\x01C7;ub\x01C7;ana")
(define Ljubljana "\x01C8;ub\x01C9;ana")
(define ljubljana "\x01C9;ub\x01C9;ana")
(define (run-tests)
(test-begin "titlecase")
(test-group "titlecase/predicate"
(test-assert (char-title-case? #\x01C5))
(test-assert (char-title-case? #\x1FFC))
(test-assert (char-upper-case? #\x01C7))
(test-assert (char-title-case? #\x01C8))
(test-assert (char-lower-case? #\x01C9))
(test-assert (not (char-title-case? #\Z)))
(test-assert (not (char-title-case? #\z))))
(test-group "titlecase/char"
(test #\x01C5 (char-titlecase #\x01C4))
(test #\x01C5 (char-titlecase #\x01C6))
(test #\x1F88 (char-titlecase #\x1F80))
(test #\x01C7 (char-upcase #\x01C7))
(test #\x01C7 (char-upcase #\x01C8))
(test #\x01C7 (char-upcase #\x01C9))
(test #\x01C8 (char-titlecase #\x01C7))
(test #\x01C8 (char-titlecase #\x01C8))
(test #\x01C8 (char-titlecase #\x01C9))
(test #\x01C9 (char-downcase #\x01C7))
(test #\x01C9 (char-downcase #\x01C8))
(test #\x01C9 (char-downcase #\x01C9))
(test #\Z (char-titlecase #\Z))
(test #\Z (char-titlecase #\z)))
(test-group "titlecase/string"
(test "\x01C5" (string-titlecase "\x01C5"))
(test "\x01C5" (string-titlecase "\x01C4"))
(test "Ss" (string-titlecase "\x00DF"))
(test "Xi\x0307" (string-titlecase "x\x0130"))
(test "\x1F88" (string-titlecase "\x1F80"))
(test "\x1F88" (string-titlecase "\x1F88"))
(test "Bar Baz" (string-titlecase "bAr baZ"))
(test "Floo" (string-titlecase "floo"))
(test "Floo" (string-titlecase "FLOO"))
(test "Floo" (string-titlecase Floo))
(test "Floo Bar" (string-titlecase "floo bar"))
(test "Floo Bar" (string-titlecase "FLOO BAR"))
(test "Floo Bar" (string-titlecase Floo-bar))
(test Baffle (string-titlecase Baffle))
(test Ljubljana (string-titlecase LJUBLJANA))
(test Ljubljana (string-titlecase Ljubljana))
(test Ljubljana (string-titlecase ljubljana)))
(test-end))))

View file

@ -0,0 +1,39 @@
;;> Returns #t if argument is a titlecase character, #f if not
(define (char-title-case? ch)
(char-set-contains? char-set:title-case ch))
;;> Returns the single-character titlecase mapping of argument
(define (char-titlecase ch)
(cond ((assv ch title-single-map) => cadr)
(else (char-upcase ch))))
;; Returns #t if a character is caseless, otherwise #f
(define (char-caseless? ch)
(not (or (char-lower-case? ch) (char-upper-case? ch) (char-title-case? ch))))
;;> Returns the string titlecase mapping of argument
(define (string-titlecase str)
(let ((end (string-cursor-end str)))
(let lp ((n (string-cursor-start str))
(prev-caseless? #t)
(result '()))
(if (string-cursor>=? str n end)
(list->string (reverse result))
(let ((ch (string-cursor-ref str n))
(n2 (string-cursor-next str n)))
(if prev-caseless?
;; ch must be titlecased
(let ((multi-title (assv ch title-multiple-map)))
(if multi-title
;; ch has multiple- or single-character titlecase mapping
(lp n2 #f (append-reverse (cdr multi-title) result))
;; ch has single-character uppercase mapping
(lp n2 (char-caseless? ch) (cons (char-upcase ch) result))))
;; ch must be lowercased
(let ((multi-downcase (assv ch lower-multiple-map)))
(if multi-downcase
;; ch has multiple-character lowercase mapping
(lp n2 #f (append-reverse (cdr multi-downcase) result))
;; ch has single-character lowercase mapping
(lp n2 (char-caseless? ch) (cons (char-downcase ch) result))))))))))

View file

@ -0,0 +1,70 @@
(define title-single-map
'((#\x01C4 #\x01C5) ; LATIN CAPITAL LETTER DZ WITH CARON
(#\x01C6 #\x01C5) ; LATIN SMALL LETTER DZ WITH CARON
(#\x01C7 #\x01C8) ; LATIN CAPITAL LETTER LJ
(#\x01C8 #\x01C8) ; LATIN CAPITAL LETTER L WITH SMALL LETTER J
(#\x01C9 #\x01C8) ; LATIN SMALL LETTER LJ
(#\x01CA #\x01CB) ; LATIN CAPITAL LETTER NJ
(#\x01CC #\x01CB) ; LATIN SMALL LETTER NJ
(#\x01F1 #\x01F2) ; LATIN CAPITAL LETTER DZ
(#\x01F3 #\x01F2) ; LATIN SMALL LETTER DZ
))
;; Alist mapping characters to their multiple-letter titlecase equivalents
(define title-multiple-map
(append
title-single-map
'((#\x00DF #\x0053 #\x0073) ; LATIN SMALL LETTER SHARP S
(#\xFB00 #\x0046 #\x0066) ; LATIN SMALL LIGATURE FF
(#\xFB01 #\x0046 #\x0069) ; LATIN SMALL LIGATURE FI
(#\xFB02 #\x0046 #\x006C) ; LATIN SMALL LIGATURE FL
(#\xFB03 #\x0046 #\x0066 #\x0069) ; LATIN SMALL LIGATURE FFI
(#\xFB04 #\x0046 #\x0066 #\x006C) ; LATIN SMALL LIGATURE FFL
(#\xFB05 #\x0053 #\x0074) ; LATIN SMALL LIGATURE LONG S T
(#\xFB06 #\x0053 #\x0074) ; LATIN SMALL LIGATURE ST
(#\x0587 #\x0535 #\x0582) ; ARMENIAN SMALL LIGATURE ECH YIWN
(#\xFB13 #\x0544 #\x0576) ; ARMENIAN SMALL LIGATURE MEN NOW
(#\xFB14 #\x0544 #\x0565) ; ARMENIAN SMALL LIGATURE MEN ECH
(#\xFB15 #\x0544 #\x056B) ; ARMENIAN SMALL LIGATURE MEN INI
(#\xFB16 #\x054E #\x0576) ; ARMENIAN SMALL LIGATURE VEW NOW
(#\xFB17 #\x0544 #\x056D) ; ARMENIAN SMALL LIGATURE MEN XEH
(#\x0149 #\x02BC #\x004E) ; LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
(#\x0390 #\x0399 #\x0308 #\x0301) ; GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
(#\x03B0 #\x03A5 #\x0308 #\x0301) ; GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
(#\x01F0 #\x004A #\x030C) ; LATIN SMALL LETTER J WITH CARON
(#\x1E96 #\x0048 #\x0331) ; LATIN SMALL LETTER H WITH LINE BELOW
(#\x1E97 #\x0054 #\x0308) ; LATIN SMALL LETTER T WITH DIAERESIS
(#\x1E98 #\x0057 #\x030A) ; LATIN SMALL LETTER W WITH RING ABOVE
(#\x1E99 #\x0059 #\x030A) ; LATIN SMALL LETTER Y WITH RING ABOVE
(#\x1E9A #\x0041 #\x02BE) ; LATIN SMALL LETTER A WITH RIGHT HALF RING
(#\x1F50 #\x03A5 #\x0313) ; GREEK SMALL LETTER UPSILON WITH PSILI
(#\x1F52 #\x03A5 #\x0313 #\x0300) ; GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
(#\x1F54 #\x03A5 #\x0313 #\x0301) ; GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
(#\x1F56 #\x03A5 #\x0313 #\x0342) ; GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
(#\x1FB6 #\x0391 #\x0342) ; GREEK SMALL LETTER ALPHA WITH PERISPOMENI
(#\x1FC6 #\x0397 #\x0342) ; GREEK SMALL LETTER ETA WITH PERISPOMENI
(#\x1FD2 #\x0399 #\x0308 #\x0300) ; GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
(#\x1FD3 #\x0399 #\x0308 #\x0301) ; GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
(#\x1FD6 #\x0399 #\x0342) ; GREEK SMALL LETTER IOTA WITH PERISPOMENI
(#\x1FD7 #\x0399 #\x0308 #\x0342) ; GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
(#\x1FE2 #\x03A5 #\x0308 #\x0300) ; GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
(#\x1FE3 #\x03A5 #\x0308 #\x0301) ; GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
(#\x1FE4 #\x03A1 #\x0313) ; GREEK SMALL LETTER RHO WITH PSILI
(#\x1FE6 #\x03A5 #\x0342) ; GREEK SMALL LETTER UPSILON WITH PERISPOMENI
(#\x1FE7 #\x03A5 #\x0308 #\x0342) ; GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
(#\x1FF6 #\x03A9 #\x0342) ; GREEK SMALL LETTER OMEGA WITH PERISPOMENI
(#\x1FB2 #\x1FBA #\x0345) ; GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
(#\x1FB4 #\x0386 #\x0345) ; GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
(#\x1FC2 #\x1FCA #\x0345) ; GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
(#\x1FC4 #\x0389 #\x0345) ; GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
(#\x1FF2 #\x1FFA #\x0345) ; GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
(#\x1FF4 #\x038F #\x0345) ; GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
(#\x1FB7 #\x0391 #\x0342 #\x0345) ; GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
(#\x1FC7 #\x0397 #\x0342 #\x0345) ; GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
(#\x1FF7 #\x03A9 #\x0342 #\x0345) ; GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
)))
;; Alist mapping characters to their multiple-character lowercase equivalents
(define lower-multiple-map
'((#\x0130 #\x0069 #\x0307))) ; LATIN CAPITAL LETTER I WITH DOT ABOVE

View file

@ -15,15 +15,15 @@
;; "char-downcase-offsets" ;; "char-downcase-offsets"
;; -o <output-file> - the output file, defaults to stdout ;; -o <output-file> - the output file, defaults to stdout
(import (chibi) (srfi 1) (srfi 69) (srfi 95) (chibi io) (chibi string) (import (chibi) (srfi 1) (srfi 69) (srfi 95) (chibi char-set full)
(chibi iset) (chibi iset optimize)) (chibi io) (chibi iset) (chibi iset optimize) (chibi string))
(define (warn . args) (define (warn . args)
(let ((err (current-error-port))) (let ((err (current-error-port)))
(for-each (lambda (x) (display x err)) args) (for-each (lambda (x) (display x err)) args)
(newline err))) (newline err)))
(define (write-offsets offset-map out min-count max-char-sets name) (define (write-offsets offset-map extras out min-count max-char-sets name)
(let lp ((ls (sort (hash-table->alist offset-map) (let lp ((ls (sort (hash-table->alist offset-map)
(lambda (a b) (> (iset-size (cdr a)) (iset-size (cdr b)))))) (lambda (a b) (> (iset-size (cdr a)) (iset-size (cdr b))))))
(i 0) (i 0)
@ -38,16 +38,20 @@
,(caar ls)) ,(caar ls))
res))) res)))
(else (else
(write `(define ,(string->symbol name) (list ,@(reverse res))) out) (write `(define ,(string->symbol name)
(list ,@(reverse res)))
out)
(newline out) (newline out)
(newline out) (newline out)
(let ((pairs (let ((pairs
(sort (sort
(append
extras
(append-map (append-map
(lambda (x) (lambda (x)
(map (lambda (y) (list y (+ y (car x)))) (map (lambda (y) (list y (+ y (car x))))
(iset->list (cdr x)))) (iset->list (cdr x))))
ls) ls))
(lambda (a b) (< (car a) (car b)))))) (lambda (a b) (< (car a) (car b))))))
(write `(define char-downcase-map (write `(define char-downcase-map
',(list->vector (append-map (lambda (x) x) pairs))) ',(list->vector (append-map (lambda (x) x) pairs)))
@ -57,20 +61,23 @@
(write `(define char-upcase-map (write `(define char-upcase-map
',(list->vector ',(list->vector
(append-map (lambda (x) (list (cadr x) (car x))) (append-map (lambda (x) (list (cadr x) (car x)))
(delete-duplicates
(sort pairs (sort pairs
(lambda (a b) (< (cadr a) (cadr b))))))) (lambda (a b) (< (cadr a) (cadr b))))
(lambda (a b) (eqv? (cadr a) (cadr b)))))))
out) out)
(newline out)))))) (newline out))))))
(define (extract-case-folding in out min-count max-char-sets name) (define (extract-case-folding in out min-count max-char-sets name)
(define (string-trim-comment str comment-ch) (define (string-trim-comment str comment-ch)
(car (string-split str comment-ch 2))) (car (string-split str comment-ch 2)))
(let ((offset-map (make-hash-table eq?))) (let ((offset-map (make-hash-table eq?))
(extras '()))
(let lp () (let lp ()
(let ((line (read-line in))) (let ((line (read-line in)))
(cond (cond
((eof-object? line) ((eof-object? line)
(write-offsets offset-map out min-count max-char-sets name)) (write-offsets offset-map extras out min-count max-char-sets name))
((or (equal? line "") (eqv? #\# (string-ref line 0))) ((or (equal? line "") (eqv? #\# (string-ref line 0)))
(lp)) (lp))
(else (else
@ -85,15 +92,20 @@
(cond (cond
((not upper) ((not upper)
(warn "invalid upper char in CaseFolding.txt: " line)) (warn "invalid upper char in CaseFolding.txt: " line))
((eqv? 'C status) ((memv status '(C S))
(let ((lower (string->number (car (cddr ls)) 16))) (let ((lower (string->number (car (cddr ls)) 16)))
(if (not lower) ;; don't store titlecase mappings
(warn "invalid lower char in CaseFolding.txt: " line) (cond
((not lower)
(warn "invalid lower char in CaseFolding.txt: " line))
((iset-contains? char-set:title-case upper)
(set! extras (cons (list upper lower) extras)))
(else
(hash-table-update! (hash-table-update!
offset-map offset-map
(- lower upper) (- lower upper)
(lambda (is) (iset-adjoin! is upper)) (lambda (is) (iset-adjoin! is upper))
(lambda () (make-iset)))))))))) (lambda () (make-iset)))))))))))
(lp)))))))) (lp))))))))
(let ((args (command-line))) (let ((args (command-line)))

View file

@ -106,13 +106,13 @@
((string-contains (car ls) "..") ((string-contains (car ls) "..")
=> (lambda (i) => (lambda (i)
(let* ((str (string-trim (car ls))) (let* ((str (string-trim (car ls)))
(start (string->number (substring str 0 i) 16)) (start (string->number (substring-cursor str (string-cursor-start str) i) 16))
(end (string->number (substring str (+ i 2)) 16))) (end (string->number (substring-cursor str (string-cursor-forward str i 2)) 16)))
(if (and start end (<= 0 start end #x110000)) (if (and start end (<= 0 start end #x110000))
(lp (cons `(ucs-range->char-set ,start ,(+ end 1)) (lp (cons `(ucs-range->char-set ,start ,(+ end 1))
ranges)) ranges))
(error "invalid char range: " line))))) (error "invalid char range: " line)))))
((string->number (cadr ls) 16) ((string->number (car ls) 16)
=> (lambda (n) => (lambda (n)
(lp (cons `(char-set ,(integer->char n)) ranges)))) (lp (cons `(char-set ,(integer->char n)) ranges))))
(else (else
@ -197,7 +197,7 @@
(lp (cddr ls) data (cadr ls) out)) (lp (cddr ls) data (cadr ls) out))
((member (car ls) '("-o" "--output")) ((member (car ls) '("-o" "--output"))
(lp (cddr ls) data derived (open-output-file (cadr ls)))) (lp (cddr ls) data derived (open-output-file (cadr ls))))
((member (car ls) '("f" "--default")) ((member (car ls) '("-f" "--default"))
(lp (append default-char-sets (cdr ls)) data derived out)) (lp (append default-char-sets (cdr ls)) data derived out))
(else (else
(error "unknown option: " (car ls))))) (error "unknown option: " (car ls)))))

View file

@ -37,12 +37,14 @@
(string-split (car ls) #\.))))) (string-split (car ls) #\.)))))
(for-each (for-each
(lambda (exp) (lambda (exp)
(display ";; ") (write exp) (newline)
(let ((value (module-ref mod exp))) (let ((value (module-ref mod exp)))
(cond (cond
((char-set? value) ((char-set? value)
(display ";; ") (write exp) (newline)
(write `(optimize ,exp) (current-error-port)) (newline (current-error-port)) (write `(optimize ,exp) (current-error-port)) (newline (current-error-port))
(if (not (equal? (iset->list value) ;; extremely slow conversion to lists as a sanity check
(display " verifying cursors\n" (current-error-port))
'(if (not (equal? (iset->list value)
(do ((cur (iset-cursor value) (do ((cur (iset-cursor value)
(iset-cursor-next value cur)) (iset-cursor-next value cur))
(res '() (cons (iset-ref value cur) res))) (res '() (cons (iset-ref value cur) res)))
@ -56,6 +58,7 @@
(iset-opt (iset-optimize iset1)) (iset-opt (iset-optimize iset1))
(_ (display " balancing\n" (current-error-port))) (_ (display " balancing\n" (current-error-port)))
(iset2 (iset-balance iset-opt))) (iset2 (iset-balance iset-opt)))
(display " comparing\n" (current-error-port))
(if (and (not ascii?) (not (iset= iset1 iset2))) (if (and (not ascii?) (not (iset= iset1 iset2)))
(begin (begin
(display " different!\n" (current-error-port)) (display " different!\n" (current-error-port))
@ -78,5 +81,6 @@
(write `(define ,exp (write `(define ,exp
(immutable-char-set ,(iset->code iset2)))) (immutable-char-set ,(iset->code iset2))))
(newline) (newline)
(newline)))))) (newline)
(display " done\n" (current-error-port)))))))
(module-exports mod))))))) (module-exports mod)))))))