mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-05-18 21:29:19 +02:00
adding ascii and full unicode char-set libraries based on isets
This commit is contained in:
parent
14a46feec9
commit
c903a73921
15 changed files with 572 additions and 25 deletions
37
Makefile
37
Makefile
|
@ -120,15 +120,25 @@ doc: doc/chibi.html doc-libs
|
|||
%.html: %.scrbl $(CHIBI_DOC_DEPENDENCIES)
|
||||
$(CHIBI_DOC) $< > $@
|
||||
|
||||
clean: clean-libs
|
||||
-$(RM) *.o *.i *.s *.8 tests/basic/*.out tests/basic/*.err
|
||||
########################################################################
|
||||
# Dist builds - rules to build generated files included in distribution
|
||||
# (currently just char-sets since it takes a long time and we don't want
|
||||
# to bundle the raw Unicode files or require a net connection to build).
|
||||
|
||||
cleaner: clean
|
||||
-$(RM) chibi-scheme$(EXE) chibi-scheme-static$(EXE) chibi-scheme-ulimit$(EXE) \
|
||||
libchibi-scheme$(SO) *.a include/chibi/install.h \
|
||||
$(shell $(FIND) lib -name \*.o)
|
||||
data/%.txt:
|
||||
curl --silent http://www.unicode.org/Public/UNIDATA/$*.txt > $@
|
||||
|
||||
dist-clean: dist-clean-libs cleaner
|
||||
build-lib/chibi/char-set/derived.scm: data/UnicodeData.txt data/DerivedCoreProperties.txt chibi-scheme$(EXE)
|
||||
$(CHIBI) tools/extract-unicode-props.scm --default > $@
|
||||
|
||||
lib/chibi/char-set/ascii.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE)
|
||||
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm --ascii chibi.char-set.compute > $@
|
||||
|
||||
lib/chibi/char-set/full.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE)
|
||||
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm chibi.char-set.compute > $@
|
||||
|
||||
########################################################################
|
||||
# Tests
|
||||
|
||||
checkdefs:
|
||||
@for d in $(D); do \
|
||||
|
@ -201,6 +211,19 @@ test-libs: chibi-scheme$(EXE)
|
|||
test: chibi-scheme$(EXE)
|
||||
$(CHIBI) -xscheme tests/r5rs-tests.scm
|
||||
|
||||
########################################################################
|
||||
# Packaging
|
||||
|
||||
clean: clean-libs
|
||||
-$(RM) *.o *.i *.s *.8 tests/basic/*.out tests/basic/*.err
|
||||
|
||||
cleaner: clean
|
||||
-$(RM) chibi-scheme$(EXE) chibi-scheme-static$(EXE) chibi-scheme-ulimit$(EXE) \
|
||||
libchibi-scheme$(SO) *.a include/chibi/install.h \
|
||||
$(shell $(FIND) lib -name \*.o)
|
||||
|
||||
dist-clean: dist-clean-libs cleaner
|
||||
|
||||
install: all
|
||||
$(MKDIR) $(DESTDIR)$(BINDIR)
|
||||
$(INSTALL) chibi-scheme$(EXE) $(DESTDIR)$(BINDIR)/
|
||||
|
|
20
build-lib/chibi/char-set/compute.scm
Normal file
20
build-lib/chibi/char-set/compute.scm
Normal file
|
@ -0,0 +1,20 @@
|
|||
|
||||
(define char-set:letter+digit
|
||||
(immutable-char-set (char-set-union char-set:letter char-set:digit)))
|
||||
|
||||
(define char-set:hex-digit
|
||||
(immutable-char-set
|
||||
(char-set-union (string->char-set "0123456789abcdefABCDEF"))))
|
||||
|
||||
(define char-set:iso-control
|
||||
(immutable-char-set
|
||||
(char-set-union (ucs-range->char-set 0 #x20)
|
||||
(ucs-range->char-set #x7F #xA0))))
|
||||
|
||||
(define char-set:graphic
|
||||
(immutable-char-set
|
||||
(char-set-union
|
||||
char-set:letter char-set:digit char-set:punctuation char-set:symbol)))
|
||||
|
||||
(define char-set:printing
|
||||
(immutable-char-set (char-set-union char-set:whitespace char-set:graphic)))
|
22
build-lib/chibi/char-set/compute.sld
Normal file
22
build-lib/chibi/char-set/compute.sld
Normal file
|
@ -0,0 +1,22 @@
|
|||
|
||||
;; Don't import this - it's temporarily used to compute optimized
|
||||
;; char-set representations.
|
||||
|
||||
(define-library (chibi char-set compute)
|
||||
(import (scheme) (chibi iset) (chibi char-set))
|
||||
(include "derived.scm" "compute.scm")
|
||||
(export
|
||||
char-set:lower-case
|
||||
char-set:upper-case
|
||||
char-set:title-case
|
||||
char-set:letter
|
||||
char-set:punctuation
|
||||
char-set:symbol
|
||||
char-set:blank
|
||||
char-set:whitespace
|
||||
char-set:digit
|
||||
char-set:letter+digit
|
||||
char-set:hex-digit
|
||||
char-set:iso-control
|
||||
char-set:graphic
|
||||
char-set:printing))
|
2
data/.hgignore
Normal file
2
data/.hgignore
Normal file
|
@ -0,0 +1,2 @@
|
|||
syntax: glob
|
||||
*.txt
|
12
lib/chibi/char-set.sld
Normal file
12
lib/chibi/char-set.sld
Normal file
|
@ -0,0 +1,12 @@
|
|||
|
||||
(define-library (chibi char-set)
|
||||
(import (scheme) (chibi char-set base) (chibi char-set extras))
|
||||
(export
|
||||
Char-Set char-set? char-set-contains?
|
||||
char-set ucs-range->char-set char-set-copy char-set-size
|
||||
list->char-set char-set->list string->char-set char-set->string
|
||||
char-set-adjoin! char-set-adjoin char-set-union char-set-union!
|
||||
char-set-intersection char-set-intersection!
|
||||
char-set-difference char-set-difference!
|
||||
immutable-char-set char-set-complement
|
||||
char-set:empty char-set:ascii char-set:full))
|
42
lib/chibi/char-set/ascii.scm
Normal file
42
lib/chibi/char-set/ascii.scm
Normal file
|
@ -0,0 +1,42 @@
|
|||
;; char-set:lower-case
|
||||
(define char-set:lower-case (immutable-char-set (%make-iset 97 122 #f #f #f)))
|
||||
|
||||
;; char-set:upper-case
|
||||
(define char-set:upper-case (immutable-char-set (%make-iset 65 90 #f #f #f)))
|
||||
|
||||
;; char-set:title-case
|
||||
(define char-set:title-case (immutable-char-set #f))
|
||||
|
||||
;; char-set:letter
|
||||
(define char-set:letter (immutable-char-set (%make-iset 97 122 #f (%make-iset 65 90 #f #f #f) #f)))
|
||||
|
||||
;; char-set:punctuation
|
||||
(define char-set:punctuation (immutable-char-set (%make-iset 63 64 #f (%make-iset 44 47 #f (%make-iset 37 42 #f (%make-iset 33 35 #f #f #f) #f) (%make-iset 58 59 #f #f #f)) (%make-iset 123 123 #f (%make-iset 95 95 #f (%make-iset 91 93 #f #f #f) #f) (%make-iset 125 125 #f #f #f)))))
|
||||
|
||||
;; char-set:symbol
|
||||
(define char-set:symbol (immutable-char-set (%make-iset 94 94 #f (%make-iset 43 43 #f (%make-iset 36 36 #f #f #f) (%make-iset 60 62 #f #f #f)) (%make-iset 124 124 #f (%make-iset 96 96 #f #f #f) (%make-iset 126 126 #f #f #f)))))
|
||||
|
||||
;; char-set:blank
|
||||
(define char-set:blank (immutable-char-set (%make-iset 32 32 #f (%make-iset 9 9 #f #f #f) #f)))
|
||||
|
||||
;; char-set:whitespace
|
||||
(define char-set:whitespace (immutable-char-set (%make-iset 32 32 #f (%make-iset 9 13 #f #f #f) #f)))
|
||||
|
||||
;; char-set:digit
|
||||
(define char-set:digit (immutable-char-set (%make-iset 48 57 #f #f #f)))
|
||||
|
||||
;; char-set:letter+digit
|
||||
(define char-set:letter+digit (immutable-char-set (%make-iset 65 90 #f (%make-iset 48 57 #f #f #f) (%make-iset 97 122 #f #f #f))))
|
||||
|
||||
;; char-set:hex-digit
|
||||
(define char-set:hex-digit (immutable-char-set (%make-iset 65 70 #f (%make-iset 48 57 #f #f #f) (%make-iset 97 102 #f #f #f))))
|
||||
|
||||
;; char-set:iso-control
|
||||
(define char-set:iso-control (immutable-char-set (%make-iset 127 127 #f (%make-iset 0 31 #f #f #f) #f)))
|
||||
|
||||
;; char-set:graphic
|
||||
(define char-set:graphic (immutable-char-set (%make-iset 33 126 #f #f #f)))
|
||||
|
||||
;; char-set:printing
|
||||
(define char-set:printing (immutable-char-set (%make-iset 32 126 #f (%make-iset 9 13 #f #f #f) #f)))
|
||||
|
11
lib/chibi/char-set/ascii.sld
Normal file
11
lib/chibi/char-set/ascii.sld
Normal file
|
@ -0,0 +1,11 @@
|
|||
|
||||
(define-library (chibi char-set ascii)
|
||||
(import (scheme) (chibi iset base))
|
||||
(export char-set-contains?
|
||||
char-set:lower-case char-set:upper-case char-set:title-case
|
||||
char-set:letter char-set:digit char-set:letter+digit
|
||||
char-set:graphic char-set:printing char-set:whitespace
|
||||
char-set:iso-control char-set:punctuation char-set:symbol
|
||||
char-set:hex-digit char-set:blank char-set:ascii
|
||||
char-set:empty char-set:full)
|
||||
(include "ascii.scm"))
|
14
lib/chibi/char-set/base.sld
Normal file
14
lib/chibi/char-set/base.sld
Normal file
|
@ -0,0 +1,14 @@
|
|||
|
||||
(define-library (chibi char-set base)
|
||||
(import (scheme) (chibi iset base))
|
||||
(export (rename Integer-Set Char-Set)
|
||||
(rename iset? char-set?)
|
||||
immutable-char-set
|
||||
char-set-contains?)
|
||||
(begin
|
||||
(define-syntax immutable-char-set
|
||||
(sc-macro-transformer
|
||||
(lambda (expr use-env)
|
||||
(eval (cadr expr) use-env))))
|
||||
(define (char-set-contains? cset ch)
|
||||
(iset-contains? cset (char->integer ch)))))
|
47
lib/chibi/char-set/extras.scm
Normal file
47
lib/chibi/char-set/extras.scm
Normal file
|
@ -0,0 +1,47 @@
|
|||
|
||||
(define (char-set . args)
|
||||
(list->char-set args))
|
||||
|
||||
;; This is a mistake in the SRFI-14 design - end should be inclusive.
|
||||
(define (ucs-range->char-set start end)
|
||||
(make-iset start (- end 1)))
|
||||
|
||||
(define char-set-copy iset-copy)
|
||||
|
||||
(define char-set-size iset-size)
|
||||
|
||||
(define (list->char-set ls)
|
||||
(list->iset (map char->integer ls)))
|
||||
(define (char-set->list cset)
|
||||
(map integer->char (iset->list cset)))
|
||||
|
||||
(define (string->char-set str)
|
||||
(list->char-set (string->list str)))
|
||||
(define (char-set->string cset)
|
||||
(list->string (char-set->list cset)))
|
||||
|
||||
(define (char-set-adjoin! cset ch)
|
||||
(iset-adjoin! cset (char->integer ch)))
|
||||
(define (char-set-adjoin cset ch)
|
||||
(iset-adjoin cset (char->integer ch)))
|
||||
|
||||
(define char-set-union iset-union)
|
||||
(define char-set-union! iset-union!)
|
||||
(define char-set-intersection iset-intersection)
|
||||
(define char-set-intersection! iset-intersection!)
|
||||
(define char-set-difference iset-difference)
|
||||
(define char-set-difference! iset-difference!)
|
||||
|
||||
(define char-set:empty (immutable-char-set (%make-iset 0 0 0 #f #f)))
|
||||
(define char-set:ascii (immutable-char-set (%make-iset 0 #x7F #f #f #f)))
|
||||
|
||||
(cond-expand
|
||||
(full-unicode
|
||||
(define char-set:full
|
||||
(immutable-char-set
|
||||
(%make-iset 0 #xD7FF #f #f (%make-iset #xE000 #x10FFFD #f #f #f)))))
|
||||
(else
|
||||
(define char-set:full (immutable-char-set (%make-iset 0 #xFF #f #f #f)))))
|
||||
|
||||
(define (char-set-complement cset)
|
||||
(char-set-difference char-set:full cset))
|
11
lib/chibi/char-set/extras.sld
Normal file
11
lib/chibi/char-set/extras.sld
Normal file
|
@ -0,0 +1,11 @@
|
|||
|
||||
(define-library (chibi char-set extras)
|
||||
(import (scheme) (chibi iset) (chibi char-set base))
|
||||
(include "extras.scm")
|
||||
(export
|
||||
char-set ucs-range->char-set char-set-copy char-set-size
|
||||
list->char-set char-set->list string->char-set char-set->string
|
||||
char-set-adjoin! char-set-adjoin char-set-union char-set-union!
|
||||
char-set-intersection char-set-intersection!
|
||||
char-set-difference char-set-difference!
|
||||
char-set-complement char-set:empty char-set:ascii char-set:full))
|
42
lib/chibi/char-set/full.scm
Normal file
42
lib/chibi/char-set/full.scm
Normal file
File diff suppressed because one or more lines are too long
9
lib/chibi/char-set/full.sld
Normal file
9
lib/chibi/char-set/full.sld
Normal file
|
@ -0,0 +1,9 @@
|
|||
|
||||
(define-library (chibi char-set full)
|
||||
(import (scheme) (chibi iset base) (chibi char-set base))
|
||||
(export char-set:lower-case char-set:upper-case char-set:title-case
|
||||
char-set:letter char-set:digit char-set:letter+digit
|
||||
char-set:graphic char-set:printing char-set:whitespace
|
||||
char-set:iso-control char-set:punctuation char-set:symbol
|
||||
char-set:hex-digit char-set:blank)
|
||||
(include "full.scm"))
|
|
@ -3,6 +3,7 @@
|
|||
;; Optimizing Iset Representation
|
||||
|
||||
(define (iset-balance iset)
|
||||
(and iset
|
||||
(let ((nodes '()))
|
||||
(iset-for-each-node
|
||||
(lambda (is) (set! nodes (cons (iset-copy-node is) nodes)))
|
||||
|
@ -20,7 +21,7 @@
|
|||
(iset-left-set! res (reduce (reverse left)))
|
||||
(iset-right-set! res (reduce (cdr ls)))
|
||||
res)
|
||||
(lp (+ i 1) (cdr ls) (cons (car ls) left)))))))))))
|
||||
(lp (+ i 1) (cdr ls) (cons (car ls) left))))))))))))
|
||||
|
||||
(define (iset-balance! iset)
|
||||
(iset-balance iset))
|
||||
|
|
209
tools/extract-unicode-props.scm
Executable file
209
tools/extract-unicode-props.scm
Executable file
|
@ -0,0 +1,209 @@
|
|||
#!/usr/bin/env chibi-scheme
|
||||
|
||||
;; Simple tool to extract Unicode properties as character-sets.
|
||||
;;
|
||||
;; Usage:
|
||||
;; extract-unicode-props.scm Lowercase Punctuation=P Blank=Zs,0009 >out
|
||||
;;
|
||||
;; Accepts a list of char-set names with optional definitions as
|
||||
;; arguments, and writes their Scheme definitions to stdout. A
|
||||
;; char-set can be of the form:
|
||||
;;
|
||||
;; Name: equivalent to Name=Name
|
||||
;; Name=value,...:
|
||||
;;
|
||||
;; A value can be any of:
|
||||
;;
|
||||
;; Property_Name: all unicode characters with the given derived property
|
||||
;; Xx: all unicode characters with the given general category
|
||||
;; X: all unicode characters with any general category X*
|
||||
;; NNNN: a single unicode value in hex format
|
||||
;; NNNN-NNNN: an inclusive range of unicode values in hex format
|
||||
;;
|
||||
;; The char-set names generated are always lowercased, replacing _
|
||||
;; with -, for convenicence when the char-set name coincides with a
|
||||
;; Unicode property name.
|
||||
;;
|
||||
;; Assumes the files UnicodeData.txt and DerivedCoreProperties.txt are
|
||||
;; in the data/ current directory, unless overridden with the --data or
|
||||
;; --derived options.
|
||||
|
||||
(import (scheme) (chibi io) (chibi strings))
|
||||
|
||||
(define (warn . args)
|
||||
(let ((err (current-error-port)))
|
||||
(for-each (lambda (x) (display x err)) args)
|
||||
(newline err)))
|
||||
|
||||
;; Parse UnicodeData.txt for characters matching a given class.
|
||||
(define (extract-char-set-category cat data)
|
||||
(define (join-to-range n ls)
|
||||
(cond
|
||||
((null? ls)
|
||||
(list n))
|
||||
((eqv? (car ls) (- n 1))
|
||||
(cons (cons (car ls) n) (cdr ls)))
|
||||
((and (pair? (car ls)) (eqv? (- n 1) (cdar ls)))
|
||||
(cons (cons (caar ls) n) (cdr ls)))
|
||||
(else
|
||||
(cons n ls))))
|
||||
(call-with-input-file data
|
||||
(lambda (in)
|
||||
(let lp ((ranges '()))
|
||||
(let ((line (read-line in)))
|
||||
(cond
|
||||
((eof-object? line)
|
||||
`(char-set-union
|
||||
,@(map (lambda (x)
|
||||
(if (pair? x)
|
||||
`(ucs-range->char-set ,(car x) ,(+ 1 (cdr x)))
|
||||
`(char-set ,(integer->char x))))
|
||||
(reverse ranges))))
|
||||
((or (equal? line "") (eqv? #\# (string-ref line 0)))
|
||||
(lp ranges))
|
||||
(else
|
||||
(let ((ls (string-split line #\; 4)))
|
||||
(cond
|
||||
((< (length ls) 3)
|
||||
(warn "invalid UnicodeData line: " line)
|
||||
(lp ranges))
|
||||
(else
|
||||
(let ((ch (string->number (car ls) 16))
|
||||
(name (cadr ls))
|
||||
(ch-cat (car (cddr ls))))
|
||||
(cond
|
||||
((or (not ch) (not (= 2 (string-length ch-cat))))
|
||||
(warn "invalid UnicodeData line: " line))
|
||||
((if (char? cat)
|
||||
(eqv? cat (string-ref ch-cat 0))
|
||||
(equal? cat ch-cat))
|
||||
(lp (join-to-range ch ranges)))
|
||||
(else
|
||||
(lp ranges))))))))))))))
|
||||
|
||||
;; Parse DerivedCoreProperties.txt for characters matching a given
|
||||
;; property.
|
||||
(define (extract-char-set-property prop derived)
|
||||
(define (string-trim-comment str comment-ch)
|
||||
(car (string-split str comment-ch 2)))
|
||||
(call-with-input-file derived
|
||||
(lambda (in)
|
||||
(let lp ((ranges '()))
|
||||
(let ((line (read-line in)))
|
||||
(cond
|
||||
((eof-object? line)
|
||||
`(char-set-union ,@(reverse ranges)))
|
||||
((or (equal? line "") (eqv? #\# (string-ref line 0)))
|
||||
(lp ranges))
|
||||
(else
|
||||
(let ((ls (string-split (string-trim-comment line #\#) #\;)))
|
||||
(cond
|
||||
((< (length ls) 2)
|
||||
(warn "invalid DerivedCoreProperties line: " line)
|
||||
(lp ranges))
|
||||
((string-ci=? prop (string-trim (cadr ls)))
|
||||
(cond
|
||||
((string-contains (car ls) "..")
|
||||
=> (lambda (i)
|
||||
(let* ((str (string-trim (car ls)))
|
||||
(start (string->number (substring str 0 i) 16))
|
||||
(end (string->number (substring str (+ i 2)) 16)))
|
||||
(if (and start end (<= 0 start end #x110000))
|
||||
(lp (cons `(ucs-range->char-set ,start ,(+ end 1))
|
||||
ranges))
|
||||
(error "invalid char range: " line)))))
|
||||
((string->number (cadr ls) 16)
|
||||
=> (lambda (n)
|
||||
(lp (cons `(char-set ,(integer->char n)) ranges))))
|
||||
(else
|
||||
(lp ranges))))
|
||||
(else
|
||||
(lp ranges)))))))))))
|
||||
|
||||
(define (extract-char-set-simple def data derived)
|
||||
(let ((ls (string-split def #\- 2)))
|
||||
(cond
|
||||
((= 2 (length ls))
|
||||
(let ((start (string->number (car ls) 16))
|
||||
(end (string->number (cadr ls) 16)))
|
||||
(if (and start end (<= start end))
|
||||
`(ucs-range->char-set ,start ,(+ end 1))
|
||||
(error "invalid character range, expected NNNN-MMMM, got: " def))))
|
||||
((string->number def 16)
|
||||
=> (lambda (start) `(char-set ,(integer->char start))))
|
||||
((and (= 1 (string-length def))
|
||||
(char-upper-case? (string-ref def 0)))
|
||||
(extract-char-set-category (string-ref def 0) data))
|
||||
((and (= 2 (string-length def))
|
||||
(char-upper-case? (string-ref def 0))
|
||||
(char-lower-case? (string-ref def 1)))
|
||||
(extract-char-set-category def data))
|
||||
(else ;; derived property
|
||||
(extract-char-set-property def derived)))))
|
||||
|
||||
(define (extract-char-set def data derived)
|
||||
(let ((defs (string-split def #\,)))
|
||||
(cond
|
||||
((= 1 (length defs))
|
||||
(extract-char-set-simple (car defs) data derived))
|
||||
(else
|
||||
`(char-set-union
|
||||
,@(map (lambda (def) (extract-char-set-simple def data derived))
|
||||
defs))))))
|
||||
|
||||
(define (process-char-set name def data derived out)
|
||||
(define (normalize-char-set-name str)
|
||||
(string-append
|
||||
"char-set:"
|
||||
(string-map (lambda (ch) (if (eqv? ch #\_) #\- (char-downcase ch))) str)))
|
||||
(display ";; " out)
|
||||
(display def out)
|
||||
(newline out)
|
||||
(write
|
||||
`(define ,(string->symbol (normalize-char-set-name name))
|
||||
(immutable-char-set
|
||||
,(extract-char-set def data derived)))
|
||||
out)
|
||||
(newline out)
|
||||
(newline out))
|
||||
|
||||
(define default-char-sets
|
||||
'("Lower-Case=Lowercase"
|
||||
"Upper-Case=Uppercase"
|
||||
"Title-Case=Lt"
|
||||
"Letter=Alphabetic"
|
||||
"Punctuation=P"
|
||||
"Symbol=S"
|
||||
"Blank=Zs,0009"
|
||||
"Whitespace=Zs,Zl,Zp,0009,000A,000B,000C,000D"
|
||||
"Digit=Nd"))
|
||||
|
||||
(define (main args)
|
||||
(let lp ((ls (cdr args))
|
||||
(data "data/UnicodeData.txt")
|
||||
(derived "data/DerivedCoreProperties.txt")
|
||||
(out (current-output-port)))
|
||||
(cond
|
||||
((and (pair? ls) (not (equal? "" (car ls)))
|
||||
(eqv? #\- (string-ref (car ls) 0)))
|
||||
(cond
|
||||
((member (car ls) '("-d" "--data"))
|
||||
(lp (cddr ls) (cadr ls) derived out))
|
||||
((member (car ls) '("-e" "--derived"))
|
||||
(lp (cddr ls) data (cadr ls) out))
|
||||
((member (car ls) '("-o" "--output"))
|
||||
(lp (cddr ls) data derived (open-output-file (cadr ls))))
|
||||
((member (car ls) '("f" "--default"))
|
||||
(lp (append default-char-sets (cdr ls)) data derived out))
|
||||
(else
|
||||
(error "unknown option: " (car ls)))))
|
||||
((pair? ls)
|
||||
(let ((ls (string-split (car ls) #\= 2)))
|
||||
(cond
|
||||
((= 1 (length ls))
|
||||
(process-char-set (car ls) (car ls) data derived out))
|
||||
(else
|
||||
(process-char-set (car ls) (cadr ls) data derived out))))
|
||||
(lp (cdr ls) data derived out))
|
||||
(else
|
||||
(close-output-port out)))))
|
82
tools/optimize-char-sets.scm
Normal file
82
tools/optimize-char-sets.scm
Normal file
|
@ -0,0 +1,82 @@
|
|||
#!/usr/bin/env chibi-scheme
|
||||
|
||||
;; Simple tool to generate libraries of optimized char-sets.
|
||||
;;
|
||||
;; Usage:
|
||||
;; optimize-char-sets.scm [--ascii] module.name > out
|
||||
;;
|
||||
;; Imports (module name) and writes optimized versions of all exported
|
||||
;; char-sets to stdout.
|
||||
|
||||
(import (scheme) (srfi 1) (srfi 69)
|
||||
(chibi io) (chibi strings) (chibi modules)
|
||||
(chibi char-set) (chibi iset) (chibi iset optimize)
|
||||
(only (meta) load-module))
|
||||
|
||||
;; Use a hash table for speedup of huge sets instead of O(n^2)
|
||||
;; srfi-1 implementation.
|
||||
(define (lset-diff ls1 ls2)
|
||||
(let ((ls2-tab (make-hash-table eq?)))
|
||||
(for-each (lambda (i) (hash-table-set! ls2-tab i #t)) ls2)
|
||||
(remove (lambda (i) (hash-table-exists? ls2-tab i)) ls1)))
|
||||
|
||||
(define (main args)
|
||||
(let lp ((ls (cdr args)) (ascii? #f))
|
||||
(cond
|
||||
((and (pair? ls) (not (equal? "" (car ls)))
|
||||
(eqv? #\- (string-ref (car ls) 0)))
|
||||
(cond
|
||||
((member (car ls) '("-a" "--ascii"))
|
||||
(lp (cdr ls) #t))
|
||||
(else (error "unknown option" (car ls)))))
|
||||
((or (null? ls) (pair? (cdr ls)))
|
||||
(error "usage: optimize-char-sets.scm [--ascii] module.name"))
|
||||
(else
|
||||
(let ((mod (load-module
|
||||
(map (lambda (x) (or (string->number x) (string->symbol x)))
|
||||
(string-split (car ls) #\.)))))
|
||||
(for-each
|
||||
(lambda (exp)
|
||||
(display ";; ") (write exp) (newline)
|
||||
(let ((value (module-ref mod exp)))
|
||||
(cond
|
||||
((char-set? value)
|
||||
(write `(optimize ,exp) (current-error-port)) (newline (current-error-port))
|
||||
(if (not (equal? (iset->list value)
|
||||
(do ((cur (iset-cursor value)
|
||||
(iset-cursor-next value cur))
|
||||
(res '() (cons (iset-ref value cur) res)))
|
||||
((end-of-iset? cur) (reverse res)))))
|
||||
(error "error in iset cursors"))
|
||||
(display " computing intersection\n" (current-error-port))
|
||||
(let* ((iset1 (if ascii?
|
||||
(iset-intersection char-set:ascii value)
|
||||
value))
|
||||
(_ (display " optimizing\n" (current-error-port)))
|
||||
(iset-opt (iset-optimize iset1))
|
||||
(_ (display " balancing\n" (current-error-port)))
|
||||
(iset2 (iset-balance iset-opt)))
|
||||
(if (and (not ascii?) (not (iset= iset1 iset2)))
|
||||
(begin
|
||||
(display " different!\n" (current-error-port))
|
||||
(let* ((ls1 (iset->list iset1))
|
||||
(ls2 (iset->list iset2))
|
||||
(diff1 (lset-diff ls1 ls2))
|
||||
(diff2 (lset-diff ls2 ls1)))
|
||||
(display " original: " (current-error-port))
|
||||
(write (length ls1) (current-error-port))
|
||||
(display " elements, missing: " (current-error-port))
|
||||
(write diff1 (current-error-port))
|
||||
(newline (current-error-port))
|
||||
(display " optimized: " (current-error-port))
|
||||
(write (length ls2) (current-error-port))
|
||||
(display " elements, missing: " (current-error-port))
|
||||
(write diff2 (current-error-port))
|
||||
(newline (current-error-port))
|
||||
(error "optimized iset is different"))))
|
||||
(display " writing\n" (current-error-port))
|
||||
(write `(define ,exp
|
||||
(immutable-char-set ,(iset->code iset2))))
|
||||
(newline)
|
||||
(newline))))))
|
||||
(module-exports mod)))))))
|
Loading…
Add table
Reference in a new issue