adding ascii and full unicode char-set libraries based on isets

This commit is contained in:
Alex Shinn 2012-06-17 19:36:17 +09:00
parent 14a46feec9
commit c903a73921
15 changed files with 572 additions and 25 deletions

View file

@ -120,15 +120,25 @@ doc: doc/chibi.html doc-libs
%.html: %.scrbl $(CHIBI_DOC_DEPENDENCIES)
$(CHIBI_DOC) $< > $@
clean: clean-libs
-$(RM) *.o *.i *.s *.8 tests/basic/*.out tests/basic/*.err
########################################################################
# Dist builds - rules to build generated files included in distribution
# (currently just char-sets since it takes a long time and we don't want
# to bundle the raw Unicode files or require a net connection to build).
cleaner: clean
-$(RM) chibi-scheme$(EXE) chibi-scheme-static$(EXE) chibi-scheme-ulimit$(EXE) \
libchibi-scheme$(SO) *.a include/chibi/install.h \
$(shell $(FIND) lib -name \*.o)
data/%.txt:
curl --silent http://www.unicode.org/Public/UNIDATA/$*.txt > $@
dist-clean: dist-clean-libs cleaner
build-lib/chibi/char-set/derived.scm: data/UnicodeData.txt data/DerivedCoreProperties.txt chibi-scheme$(EXE)
$(CHIBI) tools/extract-unicode-props.scm --default > $@
lib/chibi/char-set/ascii.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE)
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm --ascii chibi.char-set.compute > $@
lib/chibi/char-set/full.scm: build-lib/chibi/char-set/derived.scm chibi-scheme$(EXE)
$(CHIBI) -Abuild-lib tools/optimize-char-sets.scm chibi.char-set.compute > $@
########################################################################
# Tests
checkdefs:
@for d in $(D); do \
@ -201,6 +211,19 @@ test-libs: chibi-scheme$(EXE)
test: chibi-scheme$(EXE)
$(CHIBI) -xscheme tests/r5rs-tests.scm
########################################################################
# Packaging
clean: clean-libs
-$(RM) *.o *.i *.s *.8 tests/basic/*.out tests/basic/*.err
cleaner: clean
-$(RM) chibi-scheme$(EXE) chibi-scheme-static$(EXE) chibi-scheme-ulimit$(EXE) \
libchibi-scheme$(SO) *.a include/chibi/install.h \
$(shell $(FIND) lib -name \*.o)
dist-clean: dist-clean-libs cleaner
install: all
$(MKDIR) $(DESTDIR)$(BINDIR)
$(INSTALL) chibi-scheme$(EXE) $(DESTDIR)$(BINDIR)/

View file

@ -0,0 +1,20 @@
(define char-set:letter+digit
(immutable-char-set (char-set-union char-set:letter char-set:digit)))
(define char-set:hex-digit
(immutable-char-set
(char-set-union (string->char-set "0123456789abcdefABCDEF"))))
(define char-set:iso-control
(immutable-char-set
(char-set-union (ucs-range->char-set 0 #x20)
(ucs-range->char-set #x7F #xA0))))
(define char-set:graphic
(immutable-char-set
(char-set-union
char-set:letter char-set:digit char-set:punctuation char-set:symbol)))
(define char-set:printing
(immutable-char-set (char-set-union char-set:whitespace char-set:graphic)))

View file

@ -0,0 +1,22 @@
;; Don't import this - it's temporarily used to compute optimized
;; char-set representations.
(define-library (chibi char-set compute)
(import (scheme) (chibi iset) (chibi char-set))
(include "derived.scm" "compute.scm")
(export
char-set:lower-case
char-set:upper-case
char-set:title-case
char-set:letter
char-set:punctuation
char-set:symbol
char-set:blank
char-set:whitespace
char-set:digit
char-set:letter+digit
char-set:hex-digit
char-set:iso-control
char-set:graphic
char-set:printing))

2
data/.hgignore Normal file
View file

@ -0,0 +1,2 @@
syntax: glob
*.txt

12
lib/chibi/char-set.sld Normal file
View file

@ -0,0 +1,12 @@
(define-library (chibi char-set)
(import (scheme) (chibi char-set base) (chibi char-set extras))
(export
Char-Set char-set? char-set-contains?
char-set ucs-range->char-set char-set-copy char-set-size
list->char-set char-set->list string->char-set char-set->string
char-set-adjoin! char-set-adjoin char-set-union char-set-union!
char-set-intersection char-set-intersection!
char-set-difference char-set-difference!
immutable-char-set char-set-complement
char-set:empty char-set:ascii char-set:full))

View file

@ -0,0 +1,42 @@
;; char-set:lower-case
(define char-set:lower-case (immutable-char-set (%make-iset 97 122 #f #f #f)))
;; char-set:upper-case
(define char-set:upper-case (immutable-char-set (%make-iset 65 90 #f #f #f)))
;; char-set:title-case
(define char-set:title-case (immutable-char-set #f))
;; char-set:letter
(define char-set:letter (immutable-char-set (%make-iset 97 122 #f (%make-iset 65 90 #f #f #f) #f)))
;; char-set:punctuation
(define char-set:punctuation (immutable-char-set (%make-iset 63 64 #f (%make-iset 44 47 #f (%make-iset 37 42 #f (%make-iset 33 35 #f #f #f) #f) (%make-iset 58 59 #f #f #f)) (%make-iset 123 123 #f (%make-iset 95 95 #f (%make-iset 91 93 #f #f #f) #f) (%make-iset 125 125 #f #f #f)))))
;; char-set:symbol
(define char-set:symbol (immutable-char-set (%make-iset 94 94 #f (%make-iset 43 43 #f (%make-iset 36 36 #f #f #f) (%make-iset 60 62 #f #f #f)) (%make-iset 124 124 #f (%make-iset 96 96 #f #f #f) (%make-iset 126 126 #f #f #f)))))
;; char-set:blank
(define char-set:blank (immutable-char-set (%make-iset 32 32 #f (%make-iset 9 9 #f #f #f) #f)))
;; char-set:whitespace
(define char-set:whitespace (immutable-char-set (%make-iset 32 32 #f (%make-iset 9 13 #f #f #f) #f)))
;; char-set:digit
(define char-set:digit (immutable-char-set (%make-iset 48 57 #f #f #f)))
;; char-set:letter+digit
(define char-set:letter+digit (immutable-char-set (%make-iset 65 90 #f (%make-iset 48 57 #f #f #f) (%make-iset 97 122 #f #f #f))))
;; char-set:hex-digit
(define char-set:hex-digit (immutable-char-set (%make-iset 65 70 #f (%make-iset 48 57 #f #f #f) (%make-iset 97 102 #f #f #f))))
;; char-set:iso-control
(define char-set:iso-control (immutable-char-set (%make-iset 127 127 #f (%make-iset 0 31 #f #f #f) #f)))
;; char-set:graphic
(define char-set:graphic (immutable-char-set (%make-iset 33 126 #f #f #f)))
;; char-set:printing
(define char-set:printing (immutable-char-set (%make-iset 32 126 #f (%make-iset 9 13 #f #f #f) #f)))

View file

@ -0,0 +1,11 @@
(define-library (chibi char-set ascii)
(import (scheme) (chibi iset base))
(export char-set-contains?
char-set:lower-case char-set:upper-case char-set:title-case
char-set:letter char-set:digit char-set:letter+digit
char-set:graphic char-set:printing char-set:whitespace
char-set:iso-control char-set:punctuation char-set:symbol
char-set:hex-digit char-set:blank char-set:ascii
char-set:empty char-set:full)
(include "ascii.scm"))

View file

@ -0,0 +1,14 @@
(define-library (chibi char-set base)
(import (scheme) (chibi iset base))
(export (rename Integer-Set Char-Set)
(rename iset? char-set?)
immutable-char-set
char-set-contains?)
(begin
(define-syntax immutable-char-set
(sc-macro-transformer
(lambda (expr use-env)
(eval (cadr expr) use-env))))
(define (char-set-contains? cset ch)
(iset-contains? cset (char->integer ch)))))

View file

@ -0,0 +1,47 @@
(define (char-set . args)
(list->char-set args))
;; This is a mistake in the SRFI-14 design - end should be inclusive.
(define (ucs-range->char-set start end)
(make-iset start (- end 1)))
(define char-set-copy iset-copy)
(define char-set-size iset-size)
(define (list->char-set ls)
(list->iset (map char->integer ls)))
(define (char-set->list cset)
(map integer->char (iset->list cset)))
(define (string->char-set str)
(list->char-set (string->list str)))
(define (char-set->string cset)
(list->string (char-set->list cset)))
(define (char-set-adjoin! cset ch)
(iset-adjoin! cset (char->integer ch)))
(define (char-set-adjoin cset ch)
(iset-adjoin cset (char->integer ch)))
(define char-set-union iset-union)
(define char-set-union! iset-union!)
(define char-set-intersection iset-intersection)
(define char-set-intersection! iset-intersection!)
(define char-set-difference iset-difference)
(define char-set-difference! iset-difference!)
(define char-set:empty (immutable-char-set (%make-iset 0 0 0 #f #f)))
(define char-set:ascii (immutable-char-set (%make-iset 0 #x7F #f #f #f)))
(cond-expand
(full-unicode
(define char-set:full
(immutable-char-set
(%make-iset 0 #xD7FF #f #f (%make-iset #xE000 #x10FFFD #f #f #f)))))
(else
(define char-set:full (immutable-char-set (%make-iset 0 #xFF #f #f #f)))))
(define (char-set-complement cset)
(char-set-difference char-set:full cset))

View file

@ -0,0 +1,11 @@
(define-library (chibi char-set extras)
(import (scheme) (chibi iset) (chibi char-set base))
(include "extras.scm")
(export
char-set ucs-range->char-set char-set-copy char-set-size
list->char-set char-set->list string->char-set char-set->string
char-set-adjoin! char-set-adjoin char-set-union char-set-union!
char-set-intersection char-set-intersection!
char-set-difference char-set-difference!
char-set-complement char-set:empty char-set:ascii char-set:full))

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,9 @@
(define-library (chibi char-set full)
(import (scheme) (chibi iset base) (chibi char-set base))
(export char-set:lower-case char-set:upper-case char-set:title-case
char-set:letter char-set:digit char-set:letter+digit
char-set:graphic char-set:printing char-set:whitespace
char-set:iso-control char-set:punctuation char-set:symbol
char-set:hex-digit char-set:blank)
(include "full.scm"))

View file

@ -3,6 +3,7 @@
;; Optimizing Iset Representation
(define (iset-balance iset)
(and iset
(let ((nodes '()))
(iset-for-each-node
(lambda (is) (set! nodes (cons (iset-copy-node is) nodes)))
@ -20,7 +21,7 @@
(iset-left-set! res (reduce (reverse left)))
(iset-right-set! res (reduce (cdr ls)))
res)
(lp (+ i 1) (cdr ls) (cons (car ls) left)))))))))))
(lp (+ i 1) (cdr ls) (cons (car ls) left))))))))))))
(define (iset-balance! iset)
(iset-balance iset))

209
tools/extract-unicode-props.scm Executable file
View file

@ -0,0 +1,209 @@
#!/usr/bin/env chibi-scheme
;; Simple tool to extract Unicode properties as character-sets.
;;
;; Usage:
;; extract-unicode-props.scm Lowercase Punctuation=P Blank=Zs,0009 >out
;;
;; Accepts a list of char-set names with optional definitions as
;; arguments, and writes their Scheme definitions to stdout. A
;; char-set can be of the form:
;;
;; Name: equivalent to Name=Name
;; Name=value,...:
;;
;; A value can be any of:
;;
;; Property_Name: all unicode characters with the given derived property
;; Xx: all unicode characters with the given general category
;; X: all unicode characters with any general category X*
;; NNNN: a single unicode value in hex format
;; NNNN-NNNN: an inclusive range of unicode values in hex format
;;
;; The char-set names generated are always lowercased, replacing _
;; with -, for convenicence when the char-set name coincides with a
;; Unicode property name.
;;
;; Assumes the files UnicodeData.txt and DerivedCoreProperties.txt are
;; in the data/ current directory, unless overridden with the --data or
;; --derived options.
(import (scheme) (chibi io) (chibi strings))
(define (warn . args)
(let ((err (current-error-port)))
(for-each (lambda (x) (display x err)) args)
(newline err)))
;; Parse UnicodeData.txt for characters matching a given class.
(define (extract-char-set-category cat data)
(define (join-to-range n ls)
(cond
((null? ls)
(list n))
((eqv? (car ls) (- n 1))
(cons (cons (car ls) n) (cdr ls)))
((and (pair? (car ls)) (eqv? (- n 1) (cdar ls)))
(cons (cons (caar ls) n) (cdr ls)))
(else
(cons n ls))))
(call-with-input-file data
(lambda (in)
(let lp ((ranges '()))
(let ((line (read-line in)))
(cond
((eof-object? line)
`(char-set-union
,@(map (lambda (x)
(if (pair? x)
`(ucs-range->char-set ,(car x) ,(+ 1 (cdr x)))
`(char-set ,(integer->char x))))
(reverse ranges))))
((or (equal? line "") (eqv? #\# (string-ref line 0)))
(lp ranges))
(else
(let ((ls (string-split line #\; 4)))
(cond
((< (length ls) 3)
(warn "invalid UnicodeData line: " line)
(lp ranges))
(else
(let ((ch (string->number (car ls) 16))
(name (cadr ls))
(ch-cat (car (cddr ls))))
(cond
((or (not ch) (not (= 2 (string-length ch-cat))))
(warn "invalid UnicodeData line: " line))
((if (char? cat)
(eqv? cat (string-ref ch-cat 0))
(equal? cat ch-cat))
(lp (join-to-range ch ranges)))
(else
(lp ranges))))))))))))))
;; Parse DerivedCoreProperties.txt for characters matching a given
;; property.
(define (extract-char-set-property prop derived)
(define (string-trim-comment str comment-ch)
(car (string-split str comment-ch 2)))
(call-with-input-file derived
(lambda (in)
(let lp ((ranges '()))
(let ((line (read-line in)))
(cond
((eof-object? line)
`(char-set-union ,@(reverse ranges)))
((or (equal? line "") (eqv? #\# (string-ref line 0)))
(lp ranges))
(else
(let ((ls (string-split (string-trim-comment line #\#) #\;)))
(cond
((< (length ls) 2)
(warn "invalid DerivedCoreProperties line: " line)
(lp ranges))
((string-ci=? prop (string-trim (cadr ls)))
(cond
((string-contains (car ls) "..")
=> (lambda (i)
(let* ((str (string-trim (car ls)))
(start (string->number (substring str 0 i) 16))
(end (string->number (substring str (+ i 2)) 16)))
(if (and start end (<= 0 start end #x110000))
(lp (cons `(ucs-range->char-set ,start ,(+ end 1))
ranges))
(error "invalid char range: " line)))))
((string->number (cadr ls) 16)
=> (lambda (n)
(lp (cons `(char-set ,(integer->char n)) ranges))))
(else
(lp ranges))))
(else
(lp ranges)))))))))))
(define (extract-char-set-simple def data derived)
(let ((ls (string-split def #\- 2)))
(cond
((= 2 (length ls))
(let ((start (string->number (car ls) 16))
(end (string->number (cadr ls) 16)))
(if (and start end (<= start end))
`(ucs-range->char-set ,start ,(+ end 1))
(error "invalid character range, expected NNNN-MMMM, got: " def))))
((string->number def 16)
=> (lambda (start) `(char-set ,(integer->char start))))
((and (= 1 (string-length def))
(char-upper-case? (string-ref def 0)))
(extract-char-set-category (string-ref def 0) data))
((and (= 2 (string-length def))
(char-upper-case? (string-ref def 0))
(char-lower-case? (string-ref def 1)))
(extract-char-set-category def data))
(else ;; derived property
(extract-char-set-property def derived)))))
(define (extract-char-set def data derived)
(let ((defs (string-split def #\,)))
(cond
((= 1 (length defs))
(extract-char-set-simple (car defs) data derived))
(else
`(char-set-union
,@(map (lambda (def) (extract-char-set-simple def data derived))
defs))))))
(define (process-char-set name def data derived out)
(define (normalize-char-set-name str)
(string-append
"char-set:"
(string-map (lambda (ch) (if (eqv? ch #\_) #\- (char-downcase ch))) str)))
(display ";; " out)
(display def out)
(newline out)
(write
`(define ,(string->symbol (normalize-char-set-name name))
(immutable-char-set
,(extract-char-set def data derived)))
out)
(newline out)
(newline out))
(define default-char-sets
'("Lower-Case=Lowercase"
"Upper-Case=Uppercase"
"Title-Case=Lt"
"Letter=Alphabetic"
"Punctuation=P"
"Symbol=S"
"Blank=Zs,0009"
"Whitespace=Zs,Zl,Zp,0009,000A,000B,000C,000D"
"Digit=Nd"))
(define (main args)
(let lp ((ls (cdr args))
(data "data/UnicodeData.txt")
(derived "data/DerivedCoreProperties.txt")
(out (current-output-port)))
(cond
((and (pair? ls) (not (equal? "" (car ls)))
(eqv? #\- (string-ref (car ls) 0)))
(cond
((member (car ls) '("-d" "--data"))
(lp (cddr ls) (cadr ls) derived out))
((member (car ls) '("-e" "--derived"))
(lp (cddr ls) data (cadr ls) out))
((member (car ls) '("-o" "--output"))
(lp (cddr ls) data derived (open-output-file (cadr ls))))
((member (car ls) '("f" "--default"))
(lp (append default-char-sets (cdr ls)) data derived out))
(else
(error "unknown option: " (car ls)))))
((pair? ls)
(let ((ls (string-split (car ls) #\= 2)))
(cond
((= 1 (length ls))
(process-char-set (car ls) (car ls) data derived out))
(else
(process-char-set (car ls) (cadr ls) data derived out))))
(lp (cdr ls) data derived out))
(else
(close-output-port out)))))

View file

@ -0,0 +1,82 @@
#!/usr/bin/env chibi-scheme
;; Simple tool to generate libraries of optimized char-sets.
;;
;; Usage:
;; optimize-char-sets.scm [--ascii] module.name > out
;;
;; Imports (module name) and writes optimized versions of all exported
;; char-sets to stdout.
(import (scheme) (srfi 1) (srfi 69)
(chibi io) (chibi strings) (chibi modules)
(chibi char-set) (chibi iset) (chibi iset optimize)
(only (meta) load-module))
;; Use a hash table for speedup of huge sets instead of O(n^2)
;; srfi-1 implementation.
(define (lset-diff ls1 ls2)
(let ((ls2-tab (make-hash-table eq?)))
(for-each (lambda (i) (hash-table-set! ls2-tab i #t)) ls2)
(remove (lambda (i) (hash-table-exists? ls2-tab i)) ls1)))
(define (main args)
(let lp ((ls (cdr args)) (ascii? #f))
(cond
((and (pair? ls) (not (equal? "" (car ls)))
(eqv? #\- (string-ref (car ls) 0)))
(cond
((member (car ls) '("-a" "--ascii"))
(lp (cdr ls) #t))
(else (error "unknown option" (car ls)))))
((or (null? ls) (pair? (cdr ls)))
(error "usage: optimize-char-sets.scm [--ascii] module.name"))
(else
(let ((mod (load-module
(map (lambda (x) (or (string->number x) (string->symbol x)))
(string-split (car ls) #\.)))))
(for-each
(lambda (exp)
(display ";; ") (write exp) (newline)
(let ((value (module-ref mod exp)))
(cond
((char-set? value)
(write `(optimize ,exp) (current-error-port)) (newline (current-error-port))
(if (not (equal? (iset->list value)
(do ((cur (iset-cursor value)
(iset-cursor-next value cur))
(res '() (cons (iset-ref value cur) res)))
((end-of-iset? cur) (reverse res)))))
(error "error in iset cursors"))
(display " computing intersection\n" (current-error-port))
(let* ((iset1 (if ascii?
(iset-intersection char-set:ascii value)
value))
(_ (display " optimizing\n" (current-error-port)))
(iset-opt (iset-optimize iset1))
(_ (display " balancing\n" (current-error-port)))
(iset2 (iset-balance iset-opt)))
(if (and (not ascii?) (not (iset= iset1 iset2)))
(begin
(display " different!\n" (current-error-port))
(let* ((ls1 (iset->list iset1))
(ls2 (iset->list iset2))
(diff1 (lset-diff ls1 ls2))
(diff2 (lset-diff ls2 ls1)))
(display " original: " (current-error-port))
(write (length ls1) (current-error-port))
(display " elements, missing: " (current-error-port))
(write diff1 (current-error-port))
(newline (current-error-port))
(display " optimized: " (current-error-port))
(write (length ls2) (current-error-port))
(display " elements, missing: " (current-error-port))
(write diff2 (current-error-port))
(newline (current-error-port))
(error "optimized iset is different"))))
(display " writing\n" (current-error-port))
(write `(define ,exp
(immutable-char-set ,(iset->code iset2))))
(newline)
(newline))))))
(module-exports mod)))))))