mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-05-18 21:29:19 +02:00
243 lines
9.3 KiB
Scheme
Executable file
243 lines
9.3 KiB
Scheme
Executable file
#!/usr/bin/env chibi-scheme
|
|
|
|
;; Simple tool to extract Unicode properties as character-sets.
|
|
;;
|
|
;; Usage:
|
|
;; extract-unicode-props.scm Lowercase Punctuation=P Blank=Zs,0009 >out
|
|
;;
|
|
;; Accepts a list of char-set names with optional definitions as
|
|
;; arguments, and writes their Scheme definitions to stdout. A
|
|
;; char-set can be of the form:
|
|
;;
|
|
;; Name: equivalent to Name=Name
|
|
;; Name=value,...:
|
|
;;
|
|
;; A value can be any of:
|
|
;;
|
|
;; Property_Name: all unicode characters with the given derived property
|
|
;; Prop@N: a property matched at a given field (instead of default category)
|
|
;; Xx: all unicode characters with the given general category
|
|
;; X: all unicode characters with any general category X*
|
|
;; NNNN: a single unicode value in hex format
|
|
;; NNNN-NNNN: an inclusive range of unicode values in hex format
|
|
;;
|
|
;; The char-set names generated are always lowercased, replacing _
|
|
;; with -, for convenicence when the char-set name coincides with a
|
|
;; Unicode property name.
|
|
;;
|
|
;; Assumes the files UnicodeData.txt and DerivedCoreProperties.txt are
|
|
;; in the ./data/ directory, unless overridden with the --data or
|
|
;; --derived options.
|
|
|
|
(import (chibi) (chibi io) (chibi string))
|
|
|
|
(define (warn . args)
|
|
(let ((err (current-error-port)))
|
|
(for-each (lambda (x) (display x err)) args)
|
|
(newline err)))
|
|
|
|
;; Parse UnicodeData.txt or other semi-colon-delimited TSV file for
|
|
;; characters matching a given class in a given field.
|
|
(define (extract-char-set-category cat field data)
|
|
(define (join-to-range n ls)
|
|
(cond
|
|
((null? ls)
|
|
(list n))
|
|
((eqv? (car ls) (- n 1))
|
|
(cons (cons (car ls) n) (cdr ls)))
|
|
((and (pair? (car ls)) (eqv? (- n 1) (cdar ls)))
|
|
(cons (cons (caar ls) n) (cdr ls)))
|
|
(else
|
|
(cons n ls))))
|
|
(call-with-input-file data
|
|
(lambda (in)
|
|
(let lp ((ranges '()))
|
|
(let ((line (read-line in)))
|
|
(cond
|
|
((eof-object? line)
|
|
`(char-set-union
|
|
,@(map (lambda (x)
|
|
(if (pair? x)
|
|
`(ucs-range->char-set ,(car x) ,(+ 1 (cdr x)))
|
|
`(char-set ,(integer->char x))))
|
|
(reverse ranges))))
|
|
((or (equal? line "") (eqv? #\# (string-ref line 0)))
|
|
(lp ranges))
|
|
(else
|
|
(let* ((line (substring-cursor line
|
|
(string-cursor-start line)
|
|
(string-find line #\#)))
|
|
(ls (map string-trim (string-split line #\;))))
|
|
(cond
|
|
((<= (length ls) field)
|
|
(warn "invalid UnicodeData line: " line)
|
|
(lp ranges))
|
|
(else
|
|
(let ((ch (if (string-contains (car ls) "..")
|
|
(let* ((sc (string-contains (car ls) "..")))
|
|
(cons (string->number
|
|
(substring-cursor
|
|
(car ls)
|
|
(string-cursor-start (car ls))
|
|
sc)
|
|
16)
|
|
(string->number
|
|
(substring-cursor
|
|
(car ls)
|
|
(string-cursor-forward (car ls) sc 2))
|
|
16)))
|
|
(string->number (car ls) 16)))
|
|
(name (cadr ls))
|
|
(ch-cat (list-ref ls field)))
|
|
(cond
|
|
((not (or (integer? ch)
|
|
(and (pair? ch)
|
|
(integer? (car ch))
|
|
(integer? (cdr ch)))))
|
|
(warn "invalid UnicodeData line: " line))
|
|
((if (char? cat)
|
|
(eqv? cat (string-ref ch-cat 0))
|
|
(equal? cat ch-cat))
|
|
(lp (if (pair? ch)
|
|
(cons ch ranges)
|
|
(join-to-range ch ranges))))
|
|
(else
|
|
(lp ranges))))))))))))))
|
|
|
|
;; Parse DerivedCoreProperties.txt for characters matching a given
|
|
;; property.
|
|
(define (extract-char-set-property prop derived)
|
|
(define (string-trim-comment str comment-ch)
|
|
(car (string-split str comment-ch 2)))
|
|
(call-with-input-file derived
|
|
(lambda (in)
|
|
(let lp ((ranges '()))
|
|
(let ((line (read-line in)))
|
|
(cond
|
|
((eof-object? line)
|
|
`(char-set-union ,@(reverse ranges)))
|
|
((or (equal? line "") (eqv? #\# (string-ref line 0)))
|
|
(lp ranges))
|
|
(else
|
|
(let ((ls (map string-trim
|
|
(string-split (string-trim-comment line #\#) #\;))))
|
|
(cond
|
|
((< (length ls) 2)
|
|
(warn "invalid DerivedCoreProperties line: " line)
|
|
(lp ranges))
|
|
((string-ci=? prop (cadr ls))
|
|
(cond
|
|
((string-contains (car ls) "..")
|
|
=> (lambda (i)
|
|
(let* ((str (car ls))
|
|
(start (string->number (substring-cursor str (string-cursor-start str) i) 16))
|
|
(end (string->number (substring-cursor str (string-cursor-forward str i 2)) 16)))
|
|
(if (and start end (<= 0 start end #x110000))
|
|
(lp (cons `(ucs-range->char-set ,start ,(+ end 1))
|
|
ranges))
|
|
(error "invalid char range: " line)))))
|
|
((string->number (car ls) 16)
|
|
=> (lambda (n)
|
|
(lp (cons `(char-set ,(integer->char n)) ranges))))
|
|
(else
|
|
(lp ranges))))
|
|
(else
|
|
(lp ranges)))))))))))
|
|
|
|
(define (extract-char-set-simple def data derived)
|
|
(let ((ls (string-split def #\- 2)))
|
|
(cond
|
|
((= 2 (length ls))
|
|
(let ((start (string->number (car ls) 16))
|
|
(end (string->number (cadr ls) 16)))
|
|
(if (and start end (<= start end))
|
|
`(ucs-range->char-set ,start ,(+ end 1))
|
|
(error "invalid character range, expected NNNN-MMMM, got: " def))))
|
|
((string->number def 16)
|
|
=> (lambda (start) `(char-set ,(integer->char start))))
|
|
((string-find? def #\@)
|
|
(let* ((sc (string-find def #\@))
|
|
(cat (substring-cursor def (string-cursor-start def) sc))
|
|
(field (string->number
|
|
(substring-cursor def (string-cursor-next def sc)))))
|
|
(extract-char-set-category cat field data)))
|
|
((and (= 1 (string-length def))
|
|
(char-upper-case? (string-ref def 0)))
|
|
(extract-char-set-category (string-ref def 0) 2 data))
|
|
((and (= 2 (string-length def))
|
|
(char-upper-case? (string-ref def 0))
|
|
(char-lower-case? (string-ref def 1)))
|
|
(extract-char-set-category def 2 data))
|
|
;; derived properties
|
|
((and (> (string-length def) 1)
|
|
(eqv? #\: (string-ref def 0)))
|
|
(extract-char-set-property (substring def 1) derived))
|
|
(else
|
|
(extract-char-set-property def derived)))))
|
|
|
|
(define (extract-char-set def data derived)
|
|
(let ((defs (string-split def #\,)))
|
|
(cond
|
|
((= 1 (length defs))
|
|
(extract-char-set-simple (car defs) data derived))
|
|
(else
|
|
`(char-set-union
|
|
,@(map (lambda (def) (extract-char-set-simple def data derived))
|
|
defs))))))
|
|
|
|
(define (process-char-set name def data derived out)
|
|
(define (normalize-char-set-name str)
|
|
(string-append
|
|
(if (eqv? #\: (string-ref str 0)) "char-set" "char-set:")
|
|
(string-map (lambda (ch) (if (eqv? ch #\_) #\- (char-downcase ch))) str)))
|
|
(display ";; " out)
|
|
(display def out)
|
|
(newline out)
|
|
(write
|
|
`(define ,(string->symbol (normalize-char-set-name name))
|
|
(immutable-char-set
|
|
,(extract-char-set def data derived)))
|
|
out)
|
|
(newline out)
|
|
(newline out))
|
|
|
|
(define default-char-sets
|
|
'("Lower-Case=Lowercase"
|
|
"Upper-Case=Uppercase"
|
|
"Title-Case=Lt"
|
|
"Letter=Alphabetic"
|
|
"Punctuation=P"
|
|
"Symbol=S"
|
|
"Blank=Zs,0009"
|
|
"Whitespace=Zs,Zl,Zp,0009,000A,000B,000C,000D,0085"
|
|
"Digit=Nd"))
|
|
|
|
(let ((args (command-line)))
|
|
(let lp ((ls (cdr args))
|
|
(data "data/UnicodeData.txt")
|
|
(derived "data/DerivedCoreProperties.txt")
|
|
(out (current-output-port)))
|
|
(cond
|
|
((and (pair? ls) (not (equal? "" (car ls)))
|
|
(eqv? #\- (string-ref (car ls) 0)))
|
|
(cond
|
|
((member (car ls) '("-d" "--data"))
|
|
(lp (cddr ls) (cadr ls) derived out))
|
|
((member (car ls) '("-e" "--derived"))
|
|
(lp (cddr ls) data (cadr ls) out))
|
|
((member (car ls) '("-o" "--output"))
|
|
(lp (cddr ls) data derived (open-output-file (cadr ls))))
|
|
((member (car ls) '("-f" "--default"))
|
|
(lp (append default-char-sets (cdr ls)) data derived out))
|
|
(else
|
|
(error "unknown option: " (car ls)))))
|
|
((pair? ls)
|
|
(let ((ls (string-split (car ls) #\= 2)))
|
|
(cond
|
|
((= 1 (length ls))
|
|
(process-char-set (car ls) (car ls) data derived out))
|
|
(else
|
|
(process-char-set (car ls) (cadr ls) data derived out))))
|
|
(lp (cdr ls) data derived out))
|
|
(else
|
|
(close-output-port out)))))
|