chibi-scheme/tools/extract-unicode-props.scm

243 lines
9.3 KiB
Scheme
Executable file

#!/usr/bin/env chibi-scheme
;; Simple tool to extract Unicode properties as character-sets.
;;
;; Usage:
;; extract-unicode-props.scm Lowercase Punctuation=P Blank=Zs,0009 >out
;;
;; Accepts a list of char-set names with optional definitions as
;; arguments, and writes their Scheme definitions to stdout. A
;; char-set can be of the form:
;;
;; Name: equivalent to Name=Name
;; Name=value,...:
;;
;; A value can be any of:
;;
;; Property_Name: all unicode characters with the given derived property
;; Prop@N: a property matched at a given field (instead of default category)
;; Xx: all unicode characters with the given general category
;; X: all unicode characters with any general category X*
;; NNNN: a single unicode value in hex format
;; NNNN-NNNN: an inclusive range of unicode values in hex format
;;
;; The char-set names generated are always lowercased, replacing _
;; with -, for convenicence when the char-set name coincides with a
;; Unicode property name.
;;
;; Assumes the files UnicodeData.txt and DerivedCoreProperties.txt are
;; in the ./data/ directory, unless overridden with the --data or
;; --derived options.
(import (chibi) (chibi io) (chibi string))
(define (warn . args)
(let ((err (current-error-port)))
(for-each (lambda (x) (display x err)) args)
(newline err)))
;; Parse UnicodeData.txt or other semi-colon-delimited TSV file for
;; characters matching a given class in a given field.
(define (extract-char-set-category cat field data)
(define (join-to-range n ls)
(cond
((null? ls)
(list n))
((eqv? (car ls) (- n 1))
(cons (cons (car ls) n) (cdr ls)))
((and (pair? (car ls)) (eqv? (- n 1) (cdar ls)))
(cons (cons (caar ls) n) (cdr ls)))
(else
(cons n ls))))
(call-with-input-file data
(lambda (in)
(let lp ((ranges '()))
(let ((line (read-line in)))
(cond
((eof-object? line)
`(char-set-union
,@(map (lambda (x)
(if (pair? x)
`(ucs-range->char-set ,(car x) ,(+ 1 (cdr x)))
`(char-set ,(integer->char x))))
(reverse ranges))))
((or (equal? line "") (eqv? #\# (string-ref line 0)))
(lp ranges))
(else
(let* ((line (substring-cursor line
(string-cursor-start line)
(string-find line #\#)))
(ls (map string-trim (string-split line #\;))))
(cond
((<= (length ls) field)
(warn "invalid UnicodeData line: " line)
(lp ranges))
(else
(let ((ch (if (string-contains (car ls) "..")
(let* ((sc (string-contains (car ls) "..")))
(cons (string->number
(substring-cursor
(car ls)
(string-cursor-start (car ls))
sc)
16)
(string->number
(substring-cursor
(car ls)
(string-cursor-forward (car ls) sc 2))
16)))
(string->number (car ls) 16)))
(name (cadr ls))
(ch-cat (list-ref ls field)))
(cond
((not (or (integer? ch)
(and (pair? ch)
(integer? (car ch))
(integer? (cdr ch)))))
(warn "invalid UnicodeData line: " line))
((if (char? cat)
(eqv? cat (string-ref ch-cat 0))
(equal? cat ch-cat))
(lp (if (pair? ch)
(cons ch ranges)
(join-to-range ch ranges))))
(else
(lp ranges))))))))))))))
;; Parse DerivedCoreProperties.txt for characters matching a given
;; property.
(define (extract-char-set-property prop derived)
(define (string-trim-comment str comment-ch)
(car (string-split str comment-ch 2)))
(call-with-input-file derived
(lambda (in)
(let lp ((ranges '()))
(let ((line (read-line in)))
(cond
((eof-object? line)
`(char-set-union ,@(reverse ranges)))
((or (equal? line "") (eqv? #\# (string-ref line 0)))
(lp ranges))
(else
(let ((ls (map string-trim
(string-split (string-trim-comment line #\#) #\;))))
(cond
((< (length ls) 2)
(warn "invalid DerivedCoreProperties line: " line)
(lp ranges))
((string-ci=? prop (cadr ls))
(cond
((string-contains (car ls) "..")
=> (lambda (i)
(let* ((str (car ls))
(start (string->number (substring-cursor str (string-cursor-start str) i) 16))
(end (string->number (substring-cursor str (string-cursor-forward str i 2)) 16)))
(if (and start end (<= 0 start end #x110000))
(lp (cons `(ucs-range->char-set ,start ,(+ end 1))
ranges))
(error "invalid char range: " line)))))
((string->number (car ls) 16)
=> (lambda (n)
(lp (cons `(char-set ,(integer->char n)) ranges))))
(else
(lp ranges))))
(else
(lp ranges)))))))))))
(define (extract-char-set-simple def data derived)
(let ((ls (string-split def #\- 2)))
(cond
((= 2 (length ls))
(let ((start (string->number (car ls) 16))
(end (string->number (cadr ls) 16)))
(if (and start end (<= start end))
`(ucs-range->char-set ,start ,(+ end 1))
(error "invalid character range, expected NNNN-MMMM, got: " def))))
((string->number def 16)
=> (lambda (start) `(char-set ,(integer->char start))))
((string-find? def #\@)
(let* ((sc (string-find def #\@))
(cat (substring-cursor def (string-cursor-start def) sc))
(field (string->number
(substring-cursor def (string-cursor-next def sc)))))
(extract-char-set-category cat field data)))
((and (= 1 (string-length def))
(char-upper-case? (string-ref def 0)))
(extract-char-set-category (string-ref def 0) 2 data))
((and (= 2 (string-length def))
(char-upper-case? (string-ref def 0))
(char-lower-case? (string-ref def 1)))
(extract-char-set-category def 2 data))
;; derived properties
((and (> (string-length def) 1)
(eqv? #\: (string-ref def 0)))
(extract-char-set-property (substring def 1) derived))
(else
(extract-char-set-property def derived)))))
(define (extract-char-set def data derived)
(let ((defs (string-split def #\,)))
(cond
((= 1 (length defs))
(extract-char-set-simple (car defs) data derived))
(else
`(char-set-union
,@(map (lambda (def) (extract-char-set-simple def data derived))
defs))))))
(define (process-char-set name def data derived out)
(define (normalize-char-set-name str)
(string-append
(if (eqv? #\: (string-ref str 0)) "char-set" "char-set:")
(string-map (lambda (ch) (if (eqv? ch #\_) #\- (char-downcase ch))) str)))
(display ";; " out)
(display def out)
(newline out)
(write
`(define ,(string->symbol (normalize-char-set-name name))
(immutable-char-set
,(extract-char-set def data derived)))
out)
(newline out)
(newline out))
(define default-char-sets
'("Lower-Case=Lowercase"
"Upper-Case=Uppercase"
"Title-Case=Lt"
"Letter=Alphabetic"
"Punctuation=P"
"Symbol=S"
"Blank=Zs,0009"
"Whitespace=Zs,Zl,Zp,0009,000A,000B,000C,000D,0085"
"Digit=Nd"))
(let ((args (command-line)))
(let lp ((ls (cdr args))
(data "data/UnicodeData.txt")
(derived "data/DerivedCoreProperties.txt")
(out (current-output-port)))
(cond
((and (pair? ls) (not (equal? "" (car ls)))
(eqv? #\- (string-ref (car ls) 0)))
(cond
((member (car ls) '("-d" "--data"))
(lp (cddr ls) (cadr ls) derived out))
((member (car ls) '("-e" "--derived"))
(lp (cddr ls) data (cadr ls) out))
((member (car ls) '("-o" "--output"))
(lp (cddr ls) data derived (open-output-file (cadr ls))))
((member (car ls) '("-f" "--default"))
(lp (append default-char-sets (cdr ls)) data derived out))
(else
(error "unknown option: " (car ls)))))
((pair? ls)
(let ((ls (string-split (car ls) #\= 2)))
(cond
((= 1 (length ls))
(process-char-set (car ls) (car ls) data derived out))
(else
(process-char-set (car ls) (cadr ls) data derived out))))
(lp (cdr ls) data derived out))
(else
(close-output-port out)))))