mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-05-18 21:29:19 +02:00
181 lines
6.9 KiB
Scheme
181 lines
6.9 KiB
Scheme
#!/usr/bin/env chibi-scheme
|
|
|
|
;; Extract sets of char case offsets.
|
|
;;
|
|
;; Usage:
|
|
;; extract-case-offsets.scm [options] UnicodeData.txt CaseFolding.txt > out
|
|
;;
|
|
;; Recognized options are:
|
|
;;
|
|
;; -c <min-count> - the minimum required count to output a char-set
|
|
;; for an offset, default 32
|
|
;; -m <max-char-sets> - the maximum number of character sets to output
|
|
;; regardless of min-count, default umlimited
|
|
;; -n <name> - the name for char-sets, defaults to
|
|
;; "char-downcase-offsets"
|
|
;; -o <output-file> - the output file, defaults to stdout
|
|
|
|
(import (chibi) (srfi 1) (srfi 69) (srfi 95) (chibi char-set full)
|
|
(chibi io) (chibi iset) (chibi iset optimize) (chibi string))
|
|
|
|
(define (warn . args)
|
|
(let ((err (current-error-port)))
|
|
(for-each (lambda (x) (display x err)) args)
|
|
(newline err)))
|
|
|
|
(define (write-hex-list hex-ls out)
|
|
(let lp ((ls hex-ls))
|
|
(cond
|
|
((pair? ls)
|
|
(if (not (eq? ls hex-ls))
|
|
(write-char #\space out))
|
|
(write-string "#x" out)
|
|
(write-string (number->string (car ls) 16) out)
|
|
(lp (cdr ls))))))
|
|
|
|
(define (write-offsets offset-map title-ups title-downs out min-count max-char-sets name)
|
|
(let lp ((ls (sort (hash-table->alist offset-map)
|
|
(lambda (a b) (> (iset-size (cdr a)) (iset-size (cdr b))))))
|
|
(i 0)
|
|
(res '()))
|
|
(cond
|
|
((and (pair? ls)
|
|
(or (not max-char-sets) (< i max-char-sets))
|
|
(or (not min-count) (>= (iset-size (cdar ls)) min-count)))
|
|
(lp (cdr ls)
|
|
(+ i 1)
|
|
(cons `(cons ,(iset->code (iset-balance (iset-optimize (cdar ls))))
|
|
,(caar ls))
|
|
res)))
|
|
(else
|
|
(display ";; auto-generated by extract-case-offsets.scm\n\n")
|
|
(write `(define ,(string->symbol name)
|
|
(list ,@(reverse res)))
|
|
out)
|
|
(newline out)
|
|
(newline out)
|
|
(let ((pairs
|
|
(append-map
|
|
(lambda (x)
|
|
(map (lambda (y) (list y (+ y (car x))))
|
|
(iset->list (cdr x))))
|
|
ls)))
|
|
(write-string "(define char-downcase-map\n '#(" out)
|
|
(write-hex-list
|
|
(append-map (lambda (x) x) (sort (append pairs title-downs) < car))
|
|
out)
|
|
(write-string "))\n\n" out)
|
|
(write-string "(define char-upcase-map\n '#(" out)
|
|
(write-hex-list
|
|
(append-map (lambda (x) (list (cadr x) (car x)))
|
|
(delete-duplicates
|
|
(sort (append pairs title-ups) < cadr)
|
|
(lambda (a b) (eqv? (cadr a) (cadr b)))))
|
|
out)
|
|
(write-string "))\n\n" out))))))
|
|
|
|
(define (extract-case-folding in out)
|
|
(define (write-folds folds out)
|
|
(write-string "(define char-foldcase-map\n '#(" out)
|
|
(write-hex-list
|
|
(append-map (lambda (x) x) (sort folds < car))
|
|
out)
|
|
(write-string "))\n" out))
|
|
(let lp ((folds '()))
|
|
(let ((line (read-line in)))
|
|
(cond
|
|
((eof-object? line)
|
|
(write-folds folds out))
|
|
((or (equal? line "") (eqv? #\# (string-ref line 0)))
|
|
(lp folds))
|
|
(else
|
|
(let* ((line (substring-cursor line
|
|
(string-cursor-start line)
|
|
(string-find line #\#)))
|
|
(ls (map string-trim (string-split line #\;))))
|
|
(if (and (>= (length ls) 3)
|
|
(member (second ls) '("C" "S")))
|
|
(let ((base (string->number (first ls) 16))
|
|
(folded (string->number (third ls) 16)))
|
|
(if (and base folded)
|
|
(lp (cons (list base folded) folds))
|
|
(lp folds)))
|
|
(lp folds))))))))
|
|
|
|
;; value;char;name;category;combining_class;bidi_class;decomposition;numeric1;numeric2;numeric3;bidi_mirrored;unicode1_name;ISO_comment;uppercase_mapping;lowercase_mapping;titlecase_mapping
|
|
|
|
(define (extract-case-mapping in out min-count max-char-sets name)
|
|
(define (string-trim-comment str comment-ch)
|
|
(car (string-split str comment-ch 2)))
|
|
(let ((offset-map (make-hash-table eq?))
|
|
(title-ups '())
|
|
(title-downs '()))
|
|
(let lp ()
|
|
(let ((line (read-line in)))
|
|
(cond
|
|
((eof-object? line)
|
|
(write-offsets offset-map title-ups title-downs out
|
|
min-count max-char-sets name))
|
|
((or (equal? line "") (eqv? #\# (string-ref line 0)))
|
|
(lp))
|
|
(else
|
|
(let ((ls (map string-trim
|
|
(string-split (string-trim-comment line #\#) #\;))))
|
|
(cond
|
|
((< (length ls) 15)
|
|
(warn "invalid UnicodeData.txt line: " line))
|
|
(else
|
|
(let ((base (string->number (list-ref ls 0) 16))
|
|
(upper (string->number (list-ref ls 12) 16))
|
|
(lower (string->number (list-ref ls 13) 16)))
|
|
(cond
|
|
((or upper lower)
|
|
(cond
|
|
((iset-contains? char-set:title-case base)
|
|
(if upper
|
|
(set! title-ups `((,upper ,base) ,@title-ups)))
|
|
(if lower
|
|
(set! title-downs `((,base ,lower) ,@title-downs))))
|
|
(else
|
|
(hash-table-update!
|
|
offset-map
|
|
(- (or lower base) (or upper base))
|
|
(lambda (is) (iset-adjoin! is (or upper base)))
|
|
(lambda () (make-iset))))))))))
|
|
(lp))))))))
|
|
|
|
(let ((args (command-line)))
|
|
(let lp ((ls (cdr args))
|
|
(min-count 26)
|
|
(max-char-sets #f)
|
|
(name "char-downcase-offsets")
|
|
(out (current-output-port)))
|
|
(cond
|
|
((and (pair? ls) (not (equal? "" (car ls)))
|
|
(eqv? #\- (string-ref (car ls) 0)))
|
|
(cond
|
|
((member (car ls) '("-c" "--min-count"))
|
|
(lp (cddr ls) (cadr ls) max-char-sets name out))
|
|
((member (car ls) '("-m" "--max-char-sets"))
|
|
(lp (cddr ls) min-count (cadr ls) name out))
|
|
((member (car ls) '("-n" "--name"))
|
|
(lp (cddr ls) min-count max-char-sets (cadr ls) out))
|
|
((member (car ls) '("-o" "--output"))
|
|
(lp (cddr ls) min-count max-char-sets name
|
|
(open-output-file (cadr ls))))
|
|
(else
|
|
(error "unknown option: " (car ls)))))
|
|
((not (= 2 (length ls)))
|
|
(error "usage: extract-case-offsets <UnicodeData.txt> <CaseFolding.txt>"))
|
|
(else
|
|
(if (equal? "-" (car ls))
|
|
(extract-case-mapping
|
|
(current-input-port) out min-count max-char-sets name)
|
|
(call-with-input-file (car ls)
|
|
(lambda (in)
|
|
(extract-case-mapping in out min-count max-char-sets name))))
|
|
(if (equal? "-" (cadr ls))
|
|
(extract-case-folding (current-input-port) out)
|
|
(call-with-input-file (cadr ls)
|
|
(lambda (in) (extract-case-folding in out))))
|
|
(close-output-port out)))))
|