#!/usr/bin/env chibi-scheme ;; Extract sets of char case offsets. ;; ;; Usage: ;; extract-case-offsets.scm options UnicodeData.txt > out ;; ;; Recognized options are: ;; ;; -c - the minimum required count to output a char-set ;; for an offset, default 32 ;; -m - the maximum number of character sets to output ;; regardless of min-count, default umlimited ;; -n - the name for char-sets, defaults to ;; "char-downcase-offsets" ;; -o - the output file, defaults to stdout (import (chibi) (srfi 1) (srfi 69) (srfi 95) (chibi char-set full) (chibi io) (chibi iset) (chibi iset optimize) (chibi string)) (define (warn . args) (let ((err (current-error-port))) (for-each (lambda (x) (display x err)) args) (newline err))) (define (write-offsets offset-map title-ups title-downs folds out min-count max-char-sets name) (let lp ((ls (sort (hash-table->alist offset-map) (lambda (a b) (> (iset-size (cdr a)) (iset-size (cdr b)))))) (i 0) (res '())) (cond ((and (pair? ls) (or (not max-char-sets) (< i max-char-sets)) (or (not min-count) (>= (iset-size (cdar ls)) min-count))) (lp (cdr ls) (+ i 1) (cons `(cons ,(iset->code (iset-balance (iset-optimize (cdar ls)))) ,(caar ls)) res))) (else (display ";; auto-generated by extract-case-offsets.scm\n\n") (write `(define ,(string->symbol name) (list ,@(reverse res))) out) (newline out) (newline out) (let ((pairs (append-map (lambda (x) (map (lambda (y) (list y (+ y (car x)))) (iset->list (cdr x)))) ls))) (write `(define char-downcase-map ',(list->vector (append-map (lambda (x) x) (sort (append pairs title-downs) < car)))) out) (newline out) (newline out) (write `(define char-upcase-map ',(list->vector (append-map (lambda (x) (list (cadr x) (car x))) (delete-duplicates (sort (append pairs title-ups) < cadr) (lambda (a b) (eqv? (cadr a) (cadr b))))))) out) (newline out) (newline out) (write `(define char-foldcase-map ',(list->vector (append-map (lambda (x) x) (delete-duplicates (sort folds < car) (lambda (a b) (eqv? (cadr a) (cadr b))))))) out) (newline out)))))) ;; value;char;name;category;combining_class;bidi_class;decomposition;numeric1;numeric2;numeric3;bidi_mirrored;unicode1_name;ISO_comment;uppercase_mapping;lowercase_mapping;titlecase_mapping (define (extract-case-mapping in out min-count max-char-sets name) (define (string-trim-comment str comment-ch) (car (string-split str comment-ch 2))) (define (extract-single-decomposition str) (and (not (equal? "" str)) (let ((s (string-trim (last (string-split str #\>))))) (and (not (string-contains s " ")) (string->number s 16))))) (let ((offset-map (make-hash-table eq?)) (title-ups '()) (title-downs '()) (folds '())) (let lp () (let ((line (read-line in))) (cond ((eof-object? line) (write-offsets offset-map title-ups title-downs folds out min-count max-char-sets name)) ((or (equal? line "") (eqv? #\# (string-ref line 0))) (lp)) (else (let ((ls (map string-trim (string-split (string-trim-comment line #\#) #\;)))) (cond ((< (length ls) 15) (warn "invalid UnicodeData.txt line: " line)) (else (let ((base (string->number (list-ref ls 0) 16)) (upper (string->number (list-ref ls 12) 16)) (lower (string->number (list-ref ls 13) 16)) (folded (extract-single-decomposition (list-ref ls 5)))) (cond ((or upper lower) (cond ((iset-contains? char-set:title-case base) (if upper (set! title-ups `((,upper ,base) ,@title-ups))) (if lower (set! title-downs `((,base ,lower) ,@title-downs)))) (else (hash-table-update! offset-map (- (or lower base) (or upper base)) (lambda (is) (iset-adjoin! is (or upper base))) (lambda () (make-iset))))))) (cond ((and folded (not (eqv? folded (or lower base)))) ;; (write `(fold: ,line ,base ,folded) (current-error-port)) ;; (newline (current-error-port)) (set! folds (cons (list base folded) folds))))))) (lp)))))))) (let ((args (command-line))) (let lp ((ls (cdr args)) (min-count 26) (max-char-sets #f) (name "char-downcase-offsets") (out (current-output-port))) (cond ((and (pair? ls) (not (equal? "" (car ls))) (eqv? #\- (string-ref (car ls) 0))) (cond ((member (car ls) '("-c" "--min-count")) (lp (cddr ls) (cadr ls) max-char-sets name out)) ((member (car ls) '("-m" "--max-char-sets")) (lp (cddr ls) min-count (cadr ls) name out)) ((member (car ls) '("-n" "--name")) (lp (cddr ls) min-count max-char-sets (cadr ls) out)) ((member (car ls) '("-o" "--output")) (lp (cddr ls) min-count max-char-sets name (open-output-file (cadr ls)))) (else (error "unknown option: " (car ls))))) ((null? ls) (error "usage: extract-case-offsets ")) (else (if (equal? "-" (car ls)) (extract-case-mapping (current-input-port) out min-count max-char-sets name) (call-with-input-file (car ls) (lambda (in) (extract-case-mapping in out min-count max-char-sets name)))) (close-output-port out)))))