#!/usr/bin/env chibi-scheme

;; Extract sets of char case offsets.
;;
;; Usage:
;;   extract-case-offsets.scm [options] UnicodeData.txt CaseFolding.txt > out
;;
;; Recognized options are:
;;
;;   -c <min-count>     - the minimum required count to output a char-set
;;                        for an offset, default 32
;;   -m <max-char-sets> - the maximum number of character sets to output
;;                        regardless of min-count, default umlimited
;;   -n <name>          - the name for char-sets, defaults to
;;                        "char-downcase-offsets"
;;   -o <output-file>   - the output file, defaults to stdout

(import (chibi) (srfi 1) (srfi 69) (srfi 95) (chibi char-set full)
        (chibi io) (chibi iset) (chibi iset optimize) (chibi string))

(define (warn . args)
  (let ((err (current-error-port)))
    (for-each (lambda (x) (display x err)) args)
    (newline err)))

(define (write-hex-list hex-ls out)
  (let lp ((ls hex-ls))
    (cond
     ((pair? ls)
      (if (not (eq? ls hex-ls))
          (write-char #\space out))
      (write-string "#x" out)
      (write-string (number->string (car ls) 16) out)
      (lp (cdr ls))))))

(define (write-offsets offset-map title-ups title-downs out min-count max-char-sets name)
  (let lp ((ls (sort (hash-table->alist offset-map)
                     (lambda (a b) (> (iset-size (cdr a)) (iset-size (cdr b))))))
           (i 0)
           (res '()))
    (cond
     ((and (pair? ls)
           (or (not max-char-sets) (< i max-char-sets))
           (or (not min-count) (>= (iset-size (cdar ls)) min-count)))
      (lp (cdr ls)
          (+ i 1)
          (cons `(cons ,(iset->code (iset-balance (iset-optimize (cdar ls))))
                       ,(caar ls))
                res)))
     (else
      (display ";; auto-generated by extract-case-offsets.scm\n\n")
      (write `(define ,(string->symbol name)
                (list ,@(reverse res)))
             out)
      (newline out)
      (newline out)
      (let ((pairs
             (append-map
              (lambda (x)
                (map (lambda (y) (list y (+ y (car x))))
                     (iset->list (cdr x))))
              ls)))
        (write-string "(define char-downcase-map\n  '#(" out)
        (write-hex-list
         (append-map (lambda (x) x) (sort (append pairs title-downs) < car))
         out)
        (write-string "))\n\n" out)
        (write-string "(define char-upcase-map\n  '#(" out)
        (write-hex-list
         (append-map (lambda (x) (list (cadr x) (car x)))
                     (delete-duplicates
                      (sort (append pairs title-ups) < cadr)
                      (lambda (a b) (eqv? (cadr a) (cadr b)))))
         out)
        (write-string "))\n\n" out))))))

(define (extract-case-folding in out)
  (define (write-folds folds out)
    (write-string "(define char-foldcase-map\n  '#(" out)
    (write-hex-list
     (append-map (lambda (x) x) (sort folds < car))
     out)
    (write-string "))\n" out))
  (let lp ((folds '()))
    (let ((line (read-line in)))
      (cond
       ((eof-object? line)
        (write-folds folds out))
       ((or (equal? line "") (eqv? #\# (string-ref line 0)))
        (lp folds))
       (else
        (let* ((line (substring-cursor line
                                       (string-cursor-start line)
                                       (string-find line #\#)))
               (ls (map string-trim (string-split line #\;))))
          (if (and (>= (length ls) 3)
                   (member (second ls) '("C" "S")))
              (let ((base (string->number (first ls) 16))
                    (folded (string->number (third ls) 16)))
                (if (and base folded)
                    (lp (cons (list base folded) folds))
                    (lp folds)))
              (lp folds))))))))

;; value;char;name;category;combining_class;bidi_class;decomposition;numeric1;numeric2;numeric3;bidi_mirrored;unicode1_name;ISO_comment;uppercase_mapping;lowercase_mapping;titlecase_mapping

(define (extract-case-mapping in out min-count max-char-sets name)
  (define (string-trim-comment str comment-ch)
    (car (string-split str comment-ch 2)))
  (let ((offset-map (make-hash-table eq?))
        (title-ups '())
        (title-downs '()))
    (let lp ()
      (let ((line (read-line in)))
        (cond
         ((eof-object? line)
          (write-offsets offset-map title-ups title-downs out
                         min-count max-char-sets name))
         ((or (equal? line "") (eqv? #\# (string-ref line 0)))
          (lp))
         (else
          (let ((ls (map string-trim
                         (string-split (string-trim-comment line #\#) #\;))))
            (cond
             ((< (length ls) 15)
              (warn "invalid UnicodeData.txt line: " line))
             (else
              (let ((base (string->number (list-ref ls 0) 16))
                    (upper (string->number (list-ref ls 12) 16))
                    (lower (string->number (list-ref ls 13) 16)))
                (cond
                 ((or upper lower)
                  (cond
                   ((iset-contains? char-set:title-case base)
                    (if upper
                        (set! title-ups `((,upper ,base) ,@title-ups)))
                    (if lower
                        (set! title-downs `((,base ,lower) ,@title-downs))))
                   (else
                    (hash-table-update!
                     offset-map
                     (- (or lower base) (or upper base))
                     (lambda (is) (iset-adjoin! is (or upper base)))
                     (lambda () (make-iset))))))))))
            (lp))))))))

(let ((args (command-line)))
  (let lp ((ls (cdr args))
           (min-count 26)
           (max-char-sets #f)
           (name "char-downcase-offsets")
           (out (current-output-port)))
    (cond
     ((and (pair? ls) (not (equal? "" (car ls)))
           (eqv? #\- (string-ref (car ls) 0)))
      (cond
       ((member (car ls) '("-c" "--min-count"))
        (lp (cddr ls) (cadr ls) max-char-sets name out))
       ((member (car ls) '("-m" "--max-char-sets"))
        (lp (cddr ls) min-count (cadr ls) name out))
       ((member (car ls) '("-n" "--name"))
        (lp (cddr ls) min-count max-char-sets (cadr ls) out))
       ((member (car ls) '("-o" "--output"))
        (lp (cddr ls) min-count max-char-sets name
            (open-output-file (cadr ls))))
       (else
        (error "unknown option: " (car ls)))))
     ((not (= 2 (length ls)))
      (error "usage: extract-case-offsets <UnicodeData.txt> <CaseFolding.txt>"))
     (else
      (if (equal? "-" (car ls))
          (extract-case-mapping
           (current-input-port) out min-count max-char-sets name)
          (call-with-input-file (car ls)
            (lambda (in)
              (extract-case-mapping in out min-count max-char-sets name))))
      (if (equal? "-" (cadr ls))
          (extract-case-folding (current-input-port) out)
          (call-with-input-file (cadr ls)
            (lambda (in) (extract-case-folding in out))))
      (close-output-port out)))))