#!/usr/bin/env chibi-scheme

;; Simple tool to extract Unicode properties as character-sets.
;;
;; Usage:
;;   extract-unicode-props.scm Lowercase Punctuation=P Blank=Zs,0009 >out
;;
;; Accepts a list of char-set names with optional definitions as
;; arguments, and writes their Scheme definitions to stdout.  A
;; char-set can be of the form:
;;
;;   Name: equivalent to Name=Name
;;   Name=value,...:
;;
;; A value can be any of:
;;
;;   Property_Name: all unicode characters with the given derived property
;;   Prop@N: a property matched at a given field (instead of default category)
;;   Xx: all unicode characters with the given general category
;;   X: all unicode characters with any general category X*
;;   NNNN: a single unicode value in hex format
;;   NNNN-NNNN: an inclusive range of unicode values in hex format
;;
;; The char-set names generated are always lowercased, replacing _
;; with -, for convenicence when the char-set name coincides with a
;; Unicode property name.
;;
;; Assumes the files UnicodeData.txt and DerivedCoreProperties.txt are
;; in the ./data/ directory, unless overridden with the --data or
;; --derived options.

(import (chibi) (chibi io) (chibi string))

(define (warn . args)
  (let ((err (current-error-port)))
    (for-each (lambda (x) (display x err)) args)
    (newline err)))

;; Parse UnicodeData.txt or other semi-colon-delimited TSV file for
;; characters matching a given class in a given field.
(define (extract-char-set-category cat field data)
  (define (join-to-range n ls)
    (cond
     ((null? ls)
      (list n))
     ((eqv? (car ls) (- n 1))
      (cons (cons (car ls) n) (cdr ls)))
     ((and (pair? (car ls)) (eqv? (- n 1) (cdar ls)))
      (cons (cons (caar ls) n) (cdr ls)))
     (else
      (cons n ls))))
  (call-with-input-file data
    (lambda (in)
      (let lp ((ranges '()))
        (let ((line (read-line in)))
          (cond
           ((eof-object? line)
            `(char-set-union
              ,@(map (lambda (x)
                       (if (pair? x)
                           `(ucs-range->char-set ,(car x) ,(+ 1 (cdr x)))
                           `(char-set ,(integer->char x))))
                     (reverse ranges))))
           ((or (equal? line "") (eqv? #\# (string-ref line 0)))
            (lp ranges))
           (else
            (let* ((line (substring-cursor line
                                           (string-cursor-start line)
                                           (string-find line #\#)))
                   (ls (map string-trim (string-split line #\;))))
              (cond
               ((<= (length ls) field)
                (warn "invalid UnicodeData line: " line)
                (lp ranges))
               (else
                (let ((ch (if (string-contains (car ls) "..")
                              (let* ((sc (string-contains (car ls) "..")))
                                (cons (string->number
                                       (substring-cursor
                                        (car ls)
                                        (string-cursor-start (car ls))
                                        sc)
                                       16)
                                      (string->number
                                       (substring-cursor
                                        (car ls)
                                        (string-cursor-forward (car ls) sc 2))
                                       16)))
                              (string->number (car ls) 16)))
                      (name (cadr ls))
                      (ch-cat (list-ref ls field)))
                  (cond
                   ((not (or (integer? ch)
                             (and (pair? ch)
                                  (integer? (car ch))
                                  (integer? (cdr ch)))))
                    (warn "invalid UnicodeData line: " line))
                   ((if (char? cat)
                        (eqv? cat (string-ref ch-cat 0))
                        (equal? cat ch-cat))
                    (lp (if (pair? ch)
                            (cons ch ranges)
                            (join-to-range ch ranges))))
                   (else
                    (lp ranges))))))))))))))

;; Parse DerivedCoreProperties.txt for characters matching a given
;; property.
(define (extract-char-set-property prop derived)
  (define (string-trim-comment str comment-ch)
    (car (string-split str comment-ch 2)))
  (call-with-input-file derived
    (lambda (in)
      (let lp ((ranges '()))
        (let ((line (read-line in)))
          (cond
           ((eof-object? line)
            `(char-set-union ,@(reverse ranges)))
           ((or (equal? line "") (eqv? #\# (string-ref line 0)))
            (lp ranges))
           (else
            (let ((ls (map string-trim
                           (string-split (string-trim-comment line #\#) #\;))))
              (cond
               ((< (length ls) 2)
                (warn "invalid DerivedCoreProperties line: " line)
                (lp ranges))
               ((string-ci=? prop (cadr ls))
                (cond
                 ((string-contains (car ls) "..")
                  => (lambda (i)
                       (let* ((str (car ls))
                              (start (string->number (substring-cursor str (string-cursor-start str) i) 16))
                              (end (string->number (substring-cursor str (string-cursor-forward str i 2)) 16)))
                         (if (and start end (<= 0 start end #x110000))
                             (lp (cons `(ucs-range->char-set ,start ,(+ end 1))
                                       ranges))
                             (error "invalid char range: " line)))))
                 ((string->number (car ls) 16)
                  => (lambda (n)
                       (lp (cons `(char-set ,(integer->char n)) ranges))))
                 (else
                  (lp ranges))))
               (else
                (lp ranges)))))))))))

(define (extract-char-set-simple def data derived)
  (let ((ls (string-split def #\- 2)))
    (cond
     ((= 2 (length ls))
      (let ((start (string->number (car ls) 16))
            (end (string->number (cadr ls) 16)))
        (if (and start end (<= start end))
            `(ucs-range->char-set ,start ,(+ end 1))
            (error "invalid character range, expected NNNN-MMMM, got: " def))))
     ((string->number def 16)
      => (lambda (start) `(char-set ,(integer->char start))))
     ((string-find? def #\@)
      (let* ((sc (string-find def #\@))
             (cat (substring-cursor def (string-cursor-start def) sc))
             (field (string->number
                     (substring-cursor def (string-cursor-next def sc)))))
        (extract-char-set-category cat field data)))
     ((and (= 1 (string-length def))
           (char-upper-case? (string-ref def 0)))
      (extract-char-set-category (string-ref def 0) 2 data))
     ((and (= 2 (string-length def))
           (char-upper-case? (string-ref def 0))
           (char-lower-case? (string-ref def 1)))
      (extract-char-set-category def 2 data))
     ;; derived properties
     ((and (> (string-length def) 1)
           (eqv? #\: (string-ref def 0)))
      (extract-char-set-property (substring def 1) derived))
     (else
      (extract-char-set-property def derived)))))

(define (extract-char-set def data derived)
  (let ((defs (string-split def #\,)))
    (cond
     ((= 1 (length defs))
      (extract-char-set-simple (car defs) data derived))
     (else
      `(char-set-union
        ,@(map (lambda (def) (extract-char-set-simple def data derived))
               defs))))))

(define (process-char-set name def data derived out)
  (define (normalize-char-set-name str)
    (string-append
     (if (eqv? #\: (string-ref str 0)) "char-set" "char-set:")
     (string-map (lambda (ch) (if (eqv? ch #\_) #\- (char-downcase ch))) str)))
  (display ";; " out)
  (display def out)
  (newline out)
  (write
   `(define ,(string->symbol (normalize-char-set-name name))
      (immutable-char-set
       ,(extract-char-set def data derived)))
   out)
  (newline out)
  (newline out))

(define default-char-sets
  '("Lower-Case=Lowercase"
    "Upper-Case=Uppercase"
    "Title-Case=Lt"
    "Letter=Alphabetic"
    "Punctuation=P"
    "Symbol=S"
    "Blank=Zs,0009"
    "Whitespace=Zs,Zl,Zp,0009,000A,000B,000C,000D,0085"
    "Digit=Nd"))

(let ((args (command-line)))
  (let lp ((ls (cdr args))
           (data "data/UnicodeData.txt")
           (derived "data/DerivedCoreProperties.txt")
           (out (current-output-port)))
    (cond
     ((and (pair? ls) (not (equal? "" (car ls)))
           (eqv? #\- (string-ref (car ls) 0)))
      (cond
       ((member (car ls) '("-d" "--data"))
        (lp (cddr ls) (cadr ls) derived out))
       ((member (car ls) '("-e" "--derived"))
        (lp (cddr ls) data (cadr ls) out))
       ((member (car ls) '("-o" "--output"))
        (lp (cddr ls) data derived (open-output-file (cadr ls))))
       ((member (car ls) '("-f" "--default"))
        (lp (append default-char-sets (cdr ls)) data derived out))
       (else
        (error "unknown option: " (car ls)))))
     ((pair? ls)
      (let ((ls (string-split (car ls) #\= 2)))
        (cond
         ((= 1 (length ls))
          (process-char-set (car ls) (car ls) data derived out))
         (else
          (process-char-set (car ls) (cadr ls) data derived out))))
      (lp (cdr ls) data derived out))
     (else
      (close-output-port out)))))