chibi-scheme/lib/chibi/csv.scm


;;> \section{CSV Grammars}

;;> CSV is a simple and compact format for tabular data, which has
;;> made it popular for a variety of tasks since the early days of
;;> computing.  Unfortunately, there are many incompatible dialects
;;> requiring a grammar to specify all of the different options.

(define-record-type Csv-Grammar
  (make-csv-grammar separator-chars quote-char quote-doubling-escapes? escape-char record-separator comment-chars quote-non-numeric?)
  csv-grammar?
  (separator-chars csv-grammar-separator-chars csv-grammar-separator-chars-set!)
  (quote-char csv-grammar-quote-char csv-grammar-quote-char-set!)
  (quote-doubling-escapes? csv-grammar-quote-doubling-escapes? csv-grammar-quote-doubling-escapes?-set!)
  (escape-char csv-grammar-escape-char csv-grammar-escape-char-set!)
  (record-separator csv-grammar-record-separator csv-grammar-record-separator-set!)
  (comment-chars csv-grammar-comment-chars csv-grammar-comment-chars-set!)
  (quote-non-numeric? csv-grammar-quote-non-numeric? csv-grammar-quote-non-numeric?-set!))

;; TODO: Other options to consider:
;; - strip-leading/trailing-whitespace?
;; - newlines-in-quotes?

;;> Creates a new CSV grammar from the given spec, an alist of symbols
;;> to values.  The following options are supported:
;;>
;;> \itemlist[
;;> \item{\scheme{'separator-chars} - A non-empty list of characters used to delimit fields, by default \scheme{'(#\\,)} (comma-separated).}
;;> \item{\scheme{'quote-char} - A single character used to quote fields containing special characters, or \scheme{#f} to disable quoting, by default \scheme{#\\"} (a double-quote).}
;;> \item{\scheme{'quote-doubling-escapes?} - If true, two successive \scheme{quote-char}s within quotes are treated as a single escaped \scheme{quote-char} (default true).}
;;> \item{\scheme{'escape-char} - A single character used to escape characters within quoted fields, or \scheme{#f} to disable escapes, by default \scheme{#f} (no explicit escape, use quote doubling).}
;;> \item{\scheme{'record-separator} - A single character used to delimit the record (row), or one of the symbols \scheme{'cr}, \scheme{'crlf}, \scheme{'lf} or \scheme{'lax}.  These correspond to sequences of carriage return and line feed, or in the case of \scheme{'lax} any of the other three sequences.  Defaults to \scheme{'lax}.}
;;> \item{\scheme{'comment-chars} - A list of characters which if found at the start of a record indicate it is a comment, discarding all characters through to the next record-separator. Defaults to the empty list (no comments).}
;;> ]
;;>
;;> Example Gecos grammar:
;;>
;;> \example{
;;> (csv-grammar
;;>   '((separator-chars #\\:)
;;>     (quote-char . #f)))
;;> }
(define (csv-grammar spec)
  (let ((grammar (make-csv-grammar '(#\,) #\" #t #f 'lax '() #f)))
    (for-each
     (lambda (x)
       (case (car x)
         ((separator-chars delimiter)
          (csv-grammar-separator-chars-set! grammar (cdr x)))
         ((quote-char)
          (csv-grammar-quote-char-set! grammar (cdr x)))
         ((quote-doubling-escapes?)
          (csv-grammar-quote-doubling-escapes?-set! grammar (cdr x)))
         ((escape-char)
          (csv-grammar-escape-char-set! grammar (cdr x)))
         ((record-separator newline-type)
          (let ((rec-sep
                 (case (cdr x)
                   ((crlf lax) (cdr x))
                   ((cr) #\return)
                   ((lf) #\newline)
                   (else
                    (if (char? (cdr x))
                        (cdr x)
                        (error "invalid record-separator, expected a char or one of 'lax or 'crlf" (cdr x)))))))
            (csv-grammar-escape-char-set! grammar (cdr x))))
         ((comment-chars)
          (csv-grammar-comment-chars-set! grammar (cdr x)))
         ((quote-non-numeric?)
          (csv-grammar-quote-non-numeric?-set! grammar (cdr x)))
         (else
          (error "unknown csv-grammar spec" x))))
     spec)
    grammar))

;;> The default CSV grammar for convenience, with all of the defaults
;;> from \scheme{csv-grammar}, i.e. comma-delimited with \scheme{#\"}
;;> for quoting, doubled to escape.
(define default-csv-grammar
  (csv-grammar '()))

;;> The default TSV grammar for convenience, splitting fields only on
;;> tabs, with no quoting or escaping.
(define default-tsv-grammar
  (csv-grammar '((separator-chars #\tab) (quote-char . #f))))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;> \section{CSV Parsers}

;;> Parsers are low-level utilities to perform operations on records a
;;> field at a time.  You generally want to work with readers, which
;;> build on this to build records into familiar data structures.

;;> Parsers follow the rules of a grammar to parse a single CSV
;;> record, possible comprised of multiple fields.  A parser is a
;;> procedure of three arguments which performs a fold operation over
;;> the fields of the record. The parser signature is:
;;> \scheme{(parser kons knil in)}, where \scheme{kons} itself is
;;> a procedure of three arguments: \scheme{(proc acc index field)}.
;;> \scheme{proc} is called on each field of the record, in order,
;;> along with its zero-based \scheme{index} and the accumulated
;;> result of the last call, starting with \scheme{knil}.

;;> Returns a new CSV parser for the given \var{grammar}.  The parser
;;> by itself can be used to parse a record at a time.
;;>
;;> \example{
;;> (let ((parse (csv-parser)))
;;>   (parse (lambda (vec i field) (vector-set! vec i (string->number field)) vec)
;;>          (make-vector 3)
;;>          (open-input-string "1,2,3")))
;;> }
(define csv-parser
  (opt-lambda ((grammar default-csv-grammar))
    (lambda (kons knil in)
      (when (pair? (csv-grammar-comment-chars grammar))
        (let lp ()
          (when (memv (peek-char in) (csv-grammar-comment-chars grammar))
            (csv-skip-line in grammar)
            (lp))))
      (let lp ((acc knil)
               (index 0)
               (quoted? #f)
               (out (open-output-string)))
        (define (get-field)
          (let ((field (get-output-string out)))
            (cond
             ((and (zero? index) (equal? field "")) field)
             ((and (csv-grammar-quote-non-numeric? grammar) (not quoted?))
              (or (string->number field)
                  (error "unquoted field is not numeric" field)))
             (else field))))
        (define (finish-row)
          (let ((field (get-field)))
            (if (and (zero? index) (equal? field ""))
                ;; empty row, read again
                (lp acc index #f out)
                (kons acc index field))))
        (let ((ch (read-char in)))
          (cond
           ((eof-object? ch)
            (let ((field (get-field)))
              (if (and (zero? index) (equal? field ""))
                  ;; no data
                  ch
                  (kons acc index field))))
           ((memv ch (csv-grammar-separator-chars grammar))
            (lp (kons acc index (get-field))
                (+ index 1)
                #f
                (open-output-string)))
           ((eqv? ch (csv-grammar-quote-char grammar))
            ;; TODO: Consider a strict mode to enforce no text
            ;; before/after the quoted text.
            (csv-read-quoted in out grammar)
            (lp acc index #t out))
           ((eqv? ch (csv-grammar-record-separator grammar))
            (finish-row))
           ((and (eqv? ch #\return)
                 (memq (csv-grammar-record-separator grammar) '(crlf lax)))
            (cond
             ((eqv? (peek-char in) #\newline)
              (read-char in)
              (finish-row))
             ((eq? (csv-grammar-record-separator grammar) 'lax)
              (finish-row))
             (else
              (write-char ch out)
              (lp acc (+ index 1) quoted? out))))
           ((and (eqv? ch #\newline)
                 (eq? (csv-grammar-record-separator grammar) 'lax))
            (finish-row))
           (else
            (write-char ch out)
            (lp acc index quoted? out))))))))

(define (csv-skip-line in grammar)
  (let lp ()
    (let ((ch (read-char in)))
      (cond
       ((eof-object? ch))
       ((eqv? ch (csv-grammar-record-separator grammar)))
       ((and (eqv? ch #\newline)
             (eq? (csv-grammar-record-separator grammar) 'lax)))
       ((and (eqv? ch #\return)
             (memq (csv-grammar-record-separator grammar) '(crlf lax)))
        (cond
         ((eqv? (peek-char in) #\newline) (read-char in))
         ((eq? (csv-grammar-record-separator grammar) 'lax))
         (else (lp))))
       (else (lp))))))

(define (csv-read-quoted in out grammar)
  (let lp ()
    (let ((ch (read-char in)))
      (cond
       ((eof-object? ch)
        (error "unterminated csv quote" (get-output-string out)))
       ((eqv? ch (csv-grammar-quote-char grammar))
        (when (and (csv-grammar-quote-doubling-escapes? grammar)
                   (eqv? ch (peek-char in)))
          (write-char (read-char in) out)
          (lp)))
       ((eqv? ch (csv-grammar-escape-char grammar))
        (write-char (read-char in) out)
        (lp))
       (else
        ;; TODO: Consider an option to disable newlines in quotes.
        (write-char ch out)
        (lp))))))

(define (csv-skip-quoted in grammar)
  (let lp ()
    (let ((ch (read-char in)))
      (cond
       ((eof-object? ch)
        (error "unterminated csv quote"))
       ((eqv? ch (csv-grammar-quote-char grammar))
        (when (and (csv-grammar-quote-doubling-escapes? grammar)
                   (eqv? ch (peek-char in)))
          (read-char in)
          (lp)))
       ((eqv? ch (csv-grammar-escape-char grammar))
        (read-char in)
        (lp))
       (else
        (lp))))))

;;> Returns the number of rows in the input.
(define csv-num-rows
  (opt-lambda ((grammar default-csv-grammar)
               (in (current-input-port)))
    (let lp ((num-rows 0) (start? #t))
      (let ((ch (read-char in)))
        (cond
         ((eof-object? ch) (if start? num-rows (+ num-rows 1)))
         ((eqv? ch (csv-grammar-quote-char grammar))
          (csv-skip-quoted in grammar)
          (lp num-rows #f))
         ((eqv? ch (csv-grammar-record-separator grammar))
          (lp (+ num-rows 1) #f))
         ((and (eqv? ch #\return)
               (memq (csv-grammar-record-separator grammar) '(crlf lax)))
          (cond
           ((eqv? (peek-char in) #\newline)
            (read-char in)
            (lp (+ num-rows 1) #t))
           ((eq? (csv-grammar-record-separator grammar) 'lax)
            (lp (+ num-rows 1) #t))
           (else
            (lp num-rows #f))))
         ((and (eqv? ch #\newline)
               (eq? (csv-grammar-record-separator grammar) 'lax))
          (lp (+ num-rows 1) #t))
         (else
          (lp num-rows #f)))))))

;;> \section{CSV Readers}

;;> A CSV reader reads a single record, returning some representation
;;> of it.  You can either loop manually with these or pass them to
;;> one of the high-level utilities to operate on a whole CSV file at
;;> a time.

;;> The simplest reader, simply returns the field string values in
;;> order as a list.
;;>
;;> \example{
;;> ((csv-read->list) (open-input-string "foo,bar,baz"))
;;> }
(define csv-read->list
  (opt-lambda ((parser (csv-parser)))
    (opt-lambda ((in (current-input-port)))
      (let ((res (parser (lambda (ls i field) (cons field ls)) '() in)))
        (if (pair? res)
            (reverse res)
            res)))))

;;> The equivalent of \scheme{csv-read->list} but returns a vector.
;;>
;;> \example{
;;> ((csv-read->vector) (open-input-string "foo,bar,baz"))
;;> }
(define csv-read->vector
  (opt-lambda ((parser (csv-parser)))
    (let ((reader (csv-read->list parser)))
      (opt-lambda ((in (current-input-port)))
        (let ((res (reader in)))
          (if (pair? res)
              (list->vector res)
              res))))))

;;> The same as \scheme{csv-read->vector} but requires the vector to
;;> be of a fixed size, and may be more efficient.
;;>
;;> \example{
;;> ((csv-read->fixed-vector 3) (open-input-string "foo,bar,baz"))
;;> }
(define csv-read->fixed-vector
  (opt-lambda (size (parser (csv-parser)))
    (opt-lambda ((in (current-input-port)))
      (let ((res (make-vector size)))
        (let ((len (parser (lambda (prev-i i field) (vector-set! res i field) i)
                           0
                           in)))
          (if (zero? len)
              (eof-object)
              res))))))

;;> Returns an SXML representation of the record, as a row with
;;> multiple named columns.
;;>
;;> \example{
;;> ((csv-read->sxml 'city '(name latitude longitude))
;;>  (open-input-string "Tokyo,35°41′23″N,139°41′32″E"))
;;> }
(define csv-read->sxml
  (opt-lambda ((row-name 'row)
               (column-names
                (lambda (i)
                  (string->symbol (string-append "col-" (number->string i)))))
               (parser (csv-parser)))
    (define (get-column-name i)
      (if (procedure? column-names)
          (column-names i)
          (list-ref column-names i)))
    (opt-lambda ((in (current-input-port)))
      (let ((res (parser (lambda (ls i field)
                           `((,(get-column-name i) ,field) ,@ls))
                         (list row-name)
                         in)))
        (if (pair? res)
            (reverse res)
            res)))))

;;> \section{CSV Utilities}

;;> A folding operation on records.  \var{proc} is called successively
;;> on each row and the accumulated result.
;;>
;;> \example{
;;> (csv-fold
;;>  (lambda (row acc) (cons (cadr (assq 'name (cdr row))) acc))
;;>  '()
;;>  (csv-read->sxml 'city '(name latitude longitude))
;;>  (open-input-string
;;>   "Tokyo,35°41′23″N,139°41′32″E
;;> Paris,48°51′24″N,2°21′03″E"))
;;> }
(define csv-fold
  (opt-lambda (proc
               knil
               (reader (csv-read->list))
               (in (current-input-port)))
    (let lp ((acc knil))
      (let ((row (reader in)))
        (cond
         ((eof-object? row) acc)
         (else (lp (proc row acc))))))))

;;> An iterator which simply calls \var{proc} on each record in the
;;> input in order.
;;>
;;> \example{
;;> (let ((count 0))
;;>   (csv-for-each
;;>    (lambda (row) (if (string->number (car row)) (set! count (+ 1 count))))
;;>    (csv-read->list)
;;>    (open-input-string
;;>      "1,foo\\n2,bar\\nthree,baz\\n4,qux"))
;;>    count)
;;> }
(define csv-for-each
  (opt-lambda (proc
               (reader (csv-read->list))
               (in (current-input-port)))
    (csv-fold (lambda (row acc) (proc row)) #f reader in)))

;;> Returns a list containing the result of calling \var{proc} on each
;;> element in the input.
;;>
;;> \example{
;;> (csv-map
;;>  (lambda (row) (string->symbol (cadr row)))
;;>  (csv-read->list)
;;>  (open-input-string
;;>    "1,foo\\n2,bar\\nthree,baz\\n4,qux"))
;;> }
(define csv-map
  (opt-lambda (proc
               (reader (csv-read->list))
               (in (current-input-port)))
    (reverse (csv-fold (lambda (row acc) (cons (proc row) acc)) '() reader in))))

;;> Returns a list of all of the read records in the input.
;;>
;;> \example{
;;> (csv->list
;;>  (csv-read->list)
;;>  (open-input-string
;;>    "1,foo\\n2,bar\\nthree,baz\\n4,qux"))
;;> }
(define csv->list
  (opt-lambda ((reader (csv-read->list))
               (in (current-input-port)))
    (csv-map (lambda (row) row) reader in)))

;;> Returns an SXML representation of the CSV.
;;>
;;> \example{
;;> ((csv->sxml 'city '(name latitude longitude))
;;>  (open-input-string
;;>   "Tokyo,35°41′23″N,139°41′32″E
;;> Paris,48°51′24″N,2°21′03″E"))
;;> }
(define csv->sxml
  (opt-lambda ((row-name 'row)
               (column-names
                (lambda (i)
                  (string->symbol (string-append "col-" (number->string i)))))
               (parser (csv-parser)))
    (opt-lambda ((in (current-input-port)))
      (cons '*TOP*
            (csv->list (csv-read->sxml row-name column-names parser) in)))))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;> \section{CSV Writers}

(define (write->string obj)
  (let ((out (open-output-string)))
    (write obj out)
    (get-output-string out)))

(define (csv-grammar-char-needs-quoting? grammar ch)
  (or (eqv? ch (csv-grammar-quote-char grammar))
      (eqv? ch (csv-grammar-escape-char grammar))
      (memv ch (csv-grammar-separator-chars grammar))
      (eqv? ch (csv-grammar-record-separator grammar))
      (memv ch '(#\newline #\return))))

(define (csv-write-quoted obj out grammar)
  (let ((in (open-input-string (if (string? obj) obj (write->string obj)))))
    (write-char (csv-grammar-quote-char grammar) out)
    (let lp ()
      (let ((ch (read-char in)))
        (cond
         ((eof-object? ch))
         ((or (eqv? ch (csv-grammar-quote-char grammar))
              (eqv? ch (csv-grammar-escape-char grammar)))
          (cond
           ((and (csv-grammar-quote-doubling-escapes? grammar)
                 (eqv? ch (csv-grammar-quote-char grammar)))
            (write-char ch out))
           ((csv-grammar-escape-char grammar)
            => (lambda (esc) (write-char esc out)))
           (else (error "no quote defined for" ch grammar)))
          (write-char ch out)
          (lp))
         (else
          (write-char ch out)
          (lp)))))
    (write-char (csv-grammar-quote-char grammar) out)))

(define csv-writer
  (opt-lambda ((grammar default-csv-grammar))
    (opt-lambda (row (out (current-output-port)))
      (let lp ((ls row) (first? #t))
        (when (pair? ls)
          (unless first?
            (write-char (car (csv-grammar-separator-chars grammar)) out))
          (if (or (and (csv-grammar-quote-non-numeric? grammar)
                       (not (number? (car ls))))
                  (and (string? (car ls))
                       (string-any
                        (lambda (ch) (csv-grammar-char-needs-quoting? grammar ch))
                        (car ls)))
                  (and (not (string? (car ls)))
                       (not (number? (car ls)))
                       (not (symbol? (car ls)))))
              (csv-write-quoted (car ls) out grammar)
              (display (car ls) out))
          (lp (cdr ls) #f)))
      (write-string
       (case (csv-grammar-record-separator grammar)
         ((crlf) "\r\n")
         ((lf lax) "\n")
         ((cr) "\r")
         (else (string (csv-grammar-record-separator grammar))))
       out))))

(define csv-write
  (opt-lambda ((writer (csv-writer)))
    (opt-lambda (rows (out (current-output-port)))
      (for-each
       (lambda (row) (writer row out))
       rows))))