mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-05-19 05:39:18 +02:00
Add quote-doubling-escapes? and quote-non-numeric?.
This commit is contained in:
parent
679875d850
commit
8e67defd71
2 changed files with 38 additions and 19 deletions
|
@ -34,6 +34,10 @@ they are going fast\""))
|
||||||
"# this is a comment\n1997,Ford,E350"
|
"# this is a comment\n1997,Ford,E350"
|
||||||
(csv-read->list
|
(csv-read->list
|
||||||
(csv-parser (csv-grammar '((comment-chars #\#)))))))
|
(csv-parser (csv-grammar '((comment-chars #\#)))))))
|
||||||
|
(let ((parser (csv-parser (csv-grammar '((quote-non-numeric? . #t))))))
|
||||||
|
(test-error (string->csv "1997,\"Ford\",E350" (csv-read->list parser)))
|
||||||
|
(test '(1997 "Ford" "E350")
|
||||||
|
(string->csv "1997,\"Ford\",\"E350\"" (csv-read->list parser))))
|
||||||
(test '("1997" "Fo\"rd" "E3\"50")
|
(test '("1997" "Fo\"rd" "E3\"50")
|
||||||
(string->csv "1997\tFo\"rd\tE3\"50"
|
(string->csv "1997\tFo\"rd\tE3\"50"
|
||||||
(csv-read->list (csv-parser default-tsv-grammar))))
|
(csv-read->list (csv-parser default-tsv-grammar))))
|
||||||
|
|
|
@ -7,18 +7,19 @@
|
||||||
;;> requiring a grammar to specify all of the different options.
|
;;> requiring a grammar to specify all of the different options.
|
||||||
|
|
||||||
(define-record-type Csv-Grammar
|
(define-record-type Csv-Grammar
|
||||||
(make-csv-grammar separator-chars quote-char escape-char record-separator comment-chars)
|
(make-csv-grammar separator-chars quote-char quote-doubling-escapes? escape-char record-separator comment-chars quote-non-numeric?)
|
||||||
csv-grammar?
|
csv-grammar?
|
||||||
(separator-chars csv-grammar-separator-chars csv-grammar-separator-chars-set!)
|
(separator-chars csv-grammar-separator-chars csv-grammar-separator-chars-set!)
|
||||||
(quote-char csv-grammar-quote-char csv-grammar-quote-char-set!)
|
(quote-char csv-grammar-quote-char csv-grammar-quote-char-set!)
|
||||||
|
(quote-doubling-escapes? csv-grammar-quote-doubling-escapes? csv-grammar-quote-doubling-escapes?-set!)
|
||||||
(escape-char csv-grammar-escape-char csv-grammar-escape-char-set!)
|
(escape-char csv-grammar-escape-char csv-grammar-escape-char-set!)
|
||||||
(record-separator csv-grammar-record-separator csv-grammar-record-separator-set!)
|
(record-separator csv-grammar-record-separator csv-grammar-record-separator-set!)
|
||||||
(comment-chars csv-grammar-comment-chars csv-grammar-comment-chars-set!))
|
(comment-chars csv-grammar-comment-chars csv-grammar-comment-chars-set!)
|
||||||
|
(quote-non-numeric? csv-grammar-quote-non-numeric? csv-grammar-quote-non-numeric?-set!))
|
||||||
|
|
||||||
;; TODO: Consider some minimal low-level parsing options. In general
|
;; TODO: Other options to consider:
|
||||||
;; this is intended to be performed by the parser, but if we can skip
|
;; - strip-leading/trailing-whitespace?
|
||||||
;; intermediate string generation (e.g. parsing numbers directly) it
|
;; - newlines-in-quotes?
|
||||||
;; can save a considerable amount of garbage when parsing large files.
|
|
||||||
|
|
||||||
;;> Creates a new CSV grammar from the given spec, an alist of symbols
|
;;> Creates a new CSV grammar from the given spec, an alist of symbols
|
||||||
;;> to values. The following options are supported:
|
;;> to values. The following options are supported:
|
||||||
|
@ -26,7 +27,8 @@
|
||||||
;;> \itemlist[
|
;;> \itemlist[
|
||||||
;;> \item{\scheme{'separator-chars} - A non-empty list of characters used to delimit fields, by default \scheme{'(#\\,)} (comma-separated).}
|
;;> \item{\scheme{'separator-chars} - A non-empty list of characters used to delimit fields, by default \scheme{'(#\\,)} (comma-separated).}
|
||||||
;;> \item{\scheme{'quote-char} - A single character used to quote fields containing special characters, or \scheme{#f} to disable quoting, by default \scheme{#\\"} (a double-quote).}
|
;;> \item{\scheme{'quote-char} - A single character used to quote fields containing special characters, or \scheme{#f} to disable quoting, by default \scheme{#\\"} (a double-quote).}
|
||||||
;;> \item{\scheme{'escape-char} - A single character used to escape characters within quoted fields, or \scheme{#f} to disable escapes, by default \scheme{#\\"} (a double-quote). If this is the same character as the \scheme{quote-char}, then the quote char can be doubled to escape, but no other characters can be escaped.}
|
;;> \item{\scheme{'quote-doubling-escapes?} - If true, two successive \scheme{quote-char}s within quotes are treated as a single escaped \scheme{quote-char} (default true).}
|
||||||
|
;;> \item{\scheme{'escape-char} - A single character used to escape characters within quoted fields, or \scheme{#f} to disable escapes, by default \scheme{#f} (no explicit escape, use quote doubling).}
|
||||||
;;> \item{\scheme{'record-separator} - A single character used to delimit the record (row), or one of the symbols \scheme{'cr}, \scheme{'crlf}, \scheme{'lf} or \scheme{'lax}. These correspond to sequences of carriage return and line feed, or in the case of \scheme{'lax} any of the other three sequences. Defaults to \scheme{'lax}.}
|
;;> \item{\scheme{'record-separator} - A single character used to delimit the record (row), or one of the symbols \scheme{'cr}, \scheme{'crlf}, \scheme{'lf} or \scheme{'lax}. These correspond to sequences of carriage return and line feed, or in the case of \scheme{'lax} any of the other three sequences. Defaults to \scheme{'lax}.}
|
||||||
;;> \item{\scheme{'comment-chars} - A list of characters which if found at the start of a record indicate it is a comment, discarding all characters through to the next record-separator. Defaults to the empty list (no comments).}
|
;;> \item{\scheme{'comment-chars} - A list of characters which if found at the start of a record indicate it is a comment, discarding all characters through to the next record-separator. Defaults to the empty list (no comments).}
|
||||||
;;> ]
|
;;> ]
|
||||||
|
@ -36,11 +38,10 @@
|
||||||
;;> \example{
|
;;> \example{
|
||||||
;;> (csv-grammar
|
;;> (csv-grammar
|
||||||
;;> '((separator-chars #\\:)
|
;;> '((separator-chars #\\:)
|
||||||
;;> (quote-char . #f)
|
;;> (quote-char . #f)))
|
||||||
;;> (escape-char . #f)))
|
|
||||||
;;> }
|
;;> }
|
||||||
(define (csv-grammar spec)
|
(define (csv-grammar spec)
|
||||||
(let ((grammar (make-csv-grammar '(#\,) #\" #\" 'lax '())))
|
(let ((grammar (make-csv-grammar '(#\,) #\" #t #f 'lax '() #f)))
|
||||||
(for-each
|
(for-each
|
||||||
(lambda (x)
|
(lambda (x)
|
||||||
(case (car x)
|
(case (car x)
|
||||||
|
@ -48,6 +49,8 @@
|
||||||
(csv-grammar-separator-chars-set! grammar (cdr x)))
|
(csv-grammar-separator-chars-set! grammar (cdr x)))
|
||||||
((quote-char)
|
((quote-char)
|
||||||
(csv-grammar-quote-char-set! grammar (cdr x)))
|
(csv-grammar-quote-char-set! grammar (cdr x)))
|
||||||
|
((quote-doubling-escapes?)
|
||||||
|
(csv-grammar-quote-doubling-escapes?-set! grammar (cdr x)))
|
||||||
((escape-char)
|
((escape-char)
|
||||||
(csv-grammar-escape-char-set! grammar (cdr x)))
|
(csv-grammar-escape-char-set! grammar (cdr x)))
|
||||||
((record-separator newline-type)
|
((record-separator newline-type)
|
||||||
|
@ -63,6 +66,8 @@
|
||||||
(csv-grammar-escape-char-set! grammar (cdr x))))
|
(csv-grammar-escape-char-set! grammar (cdr x))))
|
||||||
((comment-chars)
|
((comment-chars)
|
||||||
(csv-grammar-comment-chars-set! grammar (cdr x)))
|
(csv-grammar-comment-chars-set! grammar (cdr x)))
|
||||||
|
((quote-non-numeric?)
|
||||||
|
(csv-grammar-quote-non-numeric?-set! grammar (cdr x)))
|
||||||
(else
|
(else
|
||||||
(error "unknown csv-grammar spec" x))))
|
(error "unknown csv-grammar spec" x))))
|
||||||
spec)
|
spec)
|
||||||
|
@ -77,7 +82,7 @@
|
||||||
;;> The default TSV grammar for convenience, splitting fields only on
|
;;> The default TSV grammar for convenience, splitting fields only on
|
||||||
;;> tabs, with no quoting or escaping.
|
;;> tabs, with no quoting or escaping.
|
||||||
(define default-tsv-grammar
|
(define default-tsv-grammar
|
||||||
(csv-grammar '((separator-chars #\tab) (quote-char . #f) (escape-char . #f))))
|
(csv-grammar '((separator-chars #\tab) (quote-char . #f))))
|
||||||
|
|
||||||
;;> \section{CSV Parsers}
|
;;> \section{CSV Parsers}
|
||||||
|
|
||||||
|
@ -114,30 +119,40 @@
|
||||||
(lp))))
|
(lp))))
|
||||||
(let lp ((acc knil)
|
(let lp ((acc knil)
|
||||||
(index 0)
|
(index 0)
|
||||||
|
(quoted? #f)
|
||||||
(out (open-output-string)))
|
(out (open-output-string)))
|
||||||
(define (finish-row)
|
(define (get-field)
|
||||||
(let ((field (get-output-string out)))
|
(let ((field (get-output-string out)))
|
||||||
|
(cond
|
||||||
|
((and (zero? index) (equal? field "")) field)
|
||||||
|
((and (csv-grammar-quote-non-numeric? grammar) (not quoted?))
|
||||||
|
(or (string->number field)
|
||||||
|
(error "unquoted field is not numeric" field)))
|
||||||
|
(else field))))
|
||||||
|
(define (finish-row)
|
||||||
|
(let ((field (get-field)))
|
||||||
(if (and (zero? index) (equal? field ""))
|
(if (and (zero? index) (equal? field ""))
|
||||||
;; empty row, read again
|
;; empty row, read again
|
||||||
(lp acc index out)
|
(lp acc index #f out)
|
||||||
(kons acc index field))))
|
(kons acc index field))))
|
||||||
(let ((ch (read-char in)))
|
(let ((ch (read-char in)))
|
||||||
(cond
|
(cond
|
||||||
((eof-object? ch)
|
((eof-object? ch)
|
||||||
(let ((field (get-output-string out)))
|
(let ((field (get-field)))
|
||||||
(if (and (zero? index) (equal? field ""))
|
(if (and (zero? index) (equal? field ""))
|
||||||
;; no data
|
;; no data
|
||||||
ch
|
ch
|
||||||
(kons acc index field))))
|
(kons acc index field))))
|
||||||
((memv ch (csv-grammar-separator-chars grammar))
|
((memv ch (csv-grammar-separator-chars grammar))
|
||||||
(lp (kons acc index (get-output-string out))
|
(lp (kons acc index (get-field))
|
||||||
(+ index 1)
|
(+ index 1)
|
||||||
|
#f
|
||||||
(open-output-string)))
|
(open-output-string)))
|
||||||
((eqv? ch (csv-grammar-quote-char grammar))
|
((eqv? ch (csv-grammar-quote-char grammar))
|
||||||
;; TODO: Consider a strict mode to enforce no text
|
;; TODO: Consider a strict mode to enforce no text
|
||||||
;; before/after the quoted text.
|
;; before/after the quoted text.
|
||||||
(csv-read-quoted in out grammar)
|
(csv-read-quoted in out grammar)
|
||||||
(lp acc index out))
|
(lp acc index #t out))
|
||||||
((eqv? ch (csv-grammar-record-separator grammar))
|
((eqv? ch (csv-grammar-record-separator grammar))
|
||||||
(finish-row))
|
(finish-row))
|
||||||
((and (eqv? ch #\return)
|
((and (eqv? ch #\return)
|
||||||
|
@ -150,13 +165,13 @@
|
||||||
(finish-row))
|
(finish-row))
|
||||||
(else
|
(else
|
||||||
(write-char ch out)
|
(write-char ch out)
|
||||||
(lp acc (+ index 1) out))))
|
(lp acc (+ index 1) quoted? out))))
|
||||||
((and (eqv? ch #\newline)
|
((and (eqv? ch #\newline)
|
||||||
(eq? (csv-grammar-record-separator grammar) 'lax))
|
(eq? (csv-grammar-record-separator grammar) 'lax))
|
||||||
(finish-row))
|
(finish-row))
|
||||||
(else
|
(else
|
||||||
(write-char ch out)
|
(write-char ch out)
|
||||||
(lp acc index out))))))))
|
(lp acc index quoted? out))))))))
|
||||||
|
|
||||||
(define (csv-skip-line in grammar)
|
(define (csv-skip-line in grammar)
|
||||||
(let lp ()
|
(let lp ()
|
||||||
|
@ -181,7 +196,7 @@
|
||||||
((eof-object? ch)
|
((eof-object? ch)
|
||||||
(error "unterminated csv quote" (get-output-string out)))
|
(error "unterminated csv quote" (get-output-string out)))
|
||||||
((eqv? ch (csv-grammar-quote-char grammar))
|
((eqv? ch (csv-grammar-quote-char grammar))
|
||||||
(when (and (eqv? ch (csv-grammar-escape-char grammar))
|
(when (and (csv-grammar-quote-doubling-escapes? grammar)
|
||||||
(eqv? ch (peek-char in)))
|
(eqv? ch (peek-char in)))
|
||||||
(write-char (read-char in) out)
|
(write-char (read-char in) out)
|
||||||
(lp)))
|
(lp)))
|
||||||
|
|
Loading…
Add table
Reference in a new issue