mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-05-19 13:49:17 +02:00
While testing a `(chibi parser)`-based parser I noticed that `parse-string` is the only provided parser combinator which uses a list, instead of a string, as a failure reason. It is not explicitly documented but since all other standard parser combinator use a string for the error reason I assume this to be a bug and have adjusted the `parse-string` combinator accordingly in this commit.
1016 lines
37 KiB
Scheme
1016 lines
37 KiB
Scheme
;; parse.scm -- Parser Combinators
|
|
;; Copyright (c) 2013 Alex Shinn. All rights reserved.
|
|
;; BSD-style license: http://synthcode.com/license.txt
|
|
|
|
;;> \section{Parse Streams}
|
|
|
|
;;> Parse streams are an abstraction to treat ports as proper streams
|
|
;;> so that we can backtrack from previous states. A single
|
|
;;> Parse-Stream record represents a single buffered chunk of text.
|
|
|
|
(define-record-type Parse-Stream
|
|
(%make-parse-stream
|
|
filename port buffer cache offset prev-char line column tail)
|
|
parse-stream?
|
|
;; The file the data came from, for debugging and error reporting.
|
|
(filename parse-stream-filename)
|
|
;; The underlying port.
|
|
(port parse-stream-port)
|
|
;; A vector of characters read from the port. We use a vector
|
|
;; rather than a string for guaranteed O(1) access.
|
|
(buffer parse-stream-buffer)
|
|
;; A vector of caches corresponding to parser successes or failures
|
|
;; starting from the corresponding char. Currently each cache is
|
|
;; just an alist, optimized under the assumption that the number of
|
|
;; possible memoized parsers is relatively small. Note that
|
|
;; memoization is only enabled explicitly.
|
|
(cache parse-stream-cache)
|
|
;; The current offset of filled characters in the buffer.
|
|
;; If offset is non-zero, (vector-ref buffer (- offset 1)) is
|
|
;; valid.
|
|
(offset parse-stream-offset parse-stream-offset-set!)
|
|
;; The previous char before the beginning of this Parse-Stream.
|
|
;; Used for line/word-boundary checks.
|
|
(prev-char parse-stream-prev-char)
|
|
;; The debug info for the start line and column of this chunk.
|
|
(line parse-stream-line)
|
|
(column parse-stream-column)
|
|
;; The successor Parse-Stream chunk, created on demand and filled
|
|
;; from the same port.
|
|
(tail %parse-stream-tail %parse-stream-tail-set!))
|
|
|
|
;; We want to balance avoiding reallocating buffers with avoiding
|
|
;; holding many memoized values in memory.
|
|
(define default-buffer-size 256)
|
|
|
|
;;> Create a parse stream open on the given \var{filename}, with a
|
|
;;> possibly already opened \var{port}.
|
|
|
|
(define (make-parse-stream filename . o)
|
|
(let ((port (if (pair? o) (car o) (open-input-file filename)))
|
|
(len (if (and (pair? o) (pair? (cdr o))) (cadr o) default-buffer-size)))
|
|
(%make-parse-stream
|
|
filename port (make-vector len #f) (make-vector len '()) 0 #f 0 0 #f)))
|
|
|
|
;;> Open \var{filename} and create a parse stream on it.
|
|
|
|
(define (file->parse-stream filename)
|
|
(make-parse-stream filename (open-input-file filename)))
|
|
|
|
;;> Create a parse stream on a string \var{str}.
|
|
|
|
(define (string->parse-stream str)
|
|
(make-parse-stream #f (open-input-string str)))
|
|
|
|
;;> Access the next buffered chunk of a parse stream.
|
|
|
|
(define (parse-stream-tail source)
|
|
(or (%parse-stream-tail source)
|
|
(let* ((len (vector-length (parse-stream-buffer source)))
|
|
(line-info (parse-stream-count-lines source))
|
|
(line (+ (parse-stream-line source) (car line-info)))
|
|
(col (if (zero? (car line-info))
|
|
(+ (parse-stream-column source) (cadr line-info))
|
|
(cadr line-info)))
|
|
(tail (%make-parse-stream (parse-stream-filename source)
|
|
(parse-stream-port source)
|
|
(make-vector len #f)
|
|
(make-vector len '())
|
|
0
|
|
(parse-stream-last-char source)
|
|
line
|
|
col
|
|
#f)))
|
|
(%parse-stream-tail-set! source tail)
|
|
tail)))
|
|
|
|
(define (parse-stream-fill! source i)
|
|
(let ((off (parse-stream-offset source))
|
|
(buf (parse-stream-buffer source)))
|
|
(if (<= off i)
|
|
(do ((off off (+ off 1)))
|
|
((> off i) (parse-stream-offset-set! source off))
|
|
(vector-set! buf off (read-char (parse-stream-port source))))
|
|
#f)))
|
|
|
|
;;> Returns true iff \var{i} is the first character position in the
|
|
;;> parse stream \var{source}.
|
|
|
|
(define (parse-stream-start? source i)
|
|
(and (zero? i) (not (parse-stream-prev-char source))))
|
|
|
|
;;> Returns true iff \var{i} is the last character position in the
|
|
;;> parse stream \var{source}.
|
|
|
|
(define (parse-stream-end? source i)
|
|
(eof-object? (parse-stream-ref source i)))
|
|
|
|
;;> Returns the character in parse stream \var{source} indexed by
|
|
;;> \var{i}.
|
|
|
|
(define (parse-stream-ref source i)
|
|
(parse-stream-fill! source i)
|
|
(vector-ref (parse-stream-buffer source) i))
|
|
|
|
(define (parse-stream-last-char source)
|
|
(let ((buf (parse-stream-buffer source)))
|
|
(let lp ((i (min (- (vector-length buf) 1) (parse-stream-offset source))))
|
|
(if (negative? i)
|
|
(parse-stream-prev-char source)
|
|
(let ((ch (vector-ref buf i)))
|
|
(if (eof-object? ch)
|
|
(lp (- i 1))
|
|
ch))))))
|
|
|
|
(define (parse-stream-char-before source i)
|
|
(if (> i (parse-stream-offset source))
|
|
(parse-stream-ref source (- i 1))
|
|
(parse-stream-prev-char source)))
|
|
|
|
(define (parse-stream-max-char source)
|
|
(let ((buf (parse-stream-buffer source)))
|
|
(let lp ((i (min (- (vector-length buf) 1)
|
|
(parse-stream-offset source))))
|
|
(if (or (negative? i)
|
|
(char? (vector-ref buf i)))
|
|
i
|
|
(lp (- i 1))))))
|
|
|
|
(define (parse-stream-count-lines source . o)
|
|
(let* ((buf (parse-stream-buffer source))
|
|
(end (if (pair? o) (car o) (vector-length buf))))
|
|
(let lp ((i 0) (from 0) (lines 0))
|
|
(if (>= i end)
|
|
(list lines (- i from) from)
|
|
(let ((ch (vector-ref buf i)))
|
|
(cond
|
|
((not (char? ch))
|
|
(list lines (- i from) from))
|
|
((eqv? ch #\newline)
|
|
(lp (+ i 1) i (+ lines 1)))
|
|
(else
|
|
(lp (+ i 1) from lines))))))))
|
|
|
|
(define (parse-stream-end-of-line source i)
|
|
(let* ((buf (parse-stream-buffer source))
|
|
(end (vector-length buf)))
|
|
(let lp ((i i))
|
|
(if (>= i end)
|
|
i
|
|
(let ((ch (vector-ref buf i)))
|
|
(if (or (not (char? ch)) (eqv? ch #\newline))
|
|
i
|
|
(lp (+ i 1))))))))
|
|
|
|
(define (parse-stream-debug-info s i)
|
|
;; i is the failed parse index, but we want the furthest reached
|
|
;; location
|
|
(if (%parse-stream-tail s)
|
|
(parse-stream-debug-info (%parse-stream-tail s) i)
|
|
(let* ((line-info
|
|
(parse-stream-count-lines s (parse-stream-max-char s)))
|
|
(line (+ (parse-stream-line s) (car line-info)))
|
|
(col (if (zero? (car line-info))
|
|
(+ (parse-stream-column s) (cadr line-info))
|
|
(cadr line-info)))
|
|
(from (car (cddr line-info)))
|
|
(to (parse-stream-end-of-line s (+ from 1)))
|
|
(str (parse-stream-substring s from s to)))
|
|
(list line col str))))
|
|
|
|
(define (parse-stream-next-source source i)
|
|
(if (>= (+ i 1) (vector-length (parse-stream-buffer source)))
|
|
(parse-stream-tail source)
|
|
source))
|
|
|
|
(define (parse-stream-next-index source i)
|
|
(if (>= (+ i 1) (vector-length (parse-stream-buffer source)))
|
|
0
|
|
(+ i 1)))
|
|
|
|
(define (parse-stream-close source)
|
|
(close-input-port (parse-stream-port source)))
|
|
|
|
(define (vector-substring vec start . o)
|
|
(let* ((end (if (pair? o) (car o) (vector-length vec)))
|
|
(res (make-string (- end start))))
|
|
(do ((i start (+ i 1)))
|
|
((= i end) res)
|
|
(string-set! res (- i start) (vector-ref vec i)))))
|
|
|
|
(define (parse-stream-in-tail? s0 s1)
|
|
(let ((s0^ (%parse-stream-tail s0)))
|
|
(or (eq? s0^ s1)
|
|
(and s0^ (parse-stream-in-tail? s0^ s1)))))
|
|
|
|
(define (parse-stream< s0 i0 s1 i1)
|
|
(if (eq? s0 s1)
|
|
(< i0 i1)
|
|
(parse-stream-in-tail? s0 s1)))
|
|
|
|
;;> Returns a string composed of the characters starting at parse
|
|
;;> stream \var{s0} index \var{i0} (inclusive), and ending at \var{s1}
|
|
;;> index \var{i1} (exclusive).
|
|
|
|
(define (parse-stream-substring s0 i0 s1 i1)
|
|
(cond
|
|
((eq? s0 s1)
|
|
(parse-stream-fill! s0 i1)
|
|
(vector-substring (parse-stream-buffer s0) i0 i1))
|
|
(else
|
|
(let lp ((s (parse-stream-tail s0))
|
|
(res (list (vector-substring (parse-stream-buffer s0) i0))))
|
|
(let ((buf (parse-stream-buffer s)))
|
|
(cond
|
|
((eq? s s1)
|
|
(apply string-append
|
|
(reverse (cons (vector-substring buf 0 i1) res))))
|
|
(else
|
|
(lp (parse-stream-tail s)
|
|
(cons (vector-substring buf 0) res)))))))))
|
|
|
|
(define (parse-stream-cache-cell s i f)
|
|
(assv f (vector-ref (parse-stream-cache s) i)))
|
|
|
|
(define (parse-stream-cache-set! s i f x)
|
|
(let ((cache (vector-ref (parse-stream-cache s) i)))
|
|
(cond
|
|
((assv f cache)
|
|
=> (lambda (cell)
|
|
;; prefer longer matches
|
|
(if (and (pair? (cdr cell))
|
|
(parse-stream< (car (cddr cell)) (cadr (cddr cell)) s i))
|
|
(set-cdr! cell x))))
|
|
(else
|
|
(vector-set! (parse-stream-cache s) i (cons (cons f x) cache))))))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;> \section{Parser Interface}
|
|
|
|
;;> Combinator to indicate failure.
|
|
|
|
(define (parse-failure s i reason)
|
|
(let ((line+col (parse-stream-debug-info s i)))
|
|
(error "incomplete parse at" (append line+col (list reason)))))
|
|
|
|
;;> Call the parser combinator \var{f} on the parse stream
|
|
;;> \var{source}, starting at index \var{index}, passing the result to
|
|
;;> the given success continuation \var{sk}, which should be a
|
|
;;> procedure of the form \scheme{(result source index fail)}. The
|
|
;;> optional failure continuation should be a procedure of the form
|
|
;;> \scheme{(source index reason)}, and defaults to just returning
|
|
;;> \scheme{#f}.
|
|
|
|
(define (call-with-parse f source index sk . o)
|
|
(let ((s (if (string? source) (string->parse-stream source) source))
|
|
(fk (if (pair? o) (car o) (lambda (s i reason) #f))))
|
|
(f s index sk fk)))
|
|
|
|
;;> Call the parser combinator \var{f} on the parse stream
|
|
;;> \var{source}, at index \var{index}, and return the result, or
|
|
;;> \scheme{#f} if parsing fails.
|
|
|
|
(define (parse f source . o)
|
|
(let ((index (if (pair? o) (car o) 0)))
|
|
(call-with-parse f source index (lambda (r s i fk) r))))
|
|
|
|
;;> Call the parser combinator \var{f} on the parse stream
|
|
;;> \var{source}, at index \var{index}. If the entire source is not
|
|
;;> parsed, raises an error, otherwise returns the result.
|
|
|
|
(define (parse-fully f source . o)
|
|
(let ((s (if (string? source) (string->parse-stream source) source))
|
|
(index (if (pair? o) (car o) 0)))
|
|
(call-with-parse
|
|
f s index
|
|
(lambda (r s i fk)
|
|
(if (parse-stream-end? s i) r (fk s i "incomplete parse")))
|
|
parse-failure)))
|
|
|
|
;;> The fundamental parse iterator. Repeatedly applies the parser
|
|
;;> combinator \var{f} to \var{source}, starting at \var{index}, as
|
|
;;> long as a valid parse is found. On each successful parse applies
|
|
;;> the procedure \var{kons} to the parse result and the previous
|
|
;;> \var{kons} result, beginning with \var{knil}. If no parses
|
|
;;> succeed returns \var{knil}.
|
|
|
|
(define (parse-fold f kons knil source . o)
|
|
(let lp ((p (if (string? source) (string->parse-stream source) source))
|
|
(index (if (pair? o) (car o) 0))
|
|
(acc knil))
|
|
(f p index (lambda (r s i fk) (lp s i (kons r acc))) (lambda (s i r) acc))))
|
|
|
|
;;> Parse as many of the parser combinator \var{f} from the parse
|
|
;;> stream \var{source}, starting at \var{index}, as possible, and
|
|
;;> return the result as a list.
|
|
|
|
(define (parse->list f source . o)
|
|
(let ((index (if (pair? o) (car o) 0)))
|
|
(reverse (parse-fold f cons '() source index))))
|
|
|
|
;;> As \scheme{parse->list} but requires the entire source be parsed
|
|
;;> with no left over characters, signalling an error otherwise.
|
|
|
|
(define (parse-fully->list f source . o)
|
|
(let lp ((s (if (string? source) (string->parse-stream source) source))
|
|
(index (if (pair? o) (car o) 0))
|
|
(acc '()))
|
|
(f s index
|
|
(lambda (r s i fk)
|
|
(if (eof-object? r) (reverse acc) (lp s i (cons r acc))))
|
|
(lambda (s i reason) (error "incomplete parse")))))
|
|
|
|
;;> Return a new parser combinator with the same behavior as \var{f},
|
|
;;> but on failure replaces the reason with \var{reason}. This can be
|
|
;;> useful to provide more descriptive parse failure reasons when
|
|
;;> chaining combinators. For example, \scheme{parse-string} just
|
|
;;> expects to parse a single fixed string. If it were defined in
|
|
;;> terms of \scheme{parse-char}, failure would indicate some char
|
|
;;> failed to match, but it's more useful to describe the whole string
|
|
;;> we were expecting to see.
|
|
|
|
(define (parse-with-failure-reason f reason)
|
|
(lambda (r s i fk)
|
|
(f r s i (lambda (s i r) (fk s i reason)))))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;> \section{Basic Parsing Combinators}
|
|
|
|
;;> Parse nothing successfully.
|
|
|
|
(define parse-epsilon
|
|
(lambda (source index sk fk)
|
|
(sk #t source index fk)))
|
|
|
|
;;> Parse any single character successfully. Fails at end of input.
|
|
|
|
(define parse-anything
|
|
(lambda (source index sk fk)
|
|
(if (parse-stream-end? source index)
|
|
(fk source index "end of input")
|
|
(sk (parse-stream-ref source index)
|
|
(parse-stream-next-source source index)
|
|
(parse-stream-next-index source index)
|
|
fk))))
|
|
|
|
;;> Always fail to parse.
|
|
|
|
(define parse-nothing
|
|
(lambda (source index sk fk)
|
|
(fk source index "nothing")))
|
|
|
|
;;> The disjunction combinator. Returns the first combinator that
|
|
;;> succeeds parsing from the same source and index.
|
|
|
|
(define (parse-or f . o)
|
|
(if (null? o)
|
|
f
|
|
(let ((g (apply parse-or o)))
|
|
(lambda (source index sk fk)
|
|
(let ((fk2 (lambda (s i r)
|
|
(g source index sk fk
|
|
;; (lambda (s2 i2 r2)
|
|
;; (fk s2 i2 `(or ,r ,r2)))
|
|
))))
|
|
(f source index sk fk2))))))
|
|
|
|
;;> The conjunction combinator. If both \var{f} and \var{g} parse
|
|
;;> successfully starting at the same source and index, returns the
|
|
;;> result of \var{g}. Otherwise fails.
|
|
|
|
(define (parse-and f g)
|
|
(lambda (source index sk fk)
|
|
(f source index (lambda (r s i fk) (g source index sk fk)) fk)))
|
|
|
|
;;> The negation combinator. If \var{f} succeeds, fails, otherwise
|
|
;;> succeeds with \var{#t}.
|
|
|
|
(define (parse-not f)
|
|
(lambda (source index sk fk)
|
|
(f source index (lambda (r s i fk) (fk s i "not"))
|
|
(lambda (s i r) (sk #t source index fk)))))
|
|
|
|
(define (parse-seq-list o)
|
|
(cond
|
|
((null? o)
|
|
parse-epsilon)
|
|
((null? (cdr o))
|
|
(let ((f (car o)))
|
|
(lambda (s i sk fk)
|
|
(f s i (lambda (r s i fk)
|
|
(sk (if (eq? r ignored-value) '() (list r)) s i fk))
|
|
fk))))
|
|
(else
|
|
(let* ((f (car o))
|
|
(o (cdr o))
|
|
(g (car o))
|
|
(o (cdr o))
|
|
(g (if (pair? o)
|
|
(apply parse-seq g o)
|
|
(lambda (s i sk fk)
|
|
(g s i (lambda (r s i fk)
|
|
(sk (if (eq? r ignored-value) '() (list r))
|
|
s i fk))
|
|
fk)))))
|
|
(lambda (source index sk fk)
|
|
(f source
|
|
index
|
|
(lambda (r s i fk)
|
|
(g s i (lambda (r2 s i fk)
|
|
(let ((r2 (if (eq? r ignored-value) r2 (cons r r2))))
|
|
(sk r2 s i fk)))
|
|
fk))
|
|
fk))))))
|
|
|
|
;;> The sequence combinator. Each combinator is applied in turn just
|
|
;;> past the position of the previous. If all succeed, returns a list
|
|
;;> of the results in order, skipping any ignored values.
|
|
|
|
(define (parse-seq . o)
|
|
(parse-seq-list o))
|
|
|
|
;;> Convert the list of parser combinators \var{ls} to a
|
|
;;> \scheme{parse-seq} sequence.
|
|
|
|
(define (list->parse-seq ls)
|
|
(if (null? (cdr ls)) (car ls) (parse-seq-list ls)))
|
|
|
|
;;> The optional combinator. Parse the combinator \var{f} (in
|
|
;;> sequence with any additional combinator args \var{o}), and return
|
|
;;> the result, or parse nothing successully on failure.
|
|
|
|
(define (parse-optional f . o)
|
|
(if (pair? o)
|
|
(parse-optional (apply parse-seq f o))
|
|
(lambda (source index sk fk)
|
|
(f source index sk (lambda (s i r) (sk #f source index fk))))))
|
|
|
|
(define ignored-value (list 'ignore))
|
|
|
|
;;> The repetition combinator. Parse \var{f} repeatedly and return a
|
|
;;> list of the results. \var{lo} is the minimum number of parses
|
|
;;> (deafult 0) to be considered a successful parse, and \var{hi} is
|
|
;;> the maximum number (default infinite) before stopping.
|
|
|
|
(define (parse-repeat f . o)
|
|
(let ((lo (if (pair? o) (car o) 0))
|
|
(hi (and (pair? o) (pair? (cdr o)) (cadr o))))
|
|
(lambda (source0 index0 sk fk)
|
|
(let repeat ((source source0) (index index0) (fk fk) (j 0) (res '()))
|
|
(let ((fk (if (>= j lo)
|
|
(lambda (s i r) (sk (reverse res) source index fk))
|
|
fk)))
|
|
(if (and hi (= j hi))
|
|
(sk (reverse res) source index fk)
|
|
(f source
|
|
index
|
|
(lambda (r s i fk) (repeat s i fk (+ j 1) (cons r res)))
|
|
fk)))))))
|
|
|
|
;;> Parse \var{f} one or more times.
|
|
|
|
(define (parse-repeat+ f)
|
|
(parse-repeat f 1))
|
|
|
|
;;> Parse \var{f} and apply the procedure \var{proc} to the result on success.
|
|
|
|
(define (parse-map f proc)
|
|
(lambda (source index sk fk)
|
|
(f source index (lambda (res s i fk) (sk (proc res) s i fk)) fk)))
|
|
|
|
;;> Parse \var{f} and apply the procedure \var{proc} to the substring
|
|
;;> of the parsed data. \var{proc} defaults to the identity.
|
|
|
|
(define (parse-map-substring f . o)
|
|
(let ((proc (if (pair? o) (car o) (lambda (res) res))))
|
|
(lambda (source index sk fk)
|
|
(f source
|
|
index
|
|
(lambda (res s i fk)
|
|
(sk (proc (parse-stream-substring source index s i)) s i fk))
|
|
fk))))
|
|
|
|
;;> Parses the same streams as \var{f} but ignores the result on
|
|
;;> success. Inside a \scheme{parse-seq} the result will not be
|
|
;;> included in the list of results. Useful for discarding
|
|
;;> boiler-plate without the need for post-processing results.
|
|
|
|
(define (parse-ignore f)
|
|
(parse-map f (lambda (res) ignored-value)))
|
|
|
|
;;> Parse with \var{f} and further require \var{check?} to return true
|
|
;;> when applied to the result.
|
|
|
|
(define (parse-assert f check?)
|
|
(lambda (source index sk fk)
|
|
(f source
|
|
index
|
|
(lambda (res s i fk)
|
|
(if (check? res) (sk res s i fk) (fk s i "assertion failed")))
|
|
fk)))
|
|
|
|
;;> Parse with \var{f} once and keep the first result, not allowing
|
|
;;> further backtracking within \var{f}.
|
|
|
|
(define (parse-atomic f)
|
|
(lambda (source index sk fk)
|
|
(f source index (lambda (res s i fk2) (sk res s i fk)) fk)))
|
|
|
|
;;> Parse with \var{f} once, keep the first result, and commit to the
|
|
;;> current parse path, discarding any prior backtracking options.
|
|
|
|
(define (parse-commit f)
|
|
(lambda (source index sk fk)
|
|
(f source index (lambda (res s i fk) (sk res s i (lambda (s i r) #f))) fk)))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;> \section{Boundary Checks}
|
|
|
|
;;> Returns true iff \var{index} is the first index of the first parse
|
|
;;> stream \var{source}.
|
|
|
|
(define parse-beginning
|
|
(lambda (source index sk fk)
|
|
(if (parse-stream-start? source index)
|
|
(sk #t source index fk)
|
|
(fk source index "expected beginning"))))
|
|
|
|
;;> Returns true iff \var{index} is the last index of the last parse
|
|
;;> stream \var{source}.
|
|
|
|
(define parse-end
|
|
(lambda (source index sk fk)
|
|
(if (parse-stream-end? source index)
|
|
(sk #t source index fk)
|
|
(fk source index "expected end"))))
|
|
|
|
;;> Returns true iff \var{source}, \var{index} indicate the beginning
|
|
;;> of a line (or the entire stream).
|
|
|
|
(define parse-beginning-of-line
|
|
(lambda (source index sk fk)
|
|
(let ((before (parse-stream-char-before source index)))
|
|
(if (or (not before) (eqv? #\newline before))
|
|
(sk #t source index fk)
|
|
(fk source index "expected beginning of line")))))
|
|
|
|
;;> Returns true iff \var{source}, \var{index} indicate the end of a
|
|
;;> line (or the entire stream).
|
|
|
|
(define parse-end-of-line
|
|
(lambda (source index sk fk)
|
|
(if (or (parse-stream-end? source index)
|
|
(eqv? #\newline (parse-stream-ref source index)))
|
|
(sk #t source index fk)
|
|
(fk source index "expected end of line"))))
|
|
|
|
(define (char-word? ch)
|
|
(or (char-alphabetic? ch) (eqv? ch #\_)))
|
|
|
|
;;> Returns true iff \var{source}, \var{index} indicate the beginning
|
|
;;> of a word (or the entire stream).
|
|
|
|
(define parse-beginning-of-word
|
|
(lambda (source index sk fk)
|
|
(let ((before (parse-stream-char-before source index)))
|
|
(if (and (or (not before) (not (char-word? before)))
|
|
(not (parse-stream-end? source index))
|
|
(char-word? (parse-stream-ref source index)))
|
|
(sk #t source index fk)
|
|
(fk source index "expected beginning of word")))))
|
|
|
|
;;> Returns true iff \var{source}, \var{index} indicate the end of a
|
|
;;> word (or the entire stream).
|
|
|
|
(define parse-end-of-word
|
|
(lambda (source index sk fk)
|
|
(let ((before (parse-stream-char-before source index)))
|
|
(if (and before
|
|
(char-word? before)
|
|
(or (parse-stream-end? source index)
|
|
(not (char-word? (parse-stream-ref source index)))))
|
|
(sk #t source index fk)
|
|
(fk source index "expected end of word")))))
|
|
|
|
;;> Parse the combinator \var{word} (default a \scheme{parse-token} of
|
|
;;> \scheme{char-alphabetic?} or underscores), ensuring it begins and
|
|
;;> ends on a word boundary.
|
|
|
|
(define (parse-word . o)
|
|
(let ((word (if (pair? o) (car o) (parse-token char-word?))))
|
|
(lambda (source index sk fk)
|
|
(parse-map
|
|
(parse-seq parse-beginning-of-word
|
|
word
|
|
parse-end-of-word)
|
|
cadr))))
|
|
|
|
;;> As \scheme{parse-word}, but instead of an arbitrary word
|
|
;;> combinator takes a character predicate \var{pred} (conjoined with
|
|
;;> \scheme{char-alphabetic?} or underscore), and parses a sequence of
|
|
;;> those characters with \scheme{parse-token}. Returns the parsed
|
|
;;> substring.
|
|
|
|
(define (parse-word+ . o)
|
|
(let ((pred (if (pair? o)
|
|
(lambda (ch) (and (char-word? ch) ((car o) ch)))
|
|
char-word?)))
|
|
(parse-word (parse-token pred))))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;> \section{Constant Parsers}
|
|
|
|
(define (parse-char-pred pred)
|
|
(lambda (source index sk fk)
|
|
(let ((ch (parse-stream-ref source index)))
|
|
(if (and (char? ch) (pred ch))
|
|
(sk ch
|
|
(parse-stream-next-source source index)
|
|
(parse-stream-next-index source index)
|
|
fk)
|
|
(fk source index "failed char pred")))))
|
|
|
|
(define (x->char-predicate x)
|
|
(cond
|
|
((char? x)
|
|
(lambda (ch) (eqv? ch x)))
|
|
((char-set? x)
|
|
(lambda (ch) (and (char? ch) (char-set-contains? x ch))))
|
|
((procedure? x)
|
|
(lambda (ch) (and (char? ch) (x ch))))
|
|
(else
|
|
(error "don't know how to handle char predicate" x))))
|
|
|
|
;;> Parse a single char which matches \var{x}, which can be a
|
|
;;> character, character set, or arbitrary procedure.
|
|
|
|
(define (parse-char x)
|
|
(parse-char-pred (x->char-predicate x)))
|
|
|
|
;;> Parse a single char which does not match \var{x}, which can be a
|
|
;;> character, character set, or arbitrary procedure.
|
|
|
|
(define (parse-not-char x)
|
|
(let ((pred (x->char-predicate x)))
|
|
(parse-char-pred (lambda (ch) (not (pred ch))))))
|
|
|
|
;;> Parse the exact string \var{str}.
|
|
|
|
(define (parse-string str)
|
|
(parse-map (parse-with-failure-reason
|
|
(parse-seq-list (map parse-char (string->list str)))
|
|
(string-append "expected '" str "'"))
|
|
list->string))
|
|
|
|
;;> Parse a sequence of characters matching \var{x} as with
|
|
;;> \scheme{parse-char}, and return the resulting substring.
|
|
|
|
(define (parse-token x)
|
|
;; (parse-map (parse-repeat+ (parse-char x)) list->string)
|
|
;; Tokens are atomic - we don't want to split them at any point in
|
|
;; the middle - so the implementation is slightly more complex than
|
|
;; the above. With a sane grammar the result would be the same
|
|
;; either way, but this provides a useful optimization.
|
|
(let ((f (parse-char x)))
|
|
(lambda (source0 index0 sk fk)
|
|
(let lp ((source source0) (index index0))
|
|
(f source
|
|
index
|
|
(lambda (r s i fk) (lp s i))
|
|
(lambda (s i r)
|
|
(if (and (eq? source source0) (eqv? index index0))
|
|
(fk s i r)
|
|
(sk (parse-stream-substring source0 index0 source index)
|
|
source index fk))))))))
|
|
|
|
;;> We provide an extended subset of SRE syntax (see
|
|
;;> \hyperlink["http://srfi.schemers.org/srfi-115/srfi-115.html"]{SRFI 115}),
|
|
;;> taking advantage of more general parsing features. These are just
|
|
;;> translated directly into parser combinators, with characters and
|
|
;;> strings implicitly matching themselves. For example, \scheme{'(or
|
|
;;> "foo" "bar")} matches either of the strings \scheme{"foo"} or
|
|
;;> \scheme{"bar"}. Existing parser combinators may be embedded directly.
|
|
;;> This is of course more powerful than SREs since it is not
|
|
;;> restricted to regular languages (or in fact any languages), though
|
|
;;> it does not provide the same performance guarantees.
|
|
|
|
(define (parse-sre x)
|
|
(define (ranges->char-set ranges)
|
|
(let lp ((ls ranges) (res (char-set)))
|
|
(cond
|
|
((null? ls)
|
|
res)
|
|
((string? (car ls))
|
|
(lp (append (string->list (car ls)) (cdr ls)) res))
|
|
((null? (cdr ls))
|
|
(error "incomplete range in / char-set" ranges))
|
|
(else
|
|
(let ((cs (ucs-range->char-set (char->integer (car ls))
|
|
(+ 1 (char->integer (cadr ls))))))
|
|
(lp (cddr ls) (char-set-union cs res)))))))
|
|
(define (sre-list->char-set ls)
|
|
(apply char-set-union (map sre->char-set ls)))
|
|
(define (sre->char-set x)
|
|
(cond
|
|
((char? x) (char-set x))
|
|
((string? x) (if (= 1 (string-length x))
|
|
(string->char-set x)
|
|
(error "multi-element string in char-set" x)))
|
|
((pair? x)
|
|
(if (and (string? (car x)) (null? (cdr x)))
|
|
(string->char-set (car x))
|
|
(case (car x)
|
|
((/) (ranges->char-set (cdr x)))
|
|
((~) (char-set-complement (sre-list->char-set (cdr x))))
|
|
((-) (apply char-set-difference (map sre->char-set (cdr x))))
|
|
((&) (apply char-set-intersection (map sre->char-set (cdr x))))
|
|
((or) (sre-list->char-set (cdr x)))
|
|
(else (error "unknown SRE char-set operator" x)))))
|
|
(else (error "unknown SRE char-set" x))))
|
|
(cond
|
|
((procedure? x) ; an embedded parser
|
|
x)
|
|
((or (char? x) (char-set? x))
|
|
(parse-char x))
|
|
((string? x)
|
|
(parse-string x))
|
|
((null? x)
|
|
parse-epsilon)
|
|
((list? x)
|
|
(case (car x)
|
|
((: seq) (parse-seq-list (map parse-sre (cdr x))))
|
|
((or) (apply parse-or (map parse-sre (cdr x))))
|
|
((and) (apply parse-and (map parse-sre (cdr x))))
|
|
((not) (apply parse-not (map parse-sre (cdr x))))
|
|
((*) (parse-repeat (list->parse-seq (map parse-sre (cdr x)))))
|
|
((+) (parse-repeat+ (list->parse-seq (map parse-sre (cdr x)))))
|
|
((?) (parse-optional (parse-seq-list (map parse-sre (cdr x)))))
|
|
((=> ->) (list->parse-seq (map parse-sre (cddr x))))
|
|
((word) (apply parse-word (cdr x)))
|
|
((word+) (apply parse-word+ (cdr x)))
|
|
((/ ~ & -) (parse-char (sre->char-set x)))
|
|
(else
|
|
(if (string? (car x))
|
|
(parse-char (sre->char-set x))
|
|
(error "unknown SRE operator" x)))))
|
|
(else
|
|
(case x
|
|
((any) parse-anything)
|
|
((nonl) (parse-char (lambda (ch) (not (eqv? ch #\newline)))))
|
|
((space whitespace) (parse-char char-whitespace?))
|
|
((digit numeric) (parse-char char-numeric?))
|
|
((alpha alphabetic) (parse-char char-alphabetic?))
|
|
((alnum alphanumeric)
|
|
(parse-char-pred (lambda (ch) (or (char-alphabetic? ch) (char-numeric? ch)))))
|
|
((lower lower-case) (parse-char char-lower-case?))
|
|
((upper upper-case) (parse-char char-upper-case?))
|
|
((word) (parse-word))
|
|
((bow) parse-beginning-of-word)
|
|
((eow) parse-end-of-word)
|
|
((bol) parse-beginning-of-line)
|
|
((eol) parse-end-of-line)
|
|
((bos) parse-beginning)
|
|
((eos) parse-end)
|
|
(else (error "unknown SRE parser" x))))))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;> \section{Laziness}
|
|
|
|
;;> A delayed combinator. This is equivalent to the parser combinator
|
|
;;> \var{f}, but is delayed so it can be more efficient if never used
|
|
;;> and \var{f} is expensive to compute. Moreover, it can allow
|
|
;;> self-referentiality as in:
|
|
;;>
|
|
;;> \schemeblock{
|
|
;;> (letrec* ((f (parse-lazy (parse-or (parse-seq g f) h))))
|
|
;;> ...)
|
|
;;> }
|
|
|
|
(define-syntax parse-lazy
|
|
(syntax-rules ()
|
|
((parse-lazy f)
|
|
(let ((g (delay f)))
|
|
(lambda (source index sk fk)
|
|
((force g) source index sk fk))))))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;> \section{Memoization}
|
|
|
|
;; debugging
|
|
(define *procedures* '())
|
|
(define (procedure-name f)
|
|
(cond ((assq f *procedures*) => cdr) (else #f)))
|
|
(define (procedure-name-set! f name)
|
|
(set! *procedures* (cons (cons f name) *procedures*)))
|
|
|
|
(define memoized-failure (list 'failure))
|
|
|
|
;;> Parse the same strings as \var{f}, but memoize the result at each
|
|
;;> source and index to avoid exponential backtracking. \var{name} is
|
|
;;> provided for debugging only.
|
|
|
|
(define (parse-memoize name f)
|
|
;;(if (not (procedure-name f)) (procedure-name-set! f name))
|
|
(lambda (source index sk fk)
|
|
(cond
|
|
((parse-stream-cache-cell source index f)
|
|
=> (lambda (cell)
|
|
(if (and (pair? (cdr cell)) (eq? memoized-failure (cadr cell)))
|
|
(fk source index (cddr cell))
|
|
(apply sk (append (cdr cell) (list fk))))))
|
|
(else
|
|
(f source
|
|
index
|
|
(lambda (res s i fk)
|
|
(parse-stream-cache-set! source index f (list res s i))
|
|
(sk res s i fk))
|
|
(lambda (s i r)
|
|
(if (not (pair? (parse-stream-cache-cell source index f)))
|
|
(parse-stream-cache-set!
|
|
source index f (cons memoized-failure r)))
|
|
(fk s i r)))))))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;> \section{Syntax}
|
|
|
|
;;> The four basic interfaces are \scheme{grammar},
|
|
;;> \scheme{define-grammar}, and their unmemoized variants
|
|
;;> \scheme{grammar/unmemoized} and
|
|
;;> \scheme{define-grammar/unmemoized}. This is optimized for the
|
|
;;> common case - generally you want to memoize grammars, and may or
|
|
;;> may not want to memoize the smaller lexical components.
|
|
|
|
;;> \macro{(grammar/unmemoized init (rule (clause [action]) ...) ...)}
|
|
;;>
|
|
;;> Describe an grammar for the given named \var{rules} and return the
|
|
;;> rule named \var{init}. The rules are parser combinators which
|
|
;;> match the first \var{clause} which succeeds, and returns the
|
|
;;> corresponding action. Each \var{clause} is an SRE parser as in
|
|
;;> \scheme{parse-sre}, which may include embdedded parser combinators
|
|
;;> with \scheme{unquote} (,). In particular, the rule itself and any
|
|
;;> other rules can be referenced in this way. The optional
|
|
;;> \var{action}, which defaults to the normal result of the clause
|
|
;;> parser, is a normal Scheme expression with all \scheme{->} named
|
|
;;> expressions in clause bound to the corresponding result.
|
|
;;> Alternately, \var{action} can be of the form \scheme{=> receiver}
|
|
;;> to send the results directly to a success continuation as in
|
|
;;> \scheme{call-with-parse}.
|
|
|
|
(define-syntax grammar/unmemoized
|
|
(syntax-rules ()
|
|
((grammar/unmemoized init (rule (clause . action) ...) ...)
|
|
(letrec ((rule (parse-or (grammar-clause clause . action) ...))
|
|
...)
|
|
init))))
|
|
|
|
;;> \macro{(grammar init (rule (clause [action]) ...) ...)}
|
|
;;>
|
|
;;> Equivalent to \scheme{grammar} but memoizes each clause. Parsers
|
|
;;> nested within each clause are not automatically memoized, so if
|
|
;;> necessary should be memoized explicitly or split out into separate
|
|
;;> rules.
|
|
|
|
(define-syntax grammar
|
|
(syntax-rules ()
|
|
((grammar init (rule (clause . action) ...) ...)
|
|
(letrec ((rule
|
|
(parse-memoize
|
|
'rule
|
|
(parse-or (grammar-clause clause . action) ...)))
|
|
...)
|
|
init))))
|
|
|
|
;;> \macro{(define-grammar/unmemoized name (rule (clause [action]) ...) ...)}
|
|
;;>
|
|
;;> Similar to \scheme{grammar/unmemoized}, instead of returning a
|
|
;;> single entry point parser defines each \var{rule} as its own
|
|
;;> parser. Also defines \var{name} as an alist mapping rule names to
|
|
;;> their values.
|
|
|
|
(define-syntax define-grammar/unmemoized
|
|
(syntax-rules ()
|
|
((define-grammar/unmemoized name (rule (clause . action) ...) ...)
|
|
(begin
|
|
(define rule (parse-or (grammar-clause clause . action) ...))
|
|
...
|
|
(define name (list (cons 'rule rule) ...))))))
|
|
|
|
;;> \macro{(define-grammar name (rule (clause [action]) ...) ...)}
|
|
;;>
|
|
;;> The memoized version of \scheme{define-grammar/unmemoized}.
|
|
;;>
|
|
;;> Example:
|
|
;;>
|
|
;;> \example{
|
|
;;> (define-grammar calc
|
|
;;> (space ((* ,(parse-char char-whitespace?))))
|
|
;;> (number ((-> n (+ ,(parse-char char-numeric?)))
|
|
;;> (string->number (list->string n))))
|
|
;;> (simple ((-> n ,number) n)
|
|
;;> ((: "(" (=> e1 ,term) ")") e1))
|
|
;;> (term-op ("*" *)
|
|
;;> ("/" /)
|
|
;;> ("%" modulo))
|
|
;;> (term ((: (-> e1 ,simple) ,space (-> op ,term-op) ,space (-> e2 ,term))
|
|
;;> (op e1 e2))
|
|
;;> ((-> e1 ,simple)
|
|
;;> e1)))
|
|
;;> (parse term "12 / (2*3)")
|
|
;;> }
|
|
|
|
(define-syntax define-grammar
|
|
(syntax-rules ()
|
|
((define-grammar name (rule (clause . action) ...) ...)
|
|
(begin
|
|
(define rule
|
|
(parse-memoize 'rule (parse-or (grammar-clause clause . action) ...)))
|
|
...
|
|
(define name (list (cons 'rule rule) ...))))))
|
|
|
|
;; Most of the implementation goes into how we parse a single grammar
|
|
;; clause. This is hard to read if you're not used to CPS macros.
|
|
|
|
(define-syntax grammar-clause
|
|
(syntax-rules ()
|
|
((grammar-clause clause . action)
|
|
(grammar-extract clause () (grammar-action action)))))
|
|
|
|
(define-syntax grammar-extract
|
|
(syntax-rules (unquote -> => : seq * + ? or and)
|
|
;; Named patterns
|
|
((grammar-extract (-> name pattern) bindings k)
|
|
(grammar-extract pattern bindings (grammar-bind name k)))
|
|
((grammar-extract (-> name pattern ...) bindings k)
|
|
(grammar-extract (: pattern ...) bindings (grammar-bind name k)))
|
|
;; Allow => as an alias for -> for SRE compatibility.
|
|
((grammar-extract (=> name pattern) bindings k)
|
|
(grammar-extract pattern bindings (grammar-bind name k)))
|
|
((grammar-extract (=> name pattern ...) bindings k)
|
|
(grammar-extract (: pattern ...) bindings (grammar-bind name k)))
|
|
((grammar-extract ,name bindings k)
|
|
(grammar-bind name k (parse-sre name) bindings))
|
|
;; Walk container patterns.
|
|
((grammar-extract (: x y ...) bindings k)
|
|
(grammar-extract x bindings (grammar-map parse-seq (y ...) () k)))
|
|
((grammar-extract (* x) bindings k)
|
|
(grammar-extract x bindings (grammar-map parse-repeat () () k)))
|
|
((grammar-extract (* x y ...) bindings k)
|
|
(grammar-extract (: x y ...) bindings (grammar-map parse-repeat () () k)))
|
|
((grammar-extract (+ x) bindings k)
|
|
(grammar-extract x bindings (grammar-map parse-repeat+ () () k)))
|
|
((grammar-extract (+ x y ...) bindings k)
|
|
(grammar-extract (: x y ...) bindings (grammar-map parse-repeat+ () () k)))
|
|
((grammar-extract (? x y ...) bindings k)
|
|
(grammar-extract x bindings (grammar-map parse-optional (y ...) () k)))
|
|
((grammar-extract (or x y ...) bindings k)
|
|
(grammar-extract x bindings (grammar-map parse-or (y ...) () k)))
|
|
((grammar-extract (and x y ...) bindings k)
|
|
(grammar-extract x bindings (grammar-map parse-and (y ...) () k)))
|
|
;; Anything else is an implicitly quasiquoted SRE
|
|
((grammar-extract pattern bindings (k ...))
|
|
(k ... (parse-sre `pattern) bindings))))
|
|
|
|
(define-syntax grammar-map
|
|
(syntax-rules ()
|
|
((grammar-map f () (args ...) (k ...) x bindings)
|
|
(k ... (f args ... x) bindings))
|
|
((grammar-map f (y . rest) (args ...) k x bindings)
|
|
(grammar-extract y bindings (grammar-map f rest (args ... x) k)))))
|
|
|
|
(define-syntax grammar-action
|
|
(syntax-rules (=>)
|
|
((grammar-action () parser bindings)
|
|
;; By default just return the result.
|
|
(grammar-action (=> (lambda (r s i fk) r)) parser bindings))
|
|
((grammar-action (=> receiver) parser ((var tmp) ...))
|
|
;; Explicit => handler.
|
|
(lambda (source index sk fk)
|
|
(let ((tmp #f) ...)
|
|
(parser source
|
|
index
|
|
(lambda (r s i fk)
|
|
(sk (receiver r s i fk) s i fk))
|
|
fk))))
|
|
((grammar-action (action-expr) parser ())
|
|
;; Fast path - no named variables.
|
|
(let ((f parser))
|
|
(lambda (source index sk fk)
|
|
(f source index (lambda (r s i fk) (sk action-expr s i fk)) fk))))
|
|
((grammar-action (action-expr) parser ((var tmp) ...))
|
|
(lambda (source index sk fk)
|
|
(let ((tmp #f) ...)
|
|
;; TODO: Precompute static components of the parser.
|
|
;; We need to bind fresh variables on each parse, so some
|
|
;; components must be reified in this scope.
|
|
(parser source
|
|
index
|
|
(lambda (r s i fk)
|
|
(sk (let ((var tmp) ...) action-expr) s i fk))
|
|
fk))))))
|