Adding initial parser combinator library. API still subject to change.

This commit is contained in:
Alex Shinn 2013-02-21 22:56:07 +09:00
parent 9f56df7de2
commit e18de40fe2
5 changed files with 1051 additions and 0 deletions

21
lib/chibi/parse.sld Normal file
View file

@ -0,0 +1,21 @@
(define-library (chibi parse)
(export grammar grammar/unmemoized define-grammar define-grammar/unmemoized
call-with-parse parse parse-fully parse-fold
parse->list parse-fully->list
file->parse-stream string->parse-stream parse-stream-substring
parse-stream-start? parse-stream-end? parse-stream-ref
parse-anything parse-nothing parse-epsilon
parse-seq parse-and parse-or parse-not
parse-repeat parse-repeat+ parse-optional
parse-map parse-map-substring parse-ignore parse-assert
parse-atomic parse-commit parse-memoize
parse-char parse-not-char parse-char-pred
parse-string parse-token parse-sre
parse-beginning parse-end
parse-beginning-of-line parse-end-of-line
parse-beginning-of-line parse-end-of-line
parse-beginning-of-word parse-end-of-word
parse-word parse-word+)
(import (chibi) (chibi char-set base) (srfi 9))
(include "parse/parse.scm"))

258
lib/chibi/parse/common.scm Normal file
View file

@ -0,0 +1,258 @@
(define (char-hex-digit? ch)
(or (char-numeric? ch)
(memv (char-downcase ch) '(#\a #\b #\c #\d #\e #\f))))
(define (char-octal-digit? ch)
(and (char? ch) (char<=? #\0 ch #\7)))
(define (parse-assert-range proc lo hi)
(if (or lo hi)
(parse-assert proc (lambda (n)
(and (or (not lo) (<= lo n))
(or (not hi) (<= n hi)))))
proc))
(define (parse-unsigned-integer . o)
(let ((lo (and (pair? o) (car o)))
(hi (and (pair? o) (pair? (cdr o)) (cadr o))))
(parse-assert-range
(parse-map (parse-token char-numeric?) string->number)
lo hi)))
(define (parse-sign+)
(parse-or (parse-char #\+) (parse-char #\-)))
(define (parse-sign)
(parse-or (parse-sign+) parse-epsilon))
(define (parse-integer . o)
(let ((lo (and (pair? o) (car o)))
(hi (and (pair? o) (pair? (cdr o)) (cadr o))))
(parse-assert-range
(parse-map-substring
(parse-seq (parse-sign) (parse-token char-numeric?))
string->number)
lo hi)))
(define (parse-c-integer)
(parse-or
(parse-map (parse-seq (parse-string "0x") (parse-token char-hex-digit?))
(lambda (x) (string->number (cadr x) 16)))
(parse-map (parse-string "0" (parse-token char-octal-digit?))
(lambda (x) (string->number (cadr x) 8)))
(parse-integer)))
(define (parse-real)
(parse-map-substring
(parse-seq
(parse-or
(parse-seq (parse-sign) (parse-repeat+ (parse-char char-numeric?))
(parse-optional
(parse-seq (parse-char #\.)
(parse-repeat (parse-char char-numeric?)))))
(parse-seq (parse-sign) (parse-char #\.)
(parse-repeat+ (parse-char char-numeric?))))
(parse-optional
(parse-seq (parse-char (lambda (ch) (eqv? #\e (char-downcase ch))))
(parse-sign)
(parse-repeat+ (parse-char char-numeric?)))))
string->number))
(define (parse-imag)
(parse-or (parse-char #\i) (parse-char #\I)))
(define (parse-complex)
(parse-map-substring
(parse-or
(parse-seq (parse-real) (parse-sign+) (parse-real) (parse-imag))
(parse-seq (parse-real) (parse-imag))
(parse-real))
string->number))
(define (parse-identifier . o)
;; Slightly more complicated than mapping parse-token because the
;; typical identifier syntax has different initial and subsequent
;; char-sets.
(let* ((init?
(if (pair? o)
(car o)
(lambda (ch) (or (eqv? #\_ ch) (char-alphabetic? ch)))))
(init (parse-char init?))
(subsequent
(parse-char
(if (and (pair? o) (pair? (cdr o)))
(cadr o)
(lambda (ch) (or (init? ch) (char-numeric? ch)))))))
(lambda (source0 index0 sk0 fk0)
(init
source0
index0
(lambda (res source index fk2)
(let lp ((s source) (i index))
(subsequent
s i (lambda (r s i fk) (lp s i))
(lambda ()
(sk0 (string->symbol (parse-stream-substring source0 index0 s i))
s i fk0)))))
fk0))))
(define (parse-delimited . o)
(let ((delim (if (pair? o) (car o) #\"))
(esc (if (and (pair? o) (pair? (cdr o))) (cadr o) #\\))
(parse-esc (if (and (pair? o) (pair? (cdr o)) (pair? (cddr o)))
(car (cddr o))
parse-anything)))
(parse-map
(parse-seq
(parse-char delim)
(parse-repeat
(parse-or (parse-char
(lambda (ch)
(and (not (eqv? ch delim)) (not (eqv? ch esc)))))
(parse-map (parse-seq (parse-char esc)
(if (eqv? delim esc)
(parse-char esc)
parse-esc))
cadr)))
(parse-char delim))
(lambda (res) (list->string (cadr res))))))
(define (parse-separated . o)
(let* ((sep (if (pair? o) (car o) #\,))
(o1 (if (pair? o) (cdr o) '()))
(delim (if (pair? o1) (car o1) #\"))
(o2 (if (pair? o1) (cdr o1) '()))
(esc (if (pair? o2) (car o2) delim))
(o3 (if (pair? o2) (cdr o2) '()))
(ok?
(if (pair? o3)
(let ((pred (car o3)))
(lambda (ch)
(and (not (eqv? ch delim))
(not (eqv? ch sep))
(pred ch))))
(lambda (ch) (and (not (eqv? ch delim)) (not (eqv? ch sep))))))
(parse-field
(parse-or (parse-delimited delim esc)
(parse-map-substring
(parse-repeat+ (parse-char ok?))))))
(parse-map
(parse-seq parse-field
(parse-repeat
(parse-map (parse-seq (parse-char sep) parse-field) cadr)))
(lambda (res) (cons (car res) (cadr res))))))
(define (parse-records . o)
(let* ((terms (if (pair? o) (car o) '("\r\n" "\n")))
(terms (if (list? terms) terms (list terms)))
(term-chars (apply append (map string->list terms)))
(ok? (lambda (ch) (not (memv ch term-chars))))
(o (if (pair? o) (cdr o) '()))
(sep (if (pair? o) (car o) #\,))
(o (if (pair? o) (cdr o) '()))
(delim (if (pair? o) (car o) #\"))
(o (if (pair? o) (cdr o) '()))
(esc (if (pair? o) (car o) delim)))
(parse-repeat
(parse-map
(parse-seq (parse-separated sep delim esc ok?)
(apply parse-or parse-end (map parse-string terms)))
car))))
(define parse-ipv4-digit (parse-integer 0 255))
(define parse-ipv4-address
(parse-map-substring
(parse-seq parse-ipv4-digit
(parse-repeat (parse-seq (parse-char #\.) parse-ipv4-digit)
3 3))))
(define parse-ipv6-digit
(parse-repeat (parse-char char-hex-digit?) 0 4))
(define parse-ipv6-address
(parse-map-substring
(parse-seq
parse-ipv6-digit
(parse-repeat (parse-seq (parse-repeat (parse-char #\:) 1 2)
parse-ipv6-digit)
1 7))))
(define parse-ip-address
(parse-or parse-ipv4-address parse-ipv6-address))
(define parse-domain-atom
(parse-token
(lambda (ch)
(or (char-alphabetic? ch) (char-numeric? ch) (memv ch '(#\- #\_))))))
(define (parse-domain)
(parse-map-substring
(parse-or
parse-ip-address
(parse-seq (parse-repeat (parse-seq parse-domain-atom (parse-char #\.)))
parse-domain-atom))))
(define parse-top-level-domain
(apply parse-or
(parse-repeat (parse-char char-alphabetic?) 2 2)
(map parse-string
'("arpa" "com" "gov" "mil" "net" "org" "aero" "biz" "coop"
"info" "museum" "name" "pro"))))
(define (parse-common-domain)
(parse-map-substring
(parse-seq (parse-repeat+ (parse-seq parse-domain-atom (parse-char #\.)))
parse-top-level-domain)))
(define parse-email-local-part
(parse-token
(lambda (ch)
(or (char-alphabetic? ch)
(char-numeric? ch)
(memv ch '(#\- #\_ #\. #\+))))))
(define (parse-email)
;; no quoted local parts or bang paths
(parse-seq parse-email-local-part
(parse-ignore (parse-char #\@))
(parse-domain)))
(define (char-url-fragment? ch)
(or (char-alphabetic? ch) (char-numeric? ch)
(memv ch '(#\_ #\- #\+ #\\ #\= #\~ #\&))))
(define (char-url? ch)
(or (char-url-fragment? ch) (memv ch '(#\. #\, #\;))))
(define (parse-url-char pred)
(parse-or (parse-char pred)
(parse-seq (parse-char #\%)
(parse-repeat (parse-char char-hex-digit?) 2 2))))
(define (parse-uri)
(parse-seq
(parse-identifier)
(parse-ignore
(parse-seq (parse-char #\:) (parse-repeat (parse-char #\/))))
(parse-domain)
(parse-optional (parse-map (parse-seq (parse-char #\:)
(parse-integer 0 65536))
cadr))
(parse-optional
(parse-map-substring
(parse-seq (parse-char #\/)
(parse-repeat (parse-url-char char-url?)))))
(parse-optional
(parse-map
(parse-seq (parse-ignore (parse-char #\?))
(parse-map-substring
(parse-repeat (parse-url-char char-url?))))
car))
(parse-optional
(parse-map
(parse-seq (parse-ignore (parse-char #\#))
(parse-map-substring
(parse-repeat (parse-url-char char-url-fragment?))))
car))))

View file

@ -0,0 +1,10 @@
(define-library (chibi parse common)
(export parse-integer parse-unsigned-integer parse-c-integer
parse-real parse-complex
parse-identifier parse-delimited parse-separated parse-records
parse-ipv4-address parse-ipv6-address parse-ip-address
parse-domain parse-common-domain parse-email parse-uri
char-hex-digit? char-octal-digit?)
(import (chibi) (chibi parse))
(include "common.scm"))

647
lib/chibi/parse/parse.scm Normal file
View file

@ -0,0 +1,647 @@
;; parse.scm -- Parser Combinators
;; Copyright (c) 2013 Alex Shinn. All rights reserved.
;; BSD-style license: http://synthcode.com/license.txt
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; parse stream type
;;
;; Abstraction to treat ports as proper streams so that we can
;; backtrack from previous states. A single Parse-Stream record
;; represents a single buffered chunk of text.
(define-record-type Parse-Stream
(%make-parse-stream filename port buffer cache offset prev-char tail)
parse-stream?
;; The file the data came from, for debugging and error reporting.
(filename parse-stream-filename)
;; The underlying port.
(port parse-stream-port)
;; A vector of characters read from the port. We use a vector
;; rather than a string for guaranteed O(1) access.
(buffer parse-stream-buffer)
;; A vector of caches corresponding to parser successes or failures
;; starting from the corresponding char. Currently each cache is
;; just an alist, optimized under the assumption that the number of
;; possible memoized parsers is relatively small. Note that
;; memoization is only enabled explicitly.
(cache parse-stream-cache)
;; The current offset of filled characters in the buffer.
;; If offset is non-zero, (vector-ref buffer (- offset 1)) is
;; valid.
(offset parse-stream-offset parse-stream-offset-set!)
;; The previous char before the beginning of this Parse-Stream.
;; Used for line/word-boundary checks.
(prev-char parse-stream-prev-char)
;; The successor Parse-Stream chunk, created on demand and filled
;; from the same port.
(tail %parse-stream-tail %parse-stream-tail-set!))
;; We want to balance avoiding reallocating buffers with avoiding
;; holding many memoized values in memory.
(define default-buffer-size 256)
(define (make-parse-stream filename . o)
(let ((port (if (pair? o) (car o) (open-input-file filename)))
(len (if (and (pair? o) (pair? (cdr o))) (cadr o) default-buffer-size)))
(%make-parse-stream
filename port (make-vector len #f) (make-vector len '()) 0 #f #f)))
(define (file->parse-stream filename)
(make-parse-stream filename (open-input-file filename)))
(define (string->parse-stream str)
(make-parse-stream #f (open-input-string str)))
(define (parse-stream-tail source)
(or (%parse-stream-tail source)
(let* ((len (vector-length (parse-stream-buffer source)))
(tail (%make-parse-stream (parse-stream-filename source)
(parse-stream-port source)
(make-vector len #f)
(make-vector len '())
0
(parse-stream-last-char source)
#f)))
(%parse-stream-tail-set! source tail)
tail)))
(define (parse-stream-fill! source i)
(let ((off (parse-stream-offset source))
(buf (parse-stream-buffer source)))
(if (<= off i)
(do ((off off (+ off 1)))
((> off i) (parse-stream-offset-set! source off))
(vector-set! buf off (read-char (parse-stream-port source))))
#f)))
(define (parse-stream-start? source i)
(and (zero? i) (not (parse-stream-prev-char source))))
(define (parse-stream-end? source i)
(eof-object? (parse-stream-ref source i)))
(define (parse-stream-ref source i)
(parse-stream-fill! source i)
(vector-ref (parse-stream-buffer source) i))
(define (parse-stream-last-char source)
(let ((buf (parse-stream-buffer source)))
(let lp ((i (parse-stream-offset source)))
(if (negative? i)
(parse-stream-prev-char source)
(let ((ch (vector-ref buf i)))
(if (eof-object? ch)
(lp (- i 1))
ch))))))
(define (parse-stream-char-before source i)
(if (> i (parse-stream-offset source))
(parse-stream-ref source (- i 1))
(parse-stream-prev-char source)))
(define (parse-stream-next-source source i)
(if (>= (+ i 1) (vector-length (parse-stream-buffer source)))
(parse-stream-tail source)
source))
(define (parse-stream-next-index source i)
(if (>= (+ i 1) (vector-length (parse-stream-buffer source)))
0
(+ i 1)))
(define (parse-stream-close source)
(close-input-port (parse-stream-port source)))
(define (vector-substring vec start end)
(let ((res (make-string (- end start))))
(do ((i start (+ i 1)))
((= i end) res)
(string-set! res (- i start) (vector-ref vec i)))))
(define (parse-stream-substring s0 i0 s1 i1)
(cond
((eq? s0 s1)
(parse-stream-fill! s0 i1)
(vector-substring (parse-stream-buffer s0) i0 i1))
(else
(let lp ((s (parse-stream-next-source s0))
(res (list (vector-substring (parse-stream-buffer s0) i0 i1))))
(let ((buf (parse-stream-buffer s)))
(cond
((eq? s s1)
(apply string-append
(reverse (cons (vector-substring buf 0 i1) res))))
(else
(lp (parse-stream-next-source s)
(cons (vector-substring buf 0 (vector-length buf)) res)))))))))
(define (parse-stream-cache-cell s i f)
(assv f (vector-ref (parse-stream-cache s) i)))
(define (parse-stream-cache-set! s i f x)
(let ((cache (vector-ref (parse-stream-cache s) i)))
(cond
((assv f cache)
=> (lambda (cell)
;; prefer longer matches
(if (and (pair? (cdr cell)) (< (cadr (cddr cell)) i))
(set-cdr! cell x))))
(else
(vector-set! (parse-stream-cache s) i (cons (cons f x) cache))))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; the parser interface
(define (call-with-parse f source index sk . o)
(let ((p (if (string? source) (string->parse-stream source) source))
(fk (if (pair? o) (car o) (lambda () #f))))
(f p index sk fk)))
(define (parse f source . o)
(let ((index (if (pair? o) (car o) 0)))
(call-with-parse f source index (lambda (r s i fk) r))))
(define (parse-fully f source . o)
(let ((index (if (pair? o) (car o) 0)))
(call-with-parse
f source index
(lambda (r s i fk) (if (parse-stream-end? s i) r (fk)))
(lambda () (error "incomplete parse")))))
(define (parse-fold f kons knil source . o)
(let lp ((p (if (string? source) (string->parse-stream source) source))
(index (if (pair? o) (car o) 0))
(acc knil))
(f p index (lambda (r s i fk) (lp s i (kons r acc))) (lambda () acc))))
(define (parse->list f source . o)
(reverse (apply parse-fold cons '() f source o)))
(define (parse-fully->list f source . o)
(let lp ((p (if (string? source) (string->parse-stream source) source))
(index (if (pair? o) (car o) 0))
(acc '()))
(f p index
(lambda (r s i fk)
(if (eof-object? r) acc (lp s i (cons r acc))))
(lambda () (error "incomplete parse")))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; basic parsing combinators
(define parse-epsilon
(lambda (source index sk fk)
(sk #t source index fk)))
(define parse-anything
(lambda (source index sk fk)
(if (parse-stream-end? source index)
(fk)
(sk (parse-stream-ref source index)
(parse-stream-next-source source index)
(parse-stream-next-index source index)
fk))))
(define parse-nothing
(lambda (source index sk fk)
(fk)))
(define (parse-or f . o)
(if (null? o)
f
(let ((g (apply parse-or o)))
(lambda (source index sk fk)
(let ((fk2 (lambda () (g source index sk fk))))
(f source index sk fk2))))))
(define (parse-and f g)
(lambda (source index sk fk)
(f source index (lambda (r s i fk) (g source index sk fk)) fk)))
(define (parse-not f)
(lambda (source index sk fk)
(f source index (lambda (r s i fk) (fk)) (lambda () (sk #t source index fk)))))
(define (parse-seq . o)
(cond
((null? o)
parse-epsilon)
((null? (cdr o))
(let ((f (car o)))
(lambda (s i sk fk)
(f s i (lambda (r s i fk) (sk (list r) s i fk)) fk))))
(else
(let* ((f (car o))
(o (cdr o))
(g (car o))
(o (cdr o))
(g (if (pair? o)
(apply parse-seq g o)
(lambda (s i sk fk)
(g s i (lambda (r s i fk) (sk (list r) s i fk)) fk)))))
(lambda (source index sk fk)
(f source
index
(lambda (r s i fk)
(g s i (lambda (r2 s i fk)
(let ((r2 (if (eq? r ignored-value) r2 (cons r r2))))
(sk r2 s i fk)))
fk))
fk))))))
(define (maybe-parse-seq f . o)
(if (null? o) f (apply parse-seq f o)))
(define (parse-optional f)
(lambda (source index sk fk)
(f source index sk (lambda () (sk #f source index fk)))))
(define ignored-value (list 'ignore))
(define (parse-repeat f . o)
(let ((lo (if (pair? o) (car o) 0))
(hi (and (pair? o) (pair? (cdr o)) (cadr o))))
(lambda (source0 index0 sk fk)
(let repeat ((source source0) (index index0) (fk fk) (j 0) (res '()))
(let ((fk (if (>= j lo)
(lambda () (sk (reverse res) source index fk))
fk)))
(if (and hi (= j hi))
(sk (reverse res) source index fk)
(f source
index
(lambda (r s i fk) (repeat s i fk (+ j 1) (cons r res)))
fk)))))))
(define (parse-repeat+ f)
(parse-repeat f 1))
(define (parse-map f proc)
(lambda (source index sk fk)
(f source index (lambda (res s i fk) (sk (proc res) s i fk)) fk)))
(define (parse-map-substring f . o)
(let ((proc (if (pair? o) (car o) (lambda (res) res))))
(lambda (source index sk fk)
(f source
index
(lambda (res s i fk)
(sk (proc (parse-stream-substring source index s i)) s i fk))
fk))))
(define (parse-ignore f)
(parse-map f (lambda (res) ignored-value)))
(define (parse-assert f check?)
(lambda (source index sk fk)
(f source
index
(lambda (res s i fk)
(if (check? res) (sk res s i fk) (fk)))
fk)))
(define (parse-atomic f)
(lambda (source index sk fk)
(f source index (lambda (res s i fk2) (sk res s i fk)) fk)))
(define (parse-commit f)
(lambda (source index sk fk)
(f source index (lambda (res s i fk) (sk res s i (lambda () #f))) fk)))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; boundary checks
(define parse-beginning
(lambda (source index sk fk)
(if (parse-stream-start? source index)
(sk #t source index fk)
(fk))))
(define parse-end
(lambda (source index sk fk)
(if (parse-stream-end? source index)
(sk #t source index fk)
(fk))))
(define parse-beginning-of-line
(lambda (source index sk fk)
(let ((before (parse-stream-char-before source index)))
(if (or (not before) (eqv? #\newline before))
(sk #t source index fk)
(fk)))))
(define parse-end-of-line
(lambda (source index sk fk)
(if (or (parse-stream-end? source index)
(eqv? #\newline (parse-stream-ref source index)))
(sk #t source index fk)
(fk))))
(define (char-word? ch)
(or (char-alphabetic? ch) (eqv? ch #\_)))
(define parse-beginning-of-word
(lambda (source index sk fk)
(let ((before (parse-stream-char-before source index)))
(if (and (or (not before) (not (char-word? before)))
(not (parse-stream-end? source index))
(char-word? (parse-stream-ref source index)))
(sk #t source index fk)
(fk)))))
(define parse-end-of-word
(lambda (source index sk fk)
(let ((before (parse-stream-char-before source index)))
(if (and before
(char-word? before)
(or (parse-stream-end? source index)
(not (char-word? (parse-stream-ref source index)))))
(sk #t source index fk)
(fk)))))
(define (parse-word . o)
(let ((word (if (pair? o) (car o) (parse-token char-word?))))
(lambda (source index sk fk)
(parse-seq parse-beginning-of-word
word
parse-end-of-word))))
(define (parse-word+ . o)
(let ((pred (if (pair? o)
(lambda (ch) (and (char-word? ch) ((car o) ch)))
char-word?)))
(parse-word (parse-token pred))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; constant parsers
(define (parse-char-pred pred)
(lambda (source index sk fk)
(let ((ch (parse-stream-ref source index)))
(if (and (char? ch) (pred ch))
(sk ch
(parse-stream-next-source source index)
(parse-stream-next-index source index)
fk)
(fk)))))
(define (x->char-predicate x)
(cond
((char? x)
(lambda (ch) (eqv? ch x)))
((char-set? x)
(lambda (ch) (and (char? ch) (char-set-contains? x ch))))
((procedure? x)
(lambda (ch) (and (char? ch) (x ch))))
(else
(error "don't know how to handle char predicate" x))))
(define (parse-char x)
(parse-char-pred (x->char-predicate x)))
(define (parse-not-char x)
(let ((pred (x->char-predicate x)))
(parse-char-pred (lambda (ch) (not (pred ch))))))
(define (parse-string x)
(parse-map (apply parse-seq (map parse-char (string->list x)))
list->string))
(define (parse-token x)
;; (parse-map (parse-repeat+ (parse-char x)) list->string)
;; Tokens are atomic - we don't want to split them at any point in
;; the middle - so the implementation is slightly more complex than
;; the above. With a sane grammar the result would be the same
;; either way, but this provides a useful optimization.
(let ((f (parse-char x)))
(lambda (source0 index0 sk fk)
(let lp ((source source0) (index index0))
(f source
index
(lambda (r s i fk) (lp s i))
(lambda ()
(if (and (eq? source source0) (eqv? index index0))
(fk)
(sk (parse-stream-substring source0 index0 source index)
source index fk))))))))
;; We provide a subset of SRE syntax, optionally interspersed with
;; existing parsers. These are just translated directly into parser
;; combinators. A future version may translate pieces into a
;; non-backtracking engine where possible.
(define (parse-sre x)
(cond
((procedure? x) ; an embedded parser
x)
((or (char? x) (char-set? x))
(parse-char x))
((string? x)
(parse-string x))
((null? x)
parse-epsilon)
((list? x)
(case (car x)
((: seq) (apply parse-seq (map parse-sre (cdr x))))
((or) (apply parse-or (map parse-sre (cdr x))))
((and) (apply parse-and (map parse-sre (cdr x))))
((not) (apply parse-not (map parse-sre (cdr x))))
((*) (parse-repeat (apply maybe-parse-seq (map parse-sre (cdr x)))))
((+) (parse-repeat+ (apply maybe-parse-seq (map parse-sre (cdr x)))))
((?) (parse-optional (apply maybe-parse-seq (map parse-sre (cdr x)))))
((=>) (apply maybe-parse-seq (map parse-sre (cddr x))))
((word) (apply parse-word (cdr x)))
((word+) (apply parse-word+ (cdr x)))
(else (error "unknown sre list parser" x))))
(else
(case x
((any) parse-anything)
((nonl) (parse-char (lambda (ch) (not (eqv? ch #\newline)))))
((space whitespace) (parse-char char-whitespace?))
((digit numeric) (parse-char char-numeric?))
((alpha alphabetic) (parse-char char-alphabetic?))
((alnum alphanumeric)
(parse-char-pred (lambda (ch) (or (char-alphabetic? ch) (char-numeric? ch)))))
((lower lower-case) (parse-char char-lower-case?))
((upper upper-case) (parse-char char-upper-case?))
((word) (parse-word))
((bow) parse-beginning-of-word)
((eow) parse-end-of-word)
((bol) parse-beginning-of-line)
((eol) parse-end-of-line)
((bos) parse-beginning)
((eos) parse-end)
(else (error "unknown sre parser" x))))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; delayed combinators for self-referentiality
(define-syntax parse-lazy
(syntax-rules ()
((parse-lazy f)
(let ((g (delay f)))
(lambda (source index sk fk)
((force g) source index sk fk))))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; memoization wrapper for packrat-like parsing
;; debugging
(define *procedures* '())
(define (procedure-name f)
(cond ((assq f *procedures*) => cdr) (else #f)))
(define (procedure-name-set! f name)
(set! *procedures* (cons (cons f name) *procedures*)))
(define (parse-memoize name f)
(if (not (procedure-name f)) (procedure-name-set! f name))
(lambda (source index sk fk)
(cond
((parse-stream-cache-cell source index f)
=> (lambda (cell)
(if (cdr cell)
(apply sk (append (cdr cell) (list fk)))
(fk))))
(else
(f source
index
(lambda (res s i fk)
(parse-stream-cache-set! source index f (list res s i))
(sk res s i fk))
(lambda ()
(if (not (pair? (parse-stream-cache-cell source index f)))
(parse-stream-cache-set! source index f #f))
(fk)))))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; syntactic sugar
;; The four basic interfaces are grammar, define-grammar, and their
;; unmemoized variants grammar/unmemoized and
;; define-grammar/unmemoized. This is optimized for the common case -
;; generally you want to memoize grammars, and may or may not want to
;; memoize the smaller lexical components.
(define-syntax grammar/unmemoized
(syntax-rules ()
((grammar init (rule (clause . action) ...) ...)
(letrec ((rule (parse-or (grammar-clause clause . action) ...))
...)
init))))
(define-syntax grammar
(syntax-rules ()
((grammar/memoized init (rule (clause . action) ...) ...)
(letrec ((rule
(parse-memoize
'rule
(parse-or (grammar-clause clause . action) ...)))
...)
init))))
(define-syntax define-grammar/unmemoized
(syntax-rules ()
((define-grammar name (rule (clause . action) ...) ...)
(begin
(define rule (parse-or (grammar-clause clause . action) ...))
...
(define name (list (cons 'rule rule) ...))))))
(define-syntax define-grammar
(syntax-rules ()
((define-grammar/memoized name (rule (clause . action) ...) ...)
(begin
(define rule
(parse-memoize 'rule (parse-or (grammar-clause clause . action) ...)))
...
(define name (list (cons 'rule rule) ...))))))
;; Most of the implementation goes into how we parse a single grammar
;; clause. This is hard to read if you're not used to CPS macros.
(define-syntax grammar-clause
(syntax-rules ()
((grammar-clause clause . action)
(grammar-extract clause () (grammar-action action)))))
(define-syntax grammar-extract
(syntax-rules (unquote => : seq * + ? or and)
;; Named patterns
((grammar-extract (=> name pattern) bindings k)
(grammar-extract pattern bindings (grammar-bind name k)))
((grammar-extract (=> name pattern ...) bindings k)
(grammar-extract (: pattern ...) bindings (grammar-bind name k)))
((grammar-extract ,name bindings k)
(grammar-bind name k (parse-sre name) bindings))
;; Walk container patterns.
((grammar-extract (: x y ...) bindings k)
(grammar-extract x bindings (grammar-map parse-seq (y ...) () k)))
((grammar-extract (* x y ...) bindings k)
(grammar-extract x bindings (grammar-map parse-repeat (y ...) () k)))
((grammar-extract (+ x y ...) bindings k)
(grammar-extract x bindings (grammar-map parse-repeat+ (y ...) () k)))
((grammar-extract (? x y ...) bindings k)
(grammar-extract x bindings (grammar-map parse-optional (y ...) () k)))
((grammar-extract (or x y ...) bindings k)
(grammar-extract x bindings (grammar-map parse-or (y ...) () k)))
((grammar-extract (and x y ...) bindings k)
(grammar-extract x bindings (grammar-map parse-and (y ...) () k)))
;; Anything else is an implicitly quasiquoted SRE
((grammar-extract pattern bindings (k ...))
(k ... (parse-sre `pattern) bindings))))
(define-syntax grammar-bind
(syntax-rules ()
((grammar-bind name (k ...) f ((var tmp) ...))
(let-syntax ((new-symbol?
(syntax-rules (var ...)
((new-symbol? name sk fk) sk)
((new-symbol? _ sk fk) fk))))
;; Bind the name only to the first instance in the pattern.
(new-symbol?
random-symbol-to-match
(k ...
(lambda (s i sk fk)
(let ((save-tmp new-tmp))
(f s i
(lambda (r s i fk) (set! new-tmp r) (sk r s i fk))
(lambda () (set! new-tmp save-tmp) (fk)))))
((var tmp) ... (name new-tmp)))
(k ... f ((var tmp) ...)))))))
(define-syntax grammar-map
(syntax-rules ()
((grammar-map f () (args ...) (k ...) x bindings)
(k ... (f args ... x) bindings))
((grammar-map f (y . rest) (args ...) k x bindings)
(grammar-extract y bindings (grammar-map f rest (args ... x) k)))))
(define-syntax grammar-action
(syntax-rules (=>)
((grammar-action () parser bindings)
;; By default just return the result.
(grammar-action (=> (lambda (r s i fk) r)) parser bindings))
((grammar-action (=> receiver) parser ((var tmp) ...))
;; Explicit => handler.
(lambda (source index sk fk)
(let ((tmp #f) ...)
(parser source
index
(lambda (r s i fk)
(sk (receiver r s i fk) s i fk))
fk))))
((grammar-action (action-expr) parser ())
;; Fast path - no named variables.
(let ((f parser))
(lambda (source index sk fk)
(f source index (lambda (r s i fk) (sk action-expr s i fk)) fk))))
((grammar-action (action-expr) parser ((var tmp) ...))
(lambda (source index sk fk)
(let ((tmp #f) ...)
;; TODO: Precompute static components of the parser.
;; We need to bind fresh variables on each parse, so some
;; components must be reified in this scope.
(parser source
index
(lambda (r s i fk)
(sk (let ((var tmp) ...) action-expr) s i fk))
fk))))))

115
tests/parse-tests.scm Normal file
View file

@ -0,0 +1,115 @@
(import (chibi) (chibi test)
(chibi char-set) (chibi char-set ascii)
(chibi parse))
(test-begin "parse")
;; basic
(test-assert (parse parse-epsilon ""))
(test-assert (parse-fully parse-epsilon ""))
(test-error (parse-fully parse-epsilon "a"))
(test-not (parse parse-anything ""))
(test-assert (parse-fully parse-anything "a"))
(test-error (parse-fully parse-anything "ab"))
(test-not (parse parse-nothing ""))
(test-not (parse parse-nothing "a"))
(test-not (parse (parse-char #\a) ""))
(test-assert (parse-fully (parse-char #\a) "a"))
(test-not (parse (parse-char #\a) "b"))
(test-error (parse-fully (parse-char #\a) "ab"))
(let ((f (parse-seq (parse-char #\a) (parse-char #\b))))
(test-not (parse f "a"))
(test-not (parse f "b"))
(test-assert (parse f "ab"))
(test-error (parse-fully f "abc")))
(let ((f (parse-or (parse-char #\a) (parse-char #\b))))
(test-not (parse f ""))
(test-assert (parse f "a"))
(test-assert (parse f "b"))
(test-error (parse-fully f "ab")))
(let ((f (parse-not (parse-char #\a))))
(test-assert (parse f ""))
(test-error (parse-fully f "a"))
(test-assert (parse f "b")))
(let ((f (parse-repeat (parse-char #\a))))
(test-assert (parse-fully f ""))
(test-assert (parse-fully f "a"))
(test-assert (parse-fully f "aa"))
(test-assert (parse-fully f "aaa"))
(test-assert (parse f "b"))
(test-assert (parse f "aab"))
(test-error (parse-fully f "aab")))
;; grammars
(let ()
(define-grammar calc
(space ((* ,char-set:whitespace)))
(number ((=> n (+ ,char-set:digit))
(string->number (list->string n))))
(simple ((=> n ,number) n)
((: "(" (=> e1 ,term) ")") e1))
(term-op ("*" *)
("/" /)
("%" modulo))
(term ((: (=> e1 ,simple) ,space (=> op ,term-op) ,space (=> e2 ,term))
(op e1 e2))
((=> e1 ,simple)
e1)))
(test 88 (parse term "4*22"))
(test 42 (parse term "42"))
;; partial match (grammar isn't checking end)
(test 42 (parse term "42*")))
(define calculator
(grammar expr
(space ((: ,char-set:whitespace ,space))
(() #f))
(digit ((=> d ,char-set:digit) d))
(number ((=> n (+ ,digit))
(string->number (list->string n))))
(simple ((=> n ,number) n)
((: "(" (=> e1 ,expr) ")") e1))
(term-op ("*" *)
("/" /)
("%" modulo))
(term ((: (=> e1 ,simple) ,space (=> op ,term-op) ,space (=> e2 ,term))
(op e1 e2))
((=> e1 ,simple)
e1))
(expr-op ("+" +) ("-" -))
(expr ((: ,space (=> e1 ,term) ,space (=> op ,expr-op) ,space (=> e2 ,expr))
(op e1 e2))
((: ,space (=> e1 ,term))
e1))))
(test 42 (parse calculator "42"))
(test 4 (parse calculator "2 + 2"))
(test 23 (parse calculator "2 + 2*10 + 1"))
(test 25 (parse calculator "2+2 * 10+1 * 3"))
(test 41 (parse calculator "(2 + 2) * 10 + 1"))
;; this takes exponential time without memoization
(define explode
(grammar start
(start ((: ,S eos) #t))
(S ((+ ,A) #t))
(A ((: "a" ,S "b") #t)
((: "a" ,S "c") #t)
((: "a") #t))))
(test-assert (parse explode "aaabb"))
(test-not (parse explode "bbaa"))
(test-assert
(parse explode (string-append (make-string 10 #\a) (make-string 8 #\c))))
(test-end)