diff --git a/lib/chibi/parse.sld b/lib/chibi/parse.sld new file mode 100644 index 00000000..7e773ee0 --- /dev/null +++ b/lib/chibi/parse.sld @@ -0,0 +1,21 @@ + +(define-library (chibi parse) + (export grammar grammar/unmemoized define-grammar define-grammar/unmemoized + call-with-parse parse parse-fully parse-fold + parse->list parse-fully->list + file->parse-stream string->parse-stream parse-stream-substring + parse-stream-start? parse-stream-end? parse-stream-ref + parse-anything parse-nothing parse-epsilon + parse-seq parse-and parse-or parse-not + parse-repeat parse-repeat+ parse-optional + parse-map parse-map-substring parse-ignore parse-assert + parse-atomic parse-commit parse-memoize + parse-char parse-not-char parse-char-pred + parse-string parse-token parse-sre + parse-beginning parse-end + parse-beginning-of-line parse-end-of-line + parse-beginning-of-line parse-end-of-line + parse-beginning-of-word parse-end-of-word + parse-word parse-word+) + (import (chibi) (chibi char-set base) (srfi 9)) + (include "parse/parse.scm")) diff --git a/lib/chibi/parse/common.scm b/lib/chibi/parse/common.scm new file mode 100644 index 00000000..0ccdc1e0 --- /dev/null +++ b/lib/chibi/parse/common.scm @@ -0,0 +1,258 @@ + +(define (char-hex-digit? ch) + (or (char-numeric? ch) + (memv (char-downcase ch) '(#\a #\b #\c #\d #\e #\f)))) + +(define (char-octal-digit? ch) + (and (char? ch) (char<=? #\0 ch #\7))) + +(define (parse-assert-range proc lo hi) + (if (or lo hi) + (parse-assert proc (lambda (n) + (and (or (not lo) (<= lo n)) + (or (not hi) (<= n hi))))) + proc)) + +(define (parse-unsigned-integer . o) + (let ((lo (and (pair? o) (car o))) + (hi (and (pair? o) (pair? (cdr o)) (cadr o)))) + (parse-assert-range + (parse-map (parse-token char-numeric?) string->number) + lo hi))) + +(define (parse-sign+) + (parse-or (parse-char #\+) (parse-char #\-))) + +(define (parse-sign) + (parse-or (parse-sign+) parse-epsilon)) + +(define (parse-integer . o) + (let ((lo (and (pair? o) (car o))) + (hi (and (pair? o) (pair? (cdr o)) (cadr o)))) + (parse-assert-range + (parse-map-substring + (parse-seq (parse-sign) (parse-token char-numeric?)) + string->number) + lo hi))) + +(define (parse-c-integer) + (parse-or + (parse-map (parse-seq (parse-string "0x") (parse-token char-hex-digit?)) + (lambda (x) (string->number (cadr x) 16))) + (parse-map (parse-string "0" (parse-token char-octal-digit?)) + (lambda (x) (string->number (cadr x) 8))) + (parse-integer))) + +(define (parse-real) + (parse-map-substring + (parse-seq + (parse-or + (parse-seq (parse-sign) (parse-repeat+ (parse-char char-numeric?)) + (parse-optional + (parse-seq (parse-char #\.) + (parse-repeat (parse-char char-numeric?))))) + (parse-seq (parse-sign) (parse-char #\.) + (parse-repeat+ (parse-char char-numeric?)))) + (parse-optional + (parse-seq (parse-char (lambda (ch) (eqv? #\e (char-downcase ch)))) + (parse-sign) + (parse-repeat+ (parse-char char-numeric?))))) + string->number)) + +(define (parse-imag) + (parse-or (parse-char #\i) (parse-char #\I))) + +(define (parse-complex) + (parse-map-substring + (parse-or + (parse-seq (parse-real) (parse-sign+) (parse-real) (parse-imag)) + (parse-seq (parse-real) (parse-imag)) + (parse-real)) + string->number)) + +(define (parse-identifier . o) + ;; Slightly more complicated than mapping parse-token because the + ;; typical identifier syntax has different initial and subsequent + ;; char-sets. + (let* ((init? + (if (pair? o) + (car o) + (lambda (ch) (or (eqv? #\_ ch) (char-alphabetic? ch))))) + (init (parse-char init?)) + (subsequent + (parse-char + (if (and (pair? o) (pair? (cdr o))) + (cadr o) + (lambda (ch) (or (init? ch) (char-numeric? ch))))))) + (lambda (source0 index0 sk0 fk0) + (init + source0 + index0 + (lambda (res source index fk2) + (let lp ((s source) (i index)) + (subsequent + s i (lambda (r s i fk) (lp s i)) + (lambda () + (sk0 (string->symbol (parse-stream-substring source0 index0 s i)) + s i fk0))))) + fk0)))) + +(define (parse-delimited . o) + (let ((delim (if (pair? o) (car o) #\")) + (esc (if (and (pair? o) (pair? (cdr o))) (cadr o) #\\)) + (parse-esc (if (and (pair? o) (pair? (cdr o)) (pair? (cddr o))) + (car (cddr o)) + parse-anything))) + (parse-map + (parse-seq + (parse-char delim) + (parse-repeat + (parse-or (parse-char + (lambda (ch) + (and (not (eqv? ch delim)) (not (eqv? ch esc))))) + (parse-map (parse-seq (parse-char esc) + (if (eqv? delim esc) + (parse-char esc) + parse-esc)) + cadr))) + (parse-char delim)) + (lambda (res) (list->string (cadr res)))))) + +(define (parse-separated . o) + (let* ((sep (if (pair? o) (car o) #\,)) + (o1 (if (pair? o) (cdr o) '())) + (delim (if (pair? o1) (car o1) #\")) + (o2 (if (pair? o1) (cdr o1) '())) + (esc (if (pair? o2) (car o2) delim)) + (o3 (if (pair? o2) (cdr o2) '())) + (ok? + (if (pair? o3) + (let ((pred (car o3))) + (lambda (ch) + (and (not (eqv? ch delim)) + (not (eqv? ch sep)) + (pred ch)))) + (lambda (ch) (and (not (eqv? ch delim)) (not (eqv? ch sep)))))) + (parse-field + (parse-or (parse-delimited delim esc) + (parse-map-substring + (parse-repeat+ (parse-char ok?)))))) + (parse-map + (parse-seq parse-field + (parse-repeat + (parse-map (parse-seq (parse-char sep) parse-field) cadr))) + (lambda (res) (cons (car res) (cadr res)))))) + +(define (parse-records . o) + (let* ((terms (if (pair? o) (car o) '("\r\n" "\n"))) + (terms (if (list? terms) terms (list terms))) + (term-chars (apply append (map string->list terms))) + (ok? (lambda (ch) (not (memv ch term-chars)))) + (o (if (pair? o) (cdr o) '())) + (sep (if (pair? o) (car o) #\,)) + (o (if (pair? o) (cdr o) '())) + (delim (if (pair? o) (car o) #\")) + (o (if (pair? o) (cdr o) '())) + (esc (if (pair? o) (car o) delim))) + (parse-repeat + (parse-map + (parse-seq (parse-separated sep delim esc ok?) + (apply parse-or parse-end (map parse-string terms))) + car)))) + +(define parse-ipv4-digit (parse-integer 0 255)) + +(define parse-ipv4-address + (parse-map-substring + (parse-seq parse-ipv4-digit + (parse-repeat (parse-seq (parse-char #\.) parse-ipv4-digit) + 3 3)))) + +(define parse-ipv6-digit + (parse-repeat (parse-char char-hex-digit?) 0 4)) + +(define parse-ipv6-address + (parse-map-substring + (parse-seq + parse-ipv6-digit + (parse-repeat (parse-seq (parse-repeat (parse-char #\:) 1 2) + parse-ipv6-digit) + 1 7)))) + +(define parse-ip-address + (parse-or parse-ipv4-address parse-ipv6-address)) + +(define parse-domain-atom + (parse-token + (lambda (ch) + (or (char-alphabetic? ch) (char-numeric? ch) (memv ch '(#\- #\_)))))) + +(define (parse-domain) + (parse-map-substring + (parse-or + parse-ip-address + (parse-seq (parse-repeat (parse-seq parse-domain-atom (parse-char #\.))) + parse-domain-atom)))) + +(define parse-top-level-domain + (apply parse-or + (parse-repeat (parse-char char-alphabetic?) 2 2) + (map parse-string + '("arpa" "com" "gov" "mil" "net" "org" "aero" "biz" "coop" + "info" "museum" "name" "pro")))) + +(define (parse-common-domain) + (parse-map-substring + (parse-seq (parse-repeat+ (parse-seq parse-domain-atom (parse-char #\.))) + parse-top-level-domain))) + +(define parse-email-local-part + (parse-token + (lambda (ch) + (or (char-alphabetic? ch) + (char-numeric? ch) + (memv ch '(#\- #\_ #\. #\+)))))) + +(define (parse-email) + ;; no quoted local parts or bang paths + (parse-seq parse-email-local-part + (parse-ignore (parse-char #\@)) + (parse-domain))) + +(define (char-url-fragment? ch) + (or (char-alphabetic? ch) (char-numeric? ch) + (memv ch '(#\_ #\- #\+ #\\ #\= #\~ #\&)))) + +(define (char-url? ch) + (or (char-url-fragment? ch) (memv ch '(#\. #\, #\;)))) + +(define (parse-url-char pred) + (parse-or (parse-char pred) + (parse-seq (parse-char #\%) + (parse-repeat (parse-char char-hex-digit?) 2 2)))) + +(define (parse-uri) + (parse-seq + (parse-identifier) + (parse-ignore + (parse-seq (parse-char #\:) (parse-repeat (parse-char #\/)))) + (parse-domain) + (parse-optional (parse-map (parse-seq (parse-char #\:) + (parse-integer 0 65536)) + cadr)) + (parse-optional + (parse-map-substring + (parse-seq (parse-char #\/) + (parse-repeat (parse-url-char char-url?))))) + (parse-optional + (parse-map + (parse-seq (parse-ignore (parse-char #\?)) + (parse-map-substring + (parse-repeat (parse-url-char char-url?)))) + car)) + (parse-optional + (parse-map + (parse-seq (parse-ignore (parse-char #\#)) + (parse-map-substring + (parse-repeat (parse-url-char char-url-fragment?)))) + car)))) diff --git a/lib/chibi/parse/common.sld b/lib/chibi/parse/common.sld new file mode 100644 index 00000000..5c3ecb5c --- /dev/null +++ b/lib/chibi/parse/common.sld @@ -0,0 +1,10 @@ + +(define-library (chibi parse common) + (export parse-integer parse-unsigned-integer parse-c-integer + parse-real parse-complex + parse-identifier parse-delimited parse-separated parse-records + parse-ipv4-address parse-ipv6-address parse-ip-address + parse-domain parse-common-domain parse-email parse-uri + char-hex-digit? char-octal-digit?) + (import (chibi) (chibi parse)) + (include "common.scm")) diff --git a/lib/chibi/parse/parse.scm b/lib/chibi/parse/parse.scm new file mode 100644 index 00000000..33b23967 --- /dev/null +++ b/lib/chibi/parse/parse.scm @@ -0,0 +1,647 @@ +;; parse.scm -- Parser Combinators +;; Copyright (c) 2013 Alex Shinn. All rights reserved. +;; BSD-style license: http://synthcode.com/license.txt + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; parse stream type +;; +;; Abstraction to treat ports as proper streams so that we can +;; backtrack from previous states. A single Parse-Stream record +;; represents a single buffered chunk of text. + +(define-record-type Parse-Stream + (%make-parse-stream filename port buffer cache offset prev-char tail) + parse-stream? + ;; The file the data came from, for debugging and error reporting. + (filename parse-stream-filename) + ;; The underlying port. + (port parse-stream-port) + ;; A vector of characters read from the port. We use a vector + ;; rather than a string for guaranteed O(1) access. + (buffer parse-stream-buffer) + ;; A vector of caches corresponding to parser successes or failures + ;; starting from the corresponding char. Currently each cache is + ;; just an alist, optimized under the assumption that the number of + ;; possible memoized parsers is relatively small. Note that + ;; memoization is only enabled explicitly. + (cache parse-stream-cache) + ;; The current offset of filled characters in the buffer. + ;; If offset is non-zero, (vector-ref buffer (- offset 1)) is + ;; valid. + (offset parse-stream-offset parse-stream-offset-set!) + ;; The previous char before the beginning of this Parse-Stream. + ;; Used for line/word-boundary checks. + (prev-char parse-stream-prev-char) + ;; The successor Parse-Stream chunk, created on demand and filled + ;; from the same port. + (tail %parse-stream-tail %parse-stream-tail-set!)) + +;; We want to balance avoiding reallocating buffers with avoiding +;; holding many memoized values in memory. +(define default-buffer-size 256) + +(define (make-parse-stream filename . o) + (let ((port (if (pair? o) (car o) (open-input-file filename))) + (len (if (and (pair? o) (pair? (cdr o))) (cadr o) default-buffer-size))) + (%make-parse-stream + filename port (make-vector len #f) (make-vector len '()) 0 #f #f))) + +(define (file->parse-stream filename) + (make-parse-stream filename (open-input-file filename))) + +(define (string->parse-stream str) + (make-parse-stream #f (open-input-string str))) + +(define (parse-stream-tail source) + (or (%parse-stream-tail source) + (let* ((len (vector-length (parse-stream-buffer source))) + (tail (%make-parse-stream (parse-stream-filename source) + (parse-stream-port source) + (make-vector len #f) + (make-vector len '()) + 0 + (parse-stream-last-char source) + #f))) + (%parse-stream-tail-set! source tail) + tail))) + +(define (parse-stream-fill! source i) + (let ((off (parse-stream-offset source)) + (buf (parse-stream-buffer source))) + (if (<= off i) + (do ((off off (+ off 1))) + ((> off i) (parse-stream-offset-set! source off)) + (vector-set! buf off (read-char (parse-stream-port source)))) + #f))) + +(define (parse-stream-start? source i) + (and (zero? i) (not (parse-stream-prev-char source)))) + +(define (parse-stream-end? source i) + (eof-object? (parse-stream-ref source i))) + +(define (parse-stream-ref source i) + (parse-stream-fill! source i) + (vector-ref (parse-stream-buffer source) i)) + +(define (parse-stream-last-char source) + (let ((buf (parse-stream-buffer source))) + (let lp ((i (parse-stream-offset source))) + (if (negative? i) + (parse-stream-prev-char source) + (let ((ch (vector-ref buf i))) + (if (eof-object? ch) + (lp (- i 1)) + ch)))))) + +(define (parse-stream-char-before source i) + (if (> i (parse-stream-offset source)) + (parse-stream-ref source (- i 1)) + (parse-stream-prev-char source))) + +(define (parse-stream-next-source source i) + (if (>= (+ i 1) (vector-length (parse-stream-buffer source))) + (parse-stream-tail source) + source)) + +(define (parse-stream-next-index source i) + (if (>= (+ i 1) (vector-length (parse-stream-buffer source))) + 0 + (+ i 1))) + +(define (parse-stream-close source) + (close-input-port (parse-stream-port source))) + +(define (vector-substring vec start end) + (let ((res (make-string (- end start)))) + (do ((i start (+ i 1))) + ((= i end) res) + (string-set! res (- i start) (vector-ref vec i))))) + +(define (parse-stream-substring s0 i0 s1 i1) + (cond + ((eq? s0 s1) + (parse-stream-fill! s0 i1) + (vector-substring (parse-stream-buffer s0) i0 i1)) + (else + (let lp ((s (parse-stream-next-source s0)) + (res (list (vector-substring (parse-stream-buffer s0) i0 i1)))) + (let ((buf (parse-stream-buffer s))) + (cond + ((eq? s s1) + (apply string-append + (reverse (cons (vector-substring buf 0 i1) res)))) + (else + (lp (parse-stream-next-source s) + (cons (vector-substring buf 0 (vector-length buf)) res))))))))) + +(define (parse-stream-cache-cell s i f) + (assv f (vector-ref (parse-stream-cache s) i))) + +(define (parse-stream-cache-set! s i f x) + (let ((cache (vector-ref (parse-stream-cache s) i))) + (cond + ((assv f cache) + => (lambda (cell) + ;; prefer longer matches + (if (and (pair? (cdr cell)) (< (cadr (cddr cell)) i)) + (set-cdr! cell x)))) + (else + (vector-set! (parse-stream-cache s) i (cons (cons f x) cache)))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; the parser interface + +(define (call-with-parse f source index sk . o) + (let ((p (if (string? source) (string->parse-stream source) source)) + (fk (if (pair? o) (car o) (lambda () #f)))) + (f p index sk fk))) + +(define (parse f source . o) + (let ((index (if (pair? o) (car o) 0))) + (call-with-parse f source index (lambda (r s i fk) r)))) + +(define (parse-fully f source . o) + (let ((index (if (pair? o) (car o) 0))) + (call-with-parse + f source index + (lambda (r s i fk) (if (parse-stream-end? s i) r (fk))) + (lambda () (error "incomplete parse"))))) + +(define (parse-fold f kons knil source . o) + (let lp ((p (if (string? source) (string->parse-stream source) source)) + (index (if (pair? o) (car o) 0)) + (acc knil)) + (f p index (lambda (r s i fk) (lp s i (kons r acc))) (lambda () acc)))) + +(define (parse->list f source . o) + (reverse (apply parse-fold cons '() f source o))) + +(define (parse-fully->list f source . o) + (let lp ((p (if (string? source) (string->parse-stream source) source)) + (index (if (pair? o) (car o) 0)) + (acc '())) + (f p index + (lambda (r s i fk) + (if (eof-object? r) acc (lp s i (cons r acc)))) + (lambda () (error "incomplete parse"))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; basic parsing combinators + +(define parse-epsilon + (lambda (source index sk fk) + (sk #t source index fk))) + +(define parse-anything + (lambda (source index sk fk) + (if (parse-stream-end? source index) + (fk) + (sk (parse-stream-ref source index) + (parse-stream-next-source source index) + (parse-stream-next-index source index) + fk)))) + +(define parse-nothing + (lambda (source index sk fk) + (fk))) + +(define (parse-or f . o) + (if (null? o) + f + (let ((g (apply parse-or o))) + (lambda (source index sk fk) + (let ((fk2 (lambda () (g source index sk fk)))) + (f source index sk fk2)))))) + +(define (parse-and f g) + (lambda (source index sk fk) + (f source index (lambda (r s i fk) (g source index sk fk)) fk))) + +(define (parse-not f) + (lambda (source index sk fk) + (f source index (lambda (r s i fk) (fk)) (lambda () (sk #t source index fk))))) + +(define (parse-seq . o) + (cond + ((null? o) + parse-epsilon) + ((null? (cdr o)) + (let ((f (car o))) + (lambda (s i sk fk) + (f s i (lambda (r s i fk) (sk (list r) s i fk)) fk)))) + (else + (let* ((f (car o)) + (o (cdr o)) + (g (car o)) + (o (cdr o)) + (g (if (pair? o) + (apply parse-seq g o) + (lambda (s i sk fk) + (g s i (lambda (r s i fk) (sk (list r) s i fk)) fk))))) + (lambda (source index sk fk) + (f source + index + (lambda (r s i fk) + (g s i (lambda (r2 s i fk) + (let ((r2 (if (eq? r ignored-value) r2 (cons r r2)))) + (sk r2 s i fk))) + fk)) + fk)))))) + +(define (maybe-parse-seq f . o) + (if (null? o) f (apply parse-seq f o))) + +(define (parse-optional f) + (lambda (source index sk fk) + (f source index sk (lambda () (sk #f source index fk))))) + +(define ignored-value (list 'ignore)) + +(define (parse-repeat f . o) + (let ((lo (if (pair? o) (car o) 0)) + (hi (and (pair? o) (pair? (cdr o)) (cadr o)))) + (lambda (source0 index0 sk fk) + (let repeat ((source source0) (index index0) (fk fk) (j 0) (res '())) + (let ((fk (if (>= j lo) + (lambda () (sk (reverse res) source index fk)) + fk))) + (if (and hi (= j hi)) + (sk (reverse res) source index fk) + (f source + index + (lambda (r s i fk) (repeat s i fk (+ j 1) (cons r res))) + fk))))))) + +(define (parse-repeat+ f) + (parse-repeat f 1)) + +(define (parse-map f proc) + (lambda (source index sk fk) + (f source index (lambda (res s i fk) (sk (proc res) s i fk)) fk))) + +(define (parse-map-substring f . o) + (let ((proc (if (pair? o) (car o) (lambda (res) res)))) + (lambda (source index sk fk) + (f source + index + (lambda (res s i fk) + (sk (proc (parse-stream-substring source index s i)) s i fk)) + fk)))) + +(define (parse-ignore f) + (parse-map f (lambda (res) ignored-value))) + +(define (parse-assert f check?) + (lambda (source index sk fk) + (f source + index + (lambda (res s i fk) + (if (check? res) (sk res s i fk) (fk))) + fk))) + +(define (parse-atomic f) + (lambda (source index sk fk) + (f source index (lambda (res s i fk2) (sk res s i fk)) fk))) + +(define (parse-commit f) + (lambda (source index sk fk) + (f source index (lambda (res s i fk) (sk res s i (lambda () #f))) fk))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; boundary checks + +(define parse-beginning + (lambda (source index sk fk) + (if (parse-stream-start? source index) + (sk #t source index fk) + (fk)))) + +(define parse-end + (lambda (source index sk fk) + (if (parse-stream-end? source index) + (sk #t source index fk) + (fk)))) + +(define parse-beginning-of-line + (lambda (source index sk fk) + (let ((before (parse-stream-char-before source index))) + (if (or (not before) (eqv? #\newline before)) + (sk #t source index fk) + (fk))))) + +(define parse-end-of-line + (lambda (source index sk fk) + (if (or (parse-stream-end? source index) + (eqv? #\newline (parse-stream-ref source index))) + (sk #t source index fk) + (fk)))) + +(define (char-word? ch) + (or (char-alphabetic? ch) (eqv? ch #\_))) + +(define parse-beginning-of-word + (lambda (source index sk fk) + (let ((before (parse-stream-char-before source index))) + (if (and (or (not before) (not (char-word? before))) + (not (parse-stream-end? source index)) + (char-word? (parse-stream-ref source index))) + (sk #t source index fk) + (fk))))) + +(define parse-end-of-word + (lambda (source index sk fk) + (let ((before (parse-stream-char-before source index))) + (if (and before + (char-word? before) + (or (parse-stream-end? source index) + (not (char-word? (parse-stream-ref source index))))) + (sk #t source index fk) + (fk))))) + +(define (parse-word . o) + (let ((word (if (pair? o) (car o) (parse-token char-word?)))) + (lambda (source index sk fk) + (parse-seq parse-beginning-of-word + word + parse-end-of-word)))) + +(define (parse-word+ . o) + (let ((pred (if (pair? o) + (lambda (ch) (and (char-word? ch) ((car o) ch))) + char-word?))) + (parse-word (parse-token pred)))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; constant parsers + +(define (parse-char-pred pred) + (lambda (source index sk fk) + (let ((ch (parse-stream-ref source index))) + (if (and (char? ch) (pred ch)) + (sk ch + (parse-stream-next-source source index) + (parse-stream-next-index source index) + fk) + (fk))))) + +(define (x->char-predicate x) + (cond + ((char? x) + (lambda (ch) (eqv? ch x))) + ((char-set? x) + (lambda (ch) (and (char? ch) (char-set-contains? x ch)))) + ((procedure? x) + (lambda (ch) (and (char? ch) (x ch)))) + (else + (error "don't know how to handle char predicate" x)))) + +(define (parse-char x) + (parse-char-pred (x->char-predicate x))) + +(define (parse-not-char x) + (let ((pred (x->char-predicate x))) + (parse-char-pred (lambda (ch) (not (pred ch)))))) + +(define (parse-string x) + (parse-map (apply parse-seq (map parse-char (string->list x))) + list->string)) + +(define (parse-token x) + ;; (parse-map (parse-repeat+ (parse-char x)) list->string) + ;; Tokens are atomic - we don't want to split them at any point in + ;; the middle - so the implementation is slightly more complex than + ;; the above. With a sane grammar the result would be the same + ;; either way, but this provides a useful optimization. + (let ((f (parse-char x))) + (lambda (source0 index0 sk fk) + (let lp ((source source0) (index index0)) + (f source + index + (lambda (r s i fk) (lp s i)) + (lambda () + (if (and (eq? source source0) (eqv? index index0)) + (fk) + (sk (parse-stream-substring source0 index0 source index) + source index fk)))))))) + +;; We provide a subset of SRE syntax, optionally interspersed with +;; existing parsers. These are just translated directly into parser +;; combinators. A future version may translate pieces into a +;; non-backtracking engine where possible. +(define (parse-sre x) + (cond + ((procedure? x) ; an embedded parser + x) + ((or (char? x) (char-set? x)) + (parse-char x)) + ((string? x) + (parse-string x)) + ((null? x) + parse-epsilon) + ((list? x) + (case (car x) + ((: seq) (apply parse-seq (map parse-sre (cdr x)))) + ((or) (apply parse-or (map parse-sre (cdr x)))) + ((and) (apply parse-and (map parse-sre (cdr x)))) + ((not) (apply parse-not (map parse-sre (cdr x)))) + ((*) (parse-repeat (apply maybe-parse-seq (map parse-sre (cdr x))))) + ((+) (parse-repeat+ (apply maybe-parse-seq (map parse-sre (cdr x))))) + ((?) (parse-optional (apply maybe-parse-seq (map parse-sre (cdr x))))) + ((=>) (apply maybe-parse-seq (map parse-sre (cddr x)))) + ((word) (apply parse-word (cdr x))) + ((word+) (apply parse-word+ (cdr x))) + (else (error "unknown sre list parser" x)))) + (else + (case x + ((any) parse-anything) + ((nonl) (parse-char (lambda (ch) (not (eqv? ch #\newline))))) + ((space whitespace) (parse-char char-whitespace?)) + ((digit numeric) (parse-char char-numeric?)) + ((alpha alphabetic) (parse-char char-alphabetic?)) + ((alnum alphanumeric) + (parse-char-pred (lambda (ch) (or (char-alphabetic? ch) (char-numeric? ch))))) + ((lower lower-case) (parse-char char-lower-case?)) + ((upper upper-case) (parse-char char-upper-case?)) + ((word) (parse-word)) + ((bow) parse-beginning-of-word) + ((eow) parse-end-of-word) + ((bol) parse-beginning-of-line) + ((eol) parse-end-of-line) + ((bos) parse-beginning) + ((eos) parse-end) + (else (error "unknown sre parser" x)))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; delayed combinators for self-referentiality + +(define-syntax parse-lazy + (syntax-rules () + ((parse-lazy f) + (let ((g (delay f))) + (lambda (source index sk fk) + ((force g) source index sk fk)))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; memoization wrapper for packrat-like parsing + +;; debugging +(define *procedures* '()) +(define (procedure-name f) + (cond ((assq f *procedures*) => cdr) (else #f))) +(define (procedure-name-set! f name) + (set! *procedures* (cons (cons f name) *procedures*))) + +(define (parse-memoize name f) + (if (not (procedure-name f)) (procedure-name-set! f name)) + (lambda (source index sk fk) + (cond + ((parse-stream-cache-cell source index f) + => (lambda (cell) + (if (cdr cell) + (apply sk (append (cdr cell) (list fk))) + (fk)))) + (else + (f source + index + (lambda (res s i fk) + (parse-stream-cache-set! source index f (list res s i)) + (sk res s i fk)) + (lambda () + (if (not (pair? (parse-stream-cache-cell source index f))) + (parse-stream-cache-set! source index f #f)) + (fk))))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; syntactic sugar + +;; The four basic interfaces are grammar, define-grammar, and their +;; unmemoized variants grammar/unmemoized and +;; define-grammar/unmemoized. This is optimized for the common case - +;; generally you want to memoize grammars, and may or may not want to +;; memoize the smaller lexical components. + +(define-syntax grammar/unmemoized + (syntax-rules () + ((grammar init (rule (clause . action) ...) ...) + (letrec ((rule (parse-or (grammar-clause clause . action) ...)) + ...) + init)))) + +(define-syntax grammar + (syntax-rules () + ((grammar/memoized init (rule (clause . action) ...) ...) + (letrec ((rule + (parse-memoize + 'rule + (parse-or (grammar-clause clause . action) ...))) + ...) + init)))) + +(define-syntax define-grammar/unmemoized + (syntax-rules () + ((define-grammar name (rule (clause . action) ...) ...) + (begin + (define rule (parse-or (grammar-clause clause . action) ...)) + ... + (define name (list (cons 'rule rule) ...)))))) + +(define-syntax define-grammar + (syntax-rules () + ((define-grammar/memoized name (rule (clause . action) ...) ...) + (begin + (define rule + (parse-memoize 'rule (parse-or (grammar-clause clause . action) ...))) + ... + (define name (list (cons 'rule rule) ...)))))) + +;; Most of the implementation goes into how we parse a single grammar +;; clause. This is hard to read if you're not used to CPS macros. + +(define-syntax grammar-clause + (syntax-rules () + ((grammar-clause clause . action) + (grammar-extract clause () (grammar-action action))))) + +(define-syntax grammar-extract + (syntax-rules (unquote => : seq * + ? or and) + ;; Named patterns + ((grammar-extract (=> name pattern) bindings k) + (grammar-extract pattern bindings (grammar-bind name k))) + ((grammar-extract (=> name pattern ...) bindings k) + (grammar-extract (: pattern ...) bindings (grammar-bind name k))) + ((grammar-extract ,name bindings k) + (grammar-bind name k (parse-sre name) bindings)) + ;; Walk container patterns. + ((grammar-extract (: x y ...) bindings k) + (grammar-extract x bindings (grammar-map parse-seq (y ...) () k))) + ((grammar-extract (* x y ...) bindings k) + (grammar-extract x bindings (grammar-map parse-repeat (y ...) () k))) + ((grammar-extract (+ x y ...) bindings k) + (grammar-extract x bindings (grammar-map parse-repeat+ (y ...) () k))) + ((grammar-extract (? x y ...) bindings k) + (grammar-extract x bindings (grammar-map parse-optional (y ...) () k))) + ((grammar-extract (or x y ...) bindings k) + (grammar-extract x bindings (grammar-map parse-or (y ...) () k))) + ((grammar-extract (and x y ...) bindings k) + (grammar-extract x bindings (grammar-map parse-and (y ...) () k))) + ;; Anything else is an implicitly quasiquoted SRE + ((grammar-extract pattern bindings (k ...)) + (k ... (parse-sre `pattern) bindings)))) + +(define-syntax grammar-bind + (syntax-rules () + ((grammar-bind name (k ...) f ((var tmp) ...)) + (let-syntax ((new-symbol? + (syntax-rules (var ...) + ((new-symbol? name sk fk) sk) + ((new-symbol? _ sk fk) fk)))) + ;; Bind the name only to the first instance in the pattern. + (new-symbol? + random-symbol-to-match + (k ... + (lambda (s i sk fk) + (let ((save-tmp new-tmp)) + (f s i + (lambda (r s i fk) (set! new-tmp r) (sk r s i fk)) + (lambda () (set! new-tmp save-tmp) (fk))))) + ((var tmp) ... (name new-tmp))) + (k ... f ((var tmp) ...))))))) + +(define-syntax grammar-map + (syntax-rules () + ((grammar-map f () (args ...) (k ...) x bindings) + (k ... (f args ... x) bindings)) + ((grammar-map f (y . rest) (args ...) k x bindings) + (grammar-extract y bindings (grammar-map f rest (args ... x) k))))) + +(define-syntax grammar-action + (syntax-rules (=>) + ((grammar-action () parser bindings) + ;; By default just return the result. + (grammar-action (=> (lambda (r s i fk) r)) parser bindings)) + ((grammar-action (=> receiver) parser ((var tmp) ...)) + ;; Explicit => handler. + (lambda (source index sk fk) + (let ((tmp #f) ...) + (parser source + index + (lambda (r s i fk) + (sk (receiver r s i fk) s i fk)) + fk)))) + ((grammar-action (action-expr) parser ()) + ;; Fast path - no named variables. + (let ((f parser)) + (lambda (source index sk fk) + (f source index (lambda (r s i fk) (sk action-expr s i fk)) fk)))) + ((grammar-action (action-expr) parser ((var tmp) ...)) + (lambda (source index sk fk) + (let ((tmp #f) ...) + ;; TODO: Precompute static components of the parser. + ;; We need to bind fresh variables on each parse, so some + ;; components must be reified in this scope. + (parser source + index + (lambda (r s i fk) + (sk (let ((var tmp) ...) action-expr) s i fk)) + fk)))))) diff --git a/tests/parse-tests.scm b/tests/parse-tests.scm new file mode 100644 index 00000000..91e54958 --- /dev/null +++ b/tests/parse-tests.scm @@ -0,0 +1,115 @@ + +(import (chibi) (chibi test) + (chibi char-set) (chibi char-set ascii) + (chibi parse)) + +(test-begin "parse") + +;; basic + +(test-assert (parse parse-epsilon "")) +(test-assert (parse-fully parse-epsilon "")) +(test-error (parse-fully parse-epsilon "a")) + +(test-not (parse parse-anything "")) +(test-assert (parse-fully parse-anything "a")) +(test-error (parse-fully parse-anything "ab")) + +(test-not (parse parse-nothing "")) +(test-not (parse parse-nothing "a")) + +(test-not (parse (parse-char #\a) "")) +(test-assert (parse-fully (parse-char #\a) "a")) +(test-not (parse (parse-char #\a) "b")) +(test-error (parse-fully (parse-char #\a) "ab")) + +(let ((f (parse-seq (parse-char #\a) (parse-char #\b)))) + (test-not (parse f "a")) + (test-not (parse f "b")) + (test-assert (parse f "ab")) + (test-error (parse-fully f "abc"))) + +(let ((f (parse-or (parse-char #\a) (parse-char #\b)))) + (test-not (parse f "")) + (test-assert (parse f "a")) + (test-assert (parse f "b")) + (test-error (parse-fully f "ab"))) + +(let ((f (parse-not (parse-char #\a)))) + (test-assert (parse f "")) + (test-error (parse-fully f "a")) + (test-assert (parse f "b"))) + +(let ((f (parse-repeat (parse-char #\a)))) + (test-assert (parse-fully f "")) + (test-assert (parse-fully f "a")) + (test-assert (parse-fully f "aa")) + (test-assert (parse-fully f "aaa")) + (test-assert (parse f "b")) + (test-assert (parse f "aab")) + (test-error (parse-fully f "aab"))) + +;; grammars + +(let () + (define-grammar calc + (space ((* ,char-set:whitespace))) + (number ((=> n (+ ,char-set:digit)) + (string->number (list->string n)))) + (simple ((=> n ,number) n) + ((: "(" (=> e1 ,term) ")") e1)) + (term-op ("*" *) + ("/" /) + ("%" modulo)) + (term ((: (=> e1 ,simple) ,space (=> op ,term-op) ,space (=> e2 ,term)) + (op e1 e2)) + ((=> e1 ,simple) + e1))) + (test 88 (parse term "4*22")) + (test 42 (parse term "42")) + ;; partial match (grammar isn't checking end) + (test 42 (parse term "42*"))) + +(define calculator + (grammar expr + (space ((: ,char-set:whitespace ,space)) + (() #f)) + (digit ((=> d ,char-set:digit) d)) + (number ((=> n (+ ,digit)) + (string->number (list->string n)))) + (simple ((=> n ,number) n) + ((: "(" (=> e1 ,expr) ")") e1)) + (term-op ("*" *) + ("/" /) + ("%" modulo)) + (term ((: (=> e1 ,simple) ,space (=> op ,term-op) ,space (=> e2 ,term)) + (op e1 e2)) + ((=> e1 ,simple) + e1)) + (expr-op ("+" +) ("-" -)) + (expr ((: ,space (=> e1 ,term) ,space (=> op ,expr-op) ,space (=> e2 ,expr)) + (op e1 e2)) + ((: ,space (=> e1 ,term)) + e1)))) + +(test 42 (parse calculator "42")) +(test 4 (parse calculator "2 + 2")) +(test 23 (parse calculator "2 + 2*10 + 1")) +(test 25 (parse calculator "2+2 * 10+1 * 3")) +(test 41 (parse calculator "(2 + 2) * 10 + 1")) + +;; this takes exponential time without memoization +(define explode + (grammar start + (start ((: ,S eos) #t)) + (S ((+ ,A) #t)) + (A ((: "a" ,S "b") #t) + ((: "a" ,S "c") #t) + ((: "a") #t)))) + +(test-assert (parse explode "aaabb")) +(test-not (parse explode "bbaa")) +(test-assert + (parse explode (string-append (make-string 10 #\a) (make-string 8 #\c)))) + +(test-end)