From 205c60a807285d0270baf3f0ccbea12cc6e4ea68 Mon Sep 17 00:00:00 2001 From: Alex Shinn Date: Wed, 24 Jul 2013 21:36:02 +0900 Subject: [PATCH] Initial version of (chibi regexp). --- lib/chibi/regexp.scm | 795 ++++++++++++++++++++++++++++++++++++++ lib/chibi/regexp.sld | 44 +++ lib/chibi/regexp/pcre.scm | 598 ++++++++++++++++++++++++++++ lib/chibi/regexp/pcre.sld | 7 + tests/re-tests.txt | 135 +++++++ tests/regexp-tests.scm | 214 ++++++++++ 6 files changed, 1793 insertions(+) create mode 100644 lib/chibi/regexp.scm create mode 100644 lib/chibi/regexp.sld create mode 100644 lib/chibi/regexp/pcre.scm create mode 100644 lib/chibi/regexp/pcre.sld create mode 100644 tests/re-tests.txt create mode 100644 tests/regexp-tests.scm diff --git a/lib/chibi/regexp.scm b/lib/chibi/regexp.scm new file mode 100644 index 00000000..e9ec3789 --- /dev/null +++ b/lib/chibi/regexp.scm @@ -0,0 +1,795 @@ +;; regexp.scm -- simple non-bactracking NFA implementation +;; Copyright (c) 2013 Alex Shinn. All rights reserved. +;; BSD-style license: http://synthcode.com/license.txt + +;;; An rx represents a start state and meta-info such as the number +;;; and names of submatches. +(define-record-type Rx + (make-rx start-state num-matches num-save-indexes match-rules match-names) + regexp? + (start-state rx-start-state rx-start-state-set!) + (num-matches rx-num-matches rx-num-matches-set!) + (num-save-indexes rx-num-save-indexes rx-num-save-indexes-set!) + (match-rules rx-rules rx-rules-set!) + (match-names rx-names rx-names-set!)) + +;;; A state is a single nfa state with transition rules. +(define-record-type State + (%make-state accept? chars match match-rule next1 next2) + state? + ;; A boolean indicating if this is an accepting state. + (accept? state-accept? state-accept?-set!) + ;; A char or char-set indicating when we can transition. + ;; Alternately, #f indicates an epsilon transition, while a + ;; procedure of the form (lambda (ch i matches) ...) is a predicate + ;; which should return #t if the char matches. + (chars state-chars state-chars-set!) + ;; A single integer indicating the match position to record. + (match state-match state-match-set!) + ;; The rule for merging ambiguous matches. Can be any of: left, + ;; right, (list i j). Posix semantics are equivalent to using left + ;; for the beginning of a submatch and right for the end. List is + ;; used to capture a list of submatch data in the current match. + (match-rule state-match-rule state-match-rule-set!) + ;; The destination if the char match succeeds. + (next1 state-next1 state-next1-set!) + ;; An optional additional transition used for forking to two states. + (next2 state-next2 state-next2-set!)) + +(define (make-state accept? chars match match-rule next1 next2) + (if (and next1 (not (state? next1))) + (error "expected a state" next1)) + (if (and next2 (not (state? next2))) + (error "expected a state" next2)) + (%make-state accept? chars match match-rule next1 next2)) + +(define (char-set-ci cset) + (let ((res (char-set))) + (for-each + (lambda (ch) + (char-set-adjoin! res (char-upcase ch)) + (char-set-adjoin! res (char-downcase ch))) + (char-set->list cset)) + res)) + +(define (make-char-state ch flags next) + (if (= ~ci? (bitwise-and ~ci? flags)) + (let ((cset (cond ((char? ch) (char-set-ci (char-set ch))) + ((char-set? ch) (char-set-ci ch)) + (else ch)))) + (make-state #f cset #f #f next #f)) + (make-state #f ch #f #f next #f))) +(define (make-fork-state next1 next2) + (make-state #f #f #f #f next1 next2)) +(define (make-epsilon-state next) + (make-fork-state next #f)) +(define (make-accept-state) + (make-state #t #f #f #f #f #f)) + +;; A record holding the current match data - essentially a wrapper +;; around a vector, plus a reference to the RX for meta-info. +(define-record-type Rx-Match + (%make-rx-match matches rx) + rx-match? + (matches rx-match-matches rx-match-matches-set!) + (rx rx-match-rx)) + +(define (rx-match-rules md) + (rx-rules (rx-match-rx md))) +(define (rx-match-names md) + (rx-names (rx-match-rx md))) +(define (make-rx-match len rx) + (%make-rx-match (make-vector len #f) rx)) +(define (make-rx-match-for-rx rx) + (make-rx-match (rx-num-save-indexes rx) rx)) +(define (rx-match-num-matches md) + (vector-length (rx-match-matches md))) + +(define (rx-match-name-offset md name) + (cond ((assq name (rx-match-names md)) => cdr) + (else (error "unknown match name" md name)))) + +(define (rx-match-ref md n) + (vector-ref (rx-match-matches md) + (if (integer? n) + n + (rx-match-name-offset md n)))) + +(define (rx-match-set! md n val) + (vector-set! (rx-match-matches md) n val)) + +(define (copy-rx-match md) + (let* ((src (rx-match-matches md)) + (len (vector-length src)) + (dst (make-vector len #f))) + (do ((i 0 (+ i 1))) + ((= i len) (%make-rx-match dst (rx-match-rx md))) + (vector-set! dst i (vector-ref src i))))) + +;;> Returns the matching result for the given named or indexed +;;> submatch \var{n}, possibly as a list for a submatch-list, or +;;> \scheme{#f} if not matched. + +(define (rx-match-submatch/list md str n) + (let ((n (if (integer? n) n (rx-match-name-offset md n)))) + (cond + ((>= n (vector-length (rx-match-rules md))) + #f) + (else + (let ((rule (vector-ref (rx-match-rules md) n))) + (cond + ((pair? rule) + (let ((start (rx-match-ref md (car rule))) + (end (rx-match-ref md (cdr rule)))) + (and start end (substring-cursor str start end)))) + (else + (let ((res (rx-match-ref md rule))) + (if (pair? res) + (reverse res) + res))))))))) + +;;> Returns the matching substring for the given named or indexed +;;> submatch \var{n}, or \scheme{#f} if not matched. + +(define (rx-match-submatch md str n) + (let ((res (rx-match-submatch/list md str n))) + (if (pair? res) (car res) res))) + +(define (rx-match-submatch-start+end md str n) + (let ((n (if (string-cursor? n) n (rx-match-name-offset md n)))) + (and (< n (vector-length (rx-match-rules md))) + (let ((rule (vector-ref (rx-match-rules md) n))) + (if (pair? rule) + (let ((start (rx-match-ref md (car rule))) + (end (rx-match-ref md (cdr rule)))) + (and start end + (cons (string-offset->index str start) + (string-offset->index str end)))) + #f))))) + +;;> Returns the start index within \var{str} for the given named or +;;> indexed submatch \var{n}, or \scheme{#f} if not matched. + +(define (rx-match-submatch-start md str n) + (cond ((rx-match-submatch-start+end md str n) => car) (else #f))) + +;;> Returns the end index within \var{str} for the given named or +;;> indexed submatch \var{n}, or \scheme{#f} if not matched. + +(define (rx-match-submatch-end md str n) + (cond ((rx-match-submatch-start+end md str n) => cdr) (else #f))) + +(define (rx-match-convert recurse? md str) + (cond + ((vector? md) + (let lp ((i 0) (res '())) + (cond + ((>= i (vector-length md)) + (reverse res)) + ((string-cursor? (vector-ref md i)) + (lp (+ i 2) + (cons (substring-cursor str + (vector-ref md i) + (vector-ref md (+ i 1))) + res))) + (else + (lp (+ i 1) + (cons (rx-match-convert recurse? (vector-ref md i) str) res)))))) + ((list? md) + (if recurse? + (map (lambda (x) (rx-match-convert recurse? x str)) (reverse md)) + (rx-match-convert recurse? (car md) str))) + ((and (pair? md) (string-cursor? (car md)) (string-cursor? (cdr md))) + (substring-cursor str (car md) (cdr md))) + ((rx-match? md) + (rx-match-convert recurse? (rx-match-matches md) str)) + (else + md))) + +;;> Convert an rx-match result to a list of submatches, beginning +;;> with the full match, using \scheme{#f} for unmatched submatches. + +(define (rx-match->list md str) + (rx-match-convert #f md str)) + +;;> Convert an rx-match result to a forest of submatches, beginning +;;> with the full match, using \scheme{#f} for unmatched submatches. + +(define (rx-match->sexp md str) + (rx-match-convert #t md str)) + +;; Collect results from a list match. +(define (match-collect md spec) + (define (match-extract md n) + (let* ((vec (rx-match-matches md)) + (rules (rx-match-rules md)) + (n-rule (vector-ref rules n)) + (rule (vector-ref rules n-rule))) + (if (pair? rule) + (let ((start (rx-match-ref md (car rule))) + (end (rx-match-ref md (cdr rule)))) + (and start end (cons start end))) + (rx-match-ref md rule)))) + (let ((end (cadr spec)) + (vec (rx-match-matches md))) + (let lp ((i (+ 1 (car spec))) + (ls '())) + (if (>= i end) + (reverse ls) + (lp (+ i 1) (cons (match-extract md i) ls)))))) + +;; A searcher represents a single rx state and match information. +(define-record-type Searcher + (make-searcher state matches) + searcher? + (state searcher-state searcher-state-set!) + (matches searcher-matches searcher-matches-set!)) + +;; Merge two rx-matches, preferring the leftmost-longest of their +;; matches. +(define (rx-match>=? m1 m2) + (let ((end (- (vector-length (rx-match-matches m1)) 1))) + (let lp ((i 0)) + (cond + ((>= i end) + #t) + ((and (eqv? (rx-match-ref m1 i) (rx-match-ref m2 i)) + (eqv? (rx-match-ref m1 (+ i 1)) (rx-match-ref m2 (+ i 1)))) + (lp (+ i 2))) + ((and (string-cursor? (rx-match-ref m2 i)) + (string-cursor? (rx-match-ref m2 (+ i 1))) + (or (not (string-cursor? (rx-match-ref m1 i))) + (not (string-cursor? (rx-match-ref m1 (+ i 1)))) + (string-cursor? (rx-match-ref m2 (+ i 1)) + (rx-match-ref m1 (+ i 1)))))) + #f) + (else + #t))))) + +(define (rx-match-max m1 m2) + (if (rx-match>=? m1 m2) m1 m2)) + +;; Merge match data from sr2 into sr1, preferring the leftmost-longest +;; match in the event of a conflict. +(define (searcher-merge! sr1 sr2) + (let ((m (rx-match-max (searcher-matches sr1) (searcher-matches sr2)))) + (searcher-matches-set! sr1 m))) + +(define (searcher-max sr1 sr2) + (if (or (not (searcher? sr2)) + (rx-match>=? (searcher-matches sr1) (searcher-matches sr2))) + sr1 + sr2)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; A posse is a group of searchers. + +(define (make-posse . o) + (make-hash-table eq?)) + +(define posse? hash-table?) +(define (posse-empty? posse) (zero? (hash-table-size posse))) + +(define (posse-ref posse sr) + (hash-table-ref/default posse (searcher-state sr) #f)) +(define (posse-add! posse sr) + (hash-table-set! posse (searcher-state sr) sr)) +(define (posse-clear! posse) + (hash-table-walk posse (lambda (key val) (hash-table-delete! posse key)))) +(define (posse-for-each proc posse) + (hash-table-walk posse (lambda (key val) (proc val)))) + +(define (posse->list posse) + (hash-table-values posse)) +(define (list->posse ls) + (let ((searchers (make-posse))) + (for-each (lambda (sr) (posse-add! searchers sr)) ls) + searchers)) +(define (posse . args) + (list->posse args)) + +(define (make-start-searcher rx) + (make-searcher (rx-start-state rx) (make-rx-match-for-rx rx))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Execution + +;; A transition which doesn't advance the index. + +(define (epsilon-state? st) + (or (not (state-chars st)) + (procedure? (state-chars st)))) + +;; Match the state against a char and index. + +(define (state-matches? st str i ch start end matches) + (let ((matcher (state-chars st))) + (cond + ((char? matcher) + (eqv? matcher ch)) + ((char-set? matcher) + (char-set-contains? matcher ch)) + ((pair? matcher) + (and (char<=? (car matcher) ch) (char<=? ch (cdr matcher)))) + ((procedure? matcher) + (matcher str i ch start end matches)) + ((not matcher)) + (else + (error "unknown state matcher" (state-chars st)))))) + +;; Advance epsilons together - if the State is newly added to the +;; group and is an epsilon state, recursively add the transition. + +(define (posse-advance! new seen accept sr str i start end) + (let advance! ((sr sr)) + (let ((st (searcher-state sr))) + ;; Update match data. + (cond + ((state-match st) + (let ((index (state-match st)) + (matches (searcher-matches sr))) + (cond + ((pair? index) + ;; Submatch list, accumulate and push. + (let* ((prev (rx-match-ref matches (car index))) + (new (cons (match-collect matches (cdr index)) + (if (pair? prev) prev '())))) + (rx-match-set! matches (car index) new))) + (else + (rx-match-set! matches index i)))))) + ;; Follow transitions. + (cond + ((state-accept? st) + (set-cdr! accept (searcher-max sr (cdr accept)))) + ((posse-ref seen sr) + => (lambda (sr-prev) (searcher-merge! sr-prev sr))) + ((epsilon-state? st) + (let ((ch (and (string-cursor (lambda (sr-prev) (searcher-merge! sr-prev sr))) + (else + ;; Add new searcher. + (posse-add! new sr)))))) + +;; Run so long as there is more to match. + +(define (regexp-run search? rx str . o) + (let* ((start (string-start-arg str o)) + (end (string-end-arg str (if (pair? o) (cdr o) o))) + (rx (regexp rx)) + (epsilons (posse)) + (accept (list #f))) + (let lp ((i start) + (searchers1 (posse)) + (searchers2 (posse))) + ;; Advance initial epsilons once from the first index, or every + ;; time when searching. + (cond + ((or search? (string-cursor=? i start)) + (posse-advance! searchers1 epsilons accept (make-start-searcher rx) + str i start end) + (posse-clear! epsilons))) + (cond + ((or (string-cursor>=? i end) + (and (or (not search?) (searcher? (cdr accept))) + (posse-empty? searchers1))) + ;; Terminate when the string is done or there are no more + ;; searchers. If we terminate prematurely and are not + ;; searching, return false. + (and (or search? (string-cursor>=? i end)) + (searcher? (cdr accept)) + (searcher-matches (cdr accept)))) + (else + ;; Otherwise advance normally. + (let ((ch (string-cursor-ref str i)) + (i2 (string-cursor-next str i))) + (posse-for-each ;; NOTE: non-deterministic from hash order + (lambda (sr) + (cond + ((state-matches? (searcher-state sr) str i ch + start end (searcher-matches sr)) + (searcher-state-set! sr (state-next1 (searcher-state sr))) + ;; Epsilons are considered at the next position. + (posse-advance! searchers2 epsilons accept sr str i2 start end) + (posse-clear! epsilons)))) + searchers1) + (posse-clear! searchers1) + (lp i2 searchers2 searchers1))))))) + +;;> Match the given regexp or SRE against the entire string and return +;;> the match data on success. Returns \scheme{#f} on failure. + +(define (regexp-match rx str . o) + (apply regexp-run #f rx str o)) + +;;> Match the given regexp or SRE against the entire string and return +;;> the \scheme{#t} on success. Returns \scheme{#f} on failure. + +(define (regexp-match? rx str . o) + (and (apply regexp-match rx str o) #t)) + +;;> Search for the given regexp or SRE within string and return +;;> the match data on success. Returns \scheme{#f} on failure. + +(define (regexp-search rx str . o) + (apply regexp-run #t rx str o)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Compiling + +(define ~none 0) +(define ~ci? 1) + +(define (parse-flags ls) + (define (symbol->flag s) + (case s ((i ci case-insensitive) ~ci?) (else ~none))) + (let lp ((ls ls) (res ~none)) + (if (not (pair? ls)) + res + (lp (cdr ls) (bitwise-ior res (symbol->flag (car ls))))))) + +(define char-set:nonl + (char-set-difference char-set:full (char-set #\newline))) +(define char-set:control (ucs-range->char-set 0 32)) +(define char-set:word-constituent + (char-set-union char-set:letter char-set:digit (char-set #\_))) +(define (char-word-constituent? ch) + (char-set-contains? char-set:word-constituent ch)) + +(define (match/bos str i ch start end matches) + (string-cursor=? i start)) +(define (match/eos str i ch start end matches) + (string-cursor>=? i end)) +(define (match/bol str i ch start end matches) + (or (string-cursor=? i start) + (eqv? #\newline (string-cursor-ref str (string-cursor-prev str i))))) +(define (match/eol str i ch start end matches) + (or (string-cursor>=? i end) + (eqv? #\newline (string-cursor-ref str i)))) +(define (match/bow str i ch start end matches) + (and (string-cursor=? i end) + (not (char-word-constituent? ch))) + (string-cursor>? i start) + (char-word-constituent? + (string-cursor-ref str (string-cursor-prev str i))))) + +(define (lookup-char-set name) + (case name + ((any) char-set:full) + ((nonl) char-set:nonl) + ((lower-case lower) char-set:lower-case) + ((upper-case upper) char-set:upper-case) + ((alphabetic alpha) char-set:letter) + ((numeric num digit) char-set:digit) + ((alphanumeric alphanum alnum) char-set:letter+digit) + ((punctuation punct) char-set:punctuation) + ((graphic graph) char-set:graphic) + ((whitespace white space) char-set:whitespace) + ((printing print) char-set:printing) + ((control cntrl) char-set:control) + ((hex-digit xdigit hex) char-set:hex-digit) + ((blank) char-set:blank) + ((ascii) char-set:ascii) + (else #f))) + +(define (sre-flatten-ranges orig-ls) + (let lp ((ls orig-ls) (res '())) + (cond + ((null? ls) + (reverse res)) + ((string? (car ls)) + (lp (append (string->list (car ls)) (cdr ls)) res)) + ((null? (cdr ls)) + (error "unbalanced cset / range" orig-ls)) + ((string? (cadr ls)) + (lp (cons (car ls) (append (string->list (cadr ls)) (cddr ls))) res)) + (else + (lp (cddr ls) (cons (cons (car ls) (cadr ls)) res)))))) + +(define (sre->char-set sre) + (cond + ((lookup-char-set sre)) + ((char-set? sre) sre) + ((char? sre) (char-set sre)) + ((pair? sre) + (if (string? (car sre)) + (string->char-set (car sre)) + (case (car sre) + ((/) (sre->char-set + `(or ,@(map (lambda (x) + (ucs-range->char-set + (char->integer (car x)) + (+ 1 (char->integer (cdr x))))) + (sre-flatten-ranges (cdr sre)))))) + ((& and) (apply char-set-intersection (map sre->char-set (cdr sre)))) + ((|\|| or) (apply char-set-union (map sre->char-set (cdr sre)))) + ((~ not) (char-set-complement (sre->char-set `(or ,@(cdr sre))))) + ((-) (char-set-difference (sre->char-set (cadr sre)) + (sre->char-set `(or ,@(cddr sre))))) + (else (error "invalid sre char-set" sre))))) + (else (error "invalid sre char-set" sre)))) + +;;> Compile an \var{sre} into a regexp. + +(define (regexp sre . o) + (define current-index 2) + (define current-match 0) + (define match-names '()) + (define match-rules (list (cons 0 1))) + (define (make-submatch-state sre flags next index) + (let* ((n3 (make-epsilon-state next)) + (n2 (->rx sre flags n3)) + (n1 (make-epsilon-state n2))) + (state-match-set! n1 index) + (state-match-rule-set! n1 'left) + (state-match-set! n3 (+ index 1)) + (state-match-rule-set! n3 'right) + n1)) + (define (->rx sre flags next) + (cond + ;; The base cases chars and strings match literally. + ((char? sre) + (make-char-state sre flags next)) + ((char-set? sre) + (make-char-state sre flags next)) + ((string? sre) + (->rx (cons 'seq (string->list sre)) flags next)) + ((and (symbol? sre) (lookup-char-set sre)) + => (lambda (cset) (make-char-state cset flags next))) + ((symbol? sre) + (case sre + ((epsilon) next) + ((bos) (make-char-state match/bos flags next)) + ((eos) (make-char-state match/eos flags next)) + ((bol) (make-char-state match/bol flags next)) + ((eol) (make-char-state match/eol flags next)) + ((bow) (make-char-state match/bow flags next)) + ((eow) (make-char-state match/eow flags next)) + ((word) (->rx '(word+ any) flags next)) + (else (error "unknown sre" sre)))) + ((pair? sre) + (case (car sre) + ((seq :) + ;; Sequencing. An empty sequence jumps directly to next, + ;; otherwise we join the first element to the sequence formed + ;; of the remaining elements followed by next. + (if (null? (cdr sre)) + next + ;; Make a dummy intermediate to join the states so that + ;; we can generate n1 first, preserving the submatch order. + (let* ((n2 (make-epsilon-state #f)) + (n1 (->rx (cadr sre) flags n2)) + (n3 (->rx (cons 'seq (cddr sre)) flags next))) + (state-next1-set! n2 n3) + n1))) + ((or) + ;; Alternation. An empty alternation always fails. + ;; Otherwise we fork between any of the alternations, each + ;; continuing to next. + (cond + ((null? (cdr sre)) + #f) + ((null? (cddr sre)) + (->rx (cadr sre) flags next)) + (else + (let* ((n1 (->rx (cadr sre) flags next)) + (n2 (->rx (cons 'or (cddr sre)) flags next))) + (make-fork-state n1 n2))))) + ((?) + ;; Optionality. Either match the body or fork to the next + ;; state directly. + (make-fork-state (->rx (cons 'seq (cdr sre)) flags next) next)) + ((*) + ;; Repetition. Introduce two fork states which can jump from + ;; the end of the loop to the beginning and from the + ;; beginning to the end (to skip the first iteration). + (let* ((n2 (make-fork-state next #f)) + (n1 (make-fork-state (->rx (cons 'seq (cdr sre)) flags n2) n2))) + (state-next2-set! n2 n1) + n1)) + ((+) + ;; One-or-more repetition. Same as above but the first + ;; transition is required so the rx is simpler - we only + ;; need one fork from the end of the loop to the beginning. + (let* ((n2 (make-fork-state next #f)) + (n1 (->rx (cons 'seq (cdr sre)) flags n2))) + (state-next2-set! n2 n1) + n1)) + ((=> submatch-named) + ;; Named submatches just record the name for the current + ;; match and rewrite as a non-named submatch. + (set! match-names (cons (cons (cadr sre) current-match) match-names)) + (->rx (cons 'submatch (cddr sre)) flags next)) + ((*=> submatch-named-list) + (set! match-names (cons (cons (cadr sre) current-match) match-names)) + (->rx (cons 'submatch-list (cddr sre)) flags next)) + (($ submatch) + ;; A submatch wraps next with an epsilon transition before + ;; next, setting the start and end index on the result and + ;; wrapped next respectively. + (let ((num current-match) + (index current-index)) + (set! current-match (+ current-match 1)) + (set! current-index (+ current-index 2)) + (set! match-rules `((,index . ,(+ index 1)) ,@match-rules)) + (make-submatch-state (cons 'seq (cdr sre)) flags next index))) + ((*$ submatch-list) + ;; A submatch-list wraps a range of submatch results into a + ;; single match value. + (let* ((num current-match) + (index current-index)) + (set! current-match (+ current-match 1)) + (set! current-index (+ current-index 1)) + (set! match-rules `(,index ,@match-rules)) + (let* ((n2 (make-epsilon-state next)) + (n1 (->rx (cons 'submatch (cdr sre)) flags n2))) + (state-match-set! n2 (list index num current-match)) + (state-match-rule-set! n2 'list) + n1))) + ((~ - & |\|| / and or not) + (make-char-state (sre->char-set sre) flags next)) + ((word) + (->rx `(: bow ,@(cdr sre) eow) flags next)) + ((word+) + (->rx `(word (+ ,(char-set-intersection + char-set:word-constituent + (sre->char-set `(or ,@(cdr sre)))))) + flags + next)) + ((w/case) + (->rx `(: ,@(cdr sre)) (bitwise-and flags (bitwise-not ~ci?)) next)) + ((w/nocase) + (->rx `(: ,@(cdr sre)) (bitwise-ior flags ~ci?) next)) + (else + (if (string? (car sre)) + (make-char-state (sre->char-set sre) flags next) + (error "unknown sre" sre))))))) + (let ((flags (parse-flags (and (pair? o) (car o))))) + (if (regexp? sre) + sre + (let ((start (make-submatch-state sre flags (make-accept-state) 0))) + (make-rx start current-match current-index + (list->vector (reverse match-rules)) match-names))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Utilities + +(define (regexp-fold rx kons knil str . o) + (let* ((rx (regexp rx)) + (finish (if (pair? o) (car o) (lambda (from md str acc) acc))) + (o (if (pair? o) (cdr o) o)) + (start (string-start-arg str o)) + (end (string-end-arg str (if (pair? o) (cdr o) o)))) + (let lp ((i start) + (from start) + (acc knil)) + (cond + ((and (string-cursor (lambda (md) + (let* ((j (rx-match-submatch-end md str 0))) + (lp (if (>= j end) j (string-cursor-next str j)) + j + (kons (string-offset->index str from) md str acc))))) + (else + (finish (string-offset->index str i) #f str acc)))))) + +(define (regexp-extract rx str . o) + (apply regexp-fold + rx + (lambda (from md str a) + (let ((s (rx-match-submatch md str 0))) + (if (equal? s "") a (cons s a)))) + '() + str + (lambda (from md str a) (reverse a)) + o)) + +(define (regexp-split rx str . o) + ;; start and end in indices passed to regexp-fold + (let ((start (if (pair? o) (car o) 0)) + (end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str)))) + (regexp-fold + rx + (lambda (from md str a) + (let ((i (rx-match-submatch-start md str 0))) + (if (< from i) (cons (substring str from i) a) a))) + '() + str + (lambda (from md str a) + (reverse (if (< from end) (cons (substring str from end) a) a))) + start + end))) + +(define (regexp-replace rx str subst . o) + (let* ((start (if (pair? o) (car o) 0)) + (end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str))) + (m (regexp-search rx str start end))) + (if m + (string-concatenate + (cons + (substring-cursor str + (string-index->offset str start) + (rx-match-submatch-start m str 0)) + (append + (reverse (regexp-apply-match m str subst)) + (list (substring-cursor str + (rx-match-submatch-end m str 0) + (string-index->offset str end)))))) + str))) + +(define (regexp-replace-all rx str subst . o) + (regexp-fold + rx + (lambda (i m str acc) + (let ((m-start (rx-match-submatch-start m str 0))) + (append (regexp-apply-match m str subst) + (if (>= i m-start) + acc + (cons (substring str i m-start) acc))))) + '() + str + (lambda (i m str acc) + (let ((end (string-length str))) + (string-concatenate-reverse + (if (>= i end) + acc + (cons (substring str i end) acc))))))) + +(define (regexp-apply-match m str ls) + (let lp ((ls ls) (res '())) + (cond + ((null? ls) + res) + ((not (pair? ls)) + (lp (list ls) res)) + ((integer? (car ls)) + (lp (cdr ls) (cons (or (rx-match-submatch m str (car ls)) "") res))) + ((procedure? (car ls)) + (lp (cdr ls) (cons ((car ls) m) res))) + ((symbol? (car ls)) + (case (car ls) + ((pre) + (lp (cdr ls) + (cons (substring-cursor str 0 (rx-match-submatch-start m str 0)) + res))) + ((post) + (lp (cdr ls) + (cons (substring str + (rx-match-submatch-end m str 0) + (string-length str)) + res))) + (else + (cond + ((assq (car ls) (rx-match-names m)) + => (lambda (x) (lp (cons (cdr x) (cdr ls)) res))) + (else + (error "unknown match replacement" (car ls))))))) + (else + (lp (cdr ls) (cons (car ls) res)))))) diff --git a/lib/chibi/regexp.sld b/lib/chibi/regexp.sld new file mode 100644 index 00000000..34266a71 --- /dev/null +++ b/lib/chibi/regexp.sld @@ -0,0 +1,44 @@ + +(define-library (chibi regexp) + (export regexp regexp? regexp-match regexp-search + regexp-replace regexp-replace-all + regexp-fold regexp-extract regexp-split + rx-match? rx-match-num-matches + rx-match-submatch rx-match-submatch/list + rx-match->list rx-match->sexp) + (import (scheme base) (srfi 9) (srfi 33) (srfi 38) (srfi 69)) + ;; Chibi's char-set library is more factored than SRFI-14. + (cond-expand + (chibi + (import (chibi) (chibi char-set) (chibi char-set full))) + (else + (import (scheme base) (srfi 14)))) + ;; Use string-cursors where available. + (begin + (define string-cursor? integer?)) + (cond-expand + (chibi + (begin + (define (string-start-arg s o) + (if (pair? o) (string-index->offset s (car o)) (string-cursor-start s))) + (define (string-end-arg s o) + (if (pair? o) (string-index->offset s (car o)) (string-cursor-end s))) + (define (string-concatenate-reverse ls) + (string-concatenate (reverse ls))))) + (else + (begin + (define (string-start-arg s o) + (if (pair? o) (string-index->offset (car o)) 0)) + (define (string-end-arg s o) + (if (pair? o) (string-index->offset (car o)) (string-length s))) + (define string-cursor=? =) + (define string-cursor? >) + (define string-cursor>=? >=) + (define string-cursor-ref string-ref) + (define substring-cursor substring) + (define (string-offset->index str off) off) + (define (string-concatenate-reverse ls) + (apply string-append (reverse ls)))))) + (include "regexp.scm")) diff --git a/lib/chibi/regexp/pcre.scm b/lib/chibi/regexp/pcre.scm new file mode 100644 index 00000000..3129f8e8 --- /dev/null +++ b/lib/chibi/regexp/pcre.scm @@ -0,0 +1,598 @@ + +;; PCRE parsing, adapted from IrRegex. + +(define ~none 0) +(define ~save? 1) +(define ~case-insensitive? 2) +(define ~multi-line? 4) +(define ~single-line? 8) +(define ~ignore-space? 16) + +(define (flag-set? flags i) + (= i (bitwise-and flags i))) +(define (flag-join a b) + (if b (bitwise-ior a b) a)) +(define (flag-clear a b) + (bitwise-and a (bitwise-not b))) + +(define (symbol-list->flags ls) + (let lp ((ls ls) (res ~none)) + (cond + ((null? ls) + res) + ((not (pair? ls)) + (lp (list ls) res)) + (else + (lp (cdr ls) + (flag-join + res + (case (car ls) + ((i ci case-insensitive) ~case-insensitive?) + ((m multi-line) ~multi-line?) + ((s single-line) ~single-line?) + ((x ignore-space) ~ignore-space?) + (else #f)))))))) + +(define posix-escape-sequences + `((#\n . #\newline) + (#\r . #\return) + (#\t . #\tab) + (#\a . #\alarm) + (#\e . #\escape))) + +(define (char-altcase c) + (if (char-upper-case? c) (char-downcase c) (char-upcase c))) + +(define (char-mirror c) + (case c ((#\<) #\>) ((#\{) #\}) ((#\() #\)) ((#\[) #\]) (else c))) + +(define (string-scan-char-escape str c . o) + (let ((end (string-length str))) + (let scan ((i (if (pair? o) (car o) 0))) + (cond ((= i end) #f) + ((eqv? c (string-ref str i)) i) + ((eqv? c #\\) (scan (+ i 2))) + (else (scan (+ i 1))))))) + +(define (string-parse-hex-escape str i end) + (cond + ((>= i end) + (error "incomplete hex escape" str i)) + ((eqv? #\{ (string-ref str i)) + (let ((j (string-scan-char-escape str #\} (+ i 1)))) + (if (not j) + (error "incomplete hex brace escape" str i) + (let* ((s (substring str (+ i 1) j)) + (n (string->number s 16))) + (if n + (list (integer->char n) j) + (error "bad hex brace escape" s)))))) + ((>= (+ i 1) end) + (error "incomplete hex escape" str i)) + (else + (let* ((s (substring str i (+ i 2))) + (n (string->number s 16))) + (if n + (list (integer->char n) (+ i 2)) + (error "bad hex escape" s)))))) + +(define (string-parse-cset str start flags) + (let* ((end (string-length str)) + (invert? (and (< start end) (eqv? #\^ (string-ref str start))))) + (define (cset-union a b) + (cond ((not a) b) + ((not b) a) + ((and (pair? a) (eq? 'or (car a))) `(,@a ,b)) + (else `(or ,a ,b)))) + (define (go i prev-char ones pairs classes) + (if (>= i end) + (error "incomplete char set" str i end)) + (case (string-ref str i) + ((#\]) + (if (and (null? ones) (null? pairs)) + (go (+ i 1) #\] (cons #\] ones) pairs classes) + (list + (let ((res + (cset-union + (cset-union + (and (pair? classes) + `(or ,@classes)) + (and (pair? ones) + `(,(list->string (reverse ones))))) + (and (pair? pairs) + `(/ ,(list->string (reverse pairs))))))) + (if invert? `(~ ,res) res)) + i))) + ((#\-) + (cond + ((or (= i start) + (and (= i (+ start 1)) invert?) + (eqv? #\] (string-ref str (+ i 1)))) + (go (+ i 1) #\- (cons #\- ones) pairs classes)) + ;; alternately permissively allow this as a - + ((not prev-char) + (error "bad pcre char-set, unexpected -" str)) + (else + (let ((ch (string-ref str (+ i 1)))) + (apply + (lambda (c j) + (if (char (lambda (x) (list (cdr x) (+ i 3)))) + ((and (eqv? #\\ ch) + (eqv? (string-ref str (+ i 2)) #\x)) + (string-parse-hex-escape str (+ i 3) end)) + (else + (list ch (+ i 2))))))))) + ((#\[) + (let* ((inv? (eqv? #\^ (string-ref str (+ i 1)))) + (i2 (if inv? (+ i 2) (+ i 1)))) + (case (string-ref str i2) + ((#\:) + (let ((j (string-find str #\: (+ i2 1) end))) + (if (or (>= (+ j 1) end) + (not (eqv? #\] (string-ref str (+ j 1))))) + (error "incomplete character class" str) + (let* ((class (string->symbol (substring str (+ i2 1) j))) + (class (if inv? `(~ ,class) class))) + (go (+ j 2) #f ones pairs (cons class classes)))))) + ((#\= #\.) + (error "collating sequences not supported" str)) + (else + (go (+ i 1) #\[ (cons #\[ ones) pairs classes))))) + ((#\\) + (let ((c (string-ref str (+ i 1)))) + (case c + ((#\d #\D #\s #\S #\w #\W) + (go (+ i 2) #f ones pairs + (cons (pcre->sre (string #\\ c)) classes))) + ((#\x) + (apply + (lambda (c j) (go j c (cons c ones) pairs classes)) + (string-parse-hex-escape str (+ i 2) end))) + (else + (let ((c (cond ((assv c posix-escape-sequences) => cdr) + (else c)))) + (go (+ i 2) c (cons c ones) pairs classes)))))) + (else + => (lambda (c) (go (+ i 1) c (cons c ones) pairs classes))))) + (if invert? + (let ((ones (if (flag-set? flags ~multi-line?) '(#\newline) '()))) + (go (+ start 1) #f ones '() '())) + (go start #f '() '() '())))) + +;; build a (seq ls ...) sre from a list +(define (sre-sequence ls) + (cond + ((null? ls) 'epsilon) + ((null? (cdr ls)) (car ls)) + (else (cons 'seq ls)))) + +;; build a (or ls ...) sre from a list +(define (sre-alternate ls) + (cond + ((null? ls) '(or)) + ((null? (cdr ls)) (car ls)) + (else (cons 'or ls)))) + +;; returns #t if the sre can ever be empty +(define (sre-empty? sre) + (if (pair? sre) + (case (car sre) + ((* ? look-ahead look-behind neg-look-ahead neg-look-behind) #t) + ((**) (or (not (number? (cadr sre))) (zero? (cadr sre)))) + ((or) (any sre-empty? (cdr sre))) + ((: seq $ submatch => submatch-named + atomic) + (every sre-empty? (cdr sre))) + (else #f)) + (memq sre '(epsilon bos eos bol eol bow eow commit)))) + +;; returns #t if the sre is a */+ repetition +(define (sre-repeater? sre) + (and (pair? sre) + (or (memq (car sre) '(* +)) + (and (memq (car sre) '($ submatch => submatch-named seq :)) + (pair? (cdr sre)) + (null? (cddr sre)) + (sre-repeater? (cadr sre)))))) + +(define (pcre->sre str . o) + (if (not (string? str)) + (error "pcre->sre: expected a string" str)) + (let ((end (string-length str)) + (orig-flags (if (pair? o) (symbol-list->flags (car o)) ~none))) + (let lp ((i 0) (from 0) (flags orig-flags) (res '()) (st '())) + ;; accumulate the substring from..i as literal text + (define (collect) + (if (= i from) res (cons (substring str from i) res))) + ;; like collect but breaks off the last single character when + ;; collecting literal data, as the argument to ?/*/+ etc. + (define (collect/single) + (let ((j (- i 1))) + (cond + ((< j from) + res) + (else + (let ((c (string-ref str j))) + (cond + ((= j from) + (cons c res)) + (else + (cons c (cons (substring str from j) res))))))))) + ;; collects for use as a result, reversing and grouping OR + ;; terms, and some ugly tweaking of `function-like' groups and + ;; conditionals + (define (collect/terms) + (let* ((ls (collect)) + (func + (and (pair? ls) + (memq (last ls) + '(atomic if look-ahead neg-look-ahead + look-behind neg-look-behind + => submatch-named)))) + (prefix (if (and func (memq (car func) '(=> submatch-named))) + (list 'submatch-named (cadr (reverse ls))) + (and func (list (car func))))) + (ls (if func + (if (memq (car func) '(=> submatch-named)) + (reverse (cddr (reverse ls))) + (reverse (cdr (reverse ls)))) + ls))) + (let lp ((ls ls) (term '()) (res '())) + (define (shift) + (cons (sre-sequence term) res)) + (cond + ((null? ls) + (let* ((res (sre-alternate (shift))) + (res (if (flag-set? flags ~save?) + (list 'submatch res) + res))) + (if prefix + (if (eq? 'if (car prefix)) + (cond + ((not (pair? res)) + 'epsilon) + ((memq (car res) + '(look-ahead neg-look-ahead + look-behind neg-look-behind)) + res) + ((eq? 'seq (car res)) + `(if ,(cadr res) + ,(sre-sequence (cddr res)))) + (else + `(if ,(cadadr res) + ,(sre-sequence (cddadr res)) + ,(sre-alternate (cddr res))))) + `(,@prefix ,res)) + res))) + ((eq? 'or (car ls)) (lp (cdr ls) '() (shift))) + (else (lp (cdr ls) (cons (car ls) term) res)))))) + (define (save) + (cons (cons flags (collect)) st)) + ;; main parsing + (cond + ((>= i end) + (if (pair? st) + (error "unterminated parenthesis in regexp" str) + (collect/terms))) + (else + (case (string-ref str i) + ((#\.) + (lp (+ i 1) (+ i 1) flags + (cons (if (flag-set? flags ~single-line?) 'any 'nonl) + (collect)) + st)) + ((#\?) + (let ((res (collect/single))) + (if (null? res) + (error "? can't follow empty pattern" str res) + (let ((x (car res))) + (lp (+ i 1) + (+ i 1) + flags + (cons + (if (pair? x) + (case (car x) + ((*) `(*? ,@(cdr x))) + ((+) `(**? 1 #f ,@(cdr x))) + ((?) `(?? ,@(cdr x))) + ((**) `(**? ,@(cdr x))) + ((=) `(**? ,(cadr x) ,@(cdr x))) + ((>=) `(**? ,(cadr x) #f ,@(cddr x))) + (else `(? ,x))) + `(? ,x)) + (cdr res)) + st))))) + ((#\+ #\*) + (let* ((res (collect/single)) + (x (if (pair? res) (car res) 'epsilon)) + (op (string->symbol (string (string-ref str i))))) + (cond + ((sre-repeater? x) + (error "duplicate repetition (e.g. **) in pattern" str res)) + ((sre-empty? x) + (error "can't repeat empty pattern (e.g. ()*)" str res)) + (else + (lp (+ i 1) (+ i 1) flags + (cons (list op x) (cdr res)) + st))))) + ((#\() + (cond + ((>= (+ i 1) end) + (error "unterminated parenthesis in regexp" str)) + ((not (memv (string-ref str (+ i 1)) '(#\? #\*))) ; normal case + (lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save))) + ((>= (+ i 2) end) + (error "unterminated parenthesis in regexp" str)) + ((eqv? (string-ref str (+ i 1)) #\*) + (error "bad regexp syntax: (*FOO) not supported" str)) + (else ;; (?...) case + (case (string-ref str (+ i 2)) + ((#\#) + (let ((j (string-find str #\) (+ i 3)))) + (lp (+ j i) (min (+ j 1) end) flags (collect) st))) + ((#\:) + (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save))) + ((#\=) + (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) + '(look-ahead) (save))) + ((#\!) + (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) + '(neg-look-ahead) (save))) + ((#\<) + (cond + ((>= (+ i 3) end) + (error "unterminated parenthesis in regexp" str)) + (else + (case (string-ref str (+ i 3)) + ((#\=) + (lp (+ i 4) (+ i 4) (flag-clear flags ~save?) + '(look-behind) (save))) + ((#\!) + (lp (+ i 4) (+ i 4) (flag-clear flags ~save?) + '(neg-look-behind) (save))) + (else + (let ((j (and (char-alphabetic? + (string-ref str (+ i 3))) + (string-find str #\> (+ i 4))))) + (if (< j end) + (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) + `(,(string->symbol (substring str (+ i 3) j)) + submatch-named) + (save)) + (error "invalid (?< sequence" str)))))))) + ((#\>) + (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) + '(atomic) (save))) + ;;((#\' #\P) ; named subpatterns + ;; ) + ;;((#\R) ; recursion + ;; ) + ((#\() + (cond + ((>= (+ i 3) end) + (error "unterminated parenthesis in regexp" str)) + ((char-numeric? (string-ref str (+ i 3))) + (let* ((j (string-find str #\) (+ i 3))) + (n (string->number (substring str (+ i 3) j)))) + (if (or (= j end) (not n)) + (error "invalid conditional reference" str) + (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) + `(,n if) (save))))) + ((char-alphabetic? (string-ref str (+ i 3))) + (let ((j (string-find str #\) (+ i 3)))) + (if (= j end) + (error "invalid named conditional reference" str) + (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) + `(,(string->symbol (substring str (+ i 3) j)) if) + (save))))) + (else + (lp (+ i 2) (+ i 2) (flag-clear flags ~save?) + '(if) (save))))) + ((#\{) + (error "unsupported Perl-style cluster" str)) + (else + (let ((old-flags flags)) + (let lp2 ((j (+ i 2)) (flags flags) (invert? #f)) + (define (join x) + ((if invert? flag-clear flag-join) flags x)) + (cond + ((>= j end) + (error "incomplete cluster" str i)) + (else + (case (string-ref str j) + ((#\i) + (lp2 (+ j 1) (join ~case-insensitive?) invert?)) + ((#\m) + (lp2 (+ j 1) (join ~multi-line?) invert?)) + ((#\x) + (lp2 (+ j 1) (join ~ignore-space?) invert?)) + ((#\-) + (lp2 (+ j 1) flags (not invert?))) + ((#\)) + (lp (+ j 1) (+ j 1) flags (collect) + st)) + ((#\:) + (lp (+ j 1) (+ j 1) flags '() + (cons (cons old-flags (collect)) st))) + (else + (error "unknown regex cluster modifier" str) + ))))))))))) + ((#\)) + (if (null? st) + (error "too many )'s in regexp" str) + (lp (+ i 1) + (+ i 1) + (caar st) + (cons (collect/terms) (cdar st)) + (cdr st)))) + ((#\[) + (apply + (lambda (sre j) + (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st)) + (string-parse-cset str (+ i 1) flags))) + ((#\{) + (cond + ((or (>= (+ i 1) end) + (not (or (char-numeric? (string-ref str (+ i 1))) + (eqv? #\, (string-ref str (+ i 1)))))) + (lp (+ i 1) from flags res st)) + (else + (let ((res (collect/single))) + (cond + ((null? res) + (error "{ can't follow empty pattern")) + (else + (let* ((x (car res)) + (tail (cdr res)) + (j (string-find str #\} (+ i 1))) + (s2 (string-split (substring str (+ i 1) j) #\,)) + (n (string->number (car s2))) + (m (and (pair? (cdr s2)) + (string->number (cadr s2))))) + (cond + ((or (= j end) + (not n) + (and (pair? (cdr s2)) + (not (equal? "" (cadr s2))) + (not m))) + (error "invalid {n} repetition syntax" s2)) + ((null? (cdr s2)) + (lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st)) + (m + (lp (+ j 1) (+ j 1) flags `((** ,n ,m ,x) ,@tail) st)) + (else + (lp (+ j 1) (+ j 1) flags `((>= ,n ,x) ,@tail) st) + ))))))))) + ((#\\) + (cond + ((>= (+ i 1) end) + (error "incomplete escape sequence" str)) + (else + (let ((c (string-ref str (+ i 1)))) + (case c + ((#\d) + (lp (+ i 2) (+ i 2) flags `(numeric ,@(collect)) st)) + ((#\D) + (lp (+ i 2) (+ i 2) flags `((~ numeric) ,@(collect)) st)) + ((#\s) + (lp (+ i 2) (+ i 2) flags `(space ,@(collect)) st)) + ((#\S) + (lp (+ i 2) (+ i 2) flags `((~ space) ,@(collect)) st)) + ((#\w) + (lp (+ i 2) (+ i 2) flags + `((or alphanumeric ("_")) ,@(collect)) st)) + ((#\W) + (lp (+ i 2) (+ i 2) flags + `((~ (or alphanumeric ("_"))) ,@(collect)) st)) + ((#\b) + (lp (+ i 2) (+ i 2) flags + `((or bow eow) ,@(collect)) st)) + ((#\B) + (lp (+ i 2) (+ i 2) flags `(nwb ,@(collect)) st)) + ((#\A) + (lp (+ i 2) (+ i 2) flags `(bos ,@(collect)) st)) + ((#\Z) + (lp (+ i 2) (+ i 2) flags + `((? #\newline) eos ,@(collect)) st)) + ((#\z) + (lp (+ i 2) (+ i 2) flags `(eos ,@(collect)) st)) + ((#\R) + (lp (+ i 2) (+ i 2) flags `(newline ,@(collect)) st)) + ((#\K) + (lp (+ i 2) (+ i 2) flags `(reset ,@(collect)) st)) + ;; these two are from Emacs and TRE, but not in PCRE + ((#\<) + (lp (+ i 2) (+ i 2) flags `(bow ,@(collect)) st)) + ((#\>) + (lp (+ i 2) (+ i 2) flags `(eow ,@(collect)) st)) + ((#\x) + (apply + (lambda (ch j) + (lp (+ j 1) (+ j 1) flags `(,ch ,@(collect)) st)) + (string-parse-hex-escape str (+ i 2) end))) + ((#\k) + (let ((c (string-ref str (+ i 2)))) + (if (not (memv c '(#\< #\{ #\'))) + (error "bad \\k usage, expected \\k<...>" str) + (let* ((terminal (char-mirror c)) + (j (string-find str terminal (+ i 2))) + (s (substring str (+ i 3) j)) + (backref + (if (flag-set? flags ~case-insensitive?) + 'backref-ci + 'backref))) + (if (= j end) + (error "unterminated named backref" str) + (lp (+ j 1) (+ j 1) flags + `((,backref ,(string->symbol s)) + ,@(collect)) + st)))))) + ((#\Q) ;; \Q..\E escapes + (let ((res (collect))) + (let lp2 ((j (+ i 2))) + (cond + ((>= j end) + (lp j (+ i 2) flags res st)) + ((eqv? #\\ (string-ref str j)) + (cond + ((>= (+ j 1) end) + (lp (+ j 1) (+ i 2) flags res st)) + ((eqv? #\E (string-ref str (+ j 1))) + (lp (+ j 2) (+ j 2) flags + (cons (substring str (+ i 2) j) res) st)) + (else + (lp2 (+ j 2))))) + (else + (lp2 (+ j 1))))))) + ;;((#\p) ; XXXX unicode properties + ;; ) + ;;((#\P) + ;; ) + (else + (cond + ((char-numeric? c) + (let* ((j (string-skip str char-numeric? (+ i 2))) + (backref + (if (flag-set? flags ~case-insensitive?) + 'backref-ci + 'backref)) + (res `((,backref ,(string->number + (substring str (+ i 1) j))) + ,@(collect)))) + (lp j j flags res st))) + ((char-alphabetic? c) + (let ((cell (assv c posix-escape-sequences))) + (if cell + (lp (+ i 2) (+ i 2) flags + (cons (cdr cell) (collect)) st) + (error "unknown escape sequence" str c)))) + (else + (lp (+ i 2) (+ i 1) flags (collect) st))))))))) + ((#\|) + (lp (+ i 1) (+ i 1) flags (cons 'or (collect)) st)) + ((#\^) + (let ((sym (if (flag-set? flags ~multi-line?) 'bol 'bos))) + (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st))) + ((#\$) + (let ((sym (if (flag-set? flags ~multi-line?) 'eol 'eos))) + (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st))) + ((#\space) + (if (flag-set? flags ~ignore-space?) + (lp (+ i 1) (+ i 1) flags (collect) st) + (lp (+ i 1) from flags res st))) + ((#\#) + (if (flag-set? flags ~ignore-space?) + (let ((j (string-find str #\newline (+ i 1)))) + (lp (+ j 1) (min (+ j 1) end) flags (collect) st)) + (lp (+ i 1) from flags res st))) + (else + (lp (+ i 1) from flags res st)))))))) + +(define (pcre->regexp pcre . o) + (regexp (apply pcre->sre pcre o))) diff --git a/lib/chibi/regexp/pcre.sld b/lib/chibi/regexp/pcre.sld new file mode 100644 index 00000000..795220ae --- /dev/null +++ b/lib/chibi/regexp/pcre.sld @@ -0,0 +1,7 @@ + +(define-library (chibi regexp pcre) + (export pcre->sre pcre->regexp) + (import (scheme base) (scheme char) (scheme cxr) + (srfi 1) (srfi 33) + (chibi string) (chibi regexp)) + (include "pcre.scm")) diff --git a/tests/re-tests.txt b/tests/re-tests.txt new file mode 100644 index 00000000..aea08801 --- /dev/null +++ b/tests/re-tests.txt @@ -0,0 +1,135 @@ +abc abc y & abc +abc xbc n - - +abc axc n - - +abc abx n - - +abc xabcy y & abc +abc ababc y & abc +ab*c abc y & abc +ab*bc abc y & abc +ab*bc abbc y & abbc +ab*bc abbbbc y & abbbbc +ab+bc abbc y & abbc +ab+bc abc n - - +ab+bc abq n - - +ab+bc abbbbc y & abbbbc +ab?bc abbc y & abbc +ab?bc abc y & abc +ab?bc abbbbc n - - +ab?c abc y & abc +^abc$ abc y & abc +^abc$ abcc n - - +^abc abcc y & abc +^abc$ aabc n - - +abc$ aabc y & abc +^ abc y & +$ abc y & +a.c abc y & abc +a.c axc y & axc +a.*c axyzc y & axyzc +a.*c axyzd n - - +a[bc]d abc n - - +a[bc]d abd y & abd +a[b-d]e abd n - - +a[b-d]e ace y & ace +a[b-d] aac y & ac +a[-b] a- y & a- +a[b-] a- y & a- +[k] ab n - - +a[b-a] - c - - +a[]b - c - - +a[ - c - - +a] a] y & a] +a[]]b a]b y & a]b +a[^bc]d aed y & aed +a[^bc]d abd n - - +a[^-b]c adc y & adc +a[^-b]c a-c n - - +a[^]b]c a]c n - - +a[^]b]c adc y & adc +ab|cd abc y & ab +ab|cd abcd y & ab +()ef def y &-\1 ef- +()* - c - - +*a - c - - +^* - c - - +$* - c - - +(*)b - c - - +$b b n - - +a\ - c - - +a\(b a(b y &-\1 a(b- +a\(*b ab y & ab +a\(*b a((b y & a((b +a\\b a\b y & a\b +abc) - c - - +(abc - c - - +((a)) abc y &-\1-\2 a-a-a +(a)b(c) abc y &-\1-\2 abc-a-c +a+b+c aabbabc y & abc +a** - c - - +(a*)* - c - - +(a*)+ - c - - +(a|)* - c - - +(a*|b)* - c - - +(a+|b)* ab y &-\1 ab-b +(a+|b)+ ab y &-\1 ab-b +(a+|b)? ab y &-\1 a-a +[^ab]* cde y & cde +(^)* - c - - +(ab|)* - c - - +)( - c - - + abc y & +abc n - - +a* y & +abcd abcd y &-\&-\\& abcd-&-\abcd +a(bc)d abcd y \1-\\1-\\\1 bc-\1-\bc +([abc])*d abbbcd y &-\1 abbbcd-c +([abc])*bcd abcd y &-\1 abcd-a +a|b|c|d|e e y & e +(a|b|c|d|e)f ef y &-\1 ef-e +((a*|b))* - c - - +abcd*efg abcdefg y & abcdefg +ab* xabyabbbz y & ab +ab* xayabbbz y & a +(ab|cd)e abcde y &-\1 cde-cd +[abhgefdc]ij hij y & hij +^(ab|cd)e abcde n x\1y xy +(abc|)ef abcdef y &-\1 ef- +(a|b)c*d abcd y &-\1 bcd-b +(ab|ab*)bc abc y &-\1 abc-a +(?:(a)b|ac) ac y &-\1 ac- +a([bc]*)c* abc y &-\1 abc-bc +a([bc]*)(c*d) abcd y &-\1-\2 abcd-bc-d +a([bc]+)(c*d) abcd y &-\1-\2 abcd-bc-d +a([bc]*)(c+d) abcd y &-\1-\2 abcd-b-cd +a[bcd]*dcdcde adcdcde y & adcdcde +a[bcd]+dcdcde adcdcde n - - +(ab|a)b*c abc y &-\1 abc-ab +(.*)b abc y &-\1 ab-a +((a)(b)c)(d) abcd y \1-\2-\3-\4 abc-a-b-d +((a)(b)?c)(d) abcd y \1-\2-\3-\4 abc-a-b-d +((a)(b)?c)(d) acd y \1-\2-\3-\4 ac-a--d +((aa)(bb)?cc)(dd) aaccdd y \1-\2-\3-\4 aacc-aa--dd +[ -~]* abc y & abc +[ -~ -~]* abc y & abc +[ -~ -~ -~]* abc y & abc +[ -~ -~ -~ -~]* abc y & abc +[ -~ -~ -~ -~ -~]* abc y & abc +[ -~ -~ -~ -~ -~ -~]* abc y & abc +[ -~ -~ -~ -~ -~ -~ -~]* abc y & abc +[a-zA-Z_][a-zA-Z0-9_]* alpha y & alpha +^a(bc+|b[eh])g|.h$ abh y &-\1 bh- +(bc+d$|ef*g.|h?i(j|k)) effgz y &-\1-\2 effgz-effgz- +(bc+d$|ef*g.|h?i(j|k)) ij y &-\1-\2 ij-ij-j +(bc+d$|ef*g.|h?i(j|k)) effg n - - +(bc+d$|ef*g.|h?i(j|k)) bcdd n - - +(bc+d$|ef*g.|h?i(j|k)) reffgz y &-\1-\2 effgz-effgz- +((((((((((a))))))))) - c - - +((((((((((a)))))))))) a y &-\10 a-a +(((((((((a))))))))) a y & a +multiple words of text uh-uh n - - +multiple words multiple words, yeah y & multiple words +(.*)c(.*) abcde y &-\1-\2 abcde-ab-de +\((.*), (.*)\) (a, b) y (\2, \1) (b, a) +(we|wee|week)(knights|night) weeknights y &-\1-\2 weeknights-wee-knights +(a([^a])*)* abcaBC y &-\1-\2 abcaBC-aBC-C +a([\/\\]*)b a//\\b y &-\1 a//\\b-//\\ diff --git a/tests/regexp-tests.scm b/tests/regexp-tests.scm new file mode 100644 index 00000000..5747d29b --- /dev/null +++ b/tests/regexp-tests.scm @@ -0,0 +1,214 @@ + +(import (chibi) (chibi regexp) (chibi regexp pcre) + (chibi string) (chibi io) (chibi match) (chibi test)) + +(define (regexp-match->sexp rx str . o) + (let ((res (apply regexp-match rx str o))) + (and res (rx-match->sexp res str)))) + +(define-syntax test-re + (syntax-rules () + ((test-re res rx str start end) + (test res (regexp-match->sexp rx str start end))) + ((test-re res rx str start) + (test-re res rx str start (string-cursor-end str))) + ((test-re res rx str) + (test-re res rx str (string-cursor-start str))))) + +(define (regexp-search->sexp rx str . o) + (let ((res (apply regexp-search rx str o))) + (and res (rx-match->sexp res str)))) + +(define-syntax test-re-search + (syntax-rules () + ((test-re-search res rx str start end) + (test res (regexp-search->sexp rx str start end))) + ((test-re-search res rx str start) + (test-re-search res rx str start (string-cursor-end str))) + ((test-re-search res rx str) + (test-re-search res rx str (string-cursor-start str))))) + +(test-begin "regexp") + +(test-re '("ababc" "abab") + '(: ($ (* "ab")) "c") + "ababc") + +(test-re '("ababc" "abab") + '(: ($ (* "ab")) "c") + "xababc" + 1) + +(test-re-search '("y") '(: "y") "xy") + +(test-re-search '("ababc" "abab") + '(: ($ (* "ab")) "c") + "xababc") + +(test-re #f + '(: (* any) ($ "foo" (* any)) ($ "bar" (* any))) + "fooxbafba") + +(test-re '("fooxbarfbar" "fooxbarf" "bar") + '(: (* any) ($ "foo" (* any)) ($ "bar" (* any))) + "fooxbarfbar") + +(test-re '("abcd" "abcd") + '($ (* (or "ab" "cd"))) + "abcd") + +;; first match is a list of ab's, second match is the last (temporary) cd +(test-re '("abcdc" (("ab") ("cd")) "cd") + '(: (* (*$ (or "ab" "cd"))) "c") + "abcdc") + +(test-re '("ababc" "abab") + '(: bos ($ (* "ab")) "c") + "ababc") +(test-re '("ababc" "abab") + '(: ($ (* "ab")) "c" eos) + "ababc") +(test-re '("ababc" "abab") + '(: bos ($ (* "ab")) "c" eos) + "ababc") +(test-re #f + '(: bos ($ (* "ab")) eos "c") + "ababc") +(test-re #f + '(: ($ (* "ab")) bos "c" eos) + "ababc") + +(test-re '("ababc" "abab") + '(: bol ($ (* "ab")) "c") + "ababc") +(test-re '("ababc" "abab") + '(: ($ (* "ab")) "c" eol) + "ababc") +(test-re '("ababc" "abab") + '(: bol ($ (* "ab")) "c" eol) + "ababc") +(test-re #f + '(: bol ($ (* "ab")) eol "c") + "ababc") +(test-re #f + '(: ($ (* "ab")) bol "c" eol) + "ababc") +(test-re '("\nabc\n" "abc") + '(: (* #\newline) bol ($ (* alpha)) eol (* #\newline)) + "\nabc\n") +(test-re #f + '(: (* #\newline) bol ($ (* alpha)) eol (* #\newline)) + "\n'abc\n") +(test-re #f + '(: (* #\newline) bol ($ (* alpha)) eol (* #\newline)) + "\nabc.\n") + +(test-re '("ababc" "abab") + '(: bow ($ (* "ab")) "c") + "ababc") +(test-re '("ababc" "abab") + '(: ($ (* "ab")) "c" eow) + "ababc") +(test-re '("ababc" "abab") + '(: bow ($ (* "ab")) "c" eow) + "ababc") +(test-re #f + '(: bow ($ (* "ab")) eow "c") + "ababc") +(test-re #f + '(: ($ (* "ab")) bow "c" eow) + "ababc") +(test-re '(" abc " "abc") + '(: (* space) bow ($ (* alpha)) eow (* space)) + " abc ") +(test-re #f + '(: (* space) bow ($ (* alpha)) eow (* space)) + " 'abc ") +(test-re #f + '(: (* space) bow ($ (* alpha)) eow (* space)) + " abc. ") + +(test-re '("beef") + '(* (/"af")) + "beef") + +(test-re '("12345beef" "beef") + '(: (* digit) ($ (* (/"af")))) + "12345beef") + +(test-re '("12345BeeF" "BeeF") + '(: (* digit) (w/nocase ($ (* (/"af"))))) + "12345BeeF") + +(test '("123" "456" "789") (regexp-extract '(+ digit) "abc123def456ghi789")) +(test '("123" "456" "789") (regexp-extract '(* digit) "abc123def456ghi789")) +(test '("abc" "def" "ghi") (regexp-split '(+ digit) "abc123def456ghi789")) +(test '("a" "b" "c" "d" "e" "f" "g" "h" "i") + (regexp-split '(* digit) "abc123def456ghi789")) + +(test "abc def" (regexp-replace '(+ space) "abc \t\n def" " ")) +(test " abc-abc" + (regexp-replace '(: ($ (+ alpha)) ":" (* space)) " abc: " '(1 "-" 1))) +(test " abc- abc" + (regexp-replace '(: ($ (+ alpha)) ":" (* space)) " abc: " '(1 "-" pre 1))) + +(test " abc d ef " (regexp-replace-all '(+ space) " abc \t\n d ef " " ")) + +(define (subst-matches matches input subst) + (define (submatch n) + (rx-match-submatch matches input n)) + (and + matches + (call-with-output-string + (lambda (out) + (call-with-input-string subst + (lambda (in) + (let lp () + (let ((c (read-char in))) + (cond + ((not (eof-object? c)) + (case c + ((#\&) + (display (or (submatch 0) "") out)) + ((#\\) + (let ((c (read-char in))) + (if (char-numeric? c) + (let lp ((res (list c))) + (if (and (char? (peek-char in)) + (char-numeric? (peek-char in))) + (lp (cons (read-char in) res)) + (display + (or (submatch (string->number + (list->string (reverse res)))) + "") + out))) + (write-char c out)))) + (else + (write-char c out))) + (lp))))))))))) + +(define (test-pcre line) + (match (string-split line #\tab) + ((pattern input result subst output) + (let ((name (string-append pattern " " input " " result " " subst))) + (cond + ((equal? "c" result) + (test-error name (regexp-search (pcre->sre pattern) input))) + ((equal? "n" result) + (test-assert name (not (regexp-search (pcre->sre pattern) input)))) + (else + (test name output + (subst-matches (regexp-search (pcre->sre pattern) input) + input + subst)))))) + (else + (error "invalid regex test line" line)))) + +(test-group "pcre" + (call-with-input-file "tests/re-tests.txt" + (lambda (in) + (for-each + (lambda (line) (test-pcre line)) + (port->list read-line in))))) + +(test-end)