mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-06-21 05:16:42 +02:00
Initial version of (chibi regexp).
This commit is contained in:
parent
c3cb2fbdbb
commit
205c60a807
6 changed files with 1793 additions and 0 deletions
795
lib/chibi/regexp.scm
Normal file
795
lib/chibi/regexp.scm
Normal file
|
@ -0,0 +1,795 @@
|
||||||
|
;; regexp.scm -- simple non-bactracking NFA implementation
|
||||||
|
;; Copyright (c) 2013 Alex Shinn. All rights reserved.
|
||||||
|
;; BSD-style license: http://synthcode.com/license.txt
|
||||||
|
|
||||||
|
;;; An rx represents a start state and meta-info such as the number
|
||||||
|
;;; and names of submatches.
|
||||||
|
(define-record-type Rx
|
||||||
|
(make-rx start-state num-matches num-save-indexes match-rules match-names)
|
||||||
|
regexp?
|
||||||
|
(start-state rx-start-state rx-start-state-set!)
|
||||||
|
(num-matches rx-num-matches rx-num-matches-set!)
|
||||||
|
(num-save-indexes rx-num-save-indexes rx-num-save-indexes-set!)
|
||||||
|
(match-rules rx-rules rx-rules-set!)
|
||||||
|
(match-names rx-names rx-names-set!))
|
||||||
|
|
||||||
|
;;; A state is a single nfa state with transition rules.
|
||||||
|
(define-record-type State
|
||||||
|
(%make-state accept? chars match match-rule next1 next2)
|
||||||
|
state?
|
||||||
|
;; A boolean indicating if this is an accepting state.
|
||||||
|
(accept? state-accept? state-accept?-set!)
|
||||||
|
;; A char or char-set indicating when we can transition.
|
||||||
|
;; Alternately, #f indicates an epsilon transition, while a
|
||||||
|
;; procedure of the form (lambda (ch i matches) ...) is a predicate
|
||||||
|
;; which should return #t if the char matches.
|
||||||
|
(chars state-chars state-chars-set!)
|
||||||
|
;; A single integer indicating the match position to record.
|
||||||
|
(match state-match state-match-set!)
|
||||||
|
;; The rule for merging ambiguous matches. Can be any of: left,
|
||||||
|
;; right, (list i j). Posix semantics are equivalent to using left
|
||||||
|
;; for the beginning of a submatch and right for the end. List is
|
||||||
|
;; used to capture a list of submatch data in the current match.
|
||||||
|
(match-rule state-match-rule state-match-rule-set!)
|
||||||
|
;; The destination if the char match succeeds.
|
||||||
|
(next1 state-next1 state-next1-set!)
|
||||||
|
;; An optional additional transition used for forking to two states.
|
||||||
|
(next2 state-next2 state-next2-set!))
|
||||||
|
|
||||||
|
(define (make-state accept? chars match match-rule next1 next2)
|
||||||
|
(if (and next1 (not (state? next1)))
|
||||||
|
(error "expected a state" next1))
|
||||||
|
(if (and next2 (not (state? next2)))
|
||||||
|
(error "expected a state" next2))
|
||||||
|
(%make-state accept? chars match match-rule next1 next2))
|
||||||
|
|
||||||
|
(define (char-set-ci cset)
|
||||||
|
(let ((res (char-set)))
|
||||||
|
(for-each
|
||||||
|
(lambda (ch)
|
||||||
|
(char-set-adjoin! res (char-upcase ch))
|
||||||
|
(char-set-adjoin! res (char-downcase ch)))
|
||||||
|
(char-set->list cset))
|
||||||
|
res))
|
||||||
|
|
||||||
|
(define (make-char-state ch flags next)
|
||||||
|
(if (= ~ci? (bitwise-and ~ci? flags))
|
||||||
|
(let ((cset (cond ((char? ch) (char-set-ci (char-set ch)))
|
||||||
|
((char-set? ch) (char-set-ci ch))
|
||||||
|
(else ch))))
|
||||||
|
(make-state #f cset #f #f next #f))
|
||||||
|
(make-state #f ch #f #f next #f)))
|
||||||
|
(define (make-fork-state next1 next2)
|
||||||
|
(make-state #f #f #f #f next1 next2))
|
||||||
|
(define (make-epsilon-state next)
|
||||||
|
(make-fork-state next #f))
|
||||||
|
(define (make-accept-state)
|
||||||
|
(make-state #t #f #f #f #f #f))
|
||||||
|
|
||||||
|
;; A record holding the current match data - essentially a wrapper
|
||||||
|
;; around a vector, plus a reference to the RX for meta-info.
|
||||||
|
(define-record-type Rx-Match
|
||||||
|
(%make-rx-match matches rx)
|
||||||
|
rx-match?
|
||||||
|
(matches rx-match-matches rx-match-matches-set!)
|
||||||
|
(rx rx-match-rx))
|
||||||
|
|
||||||
|
(define (rx-match-rules md)
|
||||||
|
(rx-rules (rx-match-rx md)))
|
||||||
|
(define (rx-match-names md)
|
||||||
|
(rx-names (rx-match-rx md)))
|
||||||
|
(define (make-rx-match len rx)
|
||||||
|
(%make-rx-match (make-vector len #f) rx))
|
||||||
|
(define (make-rx-match-for-rx rx)
|
||||||
|
(make-rx-match (rx-num-save-indexes rx) rx))
|
||||||
|
(define (rx-match-num-matches md)
|
||||||
|
(vector-length (rx-match-matches md)))
|
||||||
|
|
||||||
|
(define (rx-match-name-offset md name)
|
||||||
|
(cond ((assq name (rx-match-names md)) => cdr)
|
||||||
|
(else (error "unknown match name" md name))))
|
||||||
|
|
||||||
|
(define (rx-match-ref md n)
|
||||||
|
(vector-ref (rx-match-matches md)
|
||||||
|
(if (integer? n)
|
||||||
|
n
|
||||||
|
(rx-match-name-offset md n))))
|
||||||
|
|
||||||
|
(define (rx-match-set! md n val)
|
||||||
|
(vector-set! (rx-match-matches md) n val))
|
||||||
|
|
||||||
|
(define (copy-rx-match md)
|
||||||
|
(let* ((src (rx-match-matches md))
|
||||||
|
(len (vector-length src))
|
||||||
|
(dst (make-vector len #f)))
|
||||||
|
(do ((i 0 (+ i 1)))
|
||||||
|
((= i len) (%make-rx-match dst (rx-match-rx md)))
|
||||||
|
(vector-set! dst i (vector-ref src i)))))
|
||||||
|
|
||||||
|
;;> Returns the matching result for the given named or indexed
|
||||||
|
;;> submatch \var{n}, possibly as a list for a submatch-list, or
|
||||||
|
;;> \scheme{#f} if not matched.
|
||||||
|
|
||||||
|
(define (rx-match-submatch/list md str n)
|
||||||
|
(let ((n (if (integer? n) n (rx-match-name-offset md n))))
|
||||||
|
(cond
|
||||||
|
((>= n (vector-length (rx-match-rules md)))
|
||||||
|
#f)
|
||||||
|
(else
|
||||||
|
(let ((rule (vector-ref (rx-match-rules md) n)))
|
||||||
|
(cond
|
||||||
|
((pair? rule)
|
||||||
|
(let ((start (rx-match-ref md (car rule)))
|
||||||
|
(end (rx-match-ref md (cdr rule))))
|
||||||
|
(and start end (substring-cursor str start end))))
|
||||||
|
(else
|
||||||
|
(let ((res (rx-match-ref md rule)))
|
||||||
|
(if (pair? res)
|
||||||
|
(reverse res)
|
||||||
|
res)))))))))
|
||||||
|
|
||||||
|
;;> Returns the matching substring for the given named or indexed
|
||||||
|
;;> submatch \var{n}, or \scheme{#f} if not matched.
|
||||||
|
|
||||||
|
(define (rx-match-submatch md str n)
|
||||||
|
(let ((res (rx-match-submatch/list md str n)))
|
||||||
|
(if (pair? res) (car res) res)))
|
||||||
|
|
||||||
|
(define (rx-match-submatch-start+end md str n)
|
||||||
|
(let ((n (if (string-cursor? n) n (rx-match-name-offset md n))))
|
||||||
|
(and (< n (vector-length (rx-match-rules md)))
|
||||||
|
(let ((rule (vector-ref (rx-match-rules md) n)))
|
||||||
|
(if (pair? rule)
|
||||||
|
(let ((start (rx-match-ref md (car rule)))
|
||||||
|
(end (rx-match-ref md (cdr rule))))
|
||||||
|
(and start end
|
||||||
|
(cons (string-offset->index str start)
|
||||||
|
(string-offset->index str end))))
|
||||||
|
#f)))))
|
||||||
|
|
||||||
|
;;> Returns the start index within \var{str} for the given named or
|
||||||
|
;;> indexed submatch \var{n}, or \scheme{#f} if not matched.
|
||||||
|
|
||||||
|
(define (rx-match-submatch-start md str n)
|
||||||
|
(cond ((rx-match-submatch-start+end md str n) => car) (else #f)))
|
||||||
|
|
||||||
|
;;> Returns the end index within \var{str} for the given named or
|
||||||
|
;;> indexed submatch \var{n}, or \scheme{#f} if not matched.
|
||||||
|
|
||||||
|
(define (rx-match-submatch-end md str n)
|
||||||
|
(cond ((rx-match-submatch-start+end md str n) => cdr) (else #f)))
|
||||||
|
|
||||||
|
(define (rx-match-convert recurse? md str)
|
||||||
|
(cond
|
||||||
|
((vector? md)
|
||||||
|
(let lp ((i 0) (res '()))
|
||||||
|
(cond
|
||||||
|
((>= i (vector-length md))
|
||||||
|
(reverse res))
|
||||||
|
((string-cursor? (vector-ref md i))
|
||||||
|
(lp (+ i 2)
|
||||||
|
(cons (substring-cursor str
|
||||||
|
(vector-ref md i)
|
||||||
|
(vector-ref md (+ i 1)))
|
||||||
|
res)))
|
||||||
|
(else
|
||||||
|
(lp (+ i 1)
|
||||||
|
(cons (rx-match-convert recurse? (vector-ref md i) str) res))))))
|
||||||
|
((list? md)
|
||||||
|
(if recurse?
|
||||||
|
(map (lambda (x) (rx-match-convert recurse? x str)) (reverse md))
|
||||||
|
(rx-match-convert recurse? (car md) str)))
|
||||||
|
((and (pair? md) (string-cursor? (car md)) (string-cursor? (cdr md)))
|
||||||
|
(substring-cursor str (car md) (cdr md)))
|
||||||
|
((rx-match? md)
|
||||||
|
(rx-match-convert recurse? (rx-match-matches md) str))
|
||||||
|
(else
|
||||||
|
md)))
|
||||||
|
|
||||||
|
;;> Convert an rx-match result to a list of submatches, beginning
|
||||||
|
;;> with the full match, using \scheme{#f} for unmatched submatches.
|
||||||
|
|
||||||
|
(define (rx-match->list md str)
|
||||||
|
(rx-match-convert #f md str))
|
||||||
|
|
||||||
|
;;> Convert an rx-match result to a forest of submatches, beginning
|
||||||
|
;;> with the full match, using \scheme{#f} for unmatched submatches.
|
||||||
|
|
||||||
|
(define (rx-match->sexp md str)
|
||||||
|
(rx-match-convert #t md str))
|
||||||
|
|
||||||
|
;; Collect results from a list match.
|
||||||
|
(define (match-collect md spec)
|
||||||
|
(define (match-extract md n)
|
||||||
|
(let* ((vec (rx-match-matches md))
|
||||||
|
(rules (rx-match-rules md))
|
||||||
|
(n-rule (vector-ref rules n))
|
||||||
|
(rule (vector-ref rules n-rule)))
|
||||||
|
(if (pair? rule)
|
||||||
|
(let ((start (rx-match-ref md (car rule)))
|
||||||
|
(end (rx-match-ref md (cdr rule))))
|
||||||
|
(and start end (cons start end)))
|
||||||
|
(rx-match-ref md rule))))
|
||||||
|
(let ((end (cadr spec))
|
||||||
|
(vec (rx-match-matches md)))
|
||||||
|
(let lp ((i (+ 1 (car spec)))
|
||||||
|
(ls '()))
|
||||||
|
(if (>= i end)
|
||||||
|
(reverse ls)
|
||||||
|
(lp (+ i 1) (cons (match-extract md i) ls))))))
|
||||||
|
|
||||||
|
;; A searcher represents a single rx state and match information.
|
||||||
|
(define-record-type Searcher
|
||||||
|
(make-searcher state matches)
|
||||||
|
searcher?
|
||||||
|
(state searcher-state searcher-state-set!)
|
||||||
|
(matches searcher-matches searcher-matches-set!))
|
||||||
|
|
||||||
|
;; Merge two rx-matches, preferring the leftmost-longest of their
|
||||||
|
;; matches.
|
||||||
|
(define (rx-match>=? m1 m2)
|
||||||
|
(let ((end (- (vector-length (rx-match-matches m1)) 1)))
|
||||||
|
(let lp ((i 0))
|
||||||
|
(cond
|
||||||
|
((>= i end)
|
||||||
|
#t)
|
||||||
|
((and (eqv? (rx-match-ref m1 i) (rx-match-ref m2 i))
|
||||||
|
(eqv? (rx-match-ref m1 (+ i 1)) (rx-match-ref m2 (+ i 1))))
|
||||||
|
(lp (+ i 2)))
|
||||||
|
((and (string-cursor? (rx-match-ref m2 i))
|
||||||
|
(string-cursor? (rx-match-ref m2 (+ i 1)))
|
||||||
|
(or (not (string-cursor? (rx-match-ref m1 i)))
|
||||||
|
(not (string-cursor? (rx-match-ref m1 (+ i 1))))
|
||||||
|
(string-cursor<? (rx-match-ref m2 i) (rx-match-ref m1 i))
|
||||||
|
(and (string-cursor=? (rx-match-ref m2 i) (rx-match-ref m1 i))
|
||||||
|
(string-cursor>? (rx-match-ref m2 (+ i 1))
|
||||||
|
(rx-match-ref m1 (+ i 1))))))
|
||||||
|
#f)
|
||||||
|
(else
|
||||||
|
#t)))))
|
||||||
|
|
||||||
|
(define (rx-match-max m1 m2)
|
||||||
|
(if (rx-match>=? m1 m2) m1 m2))
|
||||||
|
|
||||||
|
;; Merge match data from sr2 into sr1, preferring the leftmost-longest
|
||||||
|
;; match in the event of a conflict.
|
||||||
|
(define (searcher-merge! sr1 sr2)
|
||||||
|
(let ((m (rx-match-max (searcher-matches sr1) (searcher-matches sr2))))
|
||||||
|
(searcher-matches-set! sr1 m)))
|
||||||
|
|
||||||
|
(define (searcher-max sr1 sr2)
|
||||||
|
(if (or (not (searcher? sr2))
|
||||||
|
(rx-match>=? (searcher-matches sr1) (searcher-matches sr2)))
|
||||||
|
sr1
|
||||||
|
sr2))
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; A posse is a group of searchers.
|
||||||
|
|
||||||
|
(define (make-posse . o)
|
||||||
|
(make-hash-table eq?))
|
||||||
|
|
||||||
|
(define posse? hash-table?)
|
||||||
|
(define (posse-empty? posse) (zero? (hash-table-size posse)))
|
||||||
|
|
||||||
|
(define (posse-ref posse sr)
|
||||||
|
(hash-table-ref/default posse (searcher-state sr) #f))
|
||||||
|
(define (posse-add! posse sr)
|
||||||
|
(hash-table-set! posse (searcher-state sr) sr))
|
||||||
|
(define (posse-clear! posse)
|
||||||
|
(hash-table-walk posse (lambda (key val) (hash-table-delete! posse key))))
|
||||||
|
(define (posse-for-each proc posse)
|
||||||
|
(hash-table-walk posse (lambda (key val) (proc val))))
|
||||||
|
|
||||||
|
(define (posse->list posse)
|
||||||
|
(hash-table-values posse))
|
||||||
|
(define (list->posse ls)
|
||||||
|
(let ((searchers (make-posse)))
|
||||||
|
(for-each (lambda (sr) (posse-add! searchers sr)) ls)
|
||||||
|
searchers))
|
||||||
|
(define (posse . args)
|
||||||
|
(list->posse args))
|
||||||
|
|
||||||
|
(define (make-start-searcher rx)
|
||||||
|
(make-searcher (rx-start-state rx) (make-rx-match-for-rx rx)))
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; Execution
|
||||||
|
|
||||||
|
;; A transition which doesn't advance the index.
|
||||||
|
|
||||||
|
(define (epsilon-state? st)
|
||||||
|
(or (not (state-chars st))
|
||||||
|
(procedure? (state-chars st))))
|
||||||
|
|
||||||
|
;; Match the state against a char and index.
|
||||||
|
|
||||||
|
(define (state-matches? st str i ch start end matches)
|
||||||
|
(let ((matcher (state-chars st)))
|
||||||
|
(cond
|
||||||
|
((char? matcher)
|
||||||
|
(eqv? matcher ch))
|
||||||
|
((char-set? matcher)
|
||||||
|
(char-set-contains? matcher ch))
|
||||||
|
((pair? matcher)
|
||||||
|
(and (char<=? (car matcher) ch) (char<=? ch (cdr matcher))))
|
||||||
|
((procedure? matcher)
|
||||||
|
(matcher str i ch start end matches))
|
||||||
|
((not matcher))
|
||||||
|
(else
|
||||||
|
(error "unknown state matcher" (state-chars st))))))
|
||||||
|
|
||||||
|
;; Advance epsilons together - if the State is newly added to the
|
||||||
|
;; group and is an epsilon state, recursively add the transition.
|
||||||
|
|
||||||
|
(define (posse-advance! new seen accept sr str i start end)
|
||||||
|
(let advance! ((sr sr))
|
||||||
|
(let ((st (searcher-state sr)))
|
||||||
|
;; Update match data.
|
||||||
|
(cond
|
||||||
|
((state-match st)
|
||||||
|
(let ((index (state-match st))
|
||||||
|
(matches (searcher-matches sr)))
|
||||||
|
(cond
|
||||||
|
((pair? index)
|
||||||
|
;; Submatch list, accumulate and push.
|
||||||
|
(let* ((prev (rx-match-ref matches (car index)))
|
||||||
|
(new (cons (match-collect matches (cdr index))
|
||||||
|
(if (pair? prev) prev '()))))
|
||||||
|
(rx-match-set! matches (car index) new)))
|
||||||
|
(else
|
||||||
|
(rx-match-set! matches index i))))))
|
||||||
|
;; Follow transitions.
|
||||||
|
(cond
|
||||||
|
((state-accept? st)
|
||||||
|
(set-cdr! accept (searcher-max sr (cdr accept))))
|
||||||
|
((posse-ref seen sr)
|
||||||
|
=> (lambda (sr-prev) (searcher-merge! sr-prev sr)))
|
||||||
|
((epsilon-state? st)
|
||||||
|
(let ((ch (and (string-cursor<? i end) (string-cursor-ref str i))))
|
||||||
|
;; Epsilon transition. If there is a procedure matcher,
|
||||||
|
;; it's a guarded epsilon and needs to be checked.
|
||||||
|
(cond
|
||||||
|
((state-matches? st str i ch start end (searcher-matches sr))
|
||||||
|
(posse-add! seen sr)
|
||||||
|
(let ((next1 (state-next1 st))
|
||||||
|
(next2 (state-next2 st)))
|
||||||
|
(cond
|
||||||
|
(next1
|
||||||
|
(searcher-state-set! sr next1)
|
||||||
|
(advance! sr)))
|
||||||
|
(cond
|
||||||
|
(next2
|
||||||
|
(let ((sr2 (make-searcher
|
||||||
|
next2
|
||||||
|
(copy-rx-match (searcher-matches sr)))))
|
||||||
|
(advance! sr2)))))))))
|
||||||
|
;; Non-special, non-epsilon searcher, add to posse.
|
||||||
|
((posse-ref new sr)
|
||||||
|
;; Merge rx-match for existing searcher.
|
||||||
|
=> (lambda (sr-prev) (searcher-merge! sr-prev sr)))
|
||||||
|
(else
|
||||||
|
;; Add new searcher.
|
||||||
|
(posse-add! new sr))))))
|
||||||
|
|
||||||
|
;; Run so long as there is more to match.
|
||||||
|
|
||||||
|
(define (regexp-run search? rx str . o)
|
||||||
|
(let* ((start (string-start-arg str o))
|
||||||
|
(end (string-end-arg str (if (pair? o) (cdr o) o)))
|
||||||
|
(rx (regexp rx))
|
||||||
|
(epsilons (posse))
|
||||||
|
(accept (list #f)))
|
||||||
|
(let lp ((i start)
|
||||||
|
(searchers1 (posse))
|
||||||
|
(searchers2 (posse)))
|
||||||
|
;; Advance initial epsilons once from the first index, or every
|
||||||
|
;; time when searching.
|
||||||
|
(cond
|
||||||
|
((or search? (string-cursor=? i start))
|
||||||
|
(posse-advance! searchers1 epsilons accept (make-start-searcher rx)
|
||||||
|
str i start end)
|
||||||
|
(posse-clear! epsilons)))
|
||||||
|
(cond
|
||||||
|
((or (string-cursor>=? i end)
|
||||||
|
(and (or (not search?) (searcher? (cdr accept)))
|
||||||
|
(posse-empty? searchers1)))
|
||||||
|
;; Terminate when the string is done or there are no more
|
||||||
|
;; searchers. If we terminate prematurely and are not
|
||||||
|
;; searching, return false.
|
||||||
|
(and (or search? (string-cursor>=? i end))
|
||||||
|
(searcher? (cdr accept))
|
||||||
|
(searcher-matches (cdr accept))))
|
||||||
|
(else
|
||||||
|
;; Otherwise advance normally.
|
||||||
|
(let ((ch (string-cursor-ref str i))
|
||||||
|
(i2 (string-cursor-next str i)))
|
||||||
|
(posse-for-each ;; NOTE: non-deterministic from hash order
|
||||||
|
(lambda (sr)
|
||||||
|
(cond
|
||||||
|
((state-matches? (searcher-state sr) str i ch
|
||||||
|
start end (searcher-matches sr))
|
||||||
|
(searcher-state-set! sr (state-next1 (searcher-state sr)))
|
||||||
|
;; Epsilons are considered at the next position.
|
||||||
|
(posse-advance! searchers2 epsilons accept sr str i2 start end)
|
||||||
|
(posse-clear! epsilons))))
|
||||||
|
searchers1)
|
||||||
|
(posse-clear! searchers1)
|
||||||
|
(lp i2 searchers2 searchers1)))))))
|
||||||
|
|
||||||
|
;;> Match the given regexp or SRE against the entire string and return
|
||||||
|
;;> the match data on success. Returns \scheme{#f} on failure.
|
||||||
|
|
||||||
|
(define (regexp-match rx str . o)
|
||||||
|
(apply regexp-run #f rx str o))
|
||||||
|
|
||||||
|
;;> Match the given regexp or SRE against the entire string and return
|
||||||
|
;;> the \scheme{#t} on success. Returns \scheme{#f} on failure.
|
||||||
|
|
||||||
|
(define (regexp-match? rx str . o)
|
||||||
|
(and (apply regexp-match rx str o) #t))
|
||||||
|
|
||||||
|
;;> Search for the given regexp or SRE within string and return
|
||||||
|
;;> the match data on success. Returns \scheme{#f} on failure.
|
||||||
|
|
||||||
|
(define (regexp-search rx str . o)
|
||||||
|
(apply regexp-run #t rx str o))
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; Compiling
|
||||||
|
|
||||||
|
(define ~none 0)
|
||||||
|
(define ~ci? 1)
|
||||||
|
|
||||||
|
(define (parse-flags ls)
|
||||||
|
(define (symbol->flag s)
|
||||||
|
(case s ((i ci case-insensitive) ~ci?) (else ~none)))
|
||||||
|
(let lp ((ls ls) (res ~none))
|
||||||
|
(if (not (pair? ls))
|
||||||
|
res
|
||||||
|
(lp (cdr ls) (bitwise-ior res (symbol->flag (car ls)))))))
|
||||||
|
|
||||||
|
(define char-set:nonl
|
||||||
|
(char-set-difference char-set:full (char-set #\newline)))
|
||||||
|
(define char-set:control (ucs-range->char-set 0 32))
|
||||||
|
(define char-set:word-constituent
|
||||||
|
(char-set-union char-set:letter char-set:digit (char-set #\_)))
|
||||||
|
(define (char-word-constituent? ch)
|
||||||
|
(char-set-contains? char-set:word-constituent ch))
|
||||||
|
|
||||||
|
(define (match/bos str i ch start end matches)
|
||||||
|
(string-cursor=? i start))
|
||||||
|
(define (match/eos str i ch start end matches)
|
||||||
|
(string-cursor>=? i end))
|
||||||
|
(define (match/bol str i ch start end matches)
|
||||||
|
(or (string-cursor=? i start)
|
||||||
|
(eqv? #\newline (string-cursor-ref str (string-cursor-prev str i)))))
|
||||||
|
(define (match/eol str i ch start end matches)
|
||||||
|
(or (string-cursor>=? i end)
|
||||||
|
(eqv? #\newline (string-cursor-ref str i))))
|
||||||
|
(define (match/bow str i ch start end matches)
|
||||||
|
(and (string-cursor<? i end)
|
||||||
|
(or (string-cursor=? i start)
|
||||||
|
(not (char-word-constituent?
|
||||||
|
(string-cursor-ref str (string-cursor-prev str i)))))
|
||||||
|
(char-word-constituent? ch)))
|
||||||
|
(define (match/eow str i ch start end matches)
|
||||||
|
(and (or (string-cursor>=? i end)
|
||||||
|
(not (char-word-constituent? ch)))
|
||||||
|
(string-cursor>? i start)
|
||||||
|
(char-word-constituent?
|
||||||
|
(string-cursor-ref str (string-cursor-prev str i)))))
|
||||||
|
|
||||||
|
(define (lookup-char-set name)
|
||||||
|
(case name
|
||||||
|
((any) char-set:full)
|
||||||
|
((nonl) char-set:nonl)
|
||||||
|
((lower-case lower) char-set:lower-case)
|
||||||
|
((upper-case upper) char-set:upper-case)
|
||||||
|
((alphabetic alpha) char-set:letter)
|
||||||
|
((numeric num digit) char-set:digit)
|
||||||
|
((alphanumeric alphanum alnum) char-set:letter+digit)
|
||||||
|
((punctuation punct) char-set:punctuation)
|
||||||
|
((graphic graph) char-set:graphic)
|
||||||
|
((whitespace white space) char-set:whitespace)
|
||||||
|
((printing print) char-set:printing)
|
||||||
|
((control cntrl) char-set:control)
|
||||||
|
((hex-digit xdigit hex) char-set:hex-digit)
|
||||||
|
((blank) char-set:blank)
|
||||||
|
((ascii) char-set:ascii)
|
||||||
|
(else #f)))
|
||||||
|
|
||||||
|
(define (sre-flatten-ranges orig-ls)
|
||||||
|
(let lp ((ls orig-ls) (res '()))
|
||||||
|
(cond
|
||||||
|
((null? ls)
|
||||||
|
(reverse res))
|
||||||
|
((string? (car ls))
|
||||||
|
(lp (append (string->list (car ls)) (cdr ls)) res))
|
||||||
|
((null? (cdr ls))
|
||||||
|
(error "unbalanced cset / range" orig-ls))
|
||||||
|
((string? (cadr ls))
|
||||||
|
(lp (cons (car ls) (append (string->list (cadr ls)) (cddr ls))) res))
|
||||||
|
(else
|
||||||
|
(lp (cddr ls) (cons (cons (car ls) (cadr ls)) res))))))
|
||||||
|
|
||||||
|
(define (sre->char-set sre)
|
||||||
|
(cond
|
||||||
|
((lookup-char-set sre))
|
||||||
|
((char-set? sre) sre)
|
||||||
|
((char? sre) (char-set sre))
|
||||||
|
((pair? sre)
|
||||||
|
(if (string? (car sre))
|
||||||
|
(string->char-set (car sre))
|
||||||
|
(case (car sre)
|
||||||
|
((/) (sre->char-set
|
||||||
|
`(or ,@(map (lambda (x)
|
||||||
|
(ucs-range->char-set
|
||||||
|
(char->integer (car x))
|
||||||
|
(+ 1 (char->integer (cdr x)))))
|
||||||
|
(sre-flatten-ranges (cdr sre))))))
|
||||||
|
((& and) (apply char-set-intersection (map sre->char-set (cdr sre))))
|
||||||
|
((|\|| or) (apply char-set-union (map sre->char-set (cdr sre))))
|
||||||
|
((~ not) (char-set-complement (sre->char-set `(or ,@(cdr sre)))))
|
||||||
|
((-) (char-set-difference (sre->char-set (cadr sre))
|
||||||
|
(sre->char-set `(or ,@(cddr sre)))))
|
||||||
|
(else (error "invalid sre char-set" sre)))))
|
||||||
|
(else (error "invalid sre char-set" sre))))
|
||||||
|
|
||||||
|
;;> Compile an \var{sre} into a regexp.
|
||||||
|
|
||||||
|
(define (regexp sre . o)
|
||||||
|
(define current-index 2)
|
||||||
|
(define current-match 0)
|
||||||
|
(define match-names '())
|
||||||
|
(define match-rules (list (cons 0 1)))
|
||||||
|
(define (make-submatch-state sre flags next index)
|
||||||
|
(let* ((n3 (make-epsilon-state next))
|
||||||
|
(n2 (->rx sre flags n3))
|
||||||
|
(n1 (make-epsilon-state n2)))
|
||||||
|
(state-match-set! n1 index)
|
||||||
|
(state-match-rule-set! n1 'left)
|
||||||
|
(state-match-set! n3 (+ index 1))
|
||||||
|
(state-match-rule-set! n3 'right)
|
||||||
|
n1))
|
||||||
|
(define (->rx sre flags next)
|
||||||
|
(cond
|
||||||
|
;; The base cases chars and strings match literally.
|
||||||
|
((char? sre)
|
||||||
|
(make-char-state sre flags next))
|
||||||
|
((char-set? sre)
|
||||||
|
(make-char-state sre flags next))
|
||||||
|
((string? sre)
|
||||||
|
(->rx (cons 'seq (string->list sre)) flags next))
|
||||||
|
((and (symbol? sre) (lookup-char-set sre))
|
||||||
|
=> (lambda (cset) (make-char-state cset flags next)))
|
||||||
|
((symbol? sre)
|
||||||
|
(case sre
|
||||||
|
((epsilon) next)
|
||||||
|
((bos) (make-char-state match/bos flags next))
|
||||||
|
((eos) (make-char-state match/eos flags next))
|
||||||
|
((bol) (make-char-state match/bol flags next))
|
||||||
|
((eol) (make-char-state match/eol flags next))
|
||||||
|
((bow) (make-char-state match/bow flags next))
|
||||||
|
((eow) (make-char-state match/eow flags next))
|
||||||
|
((word) (->rx '(word+ any) flags next))
|
||||||
|
(else (error "unknown sre" sre))))
|
||||||
|
((pair? sre)
|
||||||
|
(case (car sre)
|
||||||
|
((seq :)
|
||||||
|
;; Sequencing. An empty sequence jumps directly to next,
|
||||||
|
;; otherwise we join the first element to the sequence formed
|
||||||
|
;; of the remaining elements followed by next.
|
||||||
|
(if (null? (cdr sre))
|
||||||
|
next
|
||||||
|
;; Make a dummy intermediate to join the states so that
|
||||||
|
;; we can generate n1 first, preserving the submatch order.
|
||||||
|
(let* ((n2 (make-epsilon-state #f))
|
||||||
|
(n1 (->rx (cadr sre) flags n2))
|
||||||
|
(n3 (->rx (cons 'seq (cddr sre)) flags next)))
|
||||||
|
(state-next1-set! n2 n3)
|
||||||
|
n1)))
|
||||||
|
((or)
|
||||||
|
;; Alternation. An empty alternation always fails.
|
||||||
|
;; Otherwise we fork between any of the alternations, each
|
||||||
|
;; continuing to next.
|
||||||
|
(cond
|
||||||
|
((null? (cdr sre))
|
||||||
|
#f)
|
||||||
|
((null? (cddr sre))
|
||||||
|
(->rx (cadr sre) flags next))
|
||||||
|
(else
|
||||||
|
(let* ((n1 (->rx (cadr sre) flags next))
|
||||||
|
(n2 (->rx (cons 'or (cddr sre)) flags next)))
|
||||||
|
(make-fork-state n1 n2)))))
|
||||||
|
((?)
|
||||||
|
;; Optionality. Either match the body or fork to the next
|
||||||
|
;; state directly.
|
||||||
|
(make-fork-state (->rx (cons 'seq (cdr sre)) flags next) next))
|
||||||
|
((*)
|
||||||
|
;; Repetition. Introduce two fork states which can jump from
|
||||||
|
;; the end of the loop to the beginning and from the
|
||||||
|
;; beginning to the end (to skip the first iteration).
|
||||||
|
(let* ((n2 (make-fork-state next #f))
|
||||||
|
(n1 (make-fork-state (->rx (cons 'seq (cdr sre)) flags n2) n2)))
|
||||||
|
(state-next2-set! n2 n1)
|
||||||
|
n1))
|
||||||
|
((+)
|
||||||
|
;; One-or-more repetition. Same as above but the first
|
||||||
|
;; transition is required so the rx is simpler - we only
|
||||||
|
;; need one fork from the end of the loop to the beginning.
|
||||||
|
(let* ((n2 (make-fork-state next #f))
|
||||||
|
(n1 (->rx (cons 'seq (cdr sre)) flags n2)))
|
||||||
|
(state-next2-set! n2 n1)
|
||||||
|
n1))
|
||||||
|
((=> submatch-named)
|
||||||
|
;; Named submatches just record the name for the current
|
||||||
|
;; match and rewrite as a non-named submatch.
|
||||||
|
(set! match-names (cons (cons (cadr sre) current-match) match-names))
|
||||||
|
(->rx (cons 'submatch (cddr sre)) flags next))
|
||||||
|
((*=> submatch-named-list)
|
||||||
|
(set! match-names (cons (cons (cadr sre) current-match) match-names))
|
||||||
|
(->rx (cons 'submatch-list (cddr sre)) flags next))
|
||||||
|
(($ submatch)
|
||||||
|
;; A submatch wraps next with an epsilon transition before
|
||||||
|
;; next, setting the start and end index on the result and
|
||||||
|
;; wrapped next respectively.
|
||||||
|
(let ((num current-match)
|
||||||
|
(index current-index))
|
||||||
|
(set! current-match (+ current-match 1))
|
||||||
|
(set! current-index (+ current-index 2))
|
||||||
|
(set! match-rules `((,index . ,(+ index 1)) ,@match-rules))
|
||||||
|
(make-submatch-state (cons 'seq (cdr sre)) flags next index)))
|
||||||
|
((*$ submatch-list)
|
||||||
|
;; A submatch-list wraps a range of submatch results into a
|
||||||
|
;; single match value.
|
||||||
|
(let* ((num current-match)
|
||||||
|
(index current-index))
|
||||||
|
(set! current-match (+ current-match 1))
|
||||||
|
(set! current-index (+ current-index 1))
|
||||||
|
(set! match-rules `(,index ,@match-rules))
|
||||||
|
(let* ((n2 (make-epsilon-state next))
|
||||||
|
(n1 (->rx (cons 'submatch (cdr sre)) flags n2)))
|
||||||
|
(state-match-set! n2 (list index num current-match))
|
||||||
|
(state-match-rule-set! n2 'list)
|
||||||
|
n1)))
|
||||||
|
((~ - & |\|| / and or not)
|
||||||
|
(make-char-state (sre->char-set sre) flags next))
|
||||||
|
((word)
|
||||||
|
(->rx `(: bow ,@(cdr sre) eow) flags next))
|
||||||
|
((word+)
|
||||||
|
(->rx `(word (+ ,(char-set-intersection
|
||||||
|
char-set:word-constituent
|
||||||
|
(sre->char-set `(or ,@(cdr sre))))))
|
||||||
|
flags
|
||||||
|
next))
|
||||||
|
((w/case)
|
||||||
|
(->rx `(: ,@(cdr sre)) (bitwise-and flags (bitwise-not ~ci?)) next))
|
||||||
|
((w/nocase)
|
||||||
|
(->rx `(: ,@(cdr sre)) (bitwise-ior flags ~ci?) next))
|
||||||
|
(else
|
||||||
|
(if (string? (car sre))
|
||||||
|
(make-char-state (sre->char-set sre) flags next)
|
||||||
|
(error "unknown sre" sre)))))))
|
||||||
|
(let ((flags (parse-flags (and (pair? o) (car o)))))
|
||||||
|
(if (regexp? sre)
|
||||||
|
sre
|
||||||
|
(let ((start (make-submatch-state sre flags (make-accept-state) 0)))
|
||||||
|
(make-rx start current-match current-index
|
||||||
|
(list->vector (reverse match-rules)) match-names)))))
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;; Utilities
|
||||||
|
|
||||||
|
(define (regexp-fold rx kons knil str . o)
|
||||||
|
(let* ((rx (regexp rx))
|
||||||
|
(finish (if (pair? o) (car o) (lambda (from md str acc) acc)))
|
||||||
|
(o (if (pair? o) (cdr o) o))
|
||||||
|
(start (string-start-arg str o))
|
||||||
|
(end (string-end-arg str (if (pair? o) (cdr o) o))))
|
||||||
|
(let lp ((i start)
|
||||||
|
(from start)
|
||||||
|
(acc knil))
|
||||||
|
(cond
|
||||||
|
((and (string-cursor<? i end) (regexp-search rx str i end))
|
||||||
|
=> (lambda (md)
|
||||||
|
(let* ((j (rx-match-submatch-end md str 0)))
|
||||||
|
(lp (if (>= j end) j (string-cursor-next str j))
|
||||||
|
j
|
||||||
|
(kons (string-offset->index str from) md str acc)))))
|
||||||
|
(else
|
||||||
|
(finish (string-offset->index str i) #f str acc))))))
|
||||||
|
|
||||||
|
(define (regexp-extract rx str . o)
|
||||||
|
(apply regexp-fold
|
||||||
|
rx
|
||||||
|
(lambda (from md str a)
|
||||||
|
(let ((s (rx-match-submatch md str 0)))
|
||||||
|
(if (equal? s "") a (cons s a))))
|
||||||
|
'()
|
||||||
|
str
|
||||||
|
(lambda (from md str a) (reverse a))
|
||||||
|
o))
|
||||||
|
|
||||||
|
(define (regexp-split rx str . o)
|
||||||
|
;; start and end in indices passed to regexp-fold
|
||||||
|
(let ((start (if (pair? o) (car o) 0))
|
||||||
|
(end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str))))
|
||||||
|
(regexp-fold
|
||||||
|
rx
|
||||||
|
(lambda (from md str a)
|
||||||
|
(let ((i (rx-match-submatch-start md str 0)))
|
||||||
|
(if (< from i) (cons (substring str from i) a) a)))
|
||||||
|
'()
|
||||||
|
str
|
||||||
|
(lambda (from md str a)
|
||||||
|
(reverse (if (< from end) (cons (substring str from end) a) a)))
|
||||||
|
start
|
||||||
|
end)))
|
||||||
|
|
||||||
|
(define (regexp-replace rx str subst . o)
|
||||||
|
(let* ((start (if (pair? o) (car o) 0))
|
||||||
|
(end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str)))
|
||||||
|
(m (regexp-search rx str start end)))
|
||||||
|
(if m
|
||||||
|
(string-concatenate
|
||||||
|
(cons
|
||||||
|
(substring-cursor str
|
||||||
|
(string-index->offset str start)
|
||||||
|
(rx-match-submatch-start m str 0))
|
||||||
|
(append
|
||||||
|
(reverse (regexp-apply-match m str subst))
|
||||||
|
(list (substring-cursor str
|
||||||
|
(rx-match-submatch-end m str 0)
|
||||||
|
(string-index->offset str end))))))
|
||||||
|
str)))
|
||||||
|
|
||||||
|
(define (regexp-replace-all rx str subst . o)
|
||||||
|
(regexp-fold
|
||||||
|
rx
|
||||||
|
(lambda (i m str acc)
|
||||||
|
(let ((m-start (rx-match-submatch-start m str 0)))
|
||||||
|
(append (regexp-apply-match m str subst)
|
||||||
|
(if (>= i m-start)
|
||||||
|
acc
|
||||||
|
(cons (substring str i m-start) acc)))))
|
||||||
|
'()
|
||||||
|
str
|
||||||
|
(lambda (i m str acc)
|
||||||
|
(let ((end (string-length str)))
|
||||||
|
(string-concatenate-reverse
|
||||||
|
(if (>= i end)
|
||||||
|
acc
|
||||||
|
(cons (substring str i end) acc)))))))
|
||||||
|
|
||||||
|
(define (regexp-apply-match m str ls)
|
||||||
|
(let lp ((ls ls) (res '()))
|
||||||
|
(cond
|
||||||
|
((null? ls)
|
||||||
|
res)
|
||||||
|
((not (pair? ls))
|
||||||
|
(lp (list ls) res))
|
||||||
|
((integer? (car ls))
|
||||||
|
(lp (cdr ls) (cons (or (rx-match-submatch m str (car ls)) "") res)))
|
||||||
|
((procedure? (car ls))
|
||||||
|
(lp (cdr ls) (cons ((car ls) m) res)))
|
||||||
|
((symbol? (car ls))
|
||||||
|
(case (car ls)
|
||||||
|
((pre)
|
||||||
|
(lp (cdr ls)
|
||||||
|
(cons (substring-cursor str 0 (rx-match-submatch-start m str 0))
|
||||||
|
res)))
|
||||||
|
((post)
|
||||||
|
(lp (cdr ls)
|
||||||
|
(cons (substring str
|
||||||
|
(rx-match-submatch-end m str 0)
|
||||||
|
(string-length str))
|
||||||
|
res)))
|
||||||
|
(else
|
||||||
|
(cond
|
||||||
|
((assq (car ls) (rx-match-names m))
|
||||||
|
=> (lambda (x) (lp (cons (cdr x) (cdr ls)) res)))
|
||||||
|
(else
|
||||||
|
(error "unknown match replacement" (car ls)))))))
|
||||||
|
(else
|
||||||
|
(lp (cdr ls) (cons (car ls) res))))))
|
44
lib/chibi/regexp.sld
Normal file
44
lib/chibi/regexp.sld
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
|
||||||
|
(define-library (chibi regexp)
|
||||||
|
(export regexp regexp? regexp-match regexp-search
|
||||||
|
regexp-replace regexp-replace-all
|
||||||
|
regexp-fold regexp-extract regexp-split
|
||||||
|
rx-match? rx-match-num-matches
|
||||||
|
rx-match-submatch rx-match-submatch/list
|
||||||
|
rx-match->list rx-match->sexp)
|
||||||
|
(import (scheme base) (srfi 9) (srfi 33) (srfi 38) (srfi 69))
|
||||||
|
;; Chibi's char-set library is more factored than SRFI-14.
|
||||||
|
(cond-expand
|
||||||
|
(chibi
|
||||||
|
(import (chibi) (chibi char-set) (chibi char-set full)))
|
||||||
|
(else
|
||||||
|
(import (scheme base) (srfi 14))))
|
||||||
|
;; Use string-cursors where available.
|
||||||
|
(begin
|
||||||
|
(define string-cursor? integer?))
|
||||||
|
(cond-expand
|
||||||
|
(chibi
|
||||||
|
(begin
|
||||||
|
(define (string-start-arg s o)
|
||||||
|
(if (pair? o) (string-index->offset s (car o)) (string-cursor-start s)))
|
||||||
|
(define (string-end-arg s o)
|
||||||
|
(if (pair? o) (string-index->offset s (car o)) (string-cursor-end s)))
|
||||||
|
(define (string-concatenate-reverse ls)
|
||||||
|
(string-concatenate (reverse ls)))))
|
||||||
|
(else
|
||||||
|
(begin
|
||||||
|
(define (string-start-arg s o)
|
||||||
|
(if (pair? o) (string-index->offset (car o)) 0))
|
||||||
|
(define (string-end-arg s o)
|
||||||
|
(if (pair? o) (string-index->offset (car o)) (string-length s)))
|
||||||
|
(define string-cursor=? =)
|
||||||
|
(define string-cursor<? <)
|
||||||
|
(define string-cursor<=? <=)
|
||||||
|
(define string-cursor>? >)
|
||||||
|
(define string-cursor>=? >=)
|
||||||
|
(define string-cursor-ref string-ref)
|
||||||
|
(define substring-cursor substring)
|
||||||
|
(define (string-offset->index str off) off)
|
||||||
|
(define (string-concatenate-reverse ls)
|
||||||
|
(apply string-append (reverse ls))))))
|
||||||
|
(include "regexp.scm"))
|
598
lib/chibi/regexp/pcre.scm
Normal file
598
lib/chibi/regexp/pcre.scm
Normal file
|
@ -0,0 +1,598 @@
|
||||||
|
|
||||||
|
;; PCRE parsing, adapted from IrRegex.
|
||||||
|
|
||||||
|
(define ~none 0)
|
||||||
|
(define ~save? 1)
|
||||||
|
(define ~case-insensitive? 2)
|
||||||
|
(define ~multi-line? 4)
|
||||||
|
(define ~single-line? 8)
|
||||||
|
(define ~ignore-space? 16)
|
||||||
|
|
||||||
|
(define (flag-set? flags i)
|
||||||
|
(= i (bitwise-and flags i)))
|
||||||
|
(define (flag-join a b)
|
||||||
|
(if b (bitwise-ior a b) a))
|
||||||
|
(define (flag-clear a b)
|
||||||
|
(bitwise-and a (bitwise-not b)))
|
||||||
|
|
||||||
|
(define (symbol-list->flags ls)
|
||||||
|
(let lp ((ls ls) (res ~none))
|
||||||
|
(cond
|
||||||
|
((null? ls)
|
||||||
|
res)
|
||||||
|
((not (pair? ls))
|
||||||
|
(lp (list ls) res))
|
||||||
|
(else
|
||||||
|
(lp (cdr ls)
|
||||||
|
(flag-join
|
||||||
|
res
|
||||||
|
(case (car ls)
|
||||||
|
((i ci case-insensitive) ~case-insensitive?)
|
||||||
|
((m multi-line) ~multi-line?)
|
||||||
|
((s single-line) ~single-line?)
|
||||||
|
((x ignore-space) ~ignore-space?)
|
||||||
|
(else #f))))))))
|
||||||
|
|
||||||
|
(define posix-escape-sequences
|
||||||
|
`((#\n . #\newline)
|
||||||
|
(#\r . #\return)
|
||||||
|
(#\t . #\tab)
|
||||||
|
(#\a . #\alarm)
|
||||||
|
(#\e . #\escape)))
|
||||||
|
|
||||||
|
(define (char-altcase c)
|
||||||
|
(if (char-upper-case? c) (char-downcase c) (char-upcase c)))
|
||||||
|
|
||||||
|
(define (char-mirror c)
|
||||||
|
(case c ((#\<) #\>) ((#\{) #\}) ((#\() #\)) ((#\[) #\]) (else c)))
|
||||||
|
|
||||||
|
(define (string-scan-char-escape str c . o)
|
||||||
|
(let ((end (string-length str)))
|
||||||
|
(let scan ((i (if (pair? o) (car o) 0)))
|
||||||
|
(cond ((= i end) #f)
|
||||||
|
((eqv? c (string-ref str i)) i)
|
||||||
|
((eqv? c #\\) (scan (+ i 2)))
|
||||||
|
(else (scan (+ i 1)))))))
|
||||||
|
|
||||||
|
(define (string-parse-hex-escape str i end)
|
||||||
|
(cond
|
||||||
|
((>= i end)
|
||||||
|
(error "incomplete hex escape" str i))
|
||||||
|
((eqv? #\{ (string-ref str i))
|
||||||
|
(let ((j (string-scan-char-escape str #\} (+ i 1))))
|
||||||
|
(if (not j)
|
||||||
|
(error "incomplete hex brace escape" str i)
|
||||||
|
(let* ((s (substring str (+ i 1) j))
|
||||||
|
(n (string->number s 16)))
|
||||||
|
(if n
|
||||||
|
(list (integer->char n) j)
|
||||||
|
(error "bad hex brace escape" s))))))
|
||||||
|
((>= (+ i 1) end)
|
||||||
|
(error "incomplete hex escape" str i))
|
||||||
|
(else
|
||||||
|
(let* ((s (substring str i (+ i 2)))
|
||||||
|
(n (string->number s 16)))
|
||||||
|
(if n
|
||||||
|
(list (integer->char n) (+ i 2))
|
||||||
|
(error "bad hex escape" s))))))
|
||||||
|
|
||||||
|
(define (string-parse-cset str start flags)
|
||||||
|
(let* ((end (string-length str))
|
||||||
|
(invert? (and (< start end) (eqv? #\^ (string-ref str start)))))
|
||||||
|
(define (cset-union a b)
|
||||||
|
(cond ((not a) b)
|
||||||
|
((not b) a)
|
||||||
|
((and (pair? a) (eq? 'or (car a))) `(,@a ,b))
|
||||||
|
(else `(or ,a ,b))))
|
||||||
|
(define (go i prev-char ones pairs classes)
|
||||||
|
(if (>= i end)
|
||||||
|
(error "incomplete char set" str i end))
|
||||||
|
(case (string-ref str i)
|
||||||
|
((#\])
|
||||||
|
(if (and (null? ones) (null? pairs))
|
||||||
|
(go (+ i 1) #\] (cons #\] ones) pairs classes)
|
||||||
|
(list
|
||||||
|
(let ((res
|
||||||
|
(cset-union
|
||||||
|
(cset-union
|
||||||
|
(and (pair? classes)
|
||||||
|
`(or ,@classes))
|
||||||
|
(and (pair? ones)
|
||||||
|
`(,(list->string (reverse ones)))))
|
||||||
|
(and (pair? pairs)
|
||||||
|
`(/ ,(list->string (reverse pairs)))))))
|
||||||
|
(if invert? `(~ ,res) res))
|
||||||
|
i)))
|
||||||
|
((#\-)
|
||||||
|
(cond
|
||||||
|
((or (= i start)
|
||||||
|
(and (= i (+ start 1)) invert?)
|
||||||
|
(eqv? #\] (string-ref str (+ i 1))))
|
||||||
|
(go (+ i 1) #\- (cons #\- ones) pairs classes))
|
||||||
|
;; alternately permissively allow this as a -
|
||||||
|
((not prev-char)
|
||||||
|
(error "bad pcre char-set, unexpected -" str))
|
||||||
|
(else
|
||||||
|
(let ((ch (string-ref str (+ i 1))))
|
||||||
|
(apply
|
||||||
|
(lambda (c j)
|
||||||
|
(if (char<? c prev-char)
|
||||||
|
(error "inverted range in pcre char-set" prev-char c)
|
||||||
|
(go j #f (cdr ones) (cons c (cons prev-char pairs))
|
||||||
|
classes)))
|
||||||
|
(cond
|
||||||
|
((and (eqv? #\\ ch)
|
||||||
|
(assv (string-ref str (+ i 2)) posix-escape-sequences))
|
||||||
|
=> (lambda (x) (list (cdr x) (+ i 3))))
|
||||||
|
((and (eqv? #\\ ch)
|
||||||
|
(eqv? (string-ref str (+ i 2)) #\x))
|
||||||
|
(string-parse-hex-escape str (+ i 3) end))
|
||||||
|
(else
|
||||||
|
(list ch (+ i 2)))))))))
|
||||||
|
((#\[)
|
||||||
|
(let* ((inv? (eqv? #\^ (string-ref str (+ i 1))))
|
||||||
|
(i2 (if inv? (+ i 2) (+ i 1))))
|
||||||
|
(case (string-ref str i2)
|
||||||
|
((#\:)
|
||||||
|
(let ((j (string-find str #\: (+ i2 1) end)))
|
||||||
|
(if (or (>= (+ j 1) end)
|
||||||
|
(not (eqv? #\] (string-ref str (+ j 1)))))
|
||||||
|
(error "incomplete character class" str)
|
||||||
|
(let* ((class (string->symbol (substring str (+ i2 1) j)))
|
||||||
|
(class (if inv? `(~ ,class) class)))
|
||||||
|
(go (+ j 2) #f ones pairs (cons class classes))))))
|
||||||
|
((#\= #\.)
|
||||||
|
(error "collating sequences not supported" str))
|
||||||
|
(else
|
||||||
|
(go (+ i 1) #\[ (cons #\[ ones) pairs classes)))))
|
||||||
|
((#\\)
|
||||||
|
(let ((c (string-ref str (+ i 1))))
|
||||||
|
(case c
|
||||||
|
((#\d #\D #\s #\S #\w #\W)
|
||||||
|
(go (+ i 2) #f ones pairs
|
||||||
|
(cons (pcre->sre (string #\\ c)) classes)))
|
||||||
|
((#\x)
|
||||||
|
(apply
|
||||||
|
(lambda (c j) (go j c (cons c ones) pairs classes))
|
||||||
|
(string-parse-hex-escape str (+ i 2) end)))
|
||||||
|
(else
|
||||||
|
(let ((c (cond ((assv c posix-escape-sequences) => cdr)
|
||||||
|
(else c))))
|
||||||
|
(go (+ i 2) c (cons c ones) pairs classes))))))
|
||||||
|
(else
|
||||||
|
=> (lambda (c) (go (+ i 1) c (cons c ones) pairs classes)))))
|
||||||
|
(if invert?
|
||||||
|
(let ((ones (if (flag-set? flags ~multi-line?) '(#\newline) '())))
|
||||||
|
(go (+ start 1) #f ones '() '()))
|
||||||
|
(go start #f '() '() '()))))
|
||||||
|
|
||||||
|
;; build a (seq ls ...) sre from a list
|
||||||
|
(define (sre-sequence ls)
|
||||||
|
(cond
|
||||||
|
((null? ls) 'epsilon)
|
||||||
|
((null? (cdr ls)) (car ls))
|
||||||
|
(else (cons 'seq ls))))
|
||||||
|
|
||||||
|
;; build a (or ls ...) sre from a list
|
||||||
|
(define (sre-alternate ls)
|
||||||
|
(cond
|
||||||
|
((null? ls) '(or))
|
||||||
|
((null? (cdr ls)) (car ls))
|
||||||
|
(else (cons 'or ls))))
|
||||||
|
|
||||||
|
;; returns #t if the sre can ever be empty
|
||||||
|
(define (sre-empty? sre)
|
||||||
|
(if (pair? sre)
|
||||||
|
(case (car sre)
|
||||||
|
((* ? look-ahead look-behind neg-look-ahead neg-look-behind) #t)
|
||||||
|
((**) (or (not (number? (cadr sre))) (zero? (cadr sre))))
|
||||||
|
((or) (any sre-empty? (cdr sre)))
|
||||||
|
((: seq $ submatch => submatch-named + atomic)
|
||||||
|
(every sre-empty? (cdr sre)))
|
||||||
|
(else #f))
|
||||||
|
(memq sre '(epsilon bos eos bol eol bow eow commit))))
|
||||||
|
|
||||||
|
;; returns #t if the sre is a */+ repetition
|
||||||
|
(define (sre-repeater? sre)
|
||||||
|
(and (pair? sre)
|
||||||
|
(or (memq (car sre) '(* +))
|
||||||
|
(and (memq (car sre) '($ submatch => submatch-named seq :))
|
||||||
|
(pair? (cdr sre))
|
||||||
|
(null? (cddr sre))
|
||||||
|
(sre-repeater? (cadr sre))))))
|
||||||
|
|
||||||
|
(define (pcre->sre str . o)
|
||||||
|
(if (not (string? str))
|
||||||
|
(error "pcre->sre: expected a string" str))
|
||||||
|
(let ((end (string-length str))
|
||||||
|
(orig-flags (if (pair? o) (symbol-list->flags (car o)) ~none)))
|
||||||
|
(let lp ((i 0) (from 0) (flags orig-flags) (res '()) (st '()))
|
||||||
|
;; accumulate the substring from..i as literal text
|
||||||
|
(define (collect)
|
||||||
|
(if (= i from) res (cons (substring str from i) res)))
|
||||||
|
;; like collect but breaks off the last single character when
|
||||||
|
;; collecting literal data, as the argument to ?/*/+ etc.
|
||||||
|
(define (collect/single)
|
||||||
|
(let ((j (- i 1)))
|
||||||
|
(cond
|
||||||
|
((< j from)
|
||||||
|
res)
|
||||||
|
(else
|
||||||
|
(let ((c (string-ref str j)))
|
||||||
|
(cond
|
||||||
|
((= j from)
|
||||||
|
(cons c res))
|
||||||
|
(else
|
||||||
|
(cons c (cons (substring str from j) res)))))))))
|
||||||
|
;; collects for use as a result, reversing and grouping OR
|
||||||
|
;; terms, and some ugly tweaking of `function-like' groups and
|
||||||
|
;; conditionals
|
||||||
|
(define (collect/terms)
|
||||||
|
(let* ((ls (collect))
|
||||||
|
(func
|
||||||
|
(and (pair? ls)
|
||||||
|
(memq (last ls)
|
||||||
|
'(atomic if look-ahead neg-look-ahead
|
||||||
|
look-behind neg-look-behind
|
||||||
|
=> submatch-named))))
|
||||||
|
(prefix (if (and func (memq (car func) '(=> submatch-named)))
|
||||||
|
(list 'submatch-named (cadr (reverse ls)))
|
||||||
|
(and func (list (car func)))))
|
||||||
|
(ls (if func
|
||||||
|
(if (memq (car func) '(=> submatch-named))
|
||||||
|
(reverse (cddr (reverse ls)))
|
||||||
|
(reverse (cdr (reverse ls))))
|
||||||
|
ls)))
|
||||||
|
(let lp ((ls ls) (term '()) (res '()))
|
||||||
|
(define (shift)
|
||||||
|
(cons (sre-sequence term) res))
|
||||||
|
(cond
|
||||||
|
((null? ls)
|
||||||
|
(let* ((res (sre-alternate (shift)))
|
||||||
|
(res (if (flag-set? flags ~save?)
|
||||||
|
(list 'submatch res)
|
||||||
|
res)))
|
||||||
|
(if prefix
|
||||||
|
(if (eq? 'if (car prefix))
|
||||||
|
(cond
|
||||||
|
((not (pair? res))
|
||||||
|
'epsilon)
|
||||||
|
((memq (car res)
|
||||||
|
'(look-ahead neg-look-ahead
|
||||||
|
look-behind neg-look-behind))
|
||||||
|
res)
|
||||||
|
((eq? 'seq (car res))
|
||||||
|
`(if ,(cadr res)
|
||||||
|
,(sre-sequence (cddr res))))
|
||||||
|
(else
|
||||||
|
`(if ,(cadadr res)
|
||||||
|
,(sre-sequence (cddadr res))
|
||||||
|
,(sre-alternate (cddr res)))))
|
||||||
|
`(,@prefix ,res))
|
||||||
|
res)))
|
||||||
|
((eq? 'or (car ls)) (lp (cdr ls) '() (shift)))
|
||||||
|
(else (lp (cdr ls) (cons (car ls) term) res))))))
|
||||||
|
(define (save)
|
||||||
|
(cons (cons flags (collect)) st))
|
||||||
|
;; main parsing
|
||||||
|
(cond
|
||||||
|
((>= i end)
|
||||||
|
(if (pair? st)
|
||||||
|
(error "unterminated parenthesis in regexp" str)
|
||||||
|
(collect/terms)))
|
||||||
|
(else
|
||||||
|
(case (string-ref str i)
|
||||||
|
((#\.)
|
||||||
|
(lp (+ i 1) (+ i 1) flags
|
||||||
|
(cons (if (flag-set? flags ~single-line?) 'any 'nonl)
|
||||||
|
(collect))
|
||||||
|
st))
|
||||||
|
((#\?)
|
||||||
|
(let ((res (collect/single)))
|
||||||
|
(if (null? res)
|
||||||
|
(error "? can't follow empty pattern" str res)
|
||||||
|
(let ((x (car res)))
|
||||||
|
(lp (+ i 1)
|
||||||
|
(+ i 1)
|
||||||
|
flags
|
||||||
|
(cons
|
||||||
|
(if (pair? x)
|
||||||
|
(case (car x)
|
||||||
|
((*) `(*? ,@(cdr x)))
|
||||||
|
((+) `(**? 1 #f ,@(cdr x)))
|
||||||
|
((?) `(?? ,@(cdr x)))
|
||||||
|
((**) `(**? ,@(cdr x)))
|
||||||
|
((=) `(**? ,(cadr x) ,@(cdr x)))
|
||||||
|
((>=) `(**? ,(cadr x) #f ,@(cddr x)))
|
||||||
|
(else `(? ,x)))
|
||||||
|
`(? ,x))
|
||||||
|
(cdr res))
|
||||||
|
st)))))
|
||||||
|
((#\+ #\*)
|
||||||
|
(let* ((res (collect/single))
|
||||||
|
(x (if (pair? res) (car res) 'epsilon))
|
||||||
|
(op (string->symbol (string (string-ref str i)))))
|
||||||
|
(cond
|
||||||
|
((sre-repeater? x)
|
||||||
|
(error "duplicate repetition (e.g. **) in pattern" str res))
|
||||||
|
((sre-empty? x)
|
||||||
|
(error "can't repeat empty pattern (e.g. ()*)" str res))
|
||||||
|
(else
|
||||||
|
(lp (+ i 1) (+ i 1) flags
|
||||||
|
(cons (list op x) (cdr res))
|
||||||
|
st)))))
|
||||||
|
((#\()
|
||||||
|
(cond
|
||||||
|
((>= (+ i 1) end)
|
||||||
|
(error "unterminated parenthesis in regexp" str))
|
||||||
|
((not (memv (string-ref str (+ i 1)) '(#\? #\*))) ; normal case
|
||||||
|
(lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save)))
|
||||||
|
((>= (+ i 2) end)
|
||||||
|
(error "unterminated parenthesis in regexp" str))
|
||||||
|
((eqv? (string-ref str (+ i 1)) #\*)
|
||||||
|
(error "bad regexp syntax: (*FOO) not supported" str))
|
||||||
|
(else ;; (?...) case
|
||||||
|
(case (string-ref str (+ i 2))
|
||||||
|
((#\#)
|
||||||
|
(let ((j (string-find str #\) (+ i 3))))
|
||||||
|
(lp (+ j i) (min (+ j 1) end) flags (collect) st)))
|
||||||
|
((#\:)
|
||||||
|
(lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save)))
|
||||||
|
((#\=)
|
||||||
|
(lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
|
||||||
|
'(look-ahead) (save)))
|
||||||
|
((#\!)
|
||||||
|
(lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
|
||||||
|
'(neg-look-ahead) (save)))
|
||||||
|
((#\<)
|
||||||
|
(cond
|
||||||
|
((>= (+ i 3) end)
|
||||||
|
(error "unterminated parenthesis in regexp" str))
|
||||||
|
(else
|
||||||
|
(case (string-ref str (+ i 3))
|
||||||
|
((#\=)
|
||||||
|
(lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
|
||||||
|
'(look-behind) (save)))
|
||||||
|
((#\!)
|
||||||
|
(lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
|
||||||
|
'(neg-look-behind) (save)))
|
||||||
|
(else
|
||||||
|
(let ((j (and (char-alphabetic?
|
||||||
|
(string-ref str (+ i 3)))
|
||||||
|
(string-find str #\> (+ i 4)))))
|
||||||
|
(if (< j end)
|
||||||
|
(lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
|
||||||
|
`(,(string->symbol (substring str (+ i 3) j))
|
||||||
|
submatch-named)
|
||||||
|
(save))
|
||||||
|
(error "invalid (?< sequence" str))))))))
|
||||||
|
((#\>)
|
||||||
|
(lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
|
||||||
|
'(atomic) (save)))
|
||||||
|
;;((#\' #\P) ; named subpatterns
|
||||||
|
;; )
|
||||||
|
;;((#\R) ; recursion
|
||||||
|
;; )
|
||||||
|
((#\()
|
||||||
|
(cond
|
||||||
|
((>= (+ i 3) end)
|
||||||
|
(error "unterminated parenthesis in regexp" str))
|
||||||
|
((char-numeric? (string-ref str (+ i 3)))
|
||||||
|
(let* ((j (string-find str #\) (+ i 3)))
|
||||||
|
(n (string->number (substring str (+ i 3) j))))
|
||||||
|
(if (or (= j end) (not n))
|
||||||
|
(error "invalid conditional reference" str)
|
||||||
|
(lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
|
||||||
|
`(,n if) (save)))))
|
||||||
|
((char-alphabetic? (string-ref str (+ i 3)))
|
||||||
|
(let ((j (string-find str #\) (+ i 3))))
|
||||||
|
(if (= j end)
|
||||||
|
(error "invalid named conditional reference" str)
|
||||||
|
(lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
|
||||||
|
`(,(string->symbol (substring str (+ i 3) j)) if)
|
||||||
|
(save)))))
|
||||||
|
(else
|
||||||
|
(lp (+ i 2) (+ i 2) (flag-clear flags ~save?)
|
||||||
|
'(if) (save)))))
|
||||||
|
((#\{)
|
||||||
|
(error "unsupported Perl-style cluster" str))
|
||||||
|
(else
|
||||||
|
(let ((old-flags flags))
|
||||||
|
(let lp2 ((j (+ i 2)) (flags flags) (invert? #f))
|
||||||
|
(define (join x)
|
||||||
|
((if invert? flag-clear flag-join) flags x))
|
||||||
|
(cond
|
||||||
|
((>= j end)
|
||||||
|
(error "incomplete cluster" str i))
|
||||||
|
(else
|
||||||
|
(case (string-ref str j)
|
||||||
|
((#\i)
|
||||||
|
(lp2 (+ j 1) (join ~case-insensitive?) invert?))
|
||||||
|
((#\m)
|
||||||
|
(lp2 (+ j 1) (join ~multi-line?) invert?))
|
||||||
|
((#\x)
|
||||||
|
(lp2 (+ j 1) (join ~ignore-space?) invert?))
|
||||||
|
((#\-)
|
||||||
|
(lp2 (+ j 1) flags (not invert?)))
|
||||||
|
((#\))
|
||||||
|
(lp (+ j 1) (+ j 1) flags (collect)
|
||||||
|
st))
|
||||||
|
((#\:)
|
||||||
|
(lp (+ j 1) (+ j 1) flags '()
|
||||||
|
(cons (cons old-flags (collect)) st)))
|
||||||
|
(else
|
||||||
|
(error "unknown regex cluster modifier" str)
|
||||||
|
)))))))))))
|
||||||
|
((#\))
|
||||||
|
(if (null? st)
|
||||||
|
(error "too many )'s in regexp" str)
|
||||||
|
(lp (+ i 1)
|
||||||
|
(+ i 1)
|
||||||
|
(caar st)
|
||||||
|
(cons (collect/terms) (cdar st))
|
||||||
|
(cdr st))))
|
||||||
|
((#\[)
|
||||||
|
(apply
|
||||||
|
(lambda (sre j)
|
||||||
|
(lp (+ j 1) (+ j 1) flags (cons sre (collect)) st))
|
||||||
|
(string-parse-cset str (+ i 1) flags)))
|
||||||
|
((#\{)
|
||||||
|
(cond
|
||||||
|
((or (>= (+ i 1) end)
|
||||||
|
(not (or (char-numeric? (string-ref str (+ i 1)))
|
||||||
|
(eqv? #\, (string-ref str (+ i 1))))))
|
||||||
|
(lp (+ i 1) from flags res st))
|
||||||
|
(else
|
||||||
|
(let ((res (collect/single)))
|
||||||
|
(cond
|
||||||
|
((null? res)
|
||||||
|
(error "{ can't follow empty pattern"))
|
||||||
|
(else
|
||||||
|
(let* ((x (car res))
|
||||||
|
(tail (cdr res))
|
||||||
|
(j (string-find str #\} (+ i 1)))
|
||||||
|
(s2 (string-split (substring str (+ i 1) j) #\,))
|
||||||
|
(n (string->number (car s2)))
|
||||||
|
(m (and (pair? (cdr s2))
|
||||||
|
(string->number (cadr s2)))))
|
||||||
|
(cond
|
||||||
|
((or (= j end)
|
||||||
|
(not n)
|
||||||
|
(and (pair? (cdr s2))
|
||||||
|
(not (equal? "" (cadr s2)))
|
||||||
|
(not m)))
|
||||||
|
(error "invalid {n} repetition syntax" s2))
|
||||||
|
((null? (cdr s2))
|
||||||
|
(lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st))
|
||||||
|
(m
|
||||||
|
(lp (+ j 1) (+ j 1) flags `((** ,n ,m ,x) ,@tail) st))
|
||||||
|
(else
|
||||||
|
(lp (+ j 1) (+ j 1) flags `((>= ,n ,x) ,@tail) st)
|
||||||
|
)))))))))
|
||||||
|
((#\\)
|
||||||
|
(cond
|
||||||
|
((>= (+ i 1) end)
|
||||||
|
(error "incomplete escape sequence" str))
|
||||||
|
(else
|
||||||
|
(let ((c (string-ref str (+ i 1))))
|
||||||
|
(case c
|
||||||
|
((#\d)
|
||||||
|
(lp (+ i 2) (+ i 2) flags `(numeric ,@(collect)) st))
|
||||||
|
((#\D)
|
||||||
|
(lp (+ i 2) (+ i 2) flags `((~ numeric) ,@(collect)) st))
|
||||||
|
((#\s)
|
||||||
|
(lp (+ i 2) (+ i 2) flags `(space ,@(collect)) st))
|
||||||
|
((#\S)
|
||||||
|
(lp (+ i 2) (+ i 2) flags `((~ space) ,@(collect)) st))
|
||||||
|
((#\w)
|
||||||
|
(lp (+ i 2) (+ i 2) flags
|
||||||
|
`((or alphanumeric ("_")) ,@(collect)) st))
|
||||||
|
((#\W)
|
||||||
|
(lp (+ i 2) (+ i 2) flags
|
||||||
|
`((~ (or alphanumeric ("_"))) ,@(collect)) st))
|
||||||
|
((#\b)
|
||||||
|
(lp (+ i 2) (+ i 2) flags
|
||||||
|
`((or bow eow) ,@(collect)) st))
|
||||||
|
((#\B)
|
||||||
|
(lp (+ i 2) (+ i 2) flags `(nwb ,@(collect)) st))
|
||||||
|
((#\A)
|
||||||
|
(lp (+ i 2) (+ i 2) flags `(bos ,@(collect)) st))
|
||||||
|
((#\Z)
|
||||||
|
(lp (+ i 2) (+ i 2) flags
|
||||||
|
`((? #\newline) eos ,@(collect)) st))
|
||||||
|
((#\z)
|
||||||
|
(lp (+ i 2) (+ i 2) flags `(eos ,@(collect)) st))
|
||||||
|
((#\R)
|
||||||
|
(lp (+ i 2) (+ i 2) flags `(newline ,@(collect)) st))
|
||||||
|
((#\K)
|
||||||
|
(lp (+ i 2) (+ i 2) flags `(reset ,@(collect)) st))
|
||||||
|
;; these two are from Emacs and TRE, but not in PCRE
|
||||||
|
((#\<)
|
||||||
|
(lp (+ i 2) (+ i 2) flags `(bow ,@(collect)) st))
|
||||||
|
((#\>)
|
||||||
|
(lp (+ i 2) (+ i 2) flags `(eow ,@(collect)) st))
|
||||||
|
((#\x)
|
||||||
|
(apply
|
||||||
|
(lambda (ch j)
|
||||||
|
(lp (+ j 1) (+ j 1) flags `(,ch ,@(collect)) st))
|
||||||
|
(string-parse-hex-escape str (+ i 2) end)))
|
||||||
|
((#\k)
|
||||||
|
(let ((c (string-ref str (+ i 2))))
|
||||||
|
(if (not (memv c '(#\< #\{ #\')))
|
||||||
|
(error "bad \\k usage, expected \\k<...>" str)
|
||||||
|
(let* ((terminal (char-mirror c))
|
||||||
|
(j (string-find str terminal (+ i 2)))
|
||||||
|
(s (substring str (+ i 3) j))
|
||||||
|
(backref
|
||||||
|
(if (flag-set? flags ~case-insensitive?)
|
||||||
|
'backref-ci
|
||||||
|
'backref)))
|
||||||
|
(if (= j end)
|
||||||
|
(error "unterminated named backref" str)
|
||||||
|
(lp (+ j 1) (+ j 1) flags
|
||||||
|
`((,backref ,(string->symbol s))
|
||||||
|
,@(collect))
|
||||||
|
st))))))
|
||||||
|
((#\Q) ;; \Q..\E escapes
|
||||||
|
(let ((res (collect)))
|
||||||
|
(let lp2 ((j (+ i 2)))
|
||||||
|
(cond
|
||||||
|
((>= j end)
|
||||||
|
(lp j (+ i 2) flags res st))
|
||||||
|
((eqv? #\\ (string-ref str j))
|
||||||
|
(cond
|
||||||
|
((>= (+ j 1) end)
|
||||||
|
(lp (+ j 1) (+ i 2) flags res st))
|
||||||
|
((eqv? #\E (string-ref str (+ j 1)))
|
||||||
|
(lp (+ j 2) (+ j 2) flags
|
||||||
|
(cons (substring str (+ i 2) j) res) st))
|
||||||
|
(else
|
||||||
|
(lp2 (+ j 2)))))
|
||||||
|
(else
|
||||||
|
(lp2 (+ j 1)))))))
|
||||||
|
;;((#\p) ; XXXX unicode properties
|
||||||
|
;; )
|
||||||
|
;;((#\P)
|
||||||
|
;; )
|
||||||
|
(else
|
||||||
|
(cond
|
||||||
|
((char-numeric? c)
|
||||||
|
(let* ((j (string-skip str char-numeric? (+ i 2)))
|
||||||
|
(backref
|
||||||
|
(if (flag-set? flags ~case-insensitive?)
|
||||||
|
'backref-ci
|
||||||
|
'backref))
|
||||||
|
(res `((,backref ,(string->number
|
||||||
|
(substring str (+ i 1) j)))
|
||||||
|
,@(collect))))
|
||||||
|
(lp j j flags res st)))
|
||||||
|
((char-alphabetic? c)
|
||||||
|
(let ((cell (assv c posix-escape-sequences)))
|
||||||
|
(if cell
|
||||||
|
(lp (+ i 2) (+ i 2) flags
|
||||||
|
(cons (cdr cell) (collect)) st)
|
||||||
|
(error "unknown escape sequence" str c))))
|
||||||
|
(else
|
||||||
|
(lp (+ i 2) (+ i 1) flags (collect) st)))))))))
|
||||||
|
((#\|)
|
||||||
|
(lp (+ i 1) (+ i 1) flags (cons 'or (collect)) st))
|
||||||
|
((#\^)
|
||||||
|
(let ((sym (if (flag-set? flags ~multi-line?) 'bol 'bos)))
|
||||||
|
(lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
|
||||||
|
((#\$)
|
||||||
|
(let ((sym (if (flag-set? flags ~multi-line?) 'eol 'eos)))
|
||||||
|
(lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
|
||||||
|
((#\space)
|
||||||
|
(if (flag-set? flags ~ignore-space?)
|
||||||
|
(lp (+ i 1) (+ i 1) flags (collect) st)
|
||||||
|
(lp (+ i 1) from flags res st)))
|
||||||
|
((#\#)
|
||||||
|
(if (flag-set? flags ~ignore-space?)
|
||||||
|
(let ((j (string-find str #\newline (+ i 1))))
|
||||||
|
(lp (+ j 1) (min (+ j 1) end) flags (collect) st))
|
||||||
|
(lp (+ i 1) from flags res st)))
|
||||||
|
(else
|
||||||
|
(lp (+ i 1) from flags res st))))))))
|
||||||
|
|
||||||
|
(define (pcre->regexp pcre . o)
|
||||||
|
(regexp (apply pcre->sre pcre o)))
|
7
lib/chibi/regexp/pcre.sld
Normal file
7
lib/chibi/regexp/pcre.sld
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
|
||||||
|
(define-library (chibi regexp pcre)
|
||||||
|
(export pcre->sre pcre->regexp)
|
||||||
|
(import (scheme base) (scheme char) (scheme cxr)
|
||||||
|
(srfi 1) (srfi 33)
|
||||||
|
(chibi string) (chibi regexp))
|
||||||
|
(include "pcre.scm"))
|
135
tests/re-tests.txt
Normal file
135
tests/re-tests.txt
Normal file
|
@ -0,0 +1,135 @@
|
||||||
|
abc abc y & abc
|
||||||
|
abc xbc n - -
|
||||||
|
abc axc n - -
|
||||||
|
abc abx n - -
|
||||||
|
abc xabcy y & abc
|
||||||
|
abc ababc y & abc
|
||||||
|
ab*c abc y & abc
|
||||||
|
ab*bc abc y & abc
|
||||||
|
ab*bc abbc y & abbc
|
||||||
|
ab*bc abbbbc y & abbbbc
|
||||||
|
ab+bc abbc y & abbc
|
||||||
|
ab+bc abc n - -
|
||||||
|
ab+bc abq n - -
|
||||||
|
ab+bc abbbbc y & abbbbc
|
||||||
|
ab?bc abbc y & abbc
|
||||||
|
ab?bc abc y & abc
|
||||||
|
ab?bc abbbbc n - -
|
||||||
|
ab?c abc y & abc
|
||||||
|
^abc$ abc y & abc
|
||||||
|
^abc$ abcc n - -
|
||||||
|
^abc abcc y & abc
|
||||||
|
^abc$ aabc n - -
|
||||||
|
abc$ aabc y & abc
|
||||||
|
^ abc y &
|
||||||
|
$ abc y &
|
||||||
|
a.c abc y & abc
|
||||||
|
a.c axc y & axc
|
||||||
|
a.*c axyzc y & axyzc
|
||||||
|
a.*c axyzd n - -
|
||||||
|
a[bc]d abc n - -
|
||||||
|
a[bc]d abd y & abd
|
||||||
|
a[b-d]e abd n - -
|
||||||
|
a[b-d]e ace y & ace
|
||||||
|
a[b-d] aac y & ac
|
||||||
|
a[-b] a- y & a-
|
||||||
|
a[b-] a- y & a-
|
||||||
|
[k] ab n - -
|
||||||
|
a[b-a] - c - -
|
||||||
|
a[]b - c - -
|
||||||
|
a[ - c - -
|
||||||
|
a] a] y & a]
|
||||||
|
a[]]b a]b y & a]b
|
||||||
|
a[^bc]d aed y & aed
|
||||||
|
a[^bc]d abd n - -
|
||||||
|
a[^-b]c adc y & adc
|
||||||
|
a[^-b]c a-c n - -
|
||||||
|
a[^]b]c a]c n - -
|
||||||
|
a[^]b]c adc y & adc
|
||||||
|
ab|cd abc y & ab
|
||||||
|
ab|cd abcd y & ab
|
||||||
|
()ef def y &-\1 ef-
|
||||||
|
()* - c - -
|
||||||
|
*a - c - -
|
||||||
|
^* - c - -
|
||||||
|
$* - c - -
|
||||||
|
(*)b - c - -
|
||||||
|
$b b n - -
|
||||||
|
a\ - c - -
|
||||||
|
a\(b a(b y &-\1 a(b-
|
||||||
|
a\(*b ab y & ab
|
||||||
|
a\(*b a((b y & a((b
|
||||||
|
a\\b a\b y & a\b
|
||||||
|
abc) - c - -
|
||||||
|
(abc - c - -
|
||||||
|
((a)) abc y &-\1-\2 a-a-a
|
||||||
|
(a)b(c) abc y &-\1-\2 abc-a-c
|
||||||
|
a+b+c aabbabc y & abc
|
||||||
|
a** - c - -
|
||||||
|
(a*)* - c - -
|
||||||
|
(a*)+ - c - -
|
||||||
|
(a|)* - c - -
|
||||||
|
(a*|b)* - c - -
|
||||||
|
(a+|b)* ab y &-\1 ab-b
|
||||||
|
(a+|b)+ ab y &-\1 ab-b
|
||||||
|
(a+|b)? ab y &-\1 a-a
|
||||||
|
[^ab]* cde y & cde
|
||||||
|
(^)* - c - -
|
||||||
|
(ab|)* - c - -
|
||||||
|
)( - c - -
|
||||||
|
abc y &
|
||||||
|
abc n - -
|
||||||
|
a* y &
|
||||||
|
abcd abcd y &-\&-\\& abcd-&-\abcd
|
||||||
|
a(bc)d abcd y \1-\\1-\\\1 bc-\1-\bc
|
||||||
|
([abc])*d abbbcd y &-\1 abbbcd-c
|
||||||
|
([abc])*bcd abcd y &-\1 abcd-a
|
||||||
|
a|b|c|d|e e y & e
|
||||||
|
(a|b|c|d|e)f ef y &-\1 ef-e
|
||||||
|
((a*|b))* - c - -
|
||||||
|
abcd*efg abcdefg y & abcdefg
|
||||||
|
ab* xabyabbbz y & ab
|
||||||
|
ab* xayabbbz y & a
|
||||||
|
(ab|cd)e abcde y &-\1 cde-cd
|
||||||
|
[abhgefdc]ij hij y & hij
|
||||||
|
^(ab|cd)e abcde n x\1y xy
|
||||||
|
(abc|)ef abcdef y &-\1 ef-
|
||||||
|
(a|b)c*d abcd y &-\1 bcd-b
|
||||||
|
(ab|ab*)bc abc y &-\1 abc-a
|
||||||
|
(?:(a)b|ac) ac y &-\1 ac-
|
||||||
|
a([bc]*)c* abc y &-\1 abc-bc
|
||||||
|
a([bc]*)(c*d) abcd y &-\1-\2 abcd-bc-d
|
||||||
|
a([bc]+)(c*d) abcd y &-\1-\2 abcd-bc-d
|
||||||
|
a([bc]*)(c+d) abcd y &-\1-\2 abcd-b-cd
|
||||||
|
a[bcd]*dcdcde adcdcde y & adcdcde
|
||||||
|
a[bcd]+dcdcde adcdcde n - -
|
||||||
|
(ab|a)b*c abc y &-\1 abc-ab
|
||||||
|
(.*)b abc y &-\1 ab-a
|
||||||
|
((a)(b)c)(d) abcd y \1-\2-\3-\4 abc-a-b-d
|
||||||
|
((a)(b)?c)(d) abcd y \1-\2-\3-\4 abc-a-b-d
|
||||||
|
((a)(b)?c)(d) acd y \1-\2-\3-\4 ac-a--d
|
||||||
|
((aa)(bb)?cc)(dd) aaccdd y \1-\2-\3-\4 aacc-aa--dd
|
||||||
|
[ -~]* abc y & abc
|
||||||
|
[ -~ -~]* abc y & abc
|
||||||
|
[ -~ -~ -~]* abc y & abc
|
||||||
|
[ -~ -~ -~ -~]* abc y & abc
|
||||||
|
[ -~ -~ -~ -~ -~]* abc y & abc
|
||||||
|
[ -~ -~ -~ -~ -~ -~]* abc y & abc
|
||||||
|
[ -~ -~ -~ -~ -~ -~ -~]* abc y & abc
|
||||||
|
[a-zA-Z_][a-zA-Z0-9_]* alpha y & alpha
|
||||||
|
^a(bc+|b[eh])g|.h$ abh y &-\1 bh-
|
||||||
|
(bc+d$|ef*g.|h?i(j|k)) effgz y &-\1-\2 effgz-effgz-
|
||||||
|
(bc+d$|ef*g.|h?i(j|k)) ij y &-\1-\2 ij-ij-j
|
||||||
|
(bc+d$|ef*g.|h?i(j|k)) effg n - -
|
||||||
|
(bc+d$|ef*g.|h?i(j|k)) bcdd n - -
|
||||||
|
(bc+d$|ef*g.|h?i(j|k)) reffgz y &-\1-\2 effgz-effgz-
|
||||||
|
((((((((((a))))))))) - c - -
|
||||||
|
((((((((((a)))))))))) a y &-\10 a-a
|
||||||
|
(((((((((a))))))))) a y & a
|
||||||
|
multiple words of text uh-uh n - -
|
||||||
|
multiple words multiple words, yeah y & multiple words
|
||||||
|
(.*)c(.*) abcde y &-\1-\2 abcde-ab-de
|
||||||
|
\((.*), (.*)\) (a, b) y (\2, \1) (b, a)
|
||||||
|
(we|wee|week)(knights|night) weeknights y &-\1-\2 weeknights-wee-knights
|
||||||
|
(a([^a])*)* abcaBC y &-\1-\2 abcaBC-aBC-C
|
||||||
|
a([\/\\]*)b a//\\b y &-\1 a//\\b-//\\
|
214
tests/regexp-tests.scm
Normal file
214
tests/regexp-tests.scm
Normal file
|
@ -0,0 +1,214 @@
|
||||||
|
|
||||||
|
(import (chibi) (chibi regexp) (chibi regexp pcre)
|
||||||
|
(chibi string) (chibi io) (chibi match) (chibi test))
|
||||||
|
|
||||||
|
(define (regexp-match->sexp rx str . o)
|
||||||
|
(let ((res (apply regexp-match rx str o)))
|
||||||
|
(and res (rx-match->sexp res str))))
|
||||||
|
|
||||||
|
(define-syntax test-re
|
||||||
|
(syntax-rules ()
|
||||||
|
((test-re res rx str start end)
|
||||||
|
(test res (regexp-match->sexp rx str start end)))
|
||||||
|
((test-re res rx str start)
|
||||||
|
(test-re res rx str start (string-cursor-end str)))
|
||||||
|
((test-re res rx str)
|
||||||
|
(test-re res rx str (string-cursor-start str)))))
|
||||||
|
|
||||||
|
(define (regexp-search->sexp rx str . o)
|
||||||
|
(let ((res (apply regexp-search rx str o)))
|
||||||
|
(and res (rx-match->sexp res str))))
|
||||||
|
|
||||||
|
(define-syntax test-re-search
|
||||||
|
(syntax-rules ()
|
||||||
|
((test-re-search res rx str start end)
|
||||||
|
(test res (regexp-search->sexp rx str start end)))
|
||||||
|
((test-re-search res rx str start)
|
||||||
|
(test-re-search res rx str start (string-cursor-end str)))
|
||||||
|
((test-re-search res rx str)
|
||||||
|
(test-re-search res rx str (string-cursor-start str)))))
|
||||||
|
|
||||||
|
(test-begin "regexp")
|
||||||
|
|
||||||
|
(test-re '("ababc" "abab")
|
||||||
|
'(: ($ (* "ab")) "c")
|
||||||
|
"ababc")
|
||||||
|
|
||||||
|
(test-re '("ababc" "abab")
|
||||||
|
'(: ($ (* "ab")) "c")
|
||||||
|
"xababc"
|
||||||
|
1)
|
||||||
|
|
||||||
|
(test-re-search '("y") '(: "y") "xy")
|
||||||
|
|
||||||
|
(test-re-search '("ababc" "abab")
|
||||||
|
'(: ($ (* "ab")) "c")
|
||||||
|
"xababc")
|
||||||
|
|
||||||
|
(test-re #f
|
||||||
|
'(: (* any) ($ "foo" (* any)) ($ "bar" (* any)))
|
||||||
|
"fooxbafba")
|
||||||
|
|
||||||
|
(test-re '("fooxbarfbar" "fooxbarf" "bar")
|
||||||
|
'(: (* any) ($ "foo" (* any)) ($ "bar" (* any)))
|
||||||
|
"fooxbarfbar")
|
||||||
|
|
||||||
|
(test-re '("abcd" "abcd")
|
||||||
|
'($ (* (or "ab" "cd")))
|
||||||
|
"abcd")
|
||||||
|
|
||||||
|
;; first match is a list of ab's, second match is the last (temporary) cd
|
||||||
|
(test-re '("abcdc" (("ab") ("cd")) "cd")
|
||||||
|
'(: (* (*$ (or "ab" "cd"))) "c")
|
||||||
|
"abcdc")
|
||||||
|
|
||||||
|
(test-re '("ababc" "abab")
|
||||||
|
'(: bos ($ (* "ab")) "c")
|
||||||
|
"ababc")
|
||||||
|
(test-re '("ababc" "abab")
|
||||||
|
'(: ($ (* "ab")) "c" eos)
|
||||||
|
"ababc")
|
||||||
|
(test-re '("ababc" "abab")
|
||||||
|
'(: bos ($ (* "ab")) "c" eos)
|
||||||
|
"ababc")
|
||||||
|
(test-re #f
|
||||||
|
'(: bos ($ (* "ab")) eos "c")
|
||||||
|
"ababc")
|
||||||
|
(test-re #f
|
||||||
|
'(: ($ (* "ab")) bos "c" eos)
|
||||||
|
"ababc")
|
||||||
|
|
||||||
|
(test-re '("ababc" "abab")
|
||||||
|
'(: bol ($ (* "ab")) "c")
|
||||||
|
"ababc")
|
||||||
|
(test-re '("ababc" "abab")
|
||||||
|
'(: ($ (* "ab")) "c" eol)
|
||||||
|
"ababc")
|
||||||
|
(test-re '("ababc" "abab")
|
||||||
|
'(: bol ($ (* "ab")) "c" eol)
|
||||||
|
"ababc")
|
||||||
|
(test-re #f
|
||||||
|
'(: bol ($ (* "ab")) eol "c")
|
||||||
|
"ababc")
|
||||||
|
(test-re #f
|
||||||
|
'(: ($ (* "ab")) bol "c" eol)
|
||||||
|
"ababc")
|
||||||
|
(test-re '("\nabc\n" "abc")
|
||||||
|
'(: (* #\newline) bol ($ (* alpha)) eol (* #\newline))
|
||||||
|
"\nabc\n")
|
||||||
|
(test-re #f
|
||||||
|
'(: (* #\newline) bol ($ (* alpha)) eol (* #\newline))
|
||||||
|
"\n'abc\n")
|
||||||
|
(test-re #f
|
||||||
|
'(: (* #\newline) bol ($ (* alpha)) eol (* #\newline))
|
||||||
|
"\nabc.\n")
|
||||||
|
|
||||||
|
(test-re '("ababc" "abab")
|
||||||
|
'(: bow ($ (* "ab")) "c")
|
||||||
|
"ababc")
|
||||||
|
(test-re '("ababc" "abab")
|
||||||
|
'(: ($ (* "ab")) "c" eow)
|
||||||
|
"ababc")
|
||||||
|
(test-re '("ababc" "abab")
|
||||||
|
'(: bow ($ (* "ab")) "c" eow)
|
||||||
|
"ababc")
|
||||||
|
(test-re #f
|
||||||
|
'(: bow ($ (* "ab")) eow "c")
|
||||||
|
"ababc")
|
||||||
|
(test-re #f
|
||||||
|
'(: ($ (* "ab")) bow "c" eow)
|
||||||
|
"ababc")
|
||||||
|
(test-re '(" abc " "abc")
|
||||||
|
'(: (* space) bow ($ (* alpha)) eow (* space))
|
||||||
|
" abc ")
|
||||||
|
(test-re #f
|
||||||
|
'(: (* space) bow ($ (* alpha)) eow (* space))
|
||||||
|
" 'abc ")
|
||||||
|
(test-re #f
|
||||||
|
'(: (* space) bow ($ (* alpha)) eow (* space))
|
||||||
|
" abc. ")
|
||||||
|
|
||||||
|
(test-re '("beef")
|
||||||
|
'(* (/"af"))
|
||||||
|
"beef")
|
||||||
|
|
||||||
|
(test-re '("12345beef" "beef")
|
||||||
|
'(: (* digit) ($ (* (/"af"))))
|
||||||
|
"12345beef")
|
||||||
|
|
||||||
|
(test-re '("12345BeeF" "BeeF")
|
||||||
|
'(: (* digit) (w/nocase ($ (* (/"af")))))
|
||||||
|
"12345BeeF")
|
||||||
|
|
||||||
|
(test '("123" "456" "789") (regexp-extract '(+ digit) "abc123def456ghi789"))
|
||||||
|
(test '("123" "456" "789") (regexp-extract '(* digit) "abc123def456ghi789"))
|
||||||
|
(test '("abc" "def" "ghi") (regexp-split '(+ digit) "abc123def456ghi789"))
|
||||||
|
(test '("a" "b" "c" "d" "e" "f" "g" "h" "i")
|
||||||
|
(regexp-split '(* digit) "abc123def456ghi789"))
|
||||||
|
|
||||||
|
(test "abc def" (regexp-replace '(+ space) "abc \t\n def" " "))
|
||||||
|
(test " abc-abc"
|
||||||
|
(regexp-replace '(: ($ (+ alpha)) ":" (* space)) " abc: " '(1 "-" 1)))
|
||||||
|
(test " abc- abc"
|
||||||
|
(regexp-replace '(: ($ (+ alpha)) ":" (* space)) " abc: " '(1 "-" pre 1)))
|
||||||
|
|
||||||
|
(test " abc d ef " (regexp-replace-all '(+ space) " abc \t\n d ef " " "))
|
||||||
|
|
||||||
|
(define (subst-matches matches input subst)
|
||||||
|
(define (submatch n)
|
||||||
|
(rx-match-submatch matches input n))
|
||||||
|
(and
|
||||||
|
matches
|
||||||
|
(call-with-output-string
|
||||||
|
(lambda (out)
|
||||||
|
(call-with-input-string subst
|
||||||
|
(lambda (in)
|
||||||
|
(let lp ()
|
||||||
|
(let ((c (read-char in)))
|
||||||
|
(cond
|
||||||
|
((not (eof-object? c))
|
||||||
|
(case c
|
||||||
|
((#\&)
|
||||||
|
(display (or (submatch 0) "") out))
|
||||||
|
((#\\)
|
||||||
|
(let ((c (read-char in)))
|
||||||
|
(if (char-numeric? c)
|
||||||
|
(let lp ((res (list c)))
|
||||||
|
(if (and (char? (peek-char in))
|
||||||
|
(char-numeric? (peek-char in)))
|
||||||
|
(lp (cons (read-char in) res))
|
||||||
|
(display
|
||||||
|
(or (submatch (string->number
|
||||||
|
(list->string (reverse res))))
|
||||||
|
"")
|
||||||
|
out)))
|
||||||
|
(write-char c out))))
|
||||||
|
(else
|
||||||
|
(write-char c out)))
|
||||||
|
(lp)))))))))))
|
||||||
|
|
||||||
|
(define (test-pcre line)
|
||||||
|
(match (string-split line #\tab)
|
||||||
|
((pattern input result subst output)
|
||||||
|
(let ((name (string-append pattern " " input " " result " " subst)))
|
||||||
|
(cond
|
||||||
|
((equal? "c" result)
|
||||||
|
(test-error name (regexp-search (pcre->sre pattern) input)))
|
||||||
|
((equal? "n" result)
|
||||||
|
(test-assert name (not (regexp-search (pcre->sre pattern) input))))
|
||||||
|
(else
|
||||||
|
(test name output
|
||||||
|
(subst-matches (regexp-search (pcre->sre pattern) input)
|
||||||
|
input
|
||||||
|
subst))))))
|
||||||
|
(else
|
||||||
|
(error "invalid regex test line" line))))
|
||||||
|
|
||||||
|
(test-group "pcre"
|
||||||
|
(call-with-input-file "tests/re-tests.txt"
|
||||||
|
(lambda (in)
|
||||||
|
(for-each
|
||||||
|
(lambda (line) (test-pcre line))
|
||||||
|
(port->list read-line in)))))
|
||||||
|
|
||||||
|
(test-end)
|
Loading…
Add table
Reference in a new issue