mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-05-18 21:29:19 +02:00
1248 lines
50 KiB
Scheme
1248 lines
50 KiB
Scheme
;; regexp.scm -- simple non-bactracking NFA implementation
|
|
;; Copyright (c) 2013-2016 Alex Shinn. All rights reserved.
|
|
;; BSD-style license: http://synthcode.com/license.txt
|
|
|
|
;;; An rx represents a start state and meta-info such as the number
|
|
;;; and names of submatches.
|
|
(define-record-type Rx
|
|
(make-rx start-state num-matches num-save-indexes non-greedy-indexes
|
|
match-rules match-names sre)
|
|
regexp?
|
|
(start-state rx-start-state rx-start-state-set!)
|
|
(num-matches rx-num-matches rx-num-matches-set!)
|
|
(num-save-indexes rx-num-save-indexes rx-num-save-indexes-set!)
|
|
(non-greedy-indexes rx-non-greedy-indexes rx-non-greedy-indexes-set!)
|
|
(match-rules rx-rules rx-rules-set!)
|
|
(match-names rx-names rx-names-set!)
|
|
(sre regexp->sre))
|
|
|
|
;; Syntactic sugar.
|
|
(define-syntax rx
|
|
(syntax-rules ()
|
|
((rx sre ...)
|
|
(regexp `(: sre ...)))))
|
|
|
|
;;; A state is a single nfa state with transition rules.
|
|
(define-record-type State
|
|
(%make-state accept? chars match match-rule next1 next2 id)
|
|
state?
|
|
;; A boolean indicating if this is an accepting state.
|
|
(accept? state-accept? state-accept?-set!)
|
|
;; A char or char-set indicating when we can transition.
|
|
;; Alternately, #f indicates an epsilon transition, while a
|
|
;; procedure is a guarded epsilon transition which advances
|
|
;; only if the procedure returns a true value. The signature
|
|
;; is of the form (proc str i ch start end matches).
|
|
(chars state-chars state-chars-set!)
|
|
;; A single integer indicating the match position to record.
|
|
(match state-match state-match-set!)
|
|
;; The rule for merging ambiguous matches. Can be any of: left,
|
|
;; right, (list i j). Posix semantics are equivalent to using left
|
|
;; for the beginning of a submatch and right for the end. List is
|
|
;; used to capture a list of submatch data in the current match.
|
|
(match-rule state-match-rule state-match-rule-set!)
|
|
;; The destination if the char match succeeds.
|
|
(next1 state-next1 state-next1-set!)
|
|
;; An optional additional transition used for forking to two states.
|
|
(next2 state-next2 state-next2-set!)
|
|
;; A unique (per regexp) id for debugging.
|
|
(id state-id))
|
|
|
|
(define (make-state accept? chars match match-rule next1 next2 id)
|
|
(if (and next1 (not (state? next1)))
|
|
(error "expected a state" next1))
|
|
(if (and next2 (not (state? next2)))
|
|
(error "expected a state" next2))
|
|
(%make-state accept? chars match match-rule next1 next2 id))
|
|
|
|
(define ~none 0)
|
|
(define ~ci? 1)
|
|
(define ~ascii? 2)
|
|
(define ~nocapture? 4)
|
|
|
|
(define (flag-set? flags i) (= i (bitwise-and flags i)))
|
|
(define (flag-join a b) (if b (bitwise-ior a b) a))
|
|
(define (flag-clear a b) (bitwise-and a (bitwise-not b)))
|
|
|
|
(define (char-set-ci cset)
|
|
(char-set-fold
|
|
(lambda (ch res)
|
|
(char-set-adjoin! (char-set-adjoin! res (char-upcase ch))
|
|
(char-downcase ch)))
|
|
(char-set)
|
|
cset))
|
|
|
|
(define (make-char-state ch flags next id)
|
|
(if (flag-set? flags ~ci?)
|
|
(let ((cset (cond ((char? ch) (char-set-ci (char-set ch)))
|
|
((char-set? ch) (char-set-ci ch))
|
|
(else ch))))
|
|
(make-state #f cset #f #f next #f id))
|
|
(make-state #f ch #f #f next #f id)))
|
|
(define (make-fork-state next1 next2 id)
|
|
(make-state #f #f #f #f next1 next2 id))
|
|
(define (make-epsilon-state next id)
|
|
(make-fork-state next #f id))
|
|
(define (make-accept-state id)
|
|
(make-state #t #f #f #f #f #f id))
|
|
|
|
;; A record holding the current match data - essentially a wrapper
|
|
;; around a vector, plus a reference to the RX for meta-info.
|
|
(define-record-type Regexp-Match
|
|
(%make-regexp-match matches rx string)
|
|
regexp-match?
|
|
(matches regexp-match-matches regexp-match-matches-set!)
|
|
(rx regexp-match-rx)
|
|
(string regexp-match-string))
|
|
|
|
(define (regexp-match-rules md)
|
|
(rx-rules (regexp-match-rx md)))
|
|
(define (regexp-match-names md)
|
|
(rx-names (regexp-match-rx md)))
|
|
(define (make-regexp-match len rx str)
|
|
(%make-regexp-match (make-vector len #f) rx str))
|
|
(define (make-regexp-match-for-rx rx str)
|
|
(make-regexp-match (rx-num-save-indexes rx) rx str))
|
|
(define (regexp-match-count md)
|
|
(- (quotient (vector-length (regexp-match-matches md)) 2) 1))
|
|
|
|
(define (regexp-match-name-offset md name)
|
|
(let lp ((ls (regexp-match-names md)) (first #f))
|
|
(cond
|
|
((null? ls) (or first (error "unknown match name" md name)))
|
|
((eq? name (caar ls))
|
|
(if (regexp-match-submatch-start+end md (cdar ls))
|
|
(cdar ls)
|
|
(lp (cdr ls) (or first (cdar ls)))))
|
|
(else (lp (cdr ls) first)))))
|
|
|
|
(define (regexp-match-ref md n)
|
|
(vector-ref (regexp-match-matches md)
|
|
(if (integer? n)
|
|
n
|
|
(regexp-match-name-offset md n))))
|
|
|
|
(define (regexp-match-set! md n val)
|
|
(vector-set! (regexp-match-matches md) n val))
|
|
|
|
(define (copy-regexp-match md)
|
|
(let* ((src (regexp-match-matches md))
|
|
(len (vector-length src))
|
|
(dst (make-vector len #f)))
|
|
(do ((i 0 (+ i 1)))
|
|
((= i len)
|
|
(%make-regexp-match dst (regexp-match-rx md) (regexp-match-string md)))
|
|
(vector-set! dst i (vector-ref src i)))))
|
|
|
|
;;> Returns the matching result for the given named or indexed
|
|
;;> submatch \var{n}, possibly as a list for a submatch-list, or
|
|
;;> \scheme{#f} if not matched.
|
|
|
|
(define (regexp-match-submatch/list md n)
|
|
(let ((n (if (integer? n) n (regexp-match-name-offset md n))))
|
|
(cond
|
|
((>= n (vector-length (regexp-match-rules md)))
|
|
#f)
|
|
(else
|
|
(let ((rule (vector-ref (regexp-match-rules md) n)))
|
|
(cond
|
|
((pair? rule)
|
|
(let ((start (regexp-match-ref md (car rule)))
|
|
(end (regexp-match-ref md (cdr rule)))
|
|
(str (regexp-match-string md)))
|
|
(and start end (substring-cursor str start end))))
|
|
(else
|
|
(let ((res (regexp-match-ref md rule)))
|
|
(if (pair? res)
|
|
(reverse res)
|
|
res)))))))))
|
|
|
|
;;> Returns the matching substring for the given named or indexed
|
|
;;> submatch \var{n}, or \scheme{#f} if not matched.
|
|
|
|
(define (regexp-match-submatch md n)
|
|
(let ((res (regexp-match-submatch/list md n)))
|
|
(if (pair? res) (car res) res)))
|
|
|
|
(define (regexp-match-submatch-start+end md n)
|
|
(let ((n (if (integer? n) n (regexp-match-name-offset md n))))
|
|
(and (< n (vector-length (regexp-match-rules md)))
|
|
(let ((rule (vector-ref (regexp-match-rules md) n)))
|
|
(if (pair? rule)
|
|
(let ((start (regexp-match-ref md (car rule)))
|
|
(end (regexp-match-ref md (cdr rule)))
|
|
(str (regexp-match-string md)))
|
|
(and start end
|
|
(cons (string-cursor->index str start)
|
|
(string-cursor->index str end))))
|
|
#f)))))
|
|
|
|
;;> Returns the start index for the given named or indexed submatch
|
|
;;> \var{n}, or \scheme{#f} if not matched.
|
|
|
|
(define (regexp-match-submatch-start md n)
|
|
(cond ((regexp-match-submatch-start+end md n) => car) (else #f)))
|
|
|
|
;;> Returns the end index for the given named or indexed submatch
|
|
;;> \var{n}, or \scheme{#f} if not matched.
|
|
|
|
(define (regexp-match-submatch-end md n)
|
|
(cond ((regexp-match-submatch-start+end md n) => cdr) (else #f)))
|
|
|
|
(define (regexp-match-convert recurse? md str)
|
|
(cond
|
|
((vector? md)
|
|
(let lp ((i 0) (res '()))
|
|
(cond
|
|
((>= i (vector-length md))
|
|
(reverse res))
|
|
((string-cursor? (vector-ref md i))
|
|
(lp (+ i 2)
|
|
(cons (substring-cursor str
|
|
(vector-ref md i)
|
|
(vector-ref md (+ i 1)))
|
|
res)))
|
|
(else
|
|
(lp (+ i 1)
|
|
(cons (regexp-match-convert recurse? (vector-ref md i) str)
|
|
res))))))
|
|
((list? md)
|
|
(if recurse?
|
|
(map (lambda (x) (regexp-match-convert recurse? x str)) (reverse md))
|
|
(regexp-match-convert recurse? (car md) str)))
|
|
((and (pair? md) (string-cursor? (car md)) (string-cursor? (cdr md)))
|
|
(substring-cursor str (car md) (cdr md)))
|
|
((regexp-match? md)
|
|
(regexp-match-convert
|
|
recurse? (regexp-match-matches md) (regexp-match-string md)))
|
|
(else
|
|
md)))
|
|
|
|
;;> Convert an regexp-match result to a list of submatches, beginning
|
|
;;> with the full match, using \scheme{#f} for unmatched submatches.
|
|
|
|
(define (regexp-match->list md)
|
|
(regexp-match-convert #f md #f))
|
|
|
|
;;> Convert an regexp-match result to a forest of submatches, beginning
|
|
;;> with the full match, using \scheme{#f} for unmatched submatches.
|
|
|
|
(define (regexp-match->sexp md)
|
|
(regexp-match-convert #t md #f))
|
|
|
|
;; Collect results from a list match.
|
|
(define (match-collect md spec)
|
|
(define (match-extract md n)
|
|
(let* ((vec (regexp-match-matches md))
|
|
(rules (regexp-match-rules md))
|
|
(n-rule (vector-ref rules n))
|
|
(rule (vector-ref rules n-rule)))
|
|
(if (pair? rule)
|
|
(let ((start (regexp-match-ref md (car rule)))
|
|
(end (regexp-match-ref md (cdr rule))))
|
|
(and start end (cons start end)))
|
|
(regexp-match-ref md rule))))
|
|
(let ((end (cadr spec))
|
|
(vec (regexp-match-matches md)))
|
|
(let lp ((i (+ 1 (car spec)))
|
|
(ls '()))
|
|
(if (>= i end)
|
|
(reverse ls)
|
|
(lp (+ i 1) (cons (match-extract md i) ls))))))
|
|
|
|
;; A searcher represents a single rx state and match information.
|
|
(define-record-type Searcher
|
|
(make-searcher state matches)
|
|
searcher?
|
|
(state searcher-state searcher-state-set!)
|
|
(matches searcher-matches searcher-matches-set!))
|
|
|
|
;; Merge two regexp-matches, preferring the leftmost-longest of their
|
|
;; matches, or shortest for non-greedy matches.
|
|
(define (regexp-match>=? m1 m2)
|
|
(let ((non-greedy-indexes (rx-non-greedy-indexes (regexp-match-rx m1)))
|
|
(end (- (vector-length (regexp-match-matches m1)) 1)))
|
|
(let lp ((i 0))
|
|
(cond
|
|
((>= i end)
|
|
#t)
|
|
((and (eqv? (regexp-match-ref m1 i)
|
|
(regexp-match-ref m2 i))
|
|
(eqv? (regexp-match-ref m1 (+ i 1))
|
|
(regexp-match-ref m2 (+ i 1))))
|
|
(lp (+ i 2)))
|
|
(else
|
|
(not
|
|
(and (string-cursor? (regexp-match-ref m2 i))
|
|
(or (not (string-cursor? (regexp-match-ref m1 i)))
|
|
(string-cursor<? (regexp-match-ref m2 i)
|
|
(regexp-match-ref m1 i))
|
|
;; sanity check for incompletely advanced epsilons
|
|
(and (string-cursor? (regexp-match-ref m1 (+ i 1)))
|
|
(string-cursor<? (regexp-match-ref m1 (+ i 1))
|
|
(regexp-match-ref m1 i)))
|
|
((if (memq (+ i 1) non-greedy-indexes) not values)
|
|
(and
|
|
(string-cursor=? (regexp-match-ref m2 i)
|
|
(regexp-match-ref m1 i))
|
|
(or (not (string-cursor? (regexp-match-ref m2 (+ i 1))))
|
|
(and (string-cursor? (regexp-match-ref m1 (+ i 1)))
|
|
(string-cursor>?
|
|
(regexp-match-ref m2 (+ i 1))
|
|
(regexp-match-ref m1 (+ i 1)))))))))))))))
|
|
|
|
(define (regexp-match-max m1 m2)
|
|
(if (regexp-match>=? m1 m2) m1 m2))
|
|
|
|
;; Merge match data from sr2 into sr1, preferring the leftmost-longest
|
|
;; match in the event of a conflict.
|
|
(define (searcher-merge! sr1 sr2)
|
|
(let ((m (regexp-match-max (searcher-matches sr1) (searcher-matches sr2))))
|
|
(if (not (eq? m (searcher-matches sr1)))
|
|
(searcher-matches-set! sr1 (copy-regexp-match m)))))
|
|
|
|
(define (searcher>=? sr1 sr2)
|
|
(or (not (searcher? sr2))
|
|
(regexp-match>=? (searcher-matches sr1) (searcher-matches sr2))))
|
|
|
|
(define (searcher-start-match sr)
|
|
(regexp-match-ref (searcher-matches sr) 0))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; A posse is a group of searchers.
|
|
|
|
(define (make-posse . o)
|
|
(make-hash-table eq?))
|
|
|
|
(define posse? hash-table?)
|
|
(define (posse-empty? posse) (zero? (hash-table-size posse)))
|
|
|
|
(define (posse-ref posse sr)
|
|
(hash-table-ref/default posse (searcher-state sr) #f))
|
|
(define (posse-add! posse sr)
|
|
(hash-table-set! posse (searcher-state sr) sr))
|
|
(define (posse-clear! posse)
|
|
(hash-table-walk posse (lambda (key val) (hash-table-delete! posse key))))
|
|
(define (posse-for-each proc posse)
|
|
(hash-table-walk posse (lambda (key val) (proc val))))
|
|
(define (posse-every pred posse)
|
|
(hash-table-fold posse (lambda (key val acc) (and acc (pred val))) #t))
|
|
|
|
(define (posse->list posse)
|
|
(hash-table-values posse))
|
|
(define (list->posse ls)
|
|
(let ((searchers (make-posse)))
|
|
(for-each (lambda (sr) (posse-add! searchers sr)) ls)
|
|
searchers))
|
|
(define (posse . args)
|
|
(list->posse args))
|
|
|
|
(define (make-start-searcher rx str)
|
|
(make-searcher (rx-start-state rx) (make-regexp-match-for-rx rx str)))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; Execution
|
|
|
|
;; The intermediate state of a regexp search. Differs from a match in that a
|
|
;; match has not necessarily occurred, and includes additional information
|
|
;; needed to resume searching.
|
|
|
|
(define-record-type Regexp-State
|
|
(%make-regexp-state searchers accept string)
|
|
regexp-state?
|
|
(searchers regexp-state-searchers regexp-state-searchers-set!)
|
|
(accept regexp-state-accept regexp-state-accept-set!)
|
|
(string regexp-state-string regexp-state-string-set!))
|
|
|
|
(define (make-regexp-state . o)
|
|
(let ((searchers (if (pair? o) (car o) (posse)))
|
|
(accept (and (pair? o) (pair? (cdr o)) (cadr o))))
|
|
(%make-regexp-state searchers accept #f)))
|
|
|
|
(define (regexp-state-matches state)
|
|
(cond ((regexp-state-accept state) => searcher-matches)
|
|
(else #f)))
|
|
|
|
;; A transition which doesn't advance the index.
|
|
|
|
(define (epsilon-state? st)
|
|
(or (not (state-chars st))
|
|
(procedure? (state-chars st))))
|
|
|
|
;; Match the state against a char and index.
|
|
|
|
(define (state-matches? st str i ch start end matches)
|
|
(let ((matcher (state-chars st)))
|
|
(cond
|
|
((char? matcher)
|
|
(eqv? matcher ch))
|
|
((char-set? matcher)
|
|
(char-set-contains? matcher ch))
|
|
((pair? matcher)
|
|
(and (char<=? (car matcher) ch) (char<=? ch (cdr matcher))))
|
|
((procedure? matcher)
|
|
(matcher str i ch start end matches))
|
|
((not matcher))
|
|
(else
|
|
(error "unknown state matcher" (state-chars st))))))
|
|
|
|
;; Advance epsilons together - if the State is newly added to the
|
|
;; group and is an epsilon state, recursively add the transition.
|
|
|
|
(define (posse-advance! new seen state sr str i start end)
|
|
(let advance! ((sr sr))
|
|
(let ((st (searcher-state sr)))
|
|
;; Update match data.
|
|
(cond
|
|
((state-match st)
|
|
(let* ((index (state-match st))
|
|
(matches (searcher-matches sr))
|
|
(before (copy-regexp-match matches)))
|
|
(cond
|
|
((pair? index)
|
|
;; Submatch list, accumulate and push.
|
|
(let* ((prev (regexp-match-ref matches (car index)))
|
|
(new (cons (match-collect matches (cdr index))
|
|
(if (pair? prev) prev '()))))
|
|
(regexp-match-set! matches (car index) new)))
|
|
((not (and (eq? 'non-greedy-left (state-match-rule st))
|
|
(regexp-match-ref matches index)
|
|
(string-cursor>=? (regexp-match-ref matches index)
|
|
(regexp-match-ref matches (- index 1)))))
|
|
(regexp-match-set! matches index i))))))
|
|
;; Follow transitions.
|
|
(cond
|
|
((state-accept? st)
|
|
(cond
|
|
((searcher>=? sr (regexp-state-accept state))
|
|
(regexp-state-accept-set! state sr)
|
|
(regexp-state-string-set! state str))))
|
|
((posse-ref seen sr)
|
|
=> (lambda (sr-prev) (searcher-merge! sr-prev sr)))
|
|
((epsilon-state? st)
|
|
(let ((ch (and (string-cursor<? i end) (string-cursor-ref str i))))
|
|
;; Epsilon transition. If there is a procedure matcher,
|
|
;; it's a guarded epsilon and needs to be checked.
|
|
(cond
|
|
((state-matches? st str i ch start end (searcher-matches sr))
|
|
(posse-add! seen sr)
|
|
(let* ((next1 (state-next1 st))
|
|
(next2 (state-next2 st))
|
|
(matches (and next2 (searcher-matches sr))))
|
|
(cond
|
|
(next1
|
|
(searcher-state-set! sr next1)
|
|
(advance! (make-searcher next1 (copy-regexp-match (searcher-matches sr))))))
|
|
(cond
|
|
(next2
|
|
(let ((sr2 (make-searcher next2 (copy-regexp-match matches))))
|
|
(advance! sr2)))))))))
|
|
;; Non-special, non-epsilon searcher, add to posse.
|
|
((posse-ref new sr)
|
|
;; Merge regexp-match for existing searcher.
|
|
=> (lambda (sr-prev) (searcher-merge! sr-prev sr)))
|
|
(else
|
|
;; Add new searcher.
|
|
(posse-add! new sr))))))
|
|
|
|
;;> Advances the search until an optimal match is found or the end of the string
|
|
;;> is reached, and returns the resulting regexp state.
|
|
(define (regexp-advance! search? init? rx str start end . o)
|
|
(let ((rx (regexp rx))
|
|
(state (if (pair? o) (car o) (make-regexp-state)))
|
|
(epsilons (posse)))
|
|
(let lp ((i start)
|
|
(searchers1 (posse))
|
|
(searchers2 (posse)))
|
|
;; Advance initial epsilons once from the first index, or every
|
|
;; time when searching.
|
|
(cond
|
|
((or search? (and init? (string-cursor=? i start)))
|
|
(posse-advance! searchers1 epsilons state (make-start-searcher rx str)
|
|
str i start end)
|
|
(posse-clear! epsilons)))
|
|
(cond
|
|
((or (string-cursor>=? i end)
|
|
(and search?
|
|
(searcher? (regexp-state-accept state))
|
|
(let ((accept-start (searcher-start-match (regexp-state-accept state))))
|
|
(posse-every
|
|
(lambda (searcher)
|
|
(string-cursor>? (searcher-start-match searcher)
|
|
accept-start))
|
|
searchers1)))
|
|
(and (not search?)
|
|
(posse-empty? searchers1)))
|
|
;; Terminate when the string is done or there are no more searchers or
|
|
;; we've found an accept state which started before any pending matches.
|
|
;; If we terminate prematurely and are not searching, return false.
|
|
(regexp-state-searchers-set! state searchers1)
|
|
state)
|
|
(else
|
|
;; Otherwise advance normally from searchers1, storing the new state in
|
|
;; searchers2, and recurse swapping the two (to reduce garbage).
|
|
(let ((ch (string-cursor-ref str i))
|
|
(i2 (string-cursor-next str i)))
|
|
(posse-for-each ;; NOTE: non-deterministic from hash order
|
|
(lambda (sr)
|
|
(cond
|
|
((state-matches? (searcher-state sr) str i ch
|
|
start end (searcher-matches sr))
|
|
(searcher-state-set! sr (state-next1 (searcher-state sr)))
|
|
;; Epsilons are considered at the next position.
|
|
(posse-advance! searchers2 epsilons state sr str i2 start end)
|
|
(posse-clear! epsilons))))
|
|
searchers1)
|
|
(posse-clear! searchers1)
|
|
(lp i2 searchers2 searchers1)))))))
|
|
|
|
;; Run so long as there is more to match.
|
|
|
|
(define (regexp-run-offsets search? rx str start end)
|
|
(let ((state (regexp-advance! search? #t rx str start end)))
|
|
(and (searcher? (regexp-state-accept state))
|
|
(let ((matches (searcher-matches (regexp-state-accept state))))
|
|
(and (or search? (string-cursor>=? (regexp-match-ref matches 1) end))
|
|
matches)))))
|
|
|
|
;; Wrapper to determine start and end offsets.
|
|
|
|
(define (regexp-run search? rx str . o)
|
|
(let ((start (string-start-arg str o))
|
|
(end (string-end-arg str (if (pair? o) (cdr o) o))))
|
|
(regexp-run-offsets search? rx str start end)))
|
|
|
|
;;> Match the given regexp or SRE against the entire string and return
|
|
;;> the match data on success. Returns \scheme{#f} on failure.
|
|
|
|
(define (regexp-matches rx str . o)
|
|
(apply regexp-run #f rx str o))
|
|
|
|
;;> Match the given regexp or SRE against the entire string and return
|
|
;;> the \scheme{#t} on success. Returns \scheme{#f} on failure.
|
|
|
|
(define (regexp-matches? rx str . o)
|
|
(and (apply regexp-matches rx str o) #t))
|
|
|
|
;;> Search for the given regexp or SRE within string and return
|
|
;;> the match data on success. Returns \scheme{#f} on failure.
|
|
|
|
(define (regexp-search rx str . o)
|
|
(apply regexp-run #t rx str o))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; Compiling
|
|
|
|
(define (parse-flags ls)
|
|
(define (symbol->flag s)
|
|
(case s ((i ci case-insensitive) ~ci?) (else ~none)))
|
|
(let lp ((ls ls) (res ~none))
|
|
(if (not (pair? ls))
|
|
res
|
|
(lp (cdr ls) (flag-join res (symbol->flag (car ls)))))))
|
|
|
|
(define char-set:nonl
|
|
(char-set-difference char-set:full (char-set #\newline)))
|
|
(define char-set:control (ucs-range->char-set 0 32))
|
|
(define char-set:word-constituent
|
|
(char-set-union char-set:letter char-set:digit (char-set #\_)))
|
|
(define %char-set:word-constituent
|
|
(char-set-union %char-set:letter %char-set:digit (char-set #\_)))
|
|
(define (char-word-constituent? ch)
|
|
(char-set-contains? char-set:word-constituent ch))
|
|
(define get-char-set:cased
|
|
(let ((char-set:cased #f))
|
|
(lambda ()
|
|
(if (not char-set:cased)
|
|
(set! char-set:cased
|
|
(char-set-union char-set:upper-case
|
|
char-set:lower-case
|
|
char-set:title-case)))
|
|
char-set:cased)))
|
|
|
|
(define (match/bos str i ch start end matches)
|
|
(string-cursor=? i start))
|
|
(define (match/eos str i ch start end matches)
|
|
(string-cursor>=? i end))
|
|
(define (match/bol str i ch start end matches)
|
|
(or (string-cursor=? i start)
|
|
(eqv? #\newline (string-cursor-ref str (string-cursor-prev str i)))))
|
|
(define (match/eol str i ch start end matches)
|
|
(or (string-cursor>=? i end)
|
|
(eqv? #\newline (string-cursor-ref str i))))
|
|
(define (match/bow str i ch start end matches)
|
|
(and (string-cursor<? i end)
|
|
(or (string-cursor=? i start)
|
|
(not (char-word-constituent?
|
|
(string-cursor-ref str (string-cursor-prev str i)))))
|
|
(char-word-constituent? ch)))
|
|
(define (match/eow str i ch start end matches)
|
|
(and (or (string-cursor>=? i end)
|
|
(not (char-word-constituent? ch)))
|
|
(string-cursor>? i start)
|
|
(char-word-constituent?
|
|
(string-cursor-ref str (string-cursor-prev str i)))))
|
|
(define (match/nwb str i ch start end matches)
|
|
(and (not (match/bow str i ch start end matches))
|
|
(not (match/eow str i ch start end matches))))
|
|
(define (match/bog str i ch start end matches)
|
|
(and (string-cursor<? i end)
|
|
(or (string-cursor=? i start)
|
|
(match/eog str (string-cursor-prev str i) ch start end matches))))
|
|
(define (match/eog str i ch start end matches)
|
|
(and (string-cursor>? i start)
|
|
(or (string-cursor>=? i end)
|
|
(let* ((sci (string-cursor->index str i))
|
|
(sce (string-cursor->index str end))
|
|
(m (regexp-search re:grapheme str sci sce)))
|
|
(and m (<= (regexp-match-submatch-end m 0) sci))))))
|
|
|
|
(define (match/look-ahead sres)
|
|
(let ((rx (regexp `(seq bos ,@sres))))
|
|
(lambda (str i ch start end matches)
|
|
(and (regexp-run-offsets #t rx str i end)
|
|
#t))))
|
|
|
|
(define (match/look-behind sres)
|
|
(let ((rx (regexp `(seq ,@sres eos))))
|
|
(lambda (str i ch start end matches)
|
|
(and (regexp-run-offsets #t rx str start i)
|
|
#t))))
|
|
|
|
(define (match/neg-look-ahead sres)
|
|
(let ((rx (regexp `(seq bos ,@sres))))
|
|
(lambda (str i ch start end matches)
|
|
(not (regexp-run-offsets #t rx str i end)))))
|
|
|
|
(define (match/neg-look-behind sres)
|
|
(let ((rx (regexp `(seq ,@sres eos))))
|
|
(lambda (str i ch start end matches)
|
|
(not (regexp-run-offsets #t rx str start i)))))
|
|
|
|
(define (lookup-char-set name flags)
|
|
(cond
|
|
((flag-set? flags ~ascii?)
|
|
(case name
|
|
((any) char-set:full)
|
|
((nonl) char-set:nonl)
|
|
((lower-case lower)
|
|
(if (flag-set? flags ~ci?) %char-set:letter %char-set:lower-case))
|
|
((upper-case upper)
|
|
(if (flag-set? flags ~ci?) %char-set:letter %char-set:upper-case))
|
|
((title-case title)
|
|
(if (flag-set? flags ~ci?) %char-set:letter (char-set)))
|
|
((alphabetic alpha) %char-set:letter)
|
|
((numeric num digit) %char-set:digit)
|
|
((alphanumeric alphanum alnum) %char-set:letter+digit)
|
|
((punctuation punct) %char-set:punctuation)
|
|
((symbol) %char-set:symbol)
|
|
((graphic graph) %char-set:graphic)
|
|
((word-constituent) %char-set:word-constituent)
|
|
((whitespace white space) %char-set:whitespace)
|
|
((printing print) %char-set:printing)
|
|
((control cntrl) %char-set:iso-control)
|
|
((hex-digit xdigit hex) char-set:hex-digit)
|
|
((ascii) char-set:ascii)
|
|
(else #f)))
|
|
(else
|
|
(case name
|
|
((any) char-set:full)
|
|
((nonl) char-set:nonl)
|
|
((lower-case lower)
|
|
(if (flag-set? flags ~ci?) (get-char-set:cased) char-set:lower-case))
|
|
((upper-case upper)
|
|
(if (flag-set? flags ~ci?) (get-char-set:cased) char-set:upper-case))
|
|
((title-case title)
|
|
(if (flag-set? flags ~ci?) (get-char-set:cased) char-set:title-case))
|
|
((alphabetic alpha) char-set:letter)
|
|
((numeric num digit) char-set:digit)
|
|
((alphanumeric alphanum alnum) char-set:letter+digit)
|
|
((punctuation punct) char-set:punctuation)
|
|
((symbol) char-set:symbol)
|
|
((graphic graph) char-set:graphic)
|
|
((word-constituent) char-set:word-constituent)
|
|
((whitespace white space) char-set:whitespace)
|
|
((printing print) char-set:printing)
|
|
((control cntrl) char-set:control)
|
|
((hex-digit xdigit hex) char-set:hex-digit)
|
|
((ascii) char-set:ascii)
|
|
(else #f)))))
|
|
|
|
(define (sre-flatten-ranges orig-ls)
|
|
(let lp ((ls orig-ls) (res '()))
|
|
(cond
|
|
((null? ls)
|
|
(reverse res))
|
|
((string? (car ls))
|
|
(lp (append (string->list (car ls)) (cdr ls)) res))
|
|
((null? (cdr ls))
|
|
(error "unbalanced cset / range" orig-ls))
|
|
((string? (cadr ls))
|
|
(lp (cons (car ls) (append (string->list (cadr ls)) (cddr ls))) res))
|
|
(else
|
|
(lp (cddr ls) (cons (cons (car ls) (cadr ls)) res))))))
|
|
|
|
(define (every pred ls)
|
|
(or (null? ls) (and (pred (car ls)) (every pred (cdr ls)))))
|
|
|
|
(define (char-set-sre? sre)
|
|
(or (char? sre)
|
|
(and (string? sre) (= 1 (string-length sre)))
|
|
(lookup-char-set sre ~none)
|
|
(and (pair? sre)
|
|
(or (string? (car sre))
|
|
(memq (car sre)
|
|
'(char-set / char-range & and ~ complement - difference))
|
|
(and (memq (car sre)
|
|
'(|\|| or w/case w/nocase w/unicode w/ascii))
|
|
(every char-set-sre? (cdr sre)))))))
|
|
|
|
(define (non-greedy-sre? sre)
|
|
(and (pair? sre)
|
|
(or (memq (car sre) '(?? *? **? non-greedy-optional
|
|
non-greedy-zero-or-more non-greedy-repeated))
|
|
(and (memq (car sre) '(: seq w/case w/nocase w/unicode w/ascii))
|
|
(non-greedy-sre? (car (reverse sre))))
|
|
(and (eq? (car sre) 'or)
|
|
(any non-greedy-sre? (cdr sre))))))
|
|
|
|
(define (valid-sre? x)
|
|
(guard (exn (else #f)) (regexp x) #t))
|
|
|
|
(define (sre->char-set sre . o)
|
|
(let ((flags (if (pair? o) (car o) ~none)))
|
|
(define (->cs sre) (sre->char-set sre flags))
|
|
(define (maybe-ci sre)
|
|
(if (flag-set? flags ~ci?) (char-set-ci sre) sre))
|
|
(cond
|
|
((lookup-char-set sre flags))
|
|
((char-set? sre) (maybe-ci sre))
|
|
((char? sre) (maybe-ci (char-set sre)))
|
|
((string? sre)
|
|
(if (= 1 (string-length sre))
|
|
(maybe-ci (string->char-set sre))
|
|
(error "only single char strings can be char-sets")))
|
|
((pair? sre)
|
|
(if (string? (car sre))
|
|
(maybe-ci (string->char-set (car sre)))
|
|
(case (car sre)
|
|
((char-set) (if (null? (cddr sre))
|
|
(maybe-ci (string->char-set (cadr sre)))
|
|
(error "(char-set) takes only one char-set" sre)))
|
|
((/ char-range)
|
|
(->cs
|
|
`(or ,@(map (lambda (x)
|
|
(ucs-range->char-set
|
|
(char->integer (car x))
|
|
(+ 1 (char->integer (cdr x)))))
|
|
(sre-flatten-ranges (cdr sre))))))
|
|
((& and) (apply char-set-intersection (map ->cs (cdr sre))))
|
|
((|\|| or) (apply char-set-union (map ->cs (cdr sre))))
|
|
((~ complement) (char-set-complement (->cs `(or ,@(cdr sre)))))
|
|
((- difference) (char-set-difference (->cs (cadr sre))
|
|
(->cs `(or ,@(cddr sre)))))
|
|
((w/case) (if (null? (cddr sre))
|
|
(sre->char-set (cadr sre) (flag-clear flags ~ci?))
|
|
(error "w/case takes only one char-set" sre)))
|
|
((w/nocase) (if (null? (cddr sre))
|
|
(sre->char-set (cadr sre) (flag-join flags ~ci?))
|
|
(error "w/nocase takes only one char-set" sre)))
|
|
((w/ascii) (if (null? (cddr sre))
|
|
(sre->char-set (cadr sre) (flag-join flags ~ascii?))
|
|
(error "w/ascii takes only one char-set" sre)))
|
|
((w/unicode) (if (null? (cddr sre))
|
|
(sre->char-set (cadr sre) (flag-clear flags ~ascii?))
|
|
(error "w/unicode takes only one char-set" sre)))
|
|
(else (error "invalid sre char-set" sre)))))
|
|
(else (error "invalid sre char-set" sre)))))
|
|
|
|
(define (char-set->sre cset)
|
|
(list (char-set->string cset)))
|
|
|
|
(define (strip-submatches sre)
|
|
(if (pair? sre)
|
|
(case (car sre)
|
|
(($ submatch) (strip-submatches (cons ': (cdr sre))))
|
|
((-> => submatch-named) (strip-submatches (cons ': (cddr sre))))
|
|
(else (cons (strip-submatches (car sre))
|
|
(strip-submatches (cdr sre)))))
|
|
sre))
|
|
|
|
(define (sre-expand-reps from to sre)
|
|
(let ((sre0 (strip-submatches sre)))
|
|
(let lp ((i 0) (res '(:)))
|
|
(if (= i from)
|
|
(cond
|
|
((not to)
|
|
(reverse (cons `(* ,sre) res)))
|
|
((= from to)
|
|
(reverse (cons sre (cdr res))))
|
|
(else
|
|
(let lp ((i (+ i 1)) (res res))
|
|
(if (>= i to)
|
|
(reverse (cons `(? ,sre) res))
|
|
(lp (+ i 1) (cons `(? ,sre0) res))))))
|
|
(lp (+ i 1) (cons sre0 res))))))
|
|
|
|
;;> Compile an \var{sre} into a regexp.
|
|
|
|
(define (regexp sre . o)
|
|
(define current-index 2)
|
|
(define current-match 0)
|
|
(define current-id 0)
|
|
(define match-names '())
|
|
(define match-rules (list (cons 0 1)))
|
|
(define non-greedy-indexes '())
|
|
(define (next-id)
|
|
(let ((res current-id)) (set! current-id (+ current-id 1)) res))
|
|
(define (make-submatch-state sre flags next index)
|
|
(let* ((n3 (make-epsilon-state next (next-id)))
|
|
(n2 (->rx sre flags n3))
|
|
(n1 (make-epsilon-state n2 (next-id)))
|
|
(non-greedy? (non-greedy-sre? sre)))
|
|
(state-match-set! n1 index)
|
|
(state-match-rule-set! n1 'left)
|
|
(state-match-set! n3 (+ index 1))
|
|
(state-match-rule-set! n3 (if non-greedy? 'non-greedy-left 'right))
|
|
(if non-greedy?
|
|
(set! non-greedy-indexes (cons (+ index 1) non-greedy-indexes)))
|
|
n1))
|
|
(define (->rx sre flags next)
|
|
(cond
|
|
;; The base cases chars and strings match literally.
|
|
((char? sre)
|
|
(make-char-state sre flags next (next-id)))
|
|
((char-set? sre)
|
|
(make-char-state sre flags next (next-id)))
|
|
((string? sre)
|
|
(->rx (cons 'seq (string->list sre)) flags next))
|
|
((and (symbol? sre) (lookup-char-set sre flags))
|
|
=> (lambda (cset) (make-char-state cset ~none next (next-id))))
|
|
((symbol? sre)
|
|
(case sre
|
|
((epsilon) next)
|
|
((bos) (make-char-state match/bos flags next (next-id)))
|
|
((eos) (make-char-state match/eos flags next (next-id)))
|
|
((bol) (make-char-state match/bol flags next (next-id)))
|
|
((eol) (make-char-state match/eol flags next (next-id)))
|
|
((bow) (make-char-state match/bow flags next (next-id)))
|
|
((eow) (make-char-state match/eow flags next (next-id)))
|
|
((nwb) (make-char-state match/nwb flags next (next-id)))
|
|
((bog) (make-char-state match/bog flags next (next-id)))
|
|
((eog) (make-char-state match/eog flags next (next-id)))
|
|
((grapheme)
|
|
(->rx
|
|
`(or (: (* ,char-set:hangul-l) (+ ,char-set:hangul-v)
|
|
(* ,char-set:hangul-t))
|
|
(: (* ,char-set:hangul-l) ,char-set:hangul-v
|
|
(* ,char-set:hangul-v) (* ,char-set:hangul-t))
|
|
(: (* ,char-set:hangul-l) ,char-set:hangul-lvt
|
|
(* ,char-set:hangul-t))
|
|
(+ ,char-set:hangul-l)
|
|
(+ ,char-set:hangul-t)
|
|
(+ ,char-set:regional-indicator)
|
|
(: "\r\n")
|
|
(: (~ control ("\r\n"))
|
|
(* ,char-set:extend-or-spacing-mark))
|
|
control)
|
|
flags
|
|
next))
|
|
((word) (->rx '(word+ any) flags next))
|
|
(else (error "unknown sre" sre))))
|
|
((pair? sre)
|
|
(case (car sre)
|
|
((seq :)
|
|
;; Sequencing. An empty sequence jumps directly to next,
|
|
;; otherwise we join the first element to the sequence formed
|
|
;; of the remaining elements followed by next.
|
|
(if (null? (cdr sre))
|
|
next
|
|
;; Make a dummy intermediate to join the states so that
|
|
;; we can generate n1 first, preserving the submatch order.
|
|
(let* ((n2 (make-epsilon-state #f (next-id)))
|
|
(n1 (->rx (cadr sre) flags n2))
|
|
(n3 (->rx (cons 'seq (cddr sre)) flags next)))
|
|
(state-next1-set! n2 n3)
|
|
n1)))
|
|
((or |\||)
|
|
;; Alternation. An empty alternation always fails.
|
|
;; Otherwise we fork between any of the alternations, each
|
|
;; continuing to next.
|
|
(cond
|
|
((null? (cdr sre))
|
|
#f)
|
|
((char-set-sre? sre)
|
|
(make-char-state (sre->char-set sre) flags next (next-id)))
|
|
((null? (cddr sre))
|
|
(->rx (cadr sre) flags next))
|
|
(else
|
|
(let* ((n1 (->rx (cadr sre) flags next))
|
|
(n2 (->rx (cons 'or (cddr sre)) flags next)))
|
|
(make-fork-state n1 n2 (next-id))))))
|
|
((? optional ?? non-greedy-optional)
|
|
;; Optionality. Either match the body or fork to the next
|
|
;; state directly.
|
|
(make-fork-state (->rx (cons 'seq (cdr sre)) flags next)
|
|
next (next-id)))
|
|
((* zero-or-more *? non-greedy-zero-or-more)
|
|
;; Repetition. Introduce two fork states which can jump from
|
|
;; the end of the loop to the beginning and from the
|
|
;; beginning to the end (to skip the first iteration).
|
|
(let* ((n2 (make-fork-state next #f (next-id)))
|
|
(n1 (make-fork-state (->rx (cons 'seq (cdr sre)) flags n2)
|
|
n2 (next-id))))
|
|
(state-next2-set! n2 n1)
|
|
n1))
|
|
((+ one-or-more)
|
|
;; One-or-more repetition. Same as above but the first
|
|
;; transition is required so the rx is simpler - we only
|
|
;; need one fork from the end of the loop to the beginning.
|
|
(let* ((n2 (make-fork-state next #f (next-id)))
|
|
(n1 (->rx (cons 'seq (cdr sre)) flags n2)))
|
|
(state-next2-set! n2 n1)
|
|
n1))
|
|
((= exactly)
|
|
;; Exact repetition.
|
|
(->rx (sre-expand-reps (cadr sre) (cadr sre) (cons 'seq (cddr sre)))
|
|
flags next))
|
|
((>= at-least)
|
|
;; n-or-more repetition.
|
|
(->rx (sre-expand-reps (cadr sre) #f (cons 'seq (cddr sre)))
|
|
flags next))
|
|
((** repeated **? non-greedy-repeated)
|
|
;; n-to-m repetition.
|
|
(->rx (sre-expand-reps (cadr sre) (car (cddr sre))
|
|
(cons 'seq (cdr (cddr sre))))
|
|
flags next))
|
|
((-> => submatch-named)
|
|
;; Named submatches just record the name for the current
|
|
;; match and rewrite as a non-named submatch.
|
|
(cond
|
|
((flag-set? flags ~nocapture?)
|
|
(->rx (cons 'seq (cddr sre)) flags next))
|
|
(else
|
|
(set! match-names
|
|
(cons (cons (cadr sre) (+ 1 current-match)) match-names))
|
|
(->rx (cons 'submatch (cddr sre)) flags next))))
|
|
((*-> *=> submatch-named-list)
|
|
(cond
|
|
((flag-set? flags ~nocapture?)
|
|
(->rx (cons 'seq (cddr sre)) flags next))
|
|
(else
|
|
(set! match-names (cons (cons (cadr sre) current-match) match-names))
|
|
(->rx (cons 'submatch-list (cddr sre)) flags next))))
|
|
(($ submatch)
|
|
;; A submatch wraps next with an epsilon transition before
|
|
;; next, setting the start and end index on the result and
|
|
;; wrapped next respectively.
|
|
(cond
|
|
((flag-set? flags ~nocapture?)
|
|
(->rx (cons 'seq (cdr sre)) flags next))
|
|
(else
|
|
(let ((num current-match)
|
|
(index current-index))
|
|
(set! current-match (+ current-match 1))
|
|
(set! current-index (+ current-index 2))
|
|
(set! match-rules `((,index . ,(+ index 1)) ,@match-rules))
|
|
(make-submatch-state (cons 'seq (cdr sre)) flags next index)))))
|
|
((*$ submatch-list)
|
|
;; A submatch-list wraps a range of submatch results into a
|
|
;; single match value.
|
|
(cond
|
|
((flag-set? flags ~nocapture?)
|
|
(->rx (cons 'seq (cdr sre)) flags next))
|
|
(else
|
|
(let* ((num current-match)
|
|
(index current-index))
|
|
(set! current-match (+ current-match 1))
|
|
(set! current-index (+ current-index 1))
|
|
(set! match-rules `(,index ,@match-rules))
|
|
(let* ((n2 (make-epsilon-state next (next-id)))
|
|
(n1 (->rx (cons 'submatch (cdr sre)) flags n2)))
|
|
(state-match-set! n2 (list index num current-match))
|
|
(state-match-rule-set! n2 'list)
|
|
n1)))))
|
|
((~ - & / complement difference and char-range char-set)
|
|
(make-char-state (sre->char-set sre flags) ~none next (next-id)))
|
|
((word)
|
|
(->rx `(: bow ,@(cdr sre) eow) flags next))
|
|
((word+)
|
|
(->rx `(word (+ ,(if (equal? '(any) (cdr sre))
|
|
'word-constituent
|
|
(char-set-intersection
|
|
char-set:word-constituent
|
|
(sre->char-set `(or ,@(cdr sre)) flags)))))
|
|
flags
|
|
next))
|
|
;; TODO: The look-around assertions are O(n^d) where d is the
|
|
;; nesting depth of the assertions, i.e. quadratic for one
|
|
;; look-ahead, cubic for a look-behind inside a look-ahead,
|
|
;; etc. We could consider instead advancing the look-aheads
|
|
;; together from the current position (and advancing the
|
|
;; look-behinds from the beginning) and checking if the
|
|
;; corresponding state matches. The trick is the look-aheads
|
|
;; don't necessarily have the same length - we have to keep
|
|
;; advancing until they resolve and keep or prune the
|
|
;; corresponding non-look-ahead states accordingly.
|
|
((look-ahead)
|
|
(make-char-state (match/look-ahead (cdr sre)) flags next (next-id)))
|
|
((look-behind)
|
|
(make-char-state (match/look-behind (cdr sre)) flags next (next-id)))
|
|
((neg-look-ahead)
|
|
(make-char-state (match/neg-look-ahead (cdr sre)) flags next (next-id)))
|
|
((neg-look-behind)
|
|
(make-char-state (match/neg-look-behind (cdr sre)) flags next (next-id)))
|
|
((w/case)
|
|
(->rx `(: ,@(cdr sre)) (flag-clear flags ~ci?) next))
|
|
((w/nocase)
|
|
(->rx `(: ,@(cdr sre)) (flag-join flags ~ci?) next))
|
|
((w/unicode)
|
|
(->rx `(: ,@(cdr sre)) (flag-clear flags ~ascii?) next))
|
|
((w/ascii)
|
|
(->rx `(: ,@(cdr sre)) (flag-join flags ~ascii?) next))
|
|
((w/nocapture)
|
|
(->rx `(: ,@(cdr sre)) (flag-join flags ~nocapture?) next))
|
|
(else
|
|
(if (string? (car sre))
|
|
(make-char-state (sre->char-set sre flags) ~none next (next-id))
|
|
(error "unknown sre" sre)))))))
|
|
(let ((flags (parse-flags (and (pair? o) (car o)))))
|
|
(if (regexp? sre)
|
|
sre
|
|
(let ((start (make-submatch-state
|
|
sre flags (make-accept-state (next-id)) 0)))
|
|
;; (define (state->list st)
|
|
;; (let ((seen (make-hash-table eq?))
|
|
;; (count 0))
|
|
;; (reverse
|
|
;; (let lp ((st st) (res '()))
|
|
;; (cond
|
|
;; ((not (state? st)) res)
|
|
;; ((hash-table-ref/default seen st #f) res)
|
|
;; (else
|
|
;; (hash-table-set! seen st count)
|
|
;; (let ((orig-count count))
|
|
;; (set! count (+ count 1))
|
|
;; (let* ((next1 (lp (state-next1 st) '()))
|
|
;; (next2 (lp (state-next2 st) '()))
|
|
;; (this (append
|
|
;; (list (state-id st) ;;orig-count
|
|
;; (cond
|
|
;; ((epsilon-state? st)
|
|
;; (if (state-chars st) '? '-))
|
|
;; ((and (char-set? (state-chars st))
|
|
;; (< (char-set-size (state-chars st)) 5))
|
|
;; (char-set->string (state-chars st)))
|
|
;; ((char? (state-chars st))
|
|
;; (string (state-chars st)))
|
|
;; (else '+))
|
|
;; (cond
|
|
;; ((state-next1 st) => state-id)
|
|
;; (else #f)))
|
|
;; (if (state-next2 st)
|
|
;; (list (state-id (state-next2 st)))
|
|
;; '())
|
|
;; (if (state-match st)
|
|
;; (list (list 'm (state-match st)))
|
|
;; '()))))
|
|
;; (append next2 next1 (cons this res))))))))))
|
|
;;(for-each (lambda (x) (write x) (newline)) (state->list start))
|
|
(make-rx start current-match current-index non-greedy-indexes
|
|
(list->vector (reverse match-rules)) match-names sre)))))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;; Utilities
|
|
|
|
;;> The fundamental regexp matching iterator. Repeatedly searches
|
|
;;> \var{str} for the regexp \var{re} so long as a match can be found.
|
|
;;> On each successful match, applies \scheme{(\var{kons} \var{i}
|
|
;;> \var{regexp-match} \var{str} \var{acc})} where \var{i} is the
|
|
;;> index since the last match (beginning with
|
|
;;> \var{start}),\var{regexp-match} is the resulting match, and
|
|
;;> \var{acc} is the result of the previous \var{kons} application,
|
|
;;> beginning with \var{knil}. When no more matches can be found,
|
|
;;> calls \var{finish} with the same arguments, except that
|
|
;;> \var{regexp-match} is \scheme{#f}.
|
|
;;>
|
|
;;> By default \var{finish} just returns \var{acc}.
|
|
|
|
(define (regexp-fold rx kons knil str . o)
|
|
(let* ((rx (regexp rx))
|
|
(finish (if (pair? o) (car o) (lambda (from md str acc) acc)))
|
|
(o (if (pair? o) (cdr o) o))
|
|
(start (string-start-arg str o))
|
|
(end (string-end-arg str (if (pair? o) (cdr o) o))))
|
|
(let lp ((i start)
|
|
(from start)
|
|
(acc knil))
|
|
(cond
|
|
((and (string-cursor<? i end) (regexp-run-offsets #t rx str i end))
|
|
=> (lambda (md)
|
|
(let ((j (regexp-match-ref md 1)))
|
|
(lp (if (and (string-cursor=? i j) (string-cursor<? j end))
|
|
(string-cursor-next str j)
|
|
j)
|
|
j
|
|
(kons (string-cursor->index str from) md str acc)))))
|
|
(else
|
|
(finish (string-cursor->index str from) #f str acc))))))
|
|
|
|
;;> Extracts all non-empty substrings of \var{str} which match
|
|
;;> \var{re} between \var{start} and \var{end} as a list of strings.
|
|
|
|
(define (regexp-extract rx str . o)
|
|
(apply regexp-fold
|
|
rx
|
|
(lambda (from md str a)
|
|
(let ((s (regexp-match-submatch md 0)))
|
|
(if (equal? s "") a (cons s a))))
|
|
'()
|
|
str
|
|
(lambda (from md str a) (reverse a))
|
|
o))
|
|
|
|
;;> Splits \var{str} into a list of strings separated by matches of
|
|
;;> \var{re}.
|
|
|
|
(define (regexp-split rx str . o)
|
|
;; start and end in indices passed to regexp-fold
|
|
(let ((start (if (pair? o) (car o) 0))
|
|
(end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str))))
|
|
(regexp-fold
|
|
rx
|
|
(lambda (from md str a)
|
|
(let ((i (regexp-match-submatch-start md 0))
|
|
(j (regexp-match-submatch-end md 0)))
|
|
(if (eqv? i j)
|
|
a
|
|
(cons j
|
|
(cons (substring str (car a) i) (cdr a))))))
|
|
(cons start '())
|
|
str
|
|
(lambda (from md str a)
|
|
(reverse (cons (substring str (car a) end) (cdr a))))
|
|
start
|
|
end)))
|
|
|
|
;;> Partitions \var{str} into a list of non-empty strings
|
|
;;> matching \var{re}, interspersed with the unmatched portions
|
|
;;> of the string. The first and every odd element is an unmatched
|
|
;;> substring, which will be the empty string if \var{re} matches
|
|
;;> at the beginning of the string or end of the previous match. The
|
|
;;> second and every even element will be a substring matching
|
|
;;> \var{re}. If the final match ends at the end of the string,
|
|
;;> no trailing empty string will be included. Thus, in the
|
|
;;> degenerate case where \var{str} is the empty string, the
|
|
;;> result is \scheme{("")}.
|
|
|
|
(define (regexp-partition rx str . o)
|
|
(let ((start (if (pair? o) (car o) 0))
|
|
(end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str))))
|
|
(define (kons from md str a)
|
|
(let ((i (regexp-match-submatch-start md 0))
|
|
(j (regexp-match-submatch-end md 0)))
|
|
(if (eqv? i j)
|
|
a
|
|
(let ((left (substring str (car a) i)))
|
|
(cons j
|
|
(cons (regexp-match-submatch md 0)
|
|
(cons left (cdr a))))))))
|
|
(define (final from md str a)
|
|
(if (or (< from end) (null? (cdr a)))
|
|
(cons (substring str (car a) end) (cdr a))
|
|
(cdr a)))
|
|
(reverse (regexp-fold rx kons (cons start '()) str final start end))))
|
|
|
|
;;> Returns a new string replacing the \var{count}th match of \var{re}
|
|
;;> in \var{str} the \var{subst}, where the zero-indexed \var{count}
|
|
;;> defaults to zero (i.e. the first match). If there are not
|
|
;;> \var{count} matches, returns the selected substring unmodified.
|
|
|
|
;;> \var{subst} can be a string, an integer or symbol indicating the
|
|
;;> contents of a numbered or named submatch of \var{re},\scheme{'pre}
|
|
;;> for the substring to the left of the match, or \scheme{'post} for
|
|
;;> the substring to the right of the match.
|
|
|
|
;;> The optional parameters \var{start} and \var{end} restrict both
|
|
;;> the matching and the substitution, to the given indices, such that
|
|
;;> the result is equivalent to omitting these parameters and
|
|
;;> replacing on \scheme{(substring str start end)}. As a convenience,
|
|
;;> a value of \scheme{#f} for \var{end} is equivalent to
|
|
;;> \scheme{(string-length str)}.
|
|
|
|
(define (regexp-replace rx str subst . o)
|
|
(let* ((start (if (and (pair? o) (car o)) (car o) 0))
|
|
(o (if (pair? o) (cdr o) '()))
|
|
(end (if (and (pair? o) (car o)) (car o) (string-length str)))
|
|
(o (if (pair? o) (cdr o) '()))
|
|
(count (if (pair? o) (car o) 0)))
|
|
(let lp ((i start) (count count))
|
|
(let ((m (regexp-search rx str i end)))
|
|
(cond
|
|
((not m) str)
|
|
((positive? count)
|
|
(lp (regexp-match-submatch-end m 0) (- count 1)))
|
|
(else
|
|
(string-concatenate
|
|
(cons
|
|
(substring str start (regexp-match-submatch-start m 0))
|
|
(append
|
|
(reverse (regexp-apply-match m str subst start end))
|
|
(list (substring str (regexp-match-submatch-end m 0) end)))))))))))
|
|
|
|
;;> Equivalent to \var{regexp-replace}, but replaces all occurrences
|
|
;;> of \var{re} in \var{str}.
|
|
|
|
(define (regexp-replace-all rx str subst . o)
|
|
(let* ((start (if (and (pair? o) (car o)) (car o) 0))
|
|
(o (if (pair? o) (cdr o) '()))
|
|
(end (if (and (pair? o) (car o)) (car o) (string-length str))))
|
|
(regexp-fold
|
|
rx
|
|
(lambda (i m str acc)
|
|
(let ((m-start (regexp-match-submatch-start m 0)))
|
|
(append (regexp-apply-match m str subst start end)
|
|
(if (>= i m-start)
|
|
acc
|
|
(cons (substring str i m-start) acc)))))
|
|
'()
|
|
str
|
|
(lambda (i m str acc)
|
|
(let ((end (string-length str)))
|
|
(string-concatenate-reverse
|
|
(if (>= i end)
|
|
acc
|
|
(cons (substring str i end) acc)))))
|
|
start end)))
|
|
|
|
(define (regexp-apply-match m str ls start end)
|
|
(let lp ((ls ls) (res '()))
|
|
(cond
|
|
((null? ls)
|
|
res)
|
|
((not (pair? ls))
|
|
(lp (list ls) res))
|
|
((integer? (car ls))
|
|
(lp (cdr ls) (cons (or (regexp-match-submatch m (car ls)) "") res)))
|
|
((procedure? (car ls))
|
|
(lp (cdr ls) (cons ((car ls) m) res)))
|
|
((symbol? (car ls))
|
|
(case (car ls)
|
|
((pre)
|
|
(lp (cdr ls)
|
|
(cons (substring str start (regexp-match-submatch-start m 0))
|
|
res)))
|
|
((post)
|
|
(lp (cdr ls)
|
|
(cons (substring str (regexp-match-submatch-end m 0) end)
|
|
res)))
|
|
(else
|
|
(cond
|
|
((assq (car ls) (regexp-match-names m))
|
|
=> (lambda (x) (lp (cons (cdr x) (cdr ls)) res)))
|
|
(else
|
|
(error "unknown match replacement" (car ls)))))))
|
|
(else
|
|
(lp (cdr ls) (cons (car ls) res))))))
|
|
|
|
(define re:grapheme (regexp 'grapheme))
|