Allow for a chunked regexp API by exposing low-level tools.

This commit is contained in:
Alex Shinn 2024-03-17 22:21:23 +09:00
parent 5b27b01f91
commit b303bf3611
2 changed files with 62 additions and 28 deletions

View file

@ -300,11 +300,9 @@
(if (not (eq? m (searcher-matches sr1))) (if (not (eq? m (searcher-matches sr1)))
(searcher-matches-set! sr1 (copy-regexp-match m))))) (searcher-matches-set! sr1 (copy-regexp-match m)))))
(define (searcher-max sr1 sr2) (define (searcher>=? sr1 sr2)
(if (or (not (searcher? sr2)) (or (not (searcher? sr2))
(regexp-match>=? (searcher-matches sr1) (searcher-matches sr2))) (regexp-match>=? (searcher-matches sr1) (searcher-matches sr2))))
sr1
sr2))
(define (searcher-start-match sr) (define (searcher-start-match sr)
(regexp-match-ref (searcher-matches sr) 0)) (regexp-match-ref (searcher-matches sr) 0))
@ -344,6 +342,26 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Execution ;; Execution
;; The intermediate state of a regexp search. Differs from a match in that a
;; match has not necessarily occurred, and includes additional information
;; needed to resume searching.
(define-record-type Regexp-State
(%make-regexp-state searchers accept string)
regexp-state?
(searchers regexp-state-searchers regexp-state-searchers-set!)
(accept regexp-state-accept regexp-state-accept-set!)
(string regexp-state-string regexp-state-string-set!))
(define (make-regexp-state . o)
(let ((searchers (if (pair? o) (car o) (posse)))
(accept (and (pair? o) (pair? (cdr o)) (cadr o))))
(%make-regexp-state searchers accept #f)))
(define (regexp-state-matches state)
(cond ((regexp-state-accept state) => searcher-matches)
(else #f)))
;; A transition which doesn't advance the index. ;; A transition which doesn't advance the index.
(define (epsilon-state? st) (define (epsilon-state? st)
@ -370,7 +388,7 @@
;; Advance epsilons together - if the State is newly added to the ;; Advance epsilons together - if the State is newly added to the
;; group and is an epsilon state, recursively add the transition. ;; group and is an epsilon state, recursively add the transition.
(define (posse-advance! new seen accept sr str i start end) (define (posse-advance! new seen state sr str i start end)
(let advance! ((sr sr)) (let advance! ((sr sr))
(let ((st (searcher-state sr))) (let ((st (searcher-state sr)))
;; Update match data. ;; Update match data.
@ -394,7 +412,10 @@
;; Follow transitions. ;; Follow transitions.
(cond (cond
((state-accept? st) ((state-accept? st)
(set-cdr! accept (searcher-max sr (cdr accept)))) (cond
((searcher>=? sr (regexp-state-accept state))
(regexp-state-accept-set! state sr)
(regexp-state-string-set! state str))))
((posse-ref seen sr) ((posse-ref seen sr)
=> (lambda (sr-prev) (searcher-merge! sr-prev sr))) => (lambda (sr-prev) (searcher-merge! sr-prev sr)))
((epsilon-state? st) ((epsilon-state? st)
@ -424,27 +445,27 @@
;; Add new searcher. ;; Add new searcher.
(posse-add! new sr)))))) (posse-add! new sr))))))
;; Run so long as there is more to match. ;;> Advances the search until an optimal match is found or the end of the string
;;> is reached, and returns the resulting regexp state.
(define (regexp-run-offsets search? rx str start end) (define (regexp-advance! search? init? rx str start end . o)
(let ((rx (regexp rx)) (let ((rx (regexp rx))
(epsilons (posse)) (state (if (pair? o) (car o) (make-regexp-state)))
(accept (list #f))) (epsilons (posse)))
(let lp ((i start) (let lp ((i start)
(searchers1 (posse)) (searchers1 (posse))
(searchers2 (posse))) (searchers2 (posse)))
;; Advance initial epsilons once from the first index, or every ;; Advance initial epsilons once from the first index, or every
;; time when searching. ;; time when searching.
(cond (cond
((or search? (string-cursor=? i start)) ((or search? (and init? (string-cursor=? i start)))
(posse-advance! searchers1 epsilons accept (make-start-searcher rx str) (posse-advance! searchers1 epsilons state (make-start-searcher rx str)
str i start end) str i start end)
(posse-clear! epsilons))) (posse-clear! epsilons)))
(cond (cond
((or (string-cursor>=? i end) ((or (string-cursor>=? i end)
(and search? (and search?
(searcher? (cdr accept)) (searcher? (regexp-state-accept state))
(let ((accept-start (searcher-start-match (cdr accept)))) (let ((accept-start (searcher-start-match (regexp-state-accept state))))
(posse-every (posse-every
(lambda (searcher) (lambda (searcher)
(string-cursor>? (searcher-start-match searcher) (string-cursor>? (searcher-start-match searcher)
@ -452,31 +473,38 @@
searchers1))) searchers1)))
(and (not search?) (and (not search?)
(posse-empty? searchers1))) (posse-empty? searchers1)))
;; Terminate when the string is done or there are no more ;; Terminate when the string is done or there are no more searchers or
;; searchers. If we terminate prematurely and are not ;; we've found an accept state which started before any pending matches.
;; searching, return false. ;; If we terminate prematurely and are not searching, return false.
(and (searcher? (cdr accept)) (regexp-state-searchers-set! state searchers1)
(let ((matches (searcher-matches (cdr accept)))) state)
(and (or search? (string-cursor>=? (regexp-match-ref matches 1)
end))
(searcher-matches (cdr accept))))))
(else (else
;; Otherwise advance normally. ;; Otherwise advance normally from searchers1, storing the new state in
;; searchers2, and recurse swapping the two (to reduce garbage).
(let ((ch (string-cursor-ref str i)) (let ((ch (string-cursor-ref str i))
(i2 (string-cursor-next str i))) (i2 (string-cursor-next str i)))
(posse-for-each ;; NOTE: non-deterministic from hash order (posse-for-each ;; NOTE: non-deterministic from hash order
(lambda (sr) (lambda (sr)
(cond (cond
((state-matches? (searcher-state sr) str i ch ((state-matches? (searcher-state sr) str i ch
start end (searcher-matches sr)) start end (searcher-matches sr))
(searcher-state-set! sr (state-next1 (searcher-state sr))) (searcher-state-set! sr (state-next1 (searcher-state sr)))
;; Epsilons are considered at the next position. ;; Epsilons are considered at the next position.
(posse-advance! searchers2 epsilons accept sr str i2 start end) (posse-advance! searchers2 epsilons state sr str i2 start end)
(posse-clear! epsilons)))) (posse-clear! epsilons))))
searchers1) searchers1)
(posse-clear! searchers1) (posse-clear! searchers1)
(lp i2 searchers2 searchers1))))))) (lp i2 searchers2 searchers1)))))))
;; Run so long as there is more to match.
(define (regexp-run-offsets search? rx str start end)
(let ((state (regexp-advance! search? #t rx str start end)))
(and (searcher? (regexp-state-accept state))
(let ((matches (searcher-matches (regexp-state-accept state))))
(and (or search? (string-cursor>=? (regexp-match-ref matches 1) end))
matches)))))
;; Wrapper to determine start and end offsets. ;; Wrapper to determine start and end offsets.
(define (regexp-run search? rx str . o) (define (regexp-run search? rx str . o)

View file

@ -10,7 +10,13 @@
regexp-match? regexp-match-count regexp-match? regexp-match-count
regexp-match-submatch regexp-match-submatch/list regexp-match-submatch regexp-match-submatch/list
regexp-match-submatch-start regexp-match-submatch-end regexp-match-submatch-start regexp-match-submatch-end
regexp-match->list regexp-match->sexp) regexp-match->list regexp-match->sexp
;; low-level
regexp-advance! regexp-state?
make-regexp-state regexp-state-accept
regexp-state-searchers regexp-state-matches
regexp-match-ref
)
(import (srfi 69)) (import (srfi 69))
;; Chibi's char-set library is more factored than SRFI-14. ;; Chibi's char-set library is more factored than SRFI-14.
(cond-expand (cond-expand