;; PCRE parsing, adapted from IrRegex. (define ~none 0) (define ~save? 1) (define ~case-insensitive? 2) (define ~multi-line? 4) (define ~single-line? 8) (define ~ignore-space? 16) (define (flag-set? flags i) (= i (bitwise-and flags i))) (define (flag-join a b) (if b (bitwise-ior a b) a)) (define (flag-clear a b) (bitwise-and a (bitwise-not b))) (define (symbol-list->flags ls) (let lp ((ls ls) (res ~none)) (cond ((null? ls) res) ((not (pair? ls)) (lp (list ls) res)) (else (lp (cdr ls) (flag-join res (case (car ls) ((i ci case-insensitive) ~case-insensitive?) ((m multi-line) ~multi-line?) ((s single-line) ~single-line?) ((x ignore-space) ~ignore-space?) (else #f)))))))) (define posix-escape-sequences `((#\n . #\newline) (#\r . #\return) (#\t . #\tab) (#\a . #\alarm) (#\e . #\escape))) (define (char-altcase c) (if (char-upper-case? c) (char-downcase c) (char-upcase c))) (define (char-mirror c) (case c ((#\<) #\>) ((#\{) #\}) ((#\() #\)) ((#\[) #\]) (else c))) (define (string-scan-char-escape str c . o) (let ((end (string-length str))) (let scan ((i (if (pair? o) (car o) 0))) (cond ((= i end) #f) ((eqv? c (string-ref str i)) i) ((eqv? c #\\) (scan (+ i 2))) (else (scan (+ i 1))))))) (define (string-find/index str ch start . o) (let* ((end (if (pair? o) (car o) (string-length str))) (i (string-find str ch (string-index->cursor str start) (string-index->cursor str end)))) (string-cursor->index str i))) (define (string-parse-hex-escape str i end) (cond ((>= i end) (error "incomplete hex escape" str i)) ((eqv? #\{ (string-ref str i)) (let ((j (string-scan-char-escape str #\} (+ i 1)))) (if (not j) (error "incomplete hex brace escape" str i) (let* ((s (substring str (+ i 1) j)) (n (string->number s 16))) (if n (list (integer->char n) j) (error "bad hex brace escape" s)))))) ((>= (+ i 1) end) (error "incomplete hex escape" str i)) (else (let* ((s (substring str i (+ i 2))) (n (string->number s 16))) (if n (list (integer->char n) (+ i 2)) (error "bad hex escape" s)))))) (define (string-parse-cset str start flags) (let* ((end (string-length str)) (invert? (and (< start end) (eqv? #\^ (string-ref str start))))) (define (cset-union a b) (cond ((not a) b) ((not b) a) ((and (pair? a) (eq? 'or (car a))) `(,@a ,b)) (else `(or ,a ,b)))) (define (go i prev-char ones pairs classes) (if (>= i end) (error "incomplete char set" str i end)) (case (string-ref str i) ((#\]) (if (and (null? ones) (null? pairs)) (go (+ i 1) #\] (cons #\] ones) pairs classes) (list (let ((res (cset-union (cset-union (and (pair? classes) `(or ,@classes)) (and (pair? ones) `(,(list->string (reverse ones))))) (and (pair? pairs) `(/ ,(list->string (reverse pairs))))))) (if invert? `(~ ,res) res)) i))) ((#\-) (cond ((or (= i start) (and (= i (+ start 1)) invert?) (eqv? #\] (string-ref str (+ i 1)))) (go (+ i 1) #\- (cons #\- ones) pairs classes)) ;; alternately permissively allow this as a - ((not prev-char) (error "bad pcre char-set, unexpected -" str)) (else (let ((ch (string-ref str (+ i 1)))) (apply (lambda (c j) (if (char (lambda (x) (list (cdr x) (+ i 3)))) ((and (eqv? #\\ ch) (eqv? (string-ref str (+ i 2)) #\x)) (string-parse-hex-escape str (+ i 3) end)) (else (list ch (+ i 2))))))))) ((#\[) (let* ((inv? (eqv? #\^ (string-ref str (+ i 1)))) (i2 (if inv? (+ i 2) (+ i 1)))) (case (string-ref str i2) ((#\:) (let ((j (string-find/index str #\: (+ i2 1) end))) (if (or (>= (+ j 1) end) (not (eqv? #\] (string-ref str (+ j 1))))) (error "incomplete character class" str) (let* ((class (string->symbol (substring str (+ i2 1) j))) (class (if inv? `(~ ,class) class))) (go (+ j 2) #f ones pairs (cons class classes)))))) ((#\= #\.) (error "collating sequences not supported" str)) (else (go (+ i 1) #\[ (cons #\[ ones) pairs classes))))) ((#\\) (let ((c (string-ref str (+ i 1)))) (case c ((#\d #\D #\s #\S #\w #\W) (go (+ i 2) #f ones pairs (cons (pcre->sre (string #\\ c)) classes))) ((#\x) (apply (lambda (c j) (go j c (cons c ones) pairs classes)) (string-parse-hex-escape str (+ i 2) end))) (else (let ((c (cond ((assv c posix-escape-sequences) => cdr) (else c)))) (go (+ i 2) c (cons c ones) pairs classes)))))) (else => (lambda (c) (go (+ i 1) c (cons c ones) pairs classes))))) (if invert? (let ((ones (if (flag-set? flags ~multi-line?) '(#\newline) '()))) (go (+ start 1) #f ones '() '())) (go start #f '() '() '())))) ;; build a (seq ls ...) sre from a list (define (sre-sequence ls) (cond ((null? ls) 'epsilon) ((null? (cdr ls)) (car ls)) (else (cons 'seq ls)))) ;; build a (or ls ...) sre from a list (define (sre-alternate ls) (cond ((null? ls) '(or)) ((null? (cdr ls)) (car ls)) (else (cons 'or ls)))) ;; returns #t if the sre can ever be empty (define (sre-empty? sre) (if (pair? sre) (case (car sre) ((* ? look-ahead look-behind neg-look-ahead neg-look-behind) #t) ((**) (or (not (number? (cadr sre))) (zero? (cadr sre)))) ((or) (any sre-empty? (cdr sre))) ((: seq $ submatch => submatch-named + atomic) (every sre-empty? (cdr sre))) (else #f)) (memq sre '(epsilon bos eos bol eol bow eow commit)))) ;; returns #t if the sre is a */+ repetition (define (sre-repeater? sre) (and (pair? sre) (or (memq (car sre) '(* +)) (and (memq (car sre) '($ submatch => submatch-named seq :)) (pair? (cdr sre)) (null? (cddr sre)) (sre-repeater? (cadr sre)))))) (define (pcre->sre str . o) (if (not (string? str)) (error "pcre->sre: expected a string" str)) (let ((end (string-length str)) (orig-flags (if (pair? o) (symbol-list->flags (car o)) ~none))) (let lp ((i 0) (from 0) (flags orig-flags) (res '()) (st '())) ;; accumulate the substring from..i as literal text (define (collect) (if (= i from) res (cons (substring str from i) res))) ;; like collect but breaks off the last single character when ;; collecting literal data, as the argument to ?/*/+ etc. (define (collect/single) (let ((j (- i 1))) (cond ((< j from) res) (else (let ((c (string-ref str j))) (cond ((= j from) (cons c res)) (else (cons c (cons (substring str from j) res))))))))) ;; collects for use as a result, reversing and grouping OR ;; terms, and some ugly tweaking of `function-like' groups and ;; conditionals (define (collect/terms) (let* ((ls (collect)) (func (and (pair? ls) (memq (last ls) '(atomic if look-ahead neg-look-ahead look-behind neg-look-behind => submatch-named)))) (prefix (if (and func (memq (car func) '(=> submatch-named))) (list 'submatch-named (cadr (reverse ls))) (and func (list (car func))))) (ls (if func (if (memq (car func) '(=> submatch-named)) (reverse (cddr (reverse ls))) (reverse (cdr (reverse ls)))) ls))) (let lp ((ls ls) (term '()) (res '())) (define (shift) (cons (sre-sequence term) res)) (cond ((null? ls) (let* ((res (sre-alternate (shift))) (res (if (flag-set? flags ~save?) (list 'submatch res) res))) (if prefix (if (eq? 'if (car prefix)) (cond ((not (pair? res)) 'epsilon) ((memq (car res) '(look-ahead neg-look-ahead look-behind neg-look-behind)) res) ((eq? 'seq (car res)) `(if ,(cadr res) ,(sre-sequence (cddr res)))) (else `(if ,(cadadr res) ,(sre-sequence (cddadr res)) ,(sre-alternate (cddr res))))) `(,@prefix ,res)) res))) ((eq? 'or (car ls)) (lp (cdr ls) '() (shift))) (else (lp (cdr ls) (cons (car ls) term) res)))))) (define (save) (cons (cons flags (collect)) st)) ;; main parsing (cond ((>= i end) (if (pair? st) (error "unterminated parenthesis in regexp" str) (collect/terms))) (else (case (string-ref str i) ((#\.) (lp (+ i 1) (+ i 1) flags (cons (if (flag-set? flags ~single-line?) 'any 'nonl) (collect)) st)) ((#\?) (let ((res (collect/single))) (if (null? res) (error "? can't follow empty pattern" str res) (let ((x (car res))) (lp (+ i 1) (+ i 1) flags (cons (if (pair? x) (case (car x) ((*) `(*? ,@(cdr x))) ((+) `(**? 1 #f ,@(cdr x))) ((?) `(?? ,@(cdr x))) ((**) `(**? ,@(cdr x))) ((=) `(**? ,(cadr x) ,@(cdr x))) ((>=) `(**? ,(cadr x) #f ,@(cddr x))) (else `(? ,x))) `(? ,x)) (cdr res)) st))))) ((#\+ #\*) (let* ((res (collect/single)) (x (if (pair? res) (car res) 'epsilon)) (op (string->symbol (string (string-ref str i))))) (cond ((sre-repeater? x) (error "duplicate repetition (e.g. **) in pattern" str res)) ((sre-empty? x) (error "can't repeat empty pattern (e.g. ()*)" str res)) (else (lp (+ i 1) (+ i 1) flags (cons (list op x) (cdr res)) st))))) ((#\() (cond ((>= (+ i 1) end) (error "unterminated parenthesis in regexp" str)) ((not (memv (string-ref str (+ i 1)) '(#\? #\*))) ; normal case (lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save))) ((>= (+ i 2) end) (error "unterminated parenthesis in regexp" str)) ((eqv? (string-ref str (+ i 1)) #\*) (error "bad regexp syntax: (*FOO) not supported" str)) (else ;; (?...) case (case (string-ref str (+ i 2)) ((#\#) (let ((j (string-find/index str #\) (+ i 3)))) (lp (+ j i) (min (+ j 1) end) flags (collect) st))) ((#\:) (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save))) ((#\=) (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '(look-ahead) (save))) ((#\!) (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '(neg-look-ahead) (save))) ((#\<) (cond ((>= (+ i 3) end) (error "unterminated parenthesis in regexp" str)) (else (case (string-ref str (+ i 3)) ((#\=) (lp (+ i 4) (+ i 4) (flag-clear flags ~save?) '(look-behind) (save))) ((#\!) (lp (+ i 4) (+ i 4) (flag-clear flags ~save?) '(neg-look-behind) (save))) (else (let ((j (and (char-alphabetic? (string-ref str (+ i 3))) (string-find/index str #\> (+ i 4))))) (if (< j end) (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) `(,(string->symbol (substring str (+ i 3) j)) submatch-named) (save)) (error "invalid (?< sequence" str)))))))) ((#\>) (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '(atomic) (save))) ;;((#\' #\P) ; named subpatterns ;; ) ;;((#\R) ; recursion ;; ) ((#\() (cond ((>= (+ i 3) end) (error "unterminated parenthesis in regexp" str)) ((char-numeric? (string-ref str (+ i 3))) (let* ((j (string-find/index str #\) (+ i 3))) (n (string->number (substring str (+ i 3) j)))) (if (or (= j end) (not n)) (error "invalid conditional reference" str) (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) `(,n if) (save))))) ((char-alphabetic? (string-ref str (+ i 3))) (let ((j (string-find/index str #\) (+ i 3)))) (if (= j end) (error "invalid named conditional reference" str) (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) `(,(string->symbol (substring str (+ i 3) j)) if) (save))))) (else (lp (+ i 2) (+ i 2) (flag-clear flags ~save?) '(if) (save))))) ((#\{) (error "unsupported Perl-style cluster" str)) (else (let ((old-flags flags)) (let lp2 ((j (+ i 2)) (flags flags) (invert? #f)) (define (join x) ((if invert? flag-clear flag-join) flags x)) (cond ((>= j end) (error "incomplete cluster" str i)) (else (case (string-ref str j) ((#\i) (lp2 (+ j 1) (join ~case-insensitive?) invert?)) ((#\m) (lp2 (+ j 1) (join ~multi-line?) invert?)) ((#\x) (lp2 (+ j 1) (join ~ignore-space?) invert?)) ((#\-) (lp2 (+ j 1) flags (not invert?))) ((#\)) (lp (+ j 1) (+ j 1) flags (collect) st)) ((#\:) (lp (+ j 1) (+ j 1) flags '() (cons (cons old-flags (collect)) st))) (else (error "unknown regex cluster modifier" str) ))))))))))) ((#\)) (if (null? st) (error "too many )'s in regexp" str) (lp (+ i 1) (+ i 1) (caar st) (cons (collect/terms) (cdar st)) (cdr st)))) ((#\[) (apply (lambda (sre j) (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st)) (string-parse-cset str (+ i 1) flags))) ((#\{) (cond ((or (>= (+ i 1) end) (not (or (char-numeric? (string-ref str (+ i 1))) (eqv? #\, (string-ref str (+ i 1)))))) (lp (+ i 1) from flags res st)) (else (let ((res (collect/single))) (cond ((null? res) (error "{ can't follow empty pattern")) (else (let* ((x (car res)) (tail (cdr res)) (j (string-find/index str #\} (+ i 1))) (s2 (string-split (substring str (+ i 1) j) #\,)) (n (string->number (car s2))) (m (and (pair? (cdr s2)) (string->number (cadr s2))))) (cond ((or (= j end) (not n) (and (pair? (cdr s2)) (not (equal? "" (cadr s2))) (not m))) (error "invalid {n} repetition syntax" s2)) ((null? (cdr s2)) (lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st)) (m (lp (+ j 1) (+ j 1) flags `((** ,n ,m ,x) ,@tail) st)) (else (lp (+ j 1) (+ j 1) flags `((>= ,n ,x) ,@tail) st) ))))))))) ((#\\) (cond ((>= (+ i 1) end) (error "incomplete escape sequence" str)) (else (let ((c (string-ref str (+ i 1)))) (case c ((#\d) (lp (+ i 2) (+ i 2) flags `(numeric ,@(collect)) st)) ((#\D) (lp (+ i 2) (+ i 2) flags `((~ numeric) ,@(collect)) st)) ((#\s) (lp (+ i 2) (+ i 2) flags `(space ,@(collect)) st)) ((#\S) (lp (+ i 2) (+ i 2) flags `((~ space) ,@(collect)) st)) ((#\w) (lp (+ i 2) (+ i 2) flags `((or alphanumeric ("_")) ,@(collect)) st)) ((#\W) (lp (+ i 2) (+ i 2) flags `((~ (or alphanumeric ("_"))) ,@(collect)) st)) ((#\b) (lp (+ i 2) (+ i 2) flags `((or bow eow) ,@(collect)) st)) ((#\B) (lp (+ i 2) (+ i 2) flags `(nwb ,@(collect)) st)) ((#\A) (lp (+ i 2) (+ i 2) flags `(bos ,@(collect)) st)) ((#\Z) (lp (+ i 2) (+ i 2) flags `((? #\newline) eos ,@(collect)) st)) ((#\z) (lp (+ i 2) (+ i 2) flags `(eos ,@(collect)) st)) ((#\R) (lp (+ i 2) (+ i 2) flags `(newline ,@(collect)) st)) ((#\K) (lp (+ i 2) (+ i 2) flags `(reset ,@(collect)) st)) ;; these two are from Emacs and TRE, but not in PCRE ((#\<) (lp (+ i 2) (+ i 2) flags `(bow ,@(collect)) st)) ((#\>) (lp (+ i 2) (+ i 2) flags `(eow ,@(collect)) st)) ((#\x) (apply (lambda (ch j) (lp (+ j 1) (+ j 1) flags `(,ch ,@(collect)) st)) (string-parse-hex-escape str (+ i 2) end))) ((#\k) (let ((c (string-ref str (+ i 2)))) (if (not (memv c '(#\< #\{ #\'))) (error "bad \\k usage, expected \\k<...>" str) (let* ((terminal (char-mirror c)) (j (string-find/index str terminal (+ i 2))) (s (substring str (+ i 3) j)) (backref (if (flag-set? flags ~case-insensitive?) 'backref-ci 'backref))) (if (= j end) (error "unterminated named backref" str) (lp (+ j 1) (+ j 1) flags `((,backref ,(string->symbol s)) ,@(collect)) st)))))) ((#\Q) ;; \Q..\E escapes (let ((res (collect))) (let lp2 ((j (+ i 2))) (cond ((>= j end) (lp j (+ i 2) flags res st)) ((eqv? #\\ (string-ref str j)) (cond ((>= (+ j 1) end) (lp (+ j 1) (+ i 2) flags res st)) ((eqv? #\E (string-ref str (+ j 1))) (lp (+ j 2) (+ j 2) flags (cons (substring str (+ i 2) j) res) st)) (else (lp2 (+ j 2))))) (else (lp2 (+ j 1))))))) ;;((#\p) ; XXXX unicode properties ;; ) ;;((#\P) ;; ) (else (cond ((char-numeric? c) (let* ((j (string-skip str char-numeric? (+ i 2))) (backref (if (flag-set? flags ~case-insensitive?) 'backref-ci 'backref)) (res `((,backref ,(string->number (substring str (+ i 1) j))) ,@(collect)))) (lp j j flags res st))) ((char-alphabetic? c) (let ((cell (assv c posix-escape-sequences))) (if cell (lp (+ i 2) (+ i 2) flags (cons (cdr cell) (collect)) st) (error "unknown escape sequence" str c)))) (else (lp (+ i 2) (+ i 1) flags (collect) st))))))))) ((#\|) (lp (+ i 1) (+ i 1) flags (cons 'or (collect)) st)) ((#\^) (let ((sym (if (flag-set? flags ~multi-line?) 'bol 'bos))) (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st))) ((#\$) (let ((sym (if (flag-set? flags ~multi-line?) 'eol 'eos))) (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st))) ((#\space) (if (flag-set? flags ~ignore-space?) (lp (+ i 1) (+ i 1) flags (collect) st) (lp (+ i 1) from flags res st))) ((#\#) (if (flag-set? flags ~ignore-space?) (let ((j (string-find/index str #\newline (+ i 1)))) (lp (+ j 1) (min (+ j 1) end) flags (collect) st)) (lp (+ i 1) from flags res st))) (else (lp (+ i 1) from flags res st)))))))) (define (pcre->regexp pcre . o) (regexp (apply pcre->sre pcre o)))