chibi-scheme/lib/chibi/regexp/pcre.scm
2016-03-29 22:25:09 +09:00

604 lines
24 KiB
Scheme

;; PCRE parsing, adapted from IrRegex.
(define ~none 0)
(define ~save? 1)
(define ~case-insensitive? 2)
(define ~multi-line? 4)
(define ~single-line? 8)
(define ~ignore-space? 16)
(define (flag-set? flags i)
(= i (bitwise-and flags i)))
(define (flag-join a b)
(if b (bitwise-ior a b) a))
(define (flag-clear a b)
(bitwise-and a (bitwise-not b)))
(define (symbol-list->flags ls)
(let lp ((ls ls) (res ~none))
(cond
((null? ls)
res)
((not (pair? ls))
(lp (list ls) res))
(else
(lp (cdr ls)
(flag-join
res
(case (car ls)
((i ci case-insensitive) ~case-insensitive?)
((m multi-line) ~multi-line?)
((s single-line) ~single-line?)
((x ignore-space) ~ignore-space?)
(else #f))))))))
(define posix-escape-sequences
`((#\n . #\newline)
(#\r . #\return)
(#\t . #\tab)
(#\a . #\alarm)
(#\e . #\escape)))
(define (char-altcase c)
(if (char-upper-case? c) (char-downcase c) (char-upcase c)))
(define (char-mirror c)
(case c ((#\<) #\>) ((#\{) #\}) ((#\() #\)) ((#\[) #\]) (else c)))
(define (string-scan-char-escape str c . o)
(let ((end (string-length str)))
(let scan ((i (if (pair? o) (car o) 0)))
(cond ((= i end) #f)
((eqv? c (string-ref str i)) i)
((eqv? c #\\) (scan (+ i 2)))
(else (scan (+ i 1)))))))
(define (string-find/index str ch start . o)
(let* ((end (if (pair? o) (car o) (string-length str)))
(i (string-find str ch (string-index->cursor str start)
(string-index->cursor str end))))
(string-cursor->index str i)))
(define (string-parse-hex-escape str i end)
(cond
((>= i end)
(error "incomplete hex escape" str i))
((eqv? #\{ (string-ref str i))
(let ((j (string-scan-char-escape str #\} (+ i 1))))
(if (not j)
(error "incomplete hex brace escape" str i)
(let* ((s (substring str (+ i 1) j))
(n (string->number s 16)))
(if n
(list (integer->char n) j)
(error "bad hex brace escape" s))))))
((>= (+ i 1) end)
(error "incomplete hex escape" str i))
(else
(let* ((s (substring str i (+ i 2)))
(n (string->number s 16)))
(if n
(list (integer->char n) (+ i 2))
(error "bad hex escape" s))))))
(define (string-parse-cset str start flags)
(let* ((end (string-length str))
(invert? (and (< start end) (eqv? #\^ (string-ref str start)))))
(define (cset-union a b)
(cond ((not a) b)
((not b) a)
((and (pair? a) (eq? 'or (car a))) `(,@a ,b))
(else `(or ,a ,b))))
(define (go i prev-char ones pairs classes)
(if (>= i end)
(error "incomplete char set" str i end))
(case (string-ref str i)
((#\])
(if (and (null? ones) (null? pairs))
(go (+ i 1) #\] (cons #\] ones) pairs classes)
(list
(let ((res
(cset-union
(cset-union
(and (pair? classes)
`(or ,@classes))
(and (pair? ones)
`(,(list->string (reverse ones)))))
(and (pair? pairs)
`(/ ,(list->string (reverse pairs)))))))
(if invert? `(~ ,res) res))
i)))
((#\-)
(cond
((or (= i start)
(and (= i (+ start 1)) invert?)
(eqv? #\] (string-ref str (+ i 1))))
(go (+ i 1) #\- (cons #\- ones) pairs classes))
;; alternately permissively allow this as a -
((not prev-char)
(error "bad pcre char-set, unexpected -" str))
(else
(let ((ch (string-ref str (+ i 1))))
(apply
(lambda (c j)
(if (char<? c prev-char)
(error "inverted range in pcre char-set" prev-char c)
(go j #f (cdr ones) (cons c (cons prev-char pairs))
classes)))
(cond
((and (eqv? #\\ ch)
(assv (string-ref str (+ i 2)) posix-escape-sequences))
=> (lambda (x) (list (cdr x) (+ i 3))))
((and (eqv? #\\ ch)
(eqv? (string-ref str (+ i 2)) #\x))
(string-parse-hex-escape str (+ i 3) end))
(else
(list ch (+ i 2)))))))))
((#\[)
(let* ((inv? (eqv? #\^ (string-ref str (+ i 1))))
(i2 (if inv? (+ i 2) (+ i 1))))
(case (string-ref str i2)
((#\:)
(let ((j (string-find/index str #\: (+ i2 1) end)))
(if (or (>= (+ j 1) end)
(not (eqv? #\] (string-ref str (+ j 1)))))
(error "incomplete character class" str)
(let* ((class (string->symbol (substring str (+ i2 1) j)))
(class (if inv? `(~ ,class) class)))
(go (+ j 2) #f ones pairs (cons class classes))))))
((#\= #\.)
(error "collating sequences not supported" str))
(else
(go (+ i 1) #\[ (cons #\[ ones) pairs classes)))))
((#\\)
(let ((c (string-ref str (+ i 1))))
(case c
((#\d #\D #\s #\S #\w #\W)
(go (+ i 2) #f ones pairs
(cons (pcre->sre (string #\\ c)) classes)))
((#\x)
(apply
(lambda (c j) (go j c (cons c ones) pairs classes))
(string-parse-hex-escape str (+ i 2) end)))
(else
(let ((c (cond ((assv c posix-escape-sequences) => cdr)
(else c))))
(go (+ i 2) c (cons c ones) pairs classes))))))
(else
=> (lambda (c) (go (+ i 1) c (cons c ones) pairs classes)))))
(if invert?
(let ((ones (if (flag-set? flags ~multi-line?) '(#\newline) '())))
(go (+ start 1) #f ones '() '()))
(go start #f '() '() '()))))
;; build a (seq ls ...) sre from a list
(define (sre-sequence ls)
(cond
((null? ls) 'epsilon)
((null? (cdr ls)) (car ls))
(else (cons 'seq ls))))
;; build a (or ls ...) sre from a list
(define (sre-alternate ls)
(cond
((null? ls) '(or))
((null? (cdr ls)) (car ls))
(else (cons 'or ls))))
;; returns #t if the sre can ever be empty
(define (sre-empty? sre)
(if (pair? sre)
(case (car sre)
((* ? look-ahead look-behind neg-look-ahead neg-look-behind) #t)
((**) (or (not (number? (cadr sre))) (zero? (cadr sre))))
((or) (any sre-empty? (cdr sre)))
((: seq $ submatch => submatch-named + atomic)
(every sre-empty? (cdr sre)))
(else #f))
(memq sre '(epsilon bos eos bol eol bow eow commit))))
;; returns #t if the sre is a */+ repetition
(define (sre-repeater? sre)
(and (pair? sre)
(or (memq (car sre) '(* +))
(and (memq (car sre) '($ submatch => submatch-named seq :))
(pair? (cdr sre))
(null? (cddr sre))
(sre-repeater? (cadr sre))))))
(define (pcre->sre str . o)
(if (not (string? str))
(error "pcre->sre: expected a string" str))
(let ((end (string-length str))
(orig-flags (if (pair? o) (symbol-list->flags (car o)) ~none)))
(let lp ((i 0) (from 0) (flags orig-flags) (res '()) (st '()))
;; accumulate the substring from..i as literal text
(define (collect)
(if (= i from) res (cons (substring str from i) res)))
;; like collect but breaks off the last single character when
;; collecting literal data, as the argument to ?/*/+ etc.
(define (collect/single)
(let ((j (- i 1)))
(cond
((< j from)
res)
(else
(let ((c (string-ref str j)))
(cond
((= j from)
(cons c res))
(else
(cons c (cons (substring str from j) res)))))))))
;; collects for use as a result, reversing and grouping OR
;; terms, and some ugly tweaking of `function-like' groups and
;; conditionals
(define (collect/terms)
(let* ((ls (collect))
(func
(and (pair? ls)
(memq (last ls)
'(atomic if look-ahead neg-look-ahead
look-behind neg-look-behind
=> submatch-named))))
(prefix (if (and func (memq (car func) '(=> submatch-named)))
(list 'submatch-named (cadr (reverse ls)))
(and func (list (car func)))))
(ls (if func
(if (memq (car func) '(=> submatch-named))
(reverse (cddr (reverse ls)))
(reverse (cdr (reverse ls))))
ls)))
(let lp ((ls ls) (term '()) (res '()))
(define (shift)
(cons (sre-sequence term) res))
(cond
((null? ls)
(let* ((res (sre-alternate (shift)))
(res (if (flag-set? flags ~save?)
(list 'submatch res)
res)))
(if prefix
(if (eq? 'if (car prefix))
(cond
((not (pair? res))
'epsilon)
((memq (car res)
'(look-ahead neg-look-ahead
look-behind neg-look-behind))
res)
((eq? 'seq (car res))
`(if ,(cadr res)
,(sre-sequence (cddr res))))
(else
`(if ,(cadadr res)
,(sre-sequence (cddadr res))
,(sre-alternate (cddr res)))))
`(,@prefix ,res))
res)))
((eq? 'or (car ls)) (lp (cdr ls) '() (shift)))
(else (lp (cdr ls) (cons (car ls) term) res))))))
(define (save)
(cons (cons flags (collect)) st))
;; main parsing
(cond
((>= i end)
(if (pair? st)
(error "unterminated parenthesis in regexp" str)
(collect/terms)))
(else
(case (string-ref str i)
((#\.)
(lp (+ i 1) (+ i 1) flags
(cons (if (flag-set? flags ~single-line?) 'any 'nonl)
(collect))
st))
((#\?)
(let ((res (collect/single)))
(if (null? res)
(error "? can't follow empty pattern" str res)
(let ((x (car res)))
(lp (+ i 1)
(+ i 1)
flags
(cons
(if (pair? x)
(case (car x)
((*) `(*? ,@(cdr x)))
((+) `(**? 1 #f ,@(cdr x)))
((?) `(?? ,@(cdr x)))
((**) `(**? ,@(cdr x)))
((=) `(**? ,(cadr x) ,@(cdr x)))
((>=) `(**? ,(cadr x) #f ,@(cddr x)))
(else `(? ,x)))
`(? ,x))
(cdr res))
st)))))
((#\+ #\*)
(let* ((res (collect/single))
(x (if (pair? res) (car res) 'epsilon))
(op (string->symbol (string (string-ref str i)))))
(cond
((sre-repeater? x)
(error "duplicate repetition (e.g. **) in pattern" str res))
((sre-empty? x)
(error "can't repeat empty pattern (e.g. ()*)" str res))
(else
(lp (+ i 1) (+ i 1) flags
(cons (list op x) (cdr res))
st)))))
((#\()
(cond
((>= (+ i 1) end)
(error "unterminated parenthesis in regexp" str))
((not (memv (string-ref str (+ i 1)) '(#\? #\*))) ; normal case
(lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save)))
((>= (+ i 2) end)
(error "unterminated parenthesis in regexp" str))
((eqv? (string-ref str (+ i 1)) #\*)
(error "bad regexp syntax: (*FOO) not supported" str))
(else ;; (?...) case
(case (string-ref str (+ i 2))
((#\#)
(let ((j (string-find/index str #\) (+ i 3))))
(lp (+ j i) (min (+ j 1) end) flags (collect) st)))
((#\:)
(lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save)))
((#\=)
(lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
'(look-ahead) (save)))
((#\!)
(lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
'(neg-look-ahead) (save)))
((#\<)
(cond
((>= (+ i 3) end)
(error "unterminated parenthesis in regexp" str))
(else
(case (string-ref str (+ i 3))
((#\=)
(lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
'(look-behind) (save)))
((#\!)
(lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
'(neg-look-behind) (save)))
(else
(let ((j (and (char-alphabetic?
(string-ref str (+ i 3)))
(string-find/index str #\> (+ i 4)))))
(if (< j end)
(lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
`(,(string->symbol (substring str (+ i 3) j))
submatch-named)
(save))
(error "invalid (?< sequence" str))))))))
((#\>)
(lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
'(atomic) (save)))
;;((#\' #\P) ; named subpatterns
;; )
;;((#\R) ; recursion
;; )
((#\()
(cond
((>= (+ i 3) end)
(error "unterminated parenthesis in regexp" str))
((char-numeric? (string-ref str (+ i 3)))
(let* ((j (string-find/index str #\) (+ i 3)))
(n (string->number (substring str (+ i 3) j))))
(if (or (= j end) (not n))
(error "invalid conditional reference" str)
(lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
`(,n if) (save)))))
((char-alphabetic? (string-ref str (+ i 3)))
(let ((j (string-find/index str #\) (+ i 3))))
(if (= j end)
(error "invalid named conditional reference" str)
(lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
`(,(string->symbol (substring str (+ i 3) j)) if)
(save)))))
(else
(lp (+ i 2) (+ i 2) (flag-clear flags ~save?)
'(if) (save)))))
((#\{)
(error "unsupported Perl-style cluster" str))
(else
(let ((old-flags flags))
(let lp2 ((j (+ i 2)) (flags flags) (invert? #f))
(define (join x)
((if invert? flag-clear flag-join) flags x))
(cond
((>= j end)
(error "incomplete cluster" str i))
(else
(case (string-ref str j)
((#\i)
(lp2 (+ j 1) (join ~case-insensitive?) invert?))
((#\m)
(lp2 (+ j 1) (join ~multi-line?) invert?))
((#\x)
(lp2 (+ j 1) (join ~ignore-space?) invert?))
((#\-)
(lp2 (+ j 1) flags (not invert?)))
((#\))
(lp (+ j 1) (+ j 1) flags (collect)
st))
((#\:)
(lp (+ j 1) (+ j 1) flags '()
(cons (cons old-flags (collect)) st)))
(else
(error "unknown regex cluster modifier" str)
)))))))))))
((#\))
(if (null? st)
(error "too many )'s in regexp" str)
(lp (+ i 1)
(+ i 1)
(caar st)
(cons (collect/terms) (cdar st))
(cdr st))))
((#\[)
(apply
(lambda (sre j)
(lp (+ j 1) (+ j 1) flags (cons sre (collect)) st))
(string-parse-cset str (+ i 1) flags)))
((#\{)
(cond
((or (>= (+ i 1) end)
(not (or (char-numeric? (string-ref str (+ i 1)))
(eqv? #\, (string-ref str (+ i 1))))))
(lp (+ i 1) from flags res st))
(else
(let ((res (collect/single)))
(cond
((null? res)
(error "{ can't follow empty pattern"))
(else
(let* ((x (car res))
(tail (cdr res))
(j (string-find/index str #\} (+ i 1)))
(s2 (string-split (substring str (+ i 1) j) #\,))
(n (string->number (car s2)))
(m (and (pair? (cdr s2))
(string->number (cadr s2)))))
(cond
((or (= j end)
(not n)
(and (pair? (cdr s2))
(not (equal? "" (cadr s2)))
(not m)))
(error "invalid {n} repetition syntax" s2))
((null? (cdr s2))
(lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st))
(m
(lp (+ j 1) (+ j 1) flags `((** ,n ,m ,x) ,@tail) st))
(else
(lp (+ j 1) (+ j 1) flags `((>= ,n ,x) ,@tail) st)
)))))))))
((#\\)
(cond
((>= (+ i 1) end)
(error "incomplete escape sequence" str))
(else
(let ((c (string-ref str (+ i 1))))
(case c
((#\d)
(lp (+ i 2) (+ i 2) flags `(numeric ,@(collect)) st))
((#\D)
(lp (+ i 2) (+ i 2) flags `((~ numeric) ,@(collect)) st))
((#\s)
(lp (+ i 2) (+ i 2) flags `(space ,@(collect)) st))
((#\S)
(lp (+ i 2) (+ i 2) flags `((~ space) ,@(collect)) st))
((#\w)
(lp (+ i 2) (+ i 2) flags
`((or alphanumeric ("_")) ,@(collect)) st))
((#\W)
(lp (+ i 2) (+ i 2) flags
`((~ (or alphanumeric ("_"))) ,@(collect)) st))
((#\b)
(lp (+ i 2) (+ i 2) flags
`((or bow eow) ,@(collect)) st))
((#\B)
(lp (+ i 2) (+ i 2) flags `(nwb ,@(collect)) st))
((#\A)
(lp (+ i 2) (+ i 2) flags `(bos ,@(collect)) st))
((#\Z)
(lp (+ i 2) (+ i 2) flags
`((? #\newline) eos ,@(collect)) st))
((#\z)
(lp (+ i 2) (+ i 2) flags `(eos ,@(collect)) st))
((#\R)
(lp (+ i 2) (+ i 2) flags `(newline ,@(collect)) st))
((#\K)
(lp (+ i 2) (+ i 2) flags `(reset ,@(collect)) st))
;; these two are from Emacs and TRE, but not in PCRE
((#\<)
(lp (+ i 2) (+ i 2) flags `(bow ,@(collect)) st))
((#\>)
(lp (+ i 2) (+ i 2) flags `(eow ,@(collect)) st))
((#\x)
(apply
(lambda (ch j)
(lp (+ j 1) (+ j 1) flags `(,ch ,@(collect)) st))
(string-parse-hex-escape str (+ i 2) end)))
((#\k)
(let ((c (string-ref str (+ i 2))))
(if (not (memv c '(#\< #\{ #\')))
(error "bad \\k usage, expected \\k<...>" str)
(let* ((terminal (char-mirror c))
(j (string-find/index str terminal (+ i 2)))
(s (substring str (+ i 3) j))
(backref
(if (flag-set? flags ~case-insensitive?)
'backref-ci
'backref)))
(if (= j end)
(error "unterminated named backref" str)
(lp (+ j 1) (+ j 1) flags
`((,backref ,(string->symbol s))
,@(collect))
st))))))
((#\Q) ;; \Q..\E escapes
(let ((res (collect)))
(let lp2 ((j (+ i 2)))
(cond
((>= j end)
(lp j (+ i 2) flags res st))
((eqv? #\\ (string-ref str j))
(cond
((>= (+ j 1) end)
(lp (+ j 1) (+ i 2) flags res st))
((eqv? #\E (string-ref str (+ j 1)))
(lp (+ j 2) (+ j 2) flags
(cons (substring str (+ i 2) j) res) st))
(else
(lp2 (+ j 2)))))
(else
(lp2 (+ j 1)))))))
;;((#\p) ; XXXX unicode properties
;; )
;;((#\P)
;; )
(else
(cond
((char-numeric? c)
(let* ((j (string-skip str char-numeric? (+ i 2)))
(backref
(if (flag-set? flags ~case-insensitive?)
'backref-ci
'backref))
(res `((,backref ,(string->number
(substring str (+ i 1) j)))
,@(collect))))
(lp j j flags res st)))
((char-alphabetic? c)
(let ((cell (assv c posix-escape-sequences)))
(if cell
(lp (+ i 2) (+ i 2) flags
(cons (cdr cell) (collect)) st)
(error "unknown escape sequence" str c))))
(else
(lp (+ i 2) (+ i 1) flags (collect) st)))))))))
((#\|)
(lp (+ i 1) (+ i 1) flags (cons 'or (collect)) st))
((#\^)
(let ((sym (if (flag-set? flags ~multi-line?) 'bol 'bos)))
(lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
((#\$)
(let ((sym (if (flag-set? flags ~multi-line?) 'eol 'eos)))
(lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
((#\space)
(if (flag-set? flags ~ignore-space?)
(lp (+ i 1) (+ i 1) flags (collect) st)
(lp (+ i 1) from flags res st)))
((#\#)
(if (flag-set? flags ~ignore-space?)
(let ((j (string-find/index str #\newline (+ i 1))))
(lp (+ j 1) (min (+ j 1) end) flags (collect) st))
(lp (+ i 1) from flags res st)))
(else
(lp (+ i 1) from flags res st))))))))
(define (pcre->regexp pcre . o)
(regexp (apply pcre->sre pcre o)))