chibi-scheme/lib/chibi/regexp/pcre.scm
Alex Shinn 8b5eb68238 File descriptors maintain a reference count of ports open on them
They can be close()d explicitly with close-file-descriptor, and
will close() on gc, but only explicitly closing the last port on
them will close the fileno.  Notably needed for network sockets
where we open separate input and output ports on the same socket.
2014-02-20 22:32:50 +09:00

598 lines
24 KiB
Scheme

;; PCRE parsing, adapted from IrRegex.
(define ~none 0)
(define ~save? 1)
(define ~case-insensitive? 2)
(define ~multi-line? 4)
(define ~single-line? 8)
(define ~ignore-space? 16)
(define (flag-set? flags i)
(= i (bitwise-and flags i)))
(define (flag-join a b)
(if b (bitwise-ior a b) a))
(define (flag-clear a b)
(bitwise-and a (bitwise-not b)))
(define (symbol-list->flags ls)
(let lp ((ls ls) (res ~none))
(cond
((null? ls)
res)
((not (pair? ls))
(lp (list ls) res))
(else
(lp (cdr ls)
(flag-join
res
(case (car ls)
((i ci case-insensitive) ~case-insensitive?)
((m multi-line) ~multi-line?)
((s single-line) ~single-line?)
((x ignore-space) ~ignore-space?)
(else #f))))))))
(define posix-escape-sequences
`((#\n . #\newline)
(#\r . #\return)
(#\t . #\tab)
(#\a . #\alarm)
(#\e . #\escape)))
(define (char-altcase c)
(if (char-upper-case? c) (char-downcase c) (char-upcase c)))
(define (char-mirror c)
(case c ((#\<) #\>) ((#\{) #\}) ((#\() #\)) ((#\[) #\]) (else c)))
(define (string-scan-char-escape str c . o)
(let ((end (string-length str)))
(let scan ((i (if (pair? o) (car o) 0)))
(cond ((= i end) #f)
((eqv? c (string-ref str i)) i)
((eqv? c #\\) (scan (+ i 2)))
(else (scan (+ i 1)))))))
(define (string-parse-hex-escape str i end)
(cond
((>= i end)
(error "incomplete hex escape" str i))
((eqv? #\{ (string-ref str i))
(let ((j (string-scan-char-escape str #\} (+ i 1))))
(if (not j)
(error "incomplete hex brace escape" str i)
(let* ((s (substring str (+ i 1) j))
(n (string->number s 16)))
(if n
(list (integer->char n) j)
(error "bad hex brace escape" s))))))
((>= (+ i 1) end)
(error "incomplete hex escape" str i))
(else
(let* ((s (substring str i (+ i 2)))
(n (string->number s 16)))
(if n
(list (integer->char n) (+ i 2))
(error "bad hex escape" s))))))
(define (string-parse-cset str start flags)
(let* ((end (string-length str))
(invert? (and (< start end) (eqv? #\^ (string-ref str start)))))
(define (cset-union a b)
(cond ((not a) b)
((not b) a)
((and (pair? a) (eq? 'or (car a))) `(,@a ,b))
(else `(or ,a ,b))))
(define (go i prev-char ones pairs classes)
(if (>= i end)
(error "incomplete char set" str i end))
(case (string-ref str i)
((#\])
(if (and (null? ones) (null? pairs))
(go (+ i 1) #\] (cons #\] ones) pairs classes)
(list
(let ((res
(cset-union
(cset-union
(and (pair? classes)
`(or ,@classes))
(and (pair? ones)
`(,(list->string (reverse ones)))))
(and (pair? pairs)
`(/ ,(list->string (reverse pairs)))))))
(if invert? `(~ ,res) res))
i)))
((#\-)
(cond
((or (= i start)
(and (= i (+ start 1)) invert?)
(eqv? #\] (string-ref str (+ i 1))))
(go (+ i 1) #\- (cons #\- ones) pairs classes))
;; alternately permissively allow this as a -
((not prev-char)
(error "bad pcre char-set, unexpected -" str))
(else
(let ((ch (string-ref str (+ i 1))))
(apply
(lambda (c j)
(if (char<? c prev-char)
(error "inverted range in pcre char-set" prev-char c)
(go j #f (cdr ones) (cons c (cons prev-char pairs))
classes)))
(cond
((and (eqv? #\\ ch)
(assv (string-ref str (+ i 2)) posix-escape-sequences))
=> (lambda (x) (list (cdr x) (+ i 3))))
((and (eqv? #\\ ch)
(eqv? (string-ref str (+ i 2)) #\x))
(string-parse-hex-escape str (+ i 3) end))
(else
(list ch (+ i 2)))))))))
((#\[)
(let* ((inv? (eqv? #\^ (string-ref str (+ i 1))))
(i2 (if inv? (+ i 2) (+ i 1))))
(case (string-ref str i2)
((#\:)
(let ((j (string-find str #\: (+ i2 1) end)))
(if (or (>= (+ j 1) end)
(not (eqv? #\] (string-ref str (+ j 1)))))
(error "incomplete character class" str)
(let* ((class (string->symbol (substring str (+ i2 1) j)))
(class (if inv? `(~ ,class) class)))
(go (+ j 2) #f ones pairs (cons class classes))))))
((#\= #\.)
(error "collating sequences not supported" str))
(else
(go (+ i 1) #\[ (cons #\[ ones) pairs classes)))))
((#\\)
(let ((c (string-ref str (+ i 1))))
(case c
((#\d #\D #\s #\S #\w #\W)
(go (+ i 2) #f ones pairs
(cons (pcre->sre (string #\\ c)) classes)))
((#\x)
(apply
(lambda (c j) (go j c (cons c ones) pairs classes))
(string-parse-hex-escape str (+ i 2) end)))
(else
(let ((c (cond ((assv c posix-escape-sequences) => cdr)
(else c))))
(go (+ i 2) c (cons c ones) pairs classes))))))
(else
=> (lambda (c) (go (+ i 1) c (cons c ones) pairs classes)))))
(if invert?
(let ((ones (if (flag-set? flags ~multi-line?) '(#\newline) '())))
(go (+ start 1) #f ones '() '()))
(go start #f '() '() '()))))
;; build a (seq ls ...) sre from a list
(define (sre-sequence ls)
(cond
((null? ls) 'epsilon)
((null? (cdr ls)) (car ls))
(else (cons 'seq ls))))
;; build a (or ls ...) sre from a list
(define (sre-alternate ls)
(cond
((null? ls) '(or))
((null? (cdr ls)) (car ls))
(else (cons 'or ls))))
;; returns #t if the sre can ever be empty
(define (sre-empty? sre)
(if (pair? sre)
(case (car sre)
((* ? look-ahead look-behind neg-look-ahead neg-look-behind) #t)
((**) (or (not (number? (cadr sre))) (zero? (cadr sre))))
((or) (any sre-empty? (cdr sre)))
((: seq $ submatch => submatch-named + atomic)
(every sre-empty? (cdr sre)))
(else #f))
(memq sre '(epsilon bos eos bol eol bow eow commit))))
;; returns #t if the sre is a */+ repetition
(define (sre-repeater? sre)
(and (pair? sre)
(or (memq (car sre) '(* +))
(and (memq (car sre) '($ submatch => submatch-named seq :))
(pair? (cdr sre))
(null? (cddr sre))
(sre-repeater? (cadr sre))))))
(define (pcre->sre str . o)
(if (not (string? str))
(error "pcre->sre: expected a string" str))
(let ((end (string-length str))
(orig-flags (if (pair? o) (symbol-list->flags (car o)) ~none)))
(let lp ((i 0) (from 0) (flags orig-flags) (res '()) (st '()))
;; accumulate the substring from..i as literal text
(define (collect)
(if (= i from) res (cons (substring str from i) res)))
;; like collect but breaks off the last single character when
;; collecting literal data, as the argument to ?/*/+ etc.
(define (collect/single)
(let ((j (- i 1)))
(cond
((< j from)
res)
(else
(let ((c (string-ref str j)))
(cond
((= j from)
(cons c res))
(else
(cons c (cons (substring str from j) res)))))))))
;; collects for use as a result, reversing and grouping OR
;; terms, and some ugly tweaking of `function-like' groups and
;; conditionals
(define (collect/terms)
(let* ((ls (collect))
(func
(and (pair? ls)
(memq (last ls)
'(atomic if look-ahead neg-look-ahead
look-behind neg-look-behind
=> submatch-named))))
(prefix (if (and func (memq (car func) '(=> submatch-named)))
(list 'submatch-named (cadr (reverse ls)))
(and func (list (car func)))))
(ls (if func
(if (memq (car func) '(=> submatch-named))
(reverse (cddr (reverse ls)))
(reverse (cdr (reverse ls))))
ls)))
(let lp ((ls ls) (term '()) (res '()))
(define (shift)
(cons (sre-sequence term) res))
(cond
((null? ls)
(let* ((res (sre-alternate (shift)))
(res (if (flag-set? flags ~save?)
(list 'submatch res)
res)))
(if prefix
(if (eq? 'if (car prefix))
(cond
((not (pair? res))
'epsilon)
((memq (car res)
'(look-ahead neg-look-ahead
look-behind neg-look-behind))
res)
((eq? 'seq (car res))
`(if ,(cadr res)
,(sre-sequence (cddr res))))
(else
`(if ,(cadadr res)
,(sre-sequence (cddadr res))
,(sre-alternate (cddr res)))))
`(,@prefix ,res))
res)))
((eq? 'or (car ls)) (lp (cdr ls) '() (shift)))
(else (lp (cdr ls) (cons (car ls) term) res))))))
(define (save)
(cons (cons flags (collect)) st))
;; main parsing
(cond
((>= i end)
(if (pair? st)
(error "unterminated parenthesis in regexp" str)
(collect/terms)))
(else
(case (string-ref str i)
((#\.)
(lp (+ i 1) (+ i 1) flags
(cons (if (flag-set? flags ~single-line?) 'any 'nonl)
(collect))
st))
((#\?)
(let ((res (collect/single)))
(if (null? res)
(error "? can't follow empty pattern" str res)
(let ((x (car res)))
(lp (+ i 1)
(+ i 1)
flags
(cons
(if (pair? x)
(case (car x)
((*) `(*? ,@(cdr x)))
((+) `(**? 1 #f ,@(cdr x)))
((?) `(?? ,@(cdr x)))
((**) `(**? ,@(cdr x)))
((=) `(**? ,(cadr x) ,@(cdr x)))
((>=) `(**? ,(cadr x) #f ,@(cddr x)))
(else `(? ,x)))
`(? ,x))
(cdr res))
st)))))
((#\+ #\*)
(let* ((res (collect/single))
(x (if (pair? res) (car res) 'epsilon))
(op (string->symbol (string (string-ref str i)))))
(cond
((sre-repeater? x)
(error "duplicate repetition (e.g. **) in pattern" str res))
((sre-empty? x)
(error "can't repeat empty pattern (e.g. ()*)" str res))
(else
(lp (+ i 1) (+ i 1) flags
(cons (list op x) (cdr res))
st)))))
((#\()
(cond
((>= (+ i 1) end)
(error "unterminated parenthesis in regexp" str))
((not (memv (string-ref str (+ i 1)) '(#\? #\*))) ; normal case
(lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save)))
((>= (+ i 2) end)
(error "unterminated parenthesis in regexp" str))
((eqv? (string-ref str (+ i 1)) #\*)
(error "bad regexp syntax: (*FOO) not supported" str))
(else ;; (?...) case
(case (string-ref str (+ i 2))
((#\#)
(let ((j (string-find str #\) (+ i 3))))
(lp (+ j i) (min (+ j 1) end) flags (collect) st)))
((#\:)
(lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save)))
((#\=)
(lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
'(look-ahead) (save)))
((#\!)
(lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
'(neg-look-ahead) (save)))
((#\<)
(cond
((>= (+ i 3) end)
(error "unterminated parenthesis in regexp" str))
(else
(case (string-ref str (+ i 3))
((#\=)
(lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
'(look-behind) (save)))
((#\!)
(lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
'(neg-look-behind) (save)))
(else
(let ((j (and (char-alphabetic?
(string-ref str (+ i 3)))
(string-find str #\> (+ i 4)))))
(if (< j end)
(lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
`(,(string->symbol (substring str (+ i 3) j))
submatch-named)
(save))
(error "invalid (?< sequence" str))))))))
((#\>)
(lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
'(atomic) (save)))
;;((#\' #\P) ; named subpatterns
;; )
;;((#\R) ; recursion
;; )
((#\()
(cond
((>= (+ i 3) end)
(error "unterminated parenthesis in regexp" str))
((char-numeric? (string-ref str (+ i 3)))
(let* ((j (string-find str #\) (+ i 3)))
(n (string->number (substring str (+ i 3) j))))
(if (or (= j end) (not n))
(error "invalid conditional reference" str)
(lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
`(,n if) (save)))))
((char-alphabetic? (string-ref str (+ i 3)))
(let ((j (string-find str #\) (+ i 3))))
(if (= j end)
(error "invalid named conditional reference" str)
(lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
`(,(string->symbol (substring str (+ i 3) j)) if)
(save)))))
(else
(lp (+ i 2) (+ i 2) (flag-clear flags ~save?)
'(if) (save)))))
((#\{)
(error "unsupported Perl-style cluster" str))
(else
(let ((old-flags flags))
(let lp2 ((j (+ i 2)) (flags flags) (invert? #f))
(define (join x)
((if invert? flag-clear flag-join) flags x))
(cond
((>= j end)
(error "incomplete cluster" str i))
(else
(case (string-ref str j)
((#\i)
(lp2 (+ j 1) (join ~case-insensitive?) invert?))
((#\m)
(lp2 (+ j 1) (join ~multi-line?) invert?))
((#\x)
(lp2 (+ j 1) (join ~ignore-space?) invert?))
((#\-)
(lp2 (+ j 1) flags (not invert?)))
((#\))
(lp (+ j 1) (+ j 1) flags (collect)
st))
((#\:)
(lp (+ j 1) (+ j 1) flags '()
(cons (cons old-flags (collect)) st)))
(else
(error "unknown regex cluster modifier" str)
)))))))))))
((#\))
(if (null? st)
(error "too many )'s in regexp" str)
(lp (+ i 1)
(+ i 1)
(caar st)
(cons (collect/terms) (cdar st))
(cdr st))))
((#\[)
(apply
(lambda (sre j)
(lp (+ j 1) (+ j 1) flags (cons sre (collect)) st))
(string-parse-cset str (+ i 1) flags)))
((#\{)
(cond
((or (>= (+ i 1) end)
(not (or (char-numeric? (string-ref str (+ i 1)))
(eqv? #\, (string-ref str (+ i 1))))))
(lp (+ i 1) from flags res st))
(else
(let ((res (collect/single)))
(cond
((null? res)
(error "{ can't follow empty pattern"))
(else
(let* ((x (car res))
(tail (cdr res))
(j (string-find str #\} (+ i 1)))
(s2 (string-split (substring str (+ i 1) j) #\,))
(n (string->number (car s2)))
(m (and (pair? (cdr s2))
(string->number (cadr s2)))))
(cond
((or (= j end)
(not n)
(and (pair? (cdr s2))
(not (equal? "" (cadr s2)))
(not m)))
(error "invalid {n} repetition syntax" s2))
((null? (cdr s2))
(lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st))
(m
(lp (+ j 1) (+ j 1) flags `((** ,n ,m ,x) ,@tail) st))
(else
(lp (+ j 1) (+ j 1) flags `((>= ,n ,x) ,@tail) st)
)))))))))
((#\\)
(cond
((>= (+ i 1) end)
(error "incomplete escape sequence" str))
(else
(let ((c (string-ref str (+ i 1))))
(case c
((#\d)
(lp (+ i 2) (+ i 2) flags `(numeric ,@(collect)) st))
((#\D)
(lp (+ i 2) (+ i 2) flags `((~ numeric) ,@(collect)) st))
((#\s)
(lp (+ i 2) (+ i 2) flags `(space ,@(collect)) st))
((#\S)
(lp (+ i 2) (+ i 2) flags `((~ space) ,@(collect)) st))
((#\w)
(lp (+ i 2) (+ i 2) flags
`((or alphanumeric ("_")) ,@(collect)) st))
((#\W)
(lp (+ i 2) (+ i 2) flags
`((~ (or alphanumeric ("_"))) ,@(collect)) st))
((#\b)
(lp (+ i 2) (+ i 2) flags
`((or bow eow) ,@(collect)) st))
((#\B)
(lp (+ i 2) (+ i 2) flags `(nwb ,@(collect)) st))
((#\A)
(lp (+ i 2) (+ i 2) flags `(bos ,@(collect)) st))
((#\Z)
(lp (+ i 2) (+ i 2) flags
`((? #\newline) eos ,@(collect)) st))
((#\z)
(lp (+ i 2) (+ i 2) flags `(eos ,@(collect)) st))
((#\R)
(lp (+ i 2) (+ i 2) flags `(newline ,@(collect)) st))
((#\K)
(lp (+ i 2) (+ i 2) flags `(reset ,@(collect)) st))
;; these two are from Emacs and TRE, but not in PCRE
((#\<)
(lp (+ i 2) (+ i 2) flags `(bow ,@(collect)) st))
((#\>)
(lp (+ i 2) (+ i 2) flags `(eow ,@(collect)) st))
((#\x)
(apply
(lambda (ch j)
(lp (+ j 1) (+ j 1) flags `(,ch ,@(collect)) st))
(string-parse-hex-escape str (+ i 2) end)))
((#\k)
(let ((c (string-ref str (+ i 2))))
(if (not (memv c '(#\< #\{ #\')))
(error "bad \\k usage, expected \\k<...>" str)
(let* ((terminal (char-mirror c))
(j (string-find str terminal (+ i 2)))
(s (substring str (+ i 3) j))
(backref
(if (flag-set? flags ~case-insensitive?)
'backref-ci
'backref)))
(if (= j end)
(error "unterminated named backref" str)
(lp (+ j 1) (+ j 1) flags
`((,backref ,(string->symbol s))
,@(collect))
st))))))
((#\Q) ;; \Q..\E escapes
(let ((res (collect)))
(let lp2 ((j (+ i 2)))
(cond
((>= j end)
(lp j (+ i 2) flags res st))
((eqv? #\\ (string-ref str j))
(cond
((>= (+ j 1) end)
(lp (+ j 1) (+ i 2) flags res st))
((eqv? #\E (string-ref str (+ j 1)))
(lp (+ j 2) (+ j 2) flags
(cons (substring str (+ i 2) j) res) st))
(else
(lp2 (+ j 2)))))
(else
(lp2 (+ j 1)))))))
;;((#\p) ; XXXX unicode properties
;; )
;;((#\P)
;; )
(else
(cond
((char-numeric? c)
(let* ((j (string-skip str char-numeric? (+ i 2)))
(backref
(if (flag-set? flags ~case-insensitive?)
'backref-ci
'backref))
(res `((,backref ,(string->number
(substring str (+ i 1) j)))
,@(collect))))
(lp j j flags res st)))
((char-alphabetic? c)
(let ((cell (assv c posix-escape-sequences)))
(if cell
(lp (+ i 2) (+ i 2) flags
(cons (cdr cell) (collect)) st)
(error "unknown escape sequence" str c))))
(else
(lp (+ i 2) (+ i 1) flags (collect) st)))))))))
((#\|)
(lp (+ i 1) (+ i 1) flags (cons 'or (collect)) st))
((#\^)
(let ((sym (if (flag-set? flags ~multi-line?) 'bol 'bos)))
(lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
((#\$)
(let ((sym (if (flag-set? flags ~multi-line?) 'eol 'eos)))
(lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
((#\space)
(if (flag-set? flags ~ignore-space?)
(lp (+ i 1) (+ i 1) flags (collect) st)
(lp (+ i 1) from flags res st)))
((#\#)
(if (flag-set? flags ~ignore-space?)
(let ((j (string-find str #\newline (+ i 1))))
(lp (+ j 1) (min (+ j 1) end) flags (collect) st))
(lp (+ i 1) from flags res st)))
(else
(lp (+ i 1) from flags res st))))))))
(define (pcre->regexp pcre . o)
(regexp (apply pcre->sre pcre o)))