mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-07-09 05:57:36 +02:00
Adding grapheme/bog/eog support.
Fixing offset computations in regexp-fold.
This commit is contained in:
parent
d543583e88
commit
906d071756
3 changed files with 77 additions and 13 deletions
|
@ -381,12 +381,10 @@
|
|||
|
||||
;; Run so long as there is more to match.
|
||||
|
||||
(define (regexp-run search? rx str . o)
|
||||
(let* ((start (string-start-arg str o))
|
||||
(end (string-end-arg str (if (pair? o) (cdr o) o)))
|
||||
(rx (regexp rx))
|
||||
(epsilons (posse))
|
||||
(accept (list #f)))
|
||||
(define (regexp-run-offsets search? rx str start end)
|
||||
(let ((rx (regexp rx))
|
||||
(epsilons (posse))
|
||||
(accept (list #f)))
|
||||
(let lp ((i start)
|
||||
(searchers1 (posse))
|
||||
(searchers2 (posse)))
|
||||
|
@ -425,6 +423,13 @@
|
|||
(posse-clear! searchers1)
|
||||
(lp i2 searchers2 searchers1)))))))
|
||||
|
||||
;; Wrapper to determine start and end offsets.
|
||||
|
||||
(define (regexp-run search? rx str . o)
|
||||
(let ((start (string-start-arg str o))
|
||||
(end (string-end-arg str (if (pair? o) (cdr o) o))))
|
||||
(regexp-run-offsets search? rx str start end)))
|
||||
|
||||
;;> Match the given regexp or SRE against the entire string and return
|
||||
;;> the match data on success. Returns \scheme{#f} on failure.
|
||||
|
||||
|
@ -484,6 +489,39 @@
|
|||
(string-cursor>? i start)
|
||||
(char-word-constituent?
|
||||
(string-cursor-ref str (string-cursor-prev str i)))))
|
||||
(define (match/bog str i ch start end matches)
|
||||
(and
|
||||
(string-cursor<? i end)
|
||||
(or (string-cursor=? i start)
|
||||
(let ((ch0 (string-cursor-ref str (string-cursor-prev str i))))
|
||||
(cond
|
||||
((eqv? ch0 #\return)
|
||||
(not (eqv? ch #\newline)))
|
||||
((char-set-contains? char-set:control ch0))
|
||||
((char-set-contains? char-set:regional-indicator ch0)
|
||||
(not (char-set-contains? char-set:regional-indicator ch)))
|
||||
((char-set-contains? char-set:hangul-l ch0)
|
||||
(not (or (char-set-contains? char-set:hangul-l ch0)
|
||||
(char-set-contains? char-set:hangul-lv ch0)
|
||||
(char-set-contains? char-set:hangul-lvt ch0)
|
||||
(char-set-contains? char-set:hangul-v ch0)
|
||||
(char-set-contains? char-set:hangul-t ch0))))
|
||||
((or (char-set-contains? char-set:hangul-lv ch0)
|
||||
(char-set-contains? char-set:hangul-v ch0))
|
||||
(not (or (char-set-contains? char-set:hangul-v ch0)
|
||||
(char-set-contains? char-set:hangul-t ch0))))
|
||||
((char-set-contains? char-set:hangul-t ch0)
|
||||
(not (char-set-contains? char-set:hangul-t ch0)))
|
||||
((char-set-contains? char-set:hangul-lvt ch0)
|
||||
(not (char-set-contains? char-set:hangul-t ch0)))
|
||||
(else
|
||||
(not (char-set-contains? char-set:extend-or-spacing-mark ch))))))))
|
||||
(define (match/eog str i ch start end matches)
|
||||
(and (string-cursor>? i start)
|
||||
(or (string-cursor>=? i end)
|
||||
(let* ((i2 (string-cursor-next str i))
|
||||
(ch2 (string-cursor-ref str i2)))
|
||||
(match/bog str i2 ch2 start end matches)))))
|
||||
|
||||
(define (lookup-char-set name flags)
|
||||
(case name
|
||||
|
@ -608,6 +646,21 @@
|
|||
((eol) (make-char-state match/eol flags next))
|
||||
((bow) (make-char-state match/bow flags next))
|
||||
((eow) (make-char-state match/eow flags next))
|
||||
((bog) (make-char-state match/bog flags next))
|
||||
((eog) (make-char-state match/eog flags next))
|
||||
((grapheme)
|
||||
(->rx
|
||||
`(or (: "\r\n")
|
||||
(: (* ,char-set:hangul-l)
|
||||
(or ,char-set:hangul-lvt
|
||||
(: (? ,char-set:hangul-lv) (* ,char-set:hangul-v)))
|
||||
(* ,char-set:hangul-t))
|
||||
(+ ,char-set:regional-indicator)
|
||||
control
|
||||
(: (~ control ("\r\n"))
|
||||
(+ ,char-set:extend-or-spacing-mark)))
|
||||
flags
|
||||
next))
|
||||
((word) (->rx '(word+ any) flags next))
|
||||
(else (error "unknown sre" sre))))
|
||||
((pair? sre)
|
||||
|
@ -743,10 +796,12 @@
|
|||
(from start)
|
||||
(acc knil))
|
||||
(cond
|
||||
((and (string-cursor<? i end) (regexp-search rx str i end))
|
||||
((and (string-cursor<? i end) (regexp-run-offsets #t rx str i end))
|
||||
=> (lambda (md)
|
||||
(let* ((j (rx-match-submatch-end md str 0)))
|
||||
(lp (if (>= j end) j (string-cursor-next str j))
|
||||
(let ((j (rx-match-ref md 1)))
|
||||
(lp (if (and (string-cursor=? i j) (string-cursor<? j end))
|
||||
(string-cursor-next str j)
|
||||
j)
|
||||
j
|
||||
(kons (string-offset->index str from) md str acc)))))
|
||||
(else
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
(import (chibi) (srfi 9) (chibi char-set) (chibi char-set full)))
|
||||
(else
|
||||
(import (scheme base) (srfi 14))))
|
||||
(import (chibi char-set boundary))
|
||||
;; Use string-cursors where available.
|
||||
(begin
|
||||
(define string-cursor? integer?))
|
||||
|
|
|
@ -11,9 +11,9 @@
|
|||
((test-re res rx str start end)
|
||||
(test res (regexp-match->sexp rx str start end)))
|
||||
((test-re res rx str start)
|
||||
(test-re res rx str start (string-cursor-end str)))
|
||||
(test-re res rx str start (string-length str)))
|
||||
((test-re res rx str)
|
||||
(test-re res rx str (string-cursor-start str)))))
|
||||
(test-re res rx str 0))))
|
||||
|
||||
(define (regexp-search->sexp rx str . o)
|
||||
(let ((res (apply regexp-search rx str o)))
|
||||
|
@ -24,9 +24,9 @@
|
|||
((test-re-search res rx str start end)
|
||||
(test res (regexp-search->sexp rx str start end)))
|
||||
((test-re-search res rx str start)
|
||||
(test-re-search res rx str start (string-cursor-end str)))
|
||||
(test-re-search res rx str start (string-length str)))
|
||||
((test-re-search res rx str)
|
||||
(test-re-search res rx str (string-cursor-start str)))))
|
||||
(test-re-search res rx str 0))))
|
||||
|
||||
(test-begin "regexp")
|
||||
|
||||
|
@ -143,12 +143,20 @@
|
|||
(test-re #f '(* lower) "abcD")
|
||||
(test-re '("abcD") '(w/nocase (* lower)) "abcD")
|
||||
|
||||
(test-re '("한") 'grapheme "한")
|
||||
(test-re '("글") 'grapheme "글")
|
||||
|
||||
(test '("123" "456" "789") (regexp-extract '(+ digit) "abc123def456ghi789"))
|
||||
(test '("123" "456" "789") (regexp-extract '(* digit) "abc123def456ghi789"))
|
||||
(test '("abc" "def" "ghi") (regexp-split '(+ digit) "abc123def456ghi789"))
|
||||
(test '("a" "b" "c" "d" "e" "f" "g" "h" "i")
|
||||
(regexp-split '(* digit) "abc123def456ghi789"))
|
||||
(test '("a" "b") (regexp-split '(+ whitespace) "a b"))
|
||||
(test '("한" "글")
|
||||
(regexp-extract
|
||||
'grapheme
|
||||
(utf8->string '#u8(#xe1 #x84 #x92 #xe1 #x85 #xa1 #xe1 #x86 #xab
|
||||
#xe1 #x84 #x80 #xe1 #x85 #xb3 #xe1 #x86 #xaf))))
|
||||
|
||||
(test "abc def" (regexp-replace '(+ space) "abc \t\n def" " "))
|
||||
(test " abc-abc"
|
||||
|
|
Loading…
Add table
Reference in a new issue