From 906d0717569d4d0651d56b3077424e726eaa03ba Mon Sep 17 00:00:00 2001 From: Alex Shinn Date: Tue, 29 Oct 2013 06:49:13 +0900 Subject: [PATCH] Adding grapheme/bog/eog support. Fixing offset computations in regexp-fold. --- lib/chibi/regexp.scm | 73 ++++++++++++++++++++++++++++++++++++------ lib/chibi/regexp.sld | 1 + tests/regexp-tests.scm | 16 ++++++--- 3 files changed, 77 insertions(+), 13 deletions(-) diff --git a/lib/chibi/regexp.scm b/lib/chibi/regexp.scm index 4f132696..68fa28ea 100644 --- a/lib/chibi/regexp.scm +++ b/lib/chibi/regexp.scm @@ -381,12 +381,10 @@ ;; Run so long as there is more to match. -(define (regexp-run search? rx str . o) - (let* ((start (string-start-arg str o)) - (end (string-end-arg str (if (pair? o) (cdr o) o))) - (rx (regexp rx)) - (epsilons (posse)) - (accept (list #f))) +(define (regexp-run-offsets search? rx str start end) + (let ((rx (regexp rx)) + (epsilons (posse)) + (accept (list #f))) (let lp ((i start) (searchers1 (posse)) (searchers2 (posse))) @@ -425,6 +423,13 @@ (posse-clear! searchers1) (lp i2 searchers2 searchers1))))))) +;; Wrapper to determine start and end offsets. + +(define (regexp-run search? rx str . o) + (let ((start (string-start-arg str o)) + (end (string-end-arg str (if (pair? o) (cdr o) o)))) + (regexp-run-offsets search? rx str start end))) + ;;> Match the given regexp or SRE against the entire string and return ;;> the match data on success. Returns \scheme{#f} on failure. @@ -484,6 +489,39 @@ (string-cursor>? i start) (char-word-constituent? (string-cursor-ref str (string-cursor-prev str i))))) +(define (match/bog str i ch start end matches) + (and + (string-cursor? i start) + (or (string-cursor>=? i end) + (let* ((i2 (string-cursor-next str i)) + (ch2 (string-cursor-ref str i2))) + (match/bog str i2 ch2 start end matches))))) (define (lookup-char-set name flags) (case name @@ -608,6 +646,21 @@ ((eol) (make-char-state match/eol flags next)) ((bow) (make-char-state match/bow flags next)) ((eow) (make-char-state match/eow flags next)) + ((bog) (make-char-state match/bog flags next)) + ((eog) (make-char-state match/eog flags next)) + ((grapheme) + (->rx + `(or (: "\r\n") + (: (* ,char-set:hangul-l) + (or ,char-set:hangul-lvt + (: (? ,char-set:hangul-lv) (* ,char-set:hangul-v))) + (* ,char-set:hangul-t)) + (+ ,char-set:regional-indicator) + control + (: (~ control ("\r\n")) + (+ ,char-set:extend-or-spacing-mark))) + flags + next)) ((word) (->rx '(word+ any) flags next)) (else (error "unknown sre" sre)))) ((pair? sre) @@ -743,10 +796,12 @@ (from start) (acc knil)) (cond - ((and (string-cursor (lambda (md) - (let* ((j (rx-match-submatch-end md str 0))) - (lp (if (>= j end) j (string-cursor-next str j)) + (let ((j (rx-match-ref md 1))) + (lp (if (and (string-cursor=? i j) (string-cursorindex str from) md str acc))))) (else diff --git a/lib/chibi/regexp.sld b/lib/chibi/regexp.sld index eec956de..5a47539d 100644 --- a/lib/chibi/regexp.sld +++ b/lib/chibi/regexp.sld @@ -13,6 +13,7 @@ (import (chibi) (srfi 9) (chibi char-set) (chibi char-set full))) (else (import (scheme base) (srfi 14)))) + (import (chibi char-set boundary)) ;; Use string-cursors where available. (begin (define string-cursor? integer?)) diff --git a/tests/regexp-tests.scm b/tests/regexp-tests.scm index 16c45a4c..7ac6610c 100644 --- a/tests/regexp-tests.scm +++ b/tests/regexp-tests.scm @@ -11,9 +11,9 @@ ((test-re res rx str start end) (test res (regexp-match->sexp rx str start end))) ((test-re res rx str start) - (test-re res rx str start (string-cursor-end str))) + (test-re res rx str start (string-length str))) ((test-re res rx str) - (test-re res rx str (string-cursor-start str))))) + (test-re res rx str 0)))) (define (regexp-search->sexp rx str . o) (let ((res (apply regexp-search rx str o))) @@ -24,9 +24,9 @@ ((test-re-search res rx str start end) (test res (regexp-search->sexp rx str start end))) ((test-re-search res rx str start) - (test-re-search res rx str start (string-cursor-end str))) + (test-re-search res rx str start (string-length str))) ((test-re-search res rx str) - (test-re-search res rx str (string-cursor-start str))))) + (test-re-search res rx str 0)))) (test-begin "regexp") @@ -143,12 +143,20 @@ (test-re #f '(* lower) "abcD") (test-re '("abcD") '(w/nocase (* lower)) "abcD") +(test-re '("한") 'grapheme "한") +(test-re '("글") 'grapheme "글") + (test '("123" "456" "789") (regexp-extract '(+ digit) "abc123def456ghi789")) (test '("123" "456" "789") (regexp-extract '(* digit) "abc123def456ghi789")) (test '("abc" "def" "ghi") (regexp-split '(+ digit) "abc123def456ghi789")) (test '("a" "b" "c" "d" "e" "f" "g" "h" "i") (regexp-split '(* digit) "abc123def456ghi789")) (test '("a" "b") (regexp-split '(+ whitespace) "a b")) +(test '("한" "글") + (regexp-extract + 'grapheme + (utf8->string '#u8(#xe1 #x84 #x92 #xe1 #x85 #xa1 #xe1 #x86 #xab + #xe1 #x84 #x80 #xe1 #x85 #xb3 #xe1 #x86 #xaf)))) (test "abc def" (regexp-replace '(+ space) "abc \t\n def" " ")) (test " abc-abc"