diff --git a/lib/chibi/regexp.scm b/lib/chibi/regexp.scm index 72a99ac3..1c3092c1 100644 --- a/lib/chibi/regexp.scm +++ b/lib/chibi/regexp.scm @@ -45,6 +45,7 @@ (define ~none 0) (define ~ci? 1) +(define ~ascii? 2) (define (flag-set? flags i) (= i (bitwise-and flags i))) (define (flag-join a b) (if b (bitwise-ior a b) a)) @@ -194,7 +195,8 @@ ((and (pair? md) (string-cursor? (car md)) (string-cursor? (cdr md))) (substring-cursor str (car md) (cdr md))) ((regexp-match? md) - (regexp-match-convert recurse? (regexp-match-matches md) (regexp-match-string md))) + (regexp-match-convert + recurse? (regexp-match-matches md) (regexp-match-string md))) (else md))) @@ -245,15 +247,19 @@ (cond ((>= i end) #t) - ((and (eqv? (regexp-match-ref m1 i) (regexp-match-ref m2 i)) - (eqv? (regexp-match-ref m1 (+ i 1)) (regexp-match-ref m2 (+ i 1)))) + ((and (eqv? (regexp-match-ref m1 i) + (regexp-match-ref m2 i)) + (eqv? (regexp-match-ref m1 (+ i 1)) + (regexp-match-ref m2 (+ i 1)))) (lp (+ i 2))) ((and (string-cursor? (regexp-match-ref m2 i)) (string-cursor? (regexp-match-ref m2 (+ i 1))) (or (not (string-cursor? (regexp-match-ref m1 i))) (not (string-cursor? (regexp-match-ref m1 (+ i 1)))) - (string-cursor? (regexp-match-ref m2 (+ i 1)) (regexp-match-ref m1 (+ i 1)))))) #f) @@ -469,6 +475,8 @@ (define char-set:control (ucs-range->char-set 0 32)) (define char-set:word-constituent (char-set-union char-set:letter char-set:digit (char-set #\_))) +(define %char-set:word-constituent + (char-set-union %char-set:letter %char-set:digit (char-set #\_))) (define (char-word-constituent? ch) (char-set-contains? char-set:word-constituent ch)) @@ -529,26 +537,47 @@ (match/bog str i2 ch2 start end matches))))) (define (lookup-char-set name flags) - (case name - ((any) char-set:full) - ((nonl) char-set:nonl) - ((lower-case lower) - (if (flag-set? flags ~ci?) char-set:letter char-set:lower-case)) - ((upper-case upper) - (if (flag-set? flags ~ci?) char-set:letter char-set:upper-case)) - ((alphabetic alpha) char-set:letter) - ((numeric num digit) char-set:digit) - ((alphanumeric alphanum alnum) char-set:letter+digit) - ((punctuation punct) char-set:punctuation) - ((graphic graph) char-set:graphic) - ((word-constituent) char-set:word-constituent) - ((whitespace white space) char-set:whitespace) - ((printing print) char-set:printing) - ((control cntrl) char-set:control) - ((hex-digit xdigit hex) char-set:hex-digit) - ((blank) char-set:blank) - ((ascii) char-set:ascii) - (else #f))) + (cond + ((flag-set? flags ~ascii?) + (case name + ((any) char-set:full) + ((nonl) char-set:nonl) + ((lower-case lower) + (if (flag-set? flags ~ci?) %char-set:letter %char-set:lower-case)) + ((upper-case upper) + (if (flag-set? flags ~ci?) %char-set:letter %char-set:upper-case)) + ((alphabetic alpha) %char-set:letter) + ((numeric num digit) %char-set:digit) + ((alphanumeric alphanum alnum) %char-set:letter+digit) + ((punctuation punct) %char-set:punctuation) + ((graphic graph) %char-set:graphic) + ((word-constituent) %char-set:word-constituent) + ((whitespace white space) %char-set:whitespace) + ((printing print) %char-set:printing) + ((control cntrl) %char-set:iso-control) + ((hex-digit xdigit hex) char-set:hex-digit) + ((ascii) char-set:ascii) + (else #f))) + (else + (case name + ((any) char-set:full) + ((nonl) char-set:nonl) + ((lower-case lower) + (if (flag-set? flags ~ci?) char-set:letter char-set:lower-case)) + ((upper-case upper) + (if (flag-set? flags ~ci?) char-set:letter char-set:upper-case)) + ((alphabetic alpha) char-set:letter) + ((numeric num digit) char-set:digit) + ((alphanumeric alphanum alnum) char-set:letter+digit) + ((punctuation punct) char-set:punctuation) + ((graphic graph) char-set:graphic) + ((word-constituent) char-set:word-constituent) + ((whitespace white space) char-set:whitespace) + ((printing print) char-set:printing) + ((control cntrl) char-set:control) + ((hex-digit xdigit hex) char-set:hex-digit) + ((ascii) char-set:ascii) + (else #f))))) (define (sre-flatten-ranges orig-ls) (let lp ((ls orig-ls) (res '())) @@ -777,6 +806,10 @@ (->rx `(: ,@(cdr sre)) (flag-clear flags ~ci?) next)) ((w/nocase) (->rx `(: ,@(cdr sre)) (flag-join flags ~ci?) next)) + ((w/unicode) + (->rx `(: ,@(cdr sre)) (flag-clear flags ~ascii?) next)) + ((w/ascii) + (->rx `(: ,@(cdr sre)) (flag-join flags ~ascii?) next)) (else (if (string? (car sre)) (make-char-state (sre->char-set sre flags) ~none next) diff --git a/lib/chibi/regexp.sld b/lib/chibi/regexp.sld index ee68df38..ed1cb7ad 100644 --- a/lib/chibi/regexp.sld +++ b/lib/chibi/regexp.sld @@ -10,9 +10,32 @@ ;; Chibi's char-set library is more factored than SRFI-14. (cond-expand (chibi - (import (chibi) (srfi 9) (chibi char-set) (chibi char-set full))) + (import (chibi) (srfi 9) (chibi char-set) + (chibi char-set full) + (prefix (chibi char-set ascii) %))) (else - (import (scheme base) (srfi 14)))) + (import (scheme base) (srfi 14)) + (begin + (define %char-set:letter + (char-set-intersection char-set:ascii char-set:letter)) + (define %char-set:lower-case + (char-set-intersection char-set:ascii char-set:lower-case)) + (define %char-set:upper-case + (char-set-intersection char-set:ascii char-set:upper-case)) + (define %char-set:digit + (char-set-intersection char-set:ascii char-set:digit)) + (define %char-set:letter+digit + (char-set-intersection char-set:ascii char-set:letter+digit)) + (define %char-set:punctuation + (char-set-intersection char-set:ascii char-set:punctuation)) + (define %char-set:graphic + (char-set-intersection char-set:ascii char-set:graphic)) + (define %char-set:whitespace + (char-set-intersection char-set:ascii char-set:whitespace)) + (define %char-set:printing + (char-set-intersection char-set:ascii char-set:printing)) + (define %char-set:iso-control + (char-set-intersection char-set:ascii char-set:iso-control))))) (import (chibi char-set boundary)) ;; Use string-cursors where available. (begin diff --git a/tests/regexp-tests.scm b/tests/regexp-tests.scm index 1447fd31..87bafb87 100644 --- a/tests/regexp-tests.scm +++ b/tests/regexp-tests.scm @@ -143,6 +143,12 @@ (test-re #f '(* lower) "abcD") (test-re '("abcD") '(w/nocase (* lower)) "abcD") +(test-re '("кириллица") '(* alpha) "кириллица") +(test-re #f '(w/ascii (* alpha)) "кириллица") + +(test-re '("12345") '(* digit) "12345") +(test-re #f '(w/ascii (* digit)) "12345") + (test-re '("한") 'grapheme "한") (test-re '("글") 'grapheme "글")