Avoiding case-folding large, pre-defined Unicode char-sets.

This commit is contained in:
Alex Shinn 2013-07-27 15:32:22 +09:00
parent 131c8e93d4
commit 8afb59f9ac
2 changed files with 49 additions and 37 deletions

View file

@ -43,6 +43,13 @@
(error "expected a state" next2)) (error "expected a state" next2))
(%make-state accept? chars match match-rule next1 next2)) (%make-state accept? chars match match-rule next1 next2))
(define ~none 0)
(define ~ci? 1)
(define (flag-set? flags i) (= i (bitwise-and flags i)))
(define (flag-join a b) (if b (bitwise-ior a b) a))
(define (flag-clear a b) (bitwise-and a (bitwise-not b)))
(define (char-set-ci cset) (define (char-set-ci cset)
(let ((res (char-set))) (let ((res (char-set)))
(for-each (for-each
@ -53,7 +60,7 @@
res)) res))
(define (make-char-state ch flags next) (define (make-char-state ch flags next)
(if (= ~ci? (bitwise-and ~ci? flags)) (if (flag-set? flags ~ci?)
(let ((cset (cond ((char? ch) (char-set-ci (char-set ch))) (let ((cset (cond ((char? ch) (char-set-ci (char-set ch)))
((char-set? ch) (char-set-ci ch)) ((char-set? ch) (char-set-ci ch))
(else ch)))) (else ch))))
@ -439,16 +446,13 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Compiling ;; Compiling
(define ~none 0)
(define ~ci? 1)
(define (parse-flags ls) (define (parse-flags ls)
(define (symbol->flag s) (define (symbol->flag s)
(case s ((i ci case-insensitive) ~ci?) (else ~none))) (case s ((i ci case-insensitive) ~ci?) (else ~none)))
(let lp ((ls ls) (res ~none)) (let lp ((ls ls) (res ~none))
(if (not (pair? ls)) (if (not (pair? ls))
res res
(lp (cdr ls) (bitwise-ior res (symbol->flag (car ls))))))) (lp (cdr ls) (flag-join res (symbol->flag (car ls)))))))
(define char-set:nonl (define char-set:nonl
(char-set-difference char-set:full (char-set #\newline))) (char-set-difference char-set:full (char-set #\newline)))
@ -481,12 +485,14 @@
(char-word-constituent? (char-word-constituent?
(string-cursor-ref str (string-cursor-prev str i))))) (string-cursor-ref str (string-cursor-prev str i)))))
(define (lookup-char-set name) (define (lookup-char-set name flags)
(case name (case name
((any) char-set:full) ((any) char-set:full)
((nonl) char-set:nonl) ((nonl) char-set:nonl)
((lower-case lower) char-set:lower-case) ((lower-case lower)
((upper-case upper) char-set:upper-case) (if (flag-set? flags ~ci?) char-set:letter char-set:lower-case))
((upper-case upper)
(if (flag-set? flags ~ci?) char-set:letter char-set:upper-case))
((alphabetic alpha) char-set:letter) ((alphabetic alpha) char-set:letter)
((numeric num digit) char-set:digit) ((numeric num digit) char-set:digit)
((alphanumeric alphanum alnum) char-set:letter+digit) ((alphanumeric alphanum alnum) char-set:letter+digit)
@ -514,28 +520,31 @@
(else (else
(lp (cddr ls) (cons (cons (car ls) (cadr ls)) res)))))) (lp (cddr ls) (cons (cons (car ls) (cadr ls)) res))))))
(define (sre->char-set sre) (define (sre->char-set sre . o)
(let ((flags (if (pair? o) (car o) ~none)))
(define (->cs sre) (sre->char-set sre flags))
(cond (cond
((lookup-char-set sre)) ((lookup-char-set sre flags))
((char-set? sre) sre) ((char-set? sre) (char-set-ci sre))
((char? sre) (char-set sre)) ((char? sre) (char-set-ci (char-set sre)))
((pair? sre) ((pair? sre)
(if (string? (car sre)) (if (string? (car sre))
(string->char-set (car sre)) (string->char-set (car sre))
(case (car sre) (case (car sre)
((/) (sre->char-set ((/) (->cs
`(or ,@(map (lambda (x) `(or ,@(map (lambda (x)
(char-set-ci
(ucs-range->char-set (ucs-range->char-set
(char->integer (car x)) (char->integer (car x))
(+ 1 (char->integer (cdr x))))) (+ 1 (char->integer (cdr x))))))
(sre-flatten-ranges (cdr sre)))))) (sre-flatten-ranges (cdr sre))))))
((& and) (apply char-set-intersection (map sre->char-set (cdr sre)))) ((& and) (apply char-set-intersection (map ->cs (cdr sre))))
((|\|| or) (apply char-set-union (map sre->char-set (cdr sre)))) ((|\|| or) (apply char-set-union (map ->cs (cdr sre))))
((~ not) (char-set-complement (sre->char-set `(or ,@(cdr sre))))) ((~ not) (char-set-complement (->cs `(or ,@(cdr sre)))))
((-) (char-set-difference (sre->char-set (cadr sre)) ((-) (char-set-difference (->cs (cadr sre))
(sre->char-set `(or ,@(cddr sre))))) (->cs `(or ,@(cddr sre)))))
(else (error "invalid sre char-set" sre)))))
(else (error "invalid sre char-set" sre))))) (else (error "invalid sre char-set" sre)))))
(else (error "invalid sre char-set" sre))))
;;> Compile an \var{sre} into a regexp. ;;> Compile an \var{sre} into a regexp.
@ -562,8 +571,8 @@
(make-char-state sre flags next)) (make-char-state sre flags next))
((string? sre) ((string? sre)
(->rx (cons 'seq (string->list sre)) flags next)) (->rx (cons 'seq (string->list sre)) flags next))
((and (symbol? sre) (lookup-char-set sre)) ((and (symbol? sre) (lookup-char-set sre flags))
=> (lambda (cset) (make-char-state cset flags next))) => (lambda (cset) (make-char-state cset ~none next)))
((symbol? sre) ((symbol? sre)
(case sre (case sre
((epsilon) next) ((epsilon) next)
@ -655,22 +664,22 @@
(state-match-rule-set! n2 'list) (state-match-rule-set! n2 'list)
n1))) n1)))
((~ - & |\|| / and or not) ((~ - & |\|| / and or not)
(make-char-state (sre->char-set sre) flags next)) (make-char-state (sre->char-set sre flags) ~none next))
((word) ((word)
(->rx `(: bow ,@(cdr sre) eow) flags next)) (->rx `(: bow ,@(cdr sre) eow) flags next))
((word+) ((word+)
(->rx `(word (+ ,(char-set-intersection (->rx `(word (+ ,(char-set-intersection
char-set:word-constituent char-set:word-constituent
(sre->char-set `(or ,@(cdr sre)))))) (sre->char-set `(or ,@(cdr sre)) flags))))
flags flags
next)) next))
((w/case) ((w/case)
(->rx `(: ,@(cdr sre)) (bitwise-and flags (bitwise-not ~ci?)) next)) (->rx `(: ,@(cdr sre)) (flag-clear flags ~ci?) next))
((w/nocase) ((w/nocase)
(->rx `(: ,@(cdr sre)) (bitwise-ior flags ~ci?) next)) (->rx `(: ,@(cdr sre)) (flag-join flags ~ci?) next))
(else (else
(if (string? (car sre)) (if (string? (car sre))
(make-char-state (sre->char-set sre) flags next) (make-char-state (sre->char-set sre flags) ~none next)
(error "unknown sre" sre))))))) (error "unknown sre" sre)))))))
(let ((flags (parse-flags (and (pair? o) (car o))))) (let ((flags (parse-flags (and (pair? o) (car o)))))
(if (regexp? sre) (if (regexp? sre)

View file

@ -140,6 +140,9 @@
'(: (* digit) (w/nocase ($ (* (/"af"))))) '(: (* digit) (w/nocase ($ (* (/"af")))))
"12345BeeF") "12345BeeF")
(test-re #f '(* lower) "abcD")
(test-re '("abcD") '(w/nocase (* lower)) "abcD")
(test '("123" "456" "789") (regexp-extract '(+ digit) "abc123def456ghi789")) (test '("123" "456" "789") (regexp-extract '(+ digit) "abc123def456ghi789"))
(test '("123" "456" "789") (regexp-extract '(* digit) "abc123def456ghi789")) (test '("123" "456" "789") (regexp-extract '(* digit) "abc123def456ghi789"))
(test '("abc" "def" "ghi") (regexp-split '(+ digit) "abc123def456ghi789")) (test '("abc" "def" "ghi") (regexp-split '(+ digit) "abc123def456ghi789"))