mirror of
https://github.com/ashinn/chibi-scheme.git
synced 2025-07-07 05:06:37 +02:00
Avoiding case-folding large, pre-defined Unicode char-sets.
This commit is contained in:
parent
131c8e93d4
commit
8afb59f9ac
2 changed files with 49 additions and 37 deletions
|
@ -43,6 +43,13 @@
|
||||||
(error "expected a state" next2))
|
(error "expected a state" next2))
|
||||||
(%make-state accept? chars match match-rule next1 next2))
|
(%make-state accept? chars match match-rule next1 next2))
|
||||||
|
|
||||||
|
(define ~none 0)
|
||||||
|
(define ~ci? 1)
|
||||||
|
|
||||||
|
(define (flag-set? flags i) (= i (bitwise-and flags i)))
|
||||||
|
(define (flag-join a b) (if b (bitwise-ior a b) a))
|
||||||
|
(define (flag-clear a b) (bitwise-and a (bitwise-not b)))
|
||||||
|
|
||||||
(define (char-set-ci cset)
|
(define (char-set-ci cset)
|
||||||
(let ((res (char-set)))
|
(let ((res (char-set)))
|
||||||
(for-each
|
(for-each
|
||||||
|
@ -53,7 +60,7 @@
|
||||||
res))
|
res))
|
||||||
|
|
||||||
(define (make-char-state ch flags next)
|
(define (make-char-state ch flags next)
|
||||||
(if (= ~ci? (bitwise-and ~ci? flags))
|
(if (flag-set? flags ~ci?)
|
||||||
(let ((cset (cond ((char? ch) (char-set-ci (char-set ch)))
|
(let ((cset (cond ((char? ch) (char-set-ci (char-set ch)))
|
||||||
((char-set? ch) (char-set-ci ch))
|
((char-set? ch) (char-set-ci ch))
|
||||||
(else ch))))
|
(else ch))))
|
||||||
|
@ -439,16 +446,13 @@
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; Compiling
|
;; Compiling
|
||||||
|
|
||||||
(define ~none 0)
|
|
||||||
(define ~ci? 1)
|
|
||||||
|
|
||||||
(define (parse-flags ls)
|
(define (parse-flags ls)
|
||||||
(define (symbol->flag s)
|
(define (symbol->flag s)
|
||||||
(case s ((i ci case-insensitive) ~ci?) (else ~none)))
|
(case s ((i ci case-insensitive) ~ci?) (else ~none)))
|
||||||
(let lp ((ls ls) (res ~none))
|
(let lp ((ls ls) (res ~none))
|
||||||
(if (not (pair? ls))
|
(if (not (pair? ls))
|
||||||
res
|
res
|
||||||
(lp (cdr ls) (bitwise-ior res (symbol->flag (car ls)))))))
|
(lp (cdr ls) (flag-join res (symbol->flag (car ls)))))))
|
||||||
|
|
||||||
(define char-set:nonl
|
(define char-set:nonl
|
||||||
(char-set-difference char-set:full (char-set #\newline)))
|
(char-set-difference char-set:full (char-set #\newline)))
|
||||||
|
@ -481,12 +485,14 @@
|
||||||
(char-word-constituent?
|
(char-word-constituent?
|
||||||
(string-cursor-ref str (string-cursor-prev str i)))))
|
(string-cursor-ref str (string-cursor-prev str i)))))
|
||||||
|
|
||||||
(define (lookup-char-set name)
|
(define (lookup-char-set name flags)
|
||||||
(case name
|
(case name
|
||||||
((any) char-set:full)
|
((any) char-set:full)
|
||||||
((nonl) char-set:nonl)
|
((nonl) char-set:nonl)
|
||||||
((lower-case lower) char-set:lower-case)
|
((lower-case lower)
|
||||||
((upper-case upper) char-set:upper-case)
|
(if (flag-set? flags ~ci?) char-set:letter char-set:lower-case))
|
||||||
|
((upper-case upper)
|
||||||
|
(if (flag-set? flags ~ci?) char-set:letter char-set:upper-case))
|
||||||
((alphabetic alpha) char-set:letter)
|
((alphabetic alpha) char-set:letter)
|
||||||
((numeric num digit) char-set:digit)
|
((numeric num digit) char-set:digit)
|
||||||
((alphanumeric alphanum alnum) char-set:letter+digit)
|
((alphanumeric alphanum alnum) char-set:letter+digit)
|
||||||
|
@ -514,28 +520,31 @@
|
||||||
(else
|
(else
|
||||||
(lp (cddr ls) (cons (cons (car ls) (cadr ls)) res))))))
|
(lp (cddr ls) (cons (cons (car ls) (cadr ls)) res))))))
|
||||||
|
|
||||||
(define (sre->char-set sre)
|
(define (sre->char-set sre . o)
|
||||||
(cond
|
(let ((flags (if (pair? o) (car o) ~none)))
|
||||||
((lookup-char-set sre))
|
(define (->cs sre) (sre->char-set sre flags))
|
||||||
((char-set? sre) sre)
|
(cond
|
||||||
((char? sre) (char-set sre))
|
((lookup-char-set sre flags))
|
||||||
((pair? sre)
|
((char-set? sre) (char-set-ci sre))
|
||||||
(if (string? (car sre))
|
((char? sre) (char-set-ci (char-set sre)))
|
||||||
(string->char-set (car sre))
|
((pair? sre)
|
||||||
(case (car sre)
|
(if (string? (car sre))
|
||||||
((/) (sre->char-set
|
(string->char-set (car sre))
|
||||||
`(or ,@(map (lambda (x)
|
(case (car sre)
|
||||||
(ucs-range->char-set
|
((/) (->cs
|
||||||
(char->integer (car x))
|
`(or ,@(map (lambda (x)
|
||||||
(+ 1 (char->integer (cdr x)))))
|
(char-set-ci
|
||||||
(sre-flatten-ranges (cdr sre))))))
|
(ucs-range->char-set
|
||||||
((& and) (apply char-set-intersection (map sre->char-set (cdr sre))))
|
(char->integer (car x))
|
||||||
((|\|| or) (apply char-set-union (map sre->char-set (cdr sre))))
|
(+ 1 (char->integer (cdr x))))))
|
||||||
((~ not) (char-set-complement (sre->char-set `(or ,@(cdr sre)))))
|
(sre-flatten-ranges (cdr sre))))))
|
||||||
((-) (char-set-difference (sre->char-set (cadr sre))
|
((& and) (apply char-set-intersection (map ->cs (cdr sre))))
|
||||||
(sre->char-set `(or ,@(cddr sre)))))
|
((|\|| or) (apply char-set-union (map ->cs (cdr sre))))
|
||||||
(else (error "invalid sre char-set" sre)))))
|
((~ not) (char-set-complement (->cs `(or ,@(cdr sre)))))
|
||||||
(else (error "invalid sre char-set" sre))))
|
((-) (char-set-difference (->cs (cadr sre))
|
||||||
|
(->cs `(or ,@(cddr sre)))))
|
||||||
|
(else (error "invalid sre char-set" sre)))))
|
||||||
|
(else (error "invalid sre char-set" sre)))))
|
||||||
|
|
||||||
;;> Compile an \var{sre} into a regexp.
|
;;> Compile an \var{sre} into a regexp.
|
||||||
|
|
||||||
|
@ -562,8 +571,8 @@
|
||||||
(make-char-state sre flags next))
|
(make-char-state sre flags next))
|
||||||
((string? sre)
|
((string? sre)
|
||||||
(->rx (cons 'seq (string->list sre)) flags next))
|
(->rx (cons 'seq (string->list sre)) flags next))
|
||||||
((and (symbol? sre) (lookup-char-set sre))
|
((and (symbol? sre) (lookup-char-set sre flags))
|
||||||
=> (lambda (cset) (make-char-state cset flags next)))
|
=> (lambda (cset) (make-char-state cset ~none next)))
|
||||||
((symbol? sre)
|
((symbol? sre)
|
||||||
(case sre
|
(case sre
|
||||||
((epsilon) next)
|
((epsilon) next)
|
||||||
|
@ -655,22 +664,22 @@
|
||||||
(state-match-rule-set! n2 'list)
|
(state-match-rule-set! n2 'list)
|
||||||
n1)))
|
n1)))
|
||||||
((~ - & |\|| / and or not)
|
((~ - & |\|| / and or not)
|
||||||
(make-char-state (sre->char-set sre) flags next))
|
(make-char-state (sre->char-set sre flags) ~none next))
|
||||||
((word)
|
((word)
|
||||||
(->rx `(: bow ,@(cdr sre) eow) flags next))
|
(->rx `(: bow ,@(cdr sre) eow) flags next))
|
||||||
((word+)
|
((word+)
|
||||||
(->rx `(word (+ ,(char-set-intersection
|
(->rx `(word (+ ,(char-set-intersection
|
||||||
char-set:word-constituent
|
char-set:word-constituent
|
||||||
(sre->char-set `(or ,@(cdr sre))))))
|
(sre->char-set `(or ,@(cdr sre)) flags))))
|
||||||
flags
|
flags
|
||||||
next))
|
next))
|
||||||
((w/case)
|
((w/case)
|
||||||
(->rx `(: ,@(cdr sre)) (bitwise-and flags (bitwise-not ~ci?)) next))
|
(->rx `(: ,@(cdr sre)) (flag-clear flags ~ci?) next))
|
||||||
((w/nocase)
|
((w/nocase)
|
||||||
(->rx `(: ,@(cdr sre)) (bitwise-ior flags ~ci?) next))
|
(->rx `(: ,@(cdr sre)) (flag-join flags ~ci?) next))
|
||||||
(else
|
(else
|
||||||
(if (string? (car sre))
|
(if (string? (car sre))
|
||||||
(make-char-state (sre->char-set sre) flags next)
|
(make-char-state (sre->char-set sre flags) ~none next)
|
||||||
(error "unknown sre" sre)))))))
|
(error "unknown sre" sre)))))))
|
||||||
(let ((flags (parse-flags (and (pair? o) (car o)))))
|
(let ((flags (parse-flags (and (pair? o) (car o)))))
|
||||||
(if (regexp? sre)
|
(if (regexp? sre)
|
||||||
|
|
|
@ -140,6 +140,9 @@
|
||||||
'(: (* digit) (w/nocase ($ (* (/"af")))))
|
'(: (* digit) (w/nocase ($ (* (/"af")))))
|
||||||
"12345BeeF")
|
"12345BeeF")
|
||||||
|
|
||||||
|
(test-re #f '(* lower) "abcD")
|
||||||
|
(test-re '("abcD") '(w/nocase (* lower)) "abcD")
|
||||||
|
|
||||||
(test '("123" "456" "789") (regexp-extract '(+ digit) "abc123def456ghi789"))
|
(test '("123" "456" "789") (regexp-extract '(+ digit) "abc123def456ghi789"))
|
||||||
(test '("123" "456" "789") (regexp-extract '(* digit) "abc123def456ghi789"))
|
(test '("123" "456" "789") (regexp-extract '(* digit) "abc123def456ghi789"))
|
||||||
(test '("abc" "def" "ghi") (regexp-split '(+ digit) "abc123def456ghi789"))
|
(test '("abc" "def" "ghi") (regexp-split '(+ digit) "abc123def456ghi789"))
|
||||||
|
|
Loading…
Add table
Reference in a new issue