#lang scheme/base
(require (for-syntax scheme/base scheme/match srfi/13/string)
parser-tools/lex)
(provide define-abstract-regexps make-rx make-lex)
(define-for-syntax regexp-keywords
'(union sequence kleene* kleene+ maybe complement range any save))
(define-syntax (define-abstract-regexps stx)
(define (interp x env)
(define (interp* xs)
(map (lambda (x)
(interp x env))
xs))
(syntax-case x ()
[(op are ...)
(and (identifier? #'op)
(memq (syntax->datum #'op) regexp-keywords))
`(,(syntax->datum #'op) ,@(interp* (syntax->list #'(are ...))))]
[datum
(let ([v (syntax->datum #'datum)])
(or (string? v) (char? v)))
(syntax->datum #'datum)]
[id
(identifier? #'id)
(cond
[(assf (λ (i) (free-identifier=? #'id i)) env) => cdr]
[else (raise-syntax-error 'define-abstract-regexps "unbound regexp variable" x x)])]))
(syntax-case stx ()
[(_ [name binding] ...)
(let loop ([names (syntax->list #'(name ...))]
[bindings (syntax->list #'(binding ...))]
[env '()])
(if (null? names)
(with-syntax ([(b ...) (reverse (map cdr env))])
#'(begin (define-syntax name 'b) ...))
(loop (cdr names)
(cdr bindings)
(cons (cons (car names) (interp (car bindings) env))
env))))]))
(define-for-syntax (string-append-map c->s s)
(string-fold-right (lambda (c rest)
(string-append (c->s c) rest))
""
s))
(define-for-syntax rx-special-chars
'(#\? #\| #\^ #\$ #\& #\( #\) #\{ #\} #\[ #\] #\+ #\- #\* #\. #\\))
(define-for-syntax (escape-rx c)
(if (memq c rx-special-chars)
(string #\\ c)
(string c)))
(define-for-syntax (single-char? x)
(or (char? x)
(and (string? x) (= (string-length x) 1))))
(define-syntax (make-rx stx)
(define (char-are? are)
(match are
[(list 'range c1 c2) #t]
[(list 'union (? char-are?) ...) #t]
[(list 'sequence (? char-are?)) #t]
[(list 'any) #t]
[(? char?) #t]
[(? string?) (= (string-length are) 1)]
[_ #f]))
(define (char-are->rx char-are)
(match char-are
[(list 'range c1 c2) (format "~a-~a" (escape-rx c1) (escape-rx c2))]
[(list 'union char-ares ...) (string-append-map escape-rx char-ares)]
[(list 'sequence char-are) (char-are->rx char-are)]
[(list 'any) "."]
[(? char?) (escape-rx char-are)]
[(? string?) (string-append-map escape-rx char-are)]
[_ (error 'are->rx "must match exactly one character")]))
(define (simple-are? are)
(match are
[(? char-are?) #t]
[(list 'range c1 c2) #t]
[(list 'union (? simple-are?)) #t]
[(list 'sequence (? simple-are?)) #t]
[(list 'any) #t]
[(list 'complement char-ares ...) #t]
[(list 'save ares ...) #t]
[(? char?) #t]
[(? string?) (= (string-length are) 1)]
[_ #f]))
(define (are->rx are)
(match are
[(list 'range char-are1 char-are2)
(format "[~a-~a]" (escape-rx char-are1) (escape-rx char-are2))]
[(list 'kleene* are)
(format "~a*" (are->sub-rx are))]
[(list 'kleene* ares ...)
(format "(?:~a)*" (apply string-append (map are->rx ares)))]
[(list 'kleene+ are)
(format "~a+" (are->sub-rx are))]
[(list 'kleene+ ares ...)
(format "(?:~a)+" (apply string-append (map are->rx ares)))]
[(list 'union are)
(are->rx are)]
[(list 'union (? char-are? ares) ...)
(apply string-append (cons "[" (append (map char-are->rx ares) (list "]"))))]
[(list 'union ares ...)
(string-join (map are->sub-rx ares) "|" 'infix)]
[(list 'sequence are)
(are->rx are)]
[(list 'sequence ares ...)
(apply string-append (map are->sub-rx ares))]
[(list 'any)
"."]
[(list 'maybe are)
(format "~a?" (are->sub-rx are))]
[(list 'maybe ares ...)
(format "(?:~a)?" (apply string-append (map are->rx ares)))]
[(list 'complement char-ares ...)
(format "[^~a]" (apply string-append (map char-are->rx char-ares)))]
[(list 'save are)
(format "(~a)" (are->rx are))]
[(? string?) (string-append-map escape-rx are)]
[(? char?) (escape-rx are)]))
(define (are->sub-rx are)
(if (simple-are? are)
(are->rx are)
(format "(?:~a)" (are->rx are))))
(syntax-case stx ()
[(_ id)
(identifier? #'id)
(with-syntax ([rx (string-append "^" (are->sub-rx (syntax-local-value #'id)))])
#'(regexp rx))]))
(define-lex-trans make-lex
(letrec ([are->sre (lambda (are)
(match are
[(list 'range char-are1 char-are2)
(with-syntax ([c1 char-are1]
[c2 char-are2])
#'(char-range c1 c2))]
[(list 'kleene* ares ...)
(with-syntax ([(ares* ...) (map are->sre ares)])
#'(repetition 0 +inf.0 (concatenation ares* ...)))]
[(list 'kleene+ ares ...)
(with-syntax ([(ares* ...) (map are->sre ares)])
#'(repetition 1 +inf.0 (concatenation ares* ...)))]
[(list 'union ares ...)
(with-syntax ([(ares* ...) (map are->sre ares)])
#'(union ares* ...))]
[(list 'sequence ares ...)
(with-syntax ([(ares* ...) (map are->sre ares)])
#'(concatenation ares* ...))]
[(list 'any)
#'(any-char)]
[(list 'maybe ares ...)
(with-syntax ([(ares* ...) (map are->sre ares)])
#'(repetition 0 1 (concatenation ares* ...)))]
[(list 'complement char-ares ...)
(with-syntax ([re (apply string-append
(map (lambda (x)
(cond
[(string? x) x]
[(char? x) (string x)]
[else (raise-syntax-error
'make-lex
"complement can only contain chars or strings")]))
char-ares))])
#'(char-complement (char-set re)))]
[(list 'save are)
(are->sre are)]
[(? string?)
(with-syntax ([s are])
#'s)]
[(? char?)
(with-syntax ([c are])
#'c)]))])
(lambda (stx)
(syntax-case stx ()
[(_ id)
(identifier? #'id)
(are->sre (syntax-local-value #'id))]))))
(define-syntax define-rx
(syntax-rules ()
[(_ name binding)
(begin
(define-abstract-regexps [t binding])
(define name (make-rx t)))]))