(module lexer mzscheme
(require (lib "lex.ss" "parser-tools")
(lib "contract.ss")
(lib "etc.ss")
(prefix : (lib "lex-sre.ss" "parser-tools")))
(provide/contract [plt-lexer (input-port? . -> . position-token?)])
(define-tokens token (atom special-atom quoter-prefix prefix suffix space end))
(define-lex-abbrev digit (char-set "0123456789"))
(define-lex-abbrev opener-prefix
(:or ""
"#"
"#hash"
"#hasheq"
(:: "#" (:+ digit))
(:: "#;")))
(define-lex-abbrev openers (:or (:: opener-prefix (:or "(" "[" "{"))))
(define-lex-abbrev closers (:or ")" "]" "}"))
(define-lex-abbrev quoters (:or "'" "`" "," ",@"
"#;" "#&" "#'" "#`" "#," "#,@"
"#cs" "#ci"))
(define-lex-abbrev nl (:or "\r\n"
"\n"
"\r"))
(define-lex-abbrev atom-delims (:or whitespace (char-set "\",'`;()[]{}")))
(define-lex-abbrev escaped-symbol-chars
(:or (:: "|" (:* (char-complement (char-set "|"))) "|")
(:: "\\" any-char)))
(define-lex-abbrev pound-prefix-free-atom
(:& (complement (:: "#" any-string))
atom-chars))
(define-lex-abbrev atom-chars
(:+ (:or (:& (char-complement (char-set "|\\"))
(char-complement (:& any-char atom-delims)))
escaped-symbol-chars)))
(define-lex-abbrev pounded-atoms
(:or "#t" "#f" "#T" "#F"
(:: "#" (char-set "eibodx") pound-prefix-free-atom) (:: "#%" pound-prefix-free-atom)
(:: "#:" atom-chars) (:: "#\\" any-char)
(:: "#\\" alphabetic alphabetic (:* alphabetic))
(:: "#\\" (:? any-char) (:+ digit))
(:: "#" (:+ digit) "#")
(:: "#" (:+ digit) "=")))
(define-lex-abbrev string-literal (::
(:? (:or "#rx" "#px"))
(:? "#") "\""
(:* (:or (char-complement (char-set "\"\\"))
(:: #\\ any-char)))
"\""))
(define-lex-abbrev line-comment (:: ";" (:& any-string
(complement (:: any-string nl any-string)))))
(define (get-here-string i error-k)
(local
[(define (get-offset i)
(let-values (((x y offset) (port-next-location i)))
offset))
(define (special-read-line i)
(let ((next (peek-char-or-special i)))
(cond
((or (eq? next #\newline) (not (char? next)))
null)
(else
(read-char i)
(cons next (special-read-line i))))))]
(let* ((ender (list->string (special-read-line i)))
(next-char (peek-char-or-special i)))
(cond
((or (equal? ender "") (not (eq? #\newline next-char)))
(error-k (string-append "#<<" ender)))
(else
(read-char i)
(let loop ((acc (list (string-append "#<<" ender "\n"))))
(let* ((next-line (list->string (special-read-line i)))
(next-char (peek-char-or-special i)))
(cond
((not (or (char? next-char) (eof-object? next-char))) (error-k (apply string-append (reverse! (cons next-line acc)))))
((equal? next-line ender) (apply string-append (reverse! (cons next-line acc))))
((eof-object? next-char)
(error-k (apply string-append (reverse! (cons next-line acc)))))
(else
(read-char i)
(loop (cons (string-append next-line "\n") acc)))))))))))
(define (get-nested-comment ip)
(local ((define nested-lexer
(begin-lifted
(lexer
("#|"
(lambda (self i loop input-port)
(cons "#|"
(loop (self input-port) (add1 i)))))
("|#"
(lambda (self i loop input-port)
(cond
[(= i 1)
(list "|#")]
[else
(cons "|#"
(loop (self input-port) (sub1 i)))])))
(any-char
(lambda (self i loop input-port)
(cons lexeme (loop (self input-port) i))))
((eof)
(lambda (self i loop input-port)
(error 'get-nested-comment "expected |#")))))))
(apply string-append
(cons "#|"
(let loop ([next-action (nested-lexer ip)]
[depth 1])
(next-action nested-lexer depth loop ip))))))
(define continuation-line-lexer
(lexer
((:: (:* (:: (complement (:: any-string nl any-string)) "\\" nl))
(:: (complement (:: any-string nl any-string))) nl)
lexeme)))
(define plt-lexer
(local ((define lexer
(lexer-src-pos
("#!"
(cond
[(= 0 (position-offset start-pos))
(token-atom
(string-append "#!" (continuation-line-lexer input-port)))]
[else (token-atom lexeme)]))
(openers
(token-prefix lexeme))
(closers
(token-suffix lexeme))
((repetition 1 +inf.0 whitespace)
(token-space lexeme))
(quoters
(token-quoter-prefix lexeme))
(pounded-atoms
(token-atom lexeme))
(pound-prefix-free-atom
(token-atom lexeme))
(line-comment
(token-atom lexeme))
(string-literal
(token-atom lexeme))
("#<<"
(token-atom (get-here-string input-port
(lambda (recovery) recovery))))
("#|"
(token-atom (get-nested-comment input-port)))
((special-comment)
(begin
(token-special-atom lexeme)))
((special)
(token-special-atom lexeme))
((eof) (token-end lexeme))
(any-char (token-atom lexeme)))))
(lambda (ip)
(lexer ip)))))