(module tokenize mzscheme
(require (lib "list.ss" "srfi" "1")
(lib "string.ss")
(lib "class.ss")
(lib "match.ss")
(lib "etc.ss")
"ast.ss"
"cursor.ss"
"../config.ss"
"../exn.ss"
"regexps.ss"
"token.ss")
(define k 3)
(define scan-newlines? (make-parameter #f))
(define scan-infix-operator? (make-parameter #f))
(define (digit? ch)
(and (memq ch '(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9))
#t))
(define (hex-digit? ch)
(and (memq ch '(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9
#\A #\B #\C #\D #\E #\F
#\a #\b #\c #\d #\e #\f))
#t))
(define (oct-digit? ch)
(and (memq ch '(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7))
#t))
(define single-escape-characters
'((#\' . #\')
(#\" . #\")
(#\\ . #\\)
(#\b . #\backspace)
(#\f . #\page)
(#\n . #\newline)
(#\r . #\return)
(#\t . #\tab)
(#\v . #\vtab)))
(define (single-escape-char? ch)
(and (assq ch single-escape-characters)
#t))
(define (unescape-chars radix . chars)
(integer->char (string->number (list->string chars) radix)))
(define tokenizer<%>
(interface ()
fail fail/loc done? current-token match must-match peek-token peek-token/infix-operator peek-token/same-line read-token read-token/infix-operator read-token/same-line unread-token skip-whitespace ))
(define tokenizer%
(class* object% (tokenizer<%>)
(init port [name (object-name port)])
(define source port)
(define filename name)
(define cursor (make-cursor k))
(port-count-lines! source)
(public fail fail/loc show-state
done? current-token
(token:match match)
(token:must-match must-match)
peek-token peek-token/infix-operator peek-token/same-line
read-token read-token/infix-operator read-token/same-line
unread-token
skip-whitespace)
(define (current-position)
(let-values ([(line col offset) (port-next-location source)])
(make-position offset line col)))
(define (fail/loc loc text fmt . args)
(raise (make-exn:fail:javascript:syntax (string->immutable-string (apply format fmt args))
(current-continuation-marks)
this
loc
text)))
(define (fail fmt . args)
(send/apply this fail/loc #f #f fmt args))
(define (show-state . args)
(unless (null? args)
(apply fprintf (current-error-port) args)
(fprintf (current-error-port) ": "))
(let ([upcoming (peek-string 5 0 source)])
(fprintf (current-error-port) "~a [~v...]~n" cursor (if (eof-object? upcoming) "" upcoming))
#f))
(define (unescape-string str)
(let loop ([chars (string->list str)]
[result null])
(match chars
[() (list->string (reverse result))]
[(#\\) (fail "unterminated string literal")]
[(#\\ (and ec (? single-escape-char?)) . rest)
(loop rest (cons (cdr (assq ec single-escape-characters)) result))]
[(#\\ #\x (and d1 (? hex-digit?)) (and d2 (? hex-digit?)) . rest)
(loop rest (cons (unescape-chars 16 d1 d2) result))]
[(#\\ #\u (and d1 (? hex-digit?))
(and d2 (? hex-digit?))
(and d3 (? hex-digit?))
(and d4 (? hex-digit?)) . rest)
(loop rest (cons (unescape-chars 16 d1 d2 d3 d4) result))]
[(#\\ (and d1 (or #\0 #\1 #\2 #\3))
(and d2 (? oct-digit?))
(and d3 (? oct-digit?)) . rest)
(loop rest (cons (unescape-chars 8 d1 d2 d3) result))]
[(#\\ (and d1 (? oct-digit?))
(and d2 (? oct-digit?)) . rest)
(loop rest (cons (unescape-chars 8 d1 d2) result))]
[(#\\ (and d1 (? oct-digit?)) . rest)
(loop rest (cons (unescape-chars 8 d1) result))]
[(#\\ c . rest)
(loop rest (cons c result))]
[(c . rest)
(loop rest (cons c result))])))
(define (parse-regexp-pattern str)
str)
(define (done?)
(eq? (token-type (peek-token)) 'END))
(define (current-token)
(cursor-current cursor))
(define (token:match tt)
(and (eq? (token-type (peek-token)) tt)
(read-token)))
(define (token:must-match tt)
(unless (token:match tt)
(fail "missing ~a" (string-downcase (symbol->string tt))))
(current-token))
(define (skip-whitespace)
(let ([match (regexp-match-peek-positions #rx"^[ \t\v]+" source)])
(when match
(read-string (cdar match) source))
#f))
(define @
(opt-lambda (start [end (current-position)])
(make-region filename start end)))
(define-syntax within-region
(syntax-rules ()
[(_ type e ...)
(let ([start (current-position)]
[result (begin e ...)]
[end (current-position)])
(make-token type result (@ start end)))]))
(define (advance! n)
(when (positive? n)
(set! cursor (cursor-advance cursor read-next-token))
(advance! (sub1 n))))
(define (peek-token/same-line)
(parameterize ([scan-newlines? #t])
(peek-token)))
(define peek-token/infix-operator
(opt-lambda ([skip 0])
(parameterize ([scan-infix-operator? #t])
(peek-token skip))))
(define peek-token
(opt-lambda ([skip 0])
(if (zero? skip)
(begin0 (read-token)
(unread-token))
(begin (read-token)
(begin0 (peek-token (sub1 skip))
(unread-token))))))
(define (read-token/same-line)
(parameterize ([scan-newlines? #t])
(read-token)))
(define read-token/infix-operator
(opt-lambda ([skip 0])
(parameterize ([scan-infix-operator? #t])
(read-token skip))))
(define read-token
(opt-lambda ([skip 0])
(advance! (add1 skip))
(let ([token (cursor-current cursor)])
(cond
[(and (not (scan-newlines?))
(eq? (token-type token) 'NEWLINE))
(read-token)]
[(and (not (scan-infix-operator?))
(prefix-operator? (token-type token))
(infix-operator? (token-type token)))
(make-token 'UNARY (token-contents token) (token-location token))]
[else token]))))
(define (read-next-token)
(skip-whitespace)
(cond
[(eof-object? (peek-char source))
(make-token 'END #f (@ (current-position) (current-position)))]
[(regexp-match-peek-positions rx:empty source)
=> (lambda (match)
(let ([token (within-region 'NEWLINE
(length (regexp-match* #rx"\n" (read-string (cdar match) source))))])
(if (> (token-contents token) 0)
token
(read-next-token))))]
[(regexp-match-peek-positions rx:float source)
=> (lambda (match)
(within-region 'NUMBER
(string->number (read-string (cdar match) source))))]
[(regexp-match-peek-positions rx:integer source)
=> (lambda (match)
(within-region 'NUMBER
(string->number (read-string (cdar match) source))))]
[(regexp-match-peek-positions rx:identifier source)
=> (lambda (match)
(let* ([start (current-position)]
[contents (read-string (cdar match) source)]
[sym (string->symbol contents)])
(if (memq sym (lexical-keywords))
(make-token sym sym (@ start))
(make-token 'ID sym (@ start)))))]
[(regexp-match-peek-positions rx:string source)
=> (lambda (match)
(within-region 'STRING
(let ([str (read-string (cdar match) source)])
(unescape-string (substring str 1 (- (string-length str) 1))))))]
[(and (not (scan-infix-operator?))
(regexp-match-peek-positions rx:regexp source))
=> (lambda (match)
(within-region 'REGEXP
(let* ([str (read-string (cdar match) source)]
[pattern (substring str (car (list-ref match 1)) (cdr (list-ref match 1)))]
[flags (cond
[(list-ref match 2)
=> (lambda (pair)
(parse-regexp-pattern
(substring str (car pair) (cdr pair))))]
[else #f])])
(make-regexp-contents pattern
(and flags (regexp-match #rx"g" flags) #t)
(and flags (regexp-match #rx"i" flags) #f)))))]
[(regexp-match-peek-positions #rx"^==(?:=)?" source)
=> (lambda (match)
(let ([start (current-position)]
[operator (string->symbol (read-string (cdar match) source))])
(make-token operator operator (@ start))))]
[(regexp-match-peek-positions rx:assignment-operator source)
=> (lambda (match)
(within-region 'ASSIGN
(string->symbol (read-string (cdar match) source))))]
[(regexp-match-peek-positions rx:operator source)
=> (lambda (match)
(let ([start (current-position)]
[operator (string->symbol (read-string (cdar match) source))])
(make-token operator operator (@ start))))]
[else (fail "illegal token")]))
(define (unread-token)
(set! cursor (cursor-rewind cursor))
(let ([token (cursor-current cursor)])
(when (and token
(not (scan-newlines?))
(eq? (token-type token) 'NEWLINE))
(unread-token))))
(super-make-object)))
(define (scan str)
(let ([t (make-object tokenizer% (open-input-string str))])
(lambda ()
(send t read-token))))
(provide
tokenizer%
(struct exn:fail:javascript ())
(struct exn:fail:javascript:syntax (source))
scan))