#lang at-exp racket/base
(require racket/generator
racket/list
racket/sequence
racket/string
racket/set
data/gvector
(for-syntax racket/base)
(planet dyoo/while-loop))
(provide generate-tokens
(struct-out exn:fail:token)
(struct-out exn:fail:indentation))
(define-syntax (set!* stx)
(syntax-case stx ()
[(= id1 id-rest ... val)
(andmap identifier? (syntax->list #'(id1 id-rest ...)))
(syntax/loc stx
(let ([v val])
(set! id1 v)
(set! id-rest v) ...))]))
(define-syntax (++ stx)
(syntax-case stx ()
[(_ id)
(identifier? #'id)
(syntax/loc stx
(set! id (add1 id)))]))
(define-syntax (-- stx)
(syntax-case stx ()
[(_ id)
(identifier? #'id)
(syntax/loc stx
(set! id (sub1 id)))]))
(define-struct (exn:fail:token exn:fail) (loc))
(define-struct (exn:fail:indentation exn:fail) (loc))
(define (string-right-ref str n)
(string-ref str (- (string-length str) n)))
(define (slice-end str n)
(substring str
(max (- (string-length str) n)
0)))
(define (slice-front str n)
(substring str 0 (min n (string-length str))))
(define (rstrip-newlines s)
(regexp-replace #px"[\r\n]+$" s ""))
(define (my-gvector-pop! gv)
(define last-index (sub1 (gvector-count gv)))
(define val (gvector-ref gv last-index))
(gvector-remove! gv last-index)
val)
(define (my-gvector-last gv)
(define last-index (sub1 (gvector-count gv)))
(gvector-ref gv last-index))
(define (my-gvector-member x gv)
(let/ec return
(for ([elt (in-gvector gv)])
(when (equal? x elt)
(return #t)))
(return #f)))
(define NAME 'NAME)
(define NUMBER 'NUMBER)
(define STRING 'STRING)
(define OP 'OP)
(define COMMENT 'COMMENT)
(define NL 'NL)
(define NEWLINE 'NEWLINE)
(define DEDENT 'DEDENT)
(define INDENT 'INDENT)
(define ERRORTOKEN 'ERRORTOKEN)
(define ENDMARKER 'ENDMARKER)
(define (group . choices)
(string-append "(" (string-join choices "|") ")"))
(define (any . choices)
(string-append (apply group choices) "*"))
(define (maybe . choices)
(string-append (apply group choices) "?"))
(define r string-append)
(define Whitespace "[ \f\t]*")
(define Comment "#[^\r\n]*")
(define Ignore (string-append Whitespace
(any (string-append "\\\r?\n" Whitespace))
(maybe Comment)))
(define Name @r{[a-zA-Z_]\w*})
(define Hexnumber @r{0[xX][\da-fA-F]+[lL]?})
(define Octnumber @r{(0[oO][0-7]+)|(0[0-7]*)[lL]?})
(define Binnumber @r{0[bB][01]+[lL]?})
(define Decnumber @r{[1-9]\d*[lL]?})
(define Intnumber (group Hexnumber Binnumber Octnumber Decnumber))
(define Exponent @r{[eE][-+]?\d+})
(define Pointfloat (string-append
(group @r{\d+\.\d*} @r{\.\d+})
(maybe Exponent)))
(define Expfloat (string-append @r{\d+} Exponent))
(define Floatnumber (group Pointfloat Expfloat))
(define Imagnumber (group @r{\d+[jJ]}
(string-append Floatnumber @r{[jJ]})))
(define Number (group Imagnumber Floatnumber Intnumber))
(define Single @r{[^'\\]*(?:\\.[^'\\]*)*'})
(define Double @r{[^"\\]*(?:\\.[^"\\]*)*"})
;; Tail end of ''' string.
(define Single3 @r{[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''})
;; Tail end of """ string.
(define Double3 @r{[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""})
(define Triple (group @r{[uU]?[rR]?'''} @r{[uU]?[rR]?"""}))
;; Single-line ' or " string.
(define String (group @r{[uU]?[rR]?'[^@r["\n"]'\\]*(?:\\.[^@r["\n"]'\\]*)*'}
@r{[uU]?[rR]?"[^@r["\n"]"\\]*(?:\\.[^@r["\n"]"\\]*)*"}))
(define Operator (group @r{\*\*=?} @r{>>=?} @r{<<=?} @r{<>} @r{!=}
@r{//=?}
@r{[+\-*/%&|^=<>]=?}
@r{~}))
(define Bracket "[][(){}]")
(define Special (group "\r?\n" @r|{[:(define Funny (group Operator Bracket Special))
(define PlainToken (group Number Funny String Name))
(define Token (string-append Ignore PlainToken))
(define ContStr (group (string-append @r{[uU]?[rR]?'[^@r["\n"]'\\]*(?:\\.[^@r["\n"]'\\]*)*}
(group "'" "\\\r?\n"))
(string-append @r{[uU]?[rR]?"[^@r["\n"]"\\]*(?:\\.[^@r["\n"]"\\]*)*}
(group @r{"} "\\\r?\n"))))
(define PseudoExtras (group "\\\r?\n" Comment Triple))
(define PseudoToken
(string-append Whitespace (group PseudoExtras Number Funny ContStr Name)))
(define-values (tokenprog pseudoprog single3prog double3prog)
(apply values
(map (lambda (x)
(pregexp (string-append "^" x)))
(list Token PseudoToken Single3 Double3))))
(define endprogs
(hash @r{'} (pregexp (string-append "^" Single))
@r{"} (pregexp (string-append "^" Double))
@r{'''} single3prog
@r{"""} double3prog
@r{r'''} single3prog
@r{r"""} double3prog
@r{u'''} single3prog
@r{u"""} double3prog
@r{ur'''} single3prog
@r{ur"""} double3prog
@r{R'''} single3prog
@r{R"""} double3prog
@r{U'''} single3prog
@r{U"""} double3prog
@r{uR'''} single3prog
@r{uR"""} double3prog
@r{Ur'''} single3prog
@r{Ur"""} double3prog
@r{UR'''} single3prog
@r{UR"""} double3prog
@r{b'''} single3prog
@r{b"""} double3prog
@r{br'''} single3prog
@r{br"""} double3prog
@r{B'''} single3prog
@r{B"""} double3prog
@r{bR'''} single3prog
@r{bR"""} double3prog
@r{Br'''} single3prog
@r{Br"""} double3prog
@r{BR'''} single3prog
@r{BR"""} double3prog
@r{r} #f
@r{R} #f
@r{u} #f
@r{U} #f
@r{b} #f
@r{B} #f))
(define triple-quoted
(set "'''" @r{"""}
"r'''" @r{r"""} "R'''" @r{R"""}
"u'''" @r{u"""} "U'''" @r{U"""}
"ur'''" @r{ur"""} "Ur'''" @r{Ur"""}
"uR'''" @r{uR"""} "UR'''" @r{UR"""}
"b'''" @r{b"""} "B'''" @r{B"""}
"br'''" @r{br"""} "Br'''" @r{Br"""}
"bR'''" @r{bR"""} "BR'''" @r{BR"""}))
(define single-quoted
(set "'" @r{"}
"r'" @r{r"} "R'" @r{R"}
"u'" @r{u"} "U'" @r{U"}
"ur'" @r{ur"} "Ur'" @r{Ur"}
"uR'" @r{uR"} "UR'" @r{UR"}
"b'" @r{b"} "B'" @r{B"}
"br'" @r{br"} "Br'" @r{Br"}
"bR'" @r{bR"} "BR'" @r{BR"}))
(define tabsize 8)
(define (in-lines/preserve-newlines ip)
(in-generator
(let loop ()
(define a-match (regexp-match #px"^([^\r\n]*)(\r\n|\n|\r|$)"
ip))
(when (and a-match (not (bytes=? (first a-match) #"")))
(yield (bytes->string/utf-8 (first a-match)))
(loop)))))
(define (generate-tokens ip)
(in-generator
(define-values (read-line-not-exhausted? read-line)
(sequence-generate (in-lines/preserve-newlines ip)))
(define lnum 0)
(define strstart (list 0 0))
(define start 0)
(define end 0)
(define pos #f)
(define max #f)
(define column 0)
(define parenlev 0)
(define continued? #f)
(define namechars (apply set (string->list "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")))
(define numchars (apply set (string->list "0123456789")))
(define contstr "")
(define needcont? #f)
(define contline #f)
(define indents (gvector 0))
(define line "")
(define endprog #px"")
(let ([yield-list (lambda args (yield args))])
(while #t (if (read-line-not-exhausted?)
(set! line (read-line))
(set! line ""))
(++ lnum)
(set! pos 0)
(set! max (string-length line))
(cond
[(> (string-length contstr) 0) (when (string=? line "")
(raise (exn:fail:token "EOF in multi-line string")
(current-continuation-marks)
strstart))
(define endmatch (regexp-match-positions endprog line))
(cond
[endmatch
(set!* pos end
(cdr (first endmatch)))
(yield-list STRING
(string-append contstr (substring line 0 end))
strstart
(list lnum end)
(string-append contline line))
(set! contstr "")
(set! needcont? #f)
(set! contline #f)]
[(and needcont?
(not (string=? (slice-end line 2) "\\\n"))
(not (string=? (slice-end line 3) "\\\r\n")))
(yield-list ERRORTOKEN
(string-append contstr line)
strstart
(list lnum (string-length line))
contline)
(set! contstr "")
(set! contline #f)
(continue)]
[else
(set! contstr (string-append contstr line))
(set! contline (string-append contline line))
(continue)])]
[(and (= parenlev 0)
(not continued?)) (when (string=? line "")
(break))
(set! column 0)
(while (< pos max) (cond
[(char=? (string-ref line pos) #\space)
(++ column)]
[(char=? (string-ref line pos) #\tab)
(set! column (* tabsize (add1 (quotient column tabsize))))]
[(char=? (string-ref line pos) #\page)
(set! column 0)]
[else
(break)])
(++ pos))
(when (= pos max)
(break))
(when (member (string-ref line pos) (list #\# #\return #\newline))
(cond
[(char=? (string-ref line pos) #\#)
(define comment-token (rstrip-newlines (substring line pos)))
(define nl-pos (+ pos (string-length comment-token)))
(yield-list COMMENT
comment-token
(list lnum pos)
(list lnum (+ pos (string-length comment-token)))
line)
(yield-list NL
(substring line nl-pos)
(list lnum nl-pos)
(list lnum (string-length line))
line)]
[else
(yield-list (if (char=? (string-ref line pos) #\#) COMMENT NL)
(string (string-ref line pos))
(list lnum pos)
(list lnum (string-length line))
line)])
(continue))
(when (> column (my-gvector-last indents)) (gvector-add! indents column)
(yield-list INDENT
(substring line 0 pos)
(list lnum 0)
(list lnum pos)
line))
(while (< column (my-gvector-last indents))
(unless (my-gvector-member column indents)
(raise (exn:fail:indentation "unindent does not match any outer indentation level"
(current-continuation-marks)
(list "<tokenize>" lnum pos line)))
(my-gvector-pop! indents)
(yield-list DEDENT
""
(list lnum pos)
(list lnum pos)
line)))]
[else (if (= (string-length line) 0)
(raise (exn:fail:token "EOF in multi-line statement"
(current-continuation-marks)
(list lnum 0)))
(set! continued? #f))])
(while (< pos max)
(define pseudomatch (regexp-match-positions pseudoprog line pos))
(cond [pseudomatch (set! start (car (second pseudomatch)))
(set! end (cdr (second pseudomatch)))
(define spos (list lnum start))
(define epos (list lnum end))
(set! pos end)
(define token (substring line start end))
(define initial (string-ref line start))
(cond
[(or (set-member? numchars initial)
(and (char=? initial #\.) (not (string=? token ".")))) (yield-list NUMBER token spos epos line)]
[(or (char=? initial #\return) (char=? initial #\newline))
(yield-list (if (> parenlev 0) NL NEWLINE)
token spos epos line)]
[(char=? initial #\#)
(when (regexp-match #px"\n$" token)
(error 'generate-tokens "Assertion error: token ends with newline"))
(yield-list COMMENT token spos epos line)]
[(set-member? triple-quoted token)
(set! endprog (hash-ref endprogs token))
(define endmatch (regexp-match-positions endprog line pos))
(cond
[endmatch
(set! pos (cdr (first endmatch)))
(set! token (substring line start pos))
(yield-list STRING token spos (list lnum pos) line)]
[else
(set! strstart (list lnum start)) (set! contstr (substring line start))
(set! contline line)
(break)])]
[(or (set-member? single-quoted (string initial))
(set-member? single-quoted (slice-front token 2))
(set-member? single-quoted (slice-front token 3)))
(cond
[(char=? (string-right-ref token 1) #\newline) (set! strstart (list lnum start))
(set! endprog (or (hash-ref endprogs (string initial) #f)
(hash-ref endprogs (string (string-ref token 1)) #f)
(hash-ref endprogs (string (string-ref token 2)) #f)))
(set! contstr (substring line start))
(set! needcont? #t)
(set! contline line)
(break)]
[else (yield-list STRING token spos epos line)])]
[(set-member? namechars initial) (yield-list NAME token spos epos line)]
[(char=? initial #\\) (set! continued? #t)]
[else
(cond [(or (char=? initial #\() (char=? initial #\[) (char=? initial #\{))
(++ parenlev)]
[(or (char=? initial #\)) (char=? initial #\]) (char=? initial #\}))
(-- parenlev)])
(yield-list OP token spos epos line)])]
[else
(yield-list ERRORTOKEN
(string-ref line pos)
(list lnum pos)
(list lnum (+ pos 1))
line)
(++ pos)])))
(for ([indent (sequence-tail indents 1)]) (yield-list DEDENT
""
(list lnum 0)
(list lnum 0)
""))
(yield-list ENDMARKER
""
(list lnum 0)
(list lnum 0)
""))))
(define (exercising)
(sequence->list (generate-tokens (open-input-string "
def f(x):
return x * x
r'''hi, this is
a test
'''
"))))