#lang at-exp racket
(require parser-tools/lex
parser-tools/yacc
(prefix-in : parser-tools/lex-sre)
racket/contract
scribble/srcdoc
srfi/9 (prefix-in srfi19: (only-in srfi/19 string->date))
(only-in srfi/13 substring/shared string-trim-right)
(prefix-in xml: (only-in xml xexpr?)))
(require/doc scheme/base
scribble/manual)
(define-record-type :wikitext
(wikitext m b)
wikitext?
(b wikitext-body)
(m wikitext-metadata set-wikitext-metadata!))
(define wikitext/c
(flat-named-contract "wiki-text" wikitext?))
(provide/doc
(proc-doc/names wikitext? (-> any/c boolean?) (x)
@{Returns @scheme[#t] if @var{x} is a parsed wikitext object.})
(proc-doc/names parse (-> (or/c port? string?) wikitext/c) (source)
@{Parse the @var{source} into a wikitext object.})
(proc-doc/names body (-> wikitext/c (listof xml:xexpr?)) (wikitext)
@{Extract the body of the document from the wikitext object.})
(proc-doc/names lookup (-> wikitext/c symbol? (or/c string? false/c)) (wikitext key)
@{Retrieve the metadata value corresponding to key @var{key}.
If several were specified, they are concatenated.})
(proc-doc/names lookup/multiple
(-> wikitext/c symbol? (listof string?))
(wikitext key)
@{Retrieve the multiple metadata values corresponding to @var{key},
or an empty list if there was none.})
(proc-doc/names lookup-parsed
(-> wikitext/c symbol? any)
(wikitext key)
@{Like @scheme[lookup], except that, depending on the key,
the value is returned as a parsed object. The only ones treated specially
are @scheme['date] and @scheme['updated], which are returned as
SRFI-19 date objects.})
(proc-doc/names lookup-keys (-> wikitext/c (listof symbol?)) (wikitext)
@{Return the list of available keys.})
(proc-doc/names set-metadata!
(-> wikitext/c symbol? string? any)
(wikitext key value)
@{Set a metadata key to have the given value.
This appends the value, so that the value changes for LOOKUP,
but is extended for LOOKUP/MULTIPLE.}))
(provide list-lexemes)
(define-syntax receive
(syntax-rules ()
((receive formals expression body ...)
(call-with-values (lambda () expression)
(lambda formals body ...)))))
(define-syntax assert
(syntax-rules ()
((_ test)
(when (not test)
(error (format "Assertion failed: ~s => ~s" (quote test) test))))))
(define in-typed-block?
(let ((in-block #f))
(case-lambda
((block-type)
(and in-block (eqv? block-type in-block)))
((new-in value)
(set! in-block new-in)
value))))
(define-lex-abbrevs
(horizontal-whitespace (:or #\space #\tab))
(simple-punctuation (:or #\, #\. #\? #\' #\; #\: #\! #\@ #\-))
(url-string (:: (:? (:+ lower-case ) "://")
(:+ (:or alphabetic
(:/ #\0 #\9)
#\/ #\& #\+ #\_ #\= #\# #\@ #\: #\. #\, #\- #\~
#\space
#\( #\)
#\? #\%
))))
(eol-char (union #\newline #\return))
(chars-to-line-end (:* (char-complement eol-char))))
(define url-regexp-string "(http|ftp)://[A-Za-z0-9/&+_=#@:.,~-]+[A-Za-z0-9/#]")
(define-lex-abbrev string-contents
(:or alphabetic numeric simple-punctuation horizontal-whitespace))
(define-tokens block-value-tokens
(HEADER METADATA-KEYWORD BESCAPED-TEXT
QUOTED-PARAGRAPH-LINE QUOTED-PARAGRAPH-BLANK-LINE
UL-ITEM OL-ITEM
TABLE-ROW
ORDINARY-LINE))
(define-empty-tokens block-empty-tokens
(blank-line horizontal-line EOF))
(define block-lexer
(lexer-src-pos
((eof) 'EOF)
((:+ (:: (:* horizontal-whitespace) eol-char)) (token-blank-line))
((:: (:>= 1 #\=) chars-to-line-end eol-char)
(let ((parts (regexp-match #px"^(=+)\\s*(.*[^=\\s])[=\\s]*$" lexeme)))
(if parts
(token-HEADER (cons (string-length (list-ref parts 1))
(list-ref parts 2)))
(token-HEADER (cons (count-chars #\= lexeme) "HEADER")))))
((:: "{{{" (:* horizontal-whitespace) eol-char)
(let loop ((lines '())
(first-line? #t))
(cond ((regexp-try-match #rx"^}}}" input-port)
=> (λ (m) (token-BESCAPED-TEXT (apply string-append (reverse lines)))))
(else
(let ((line (read-line input-port 'any)))
(if (eof-object? line)
(token-BESCAPED-TEXT (apply string-append (reverse lines)))
(loop (cons (if first-line? line (string-append "\n" line))
lines)
#f)))))))
((:: (:* horizontal-whitespace) (:+ (:: ">" (:* horizontal-whitespace))))
(let ((line (read-line input-port 'any))
(depth (count-chars #\> lexeme)))
(if (= (string-length line) 0)
(token-QUOTED-PARAGRAPH-BLANK-LINE (list depth))
(token-QUOTED-PARAGRAPH-LINE (list depth line)))))
((:: "::" (:+ graphic) (:* horizontal-whitespace) chars-to-line-end eol-char)
(cond ((regexp-match #px"^::(\\S+)\\s+(.*[^\\s])\\s*$" lexeme)
=> (λ (parts)
(token-METADATA-KEYWORD (cons (string->symbol (list-ref parts 1))
(list-ref parts 2)))))
((regexp-match #px"^::(\\S+)" lexeme)
=> (λ (parts)
(token-METADATA-KEYWORD (cons (string->symbol (list-ref parts 1)) ""))))
(else (error (format "What? Lexeme ~s matched keywords and didn't match!" lexeme)))))
((:: (:* horizontal-whitespace) (:+ "*") chars-to-line-end eol-char)
(let* ((parts (regexp-match #px"^\\s*(\\*+)\\s*(.*)" lexeme))
(n-hashes (count-chars #\* (list-ref parts 1))))
(assert parts)
(if (and (= n-hashes 2)
(not (in-typed-block? 'block-ul)))
(token-ORDINARY-LINE (chop lexeme)) (in-typed-block? 'block-ul (token-UL-ITEM (list n-hashes
(chop (list-ref parts 2))))))))
((:: (:* horizontal-whitespace) (:+ "#") chars-to-line-end eol-char)
(let* ((parts (regexp-match #px"^\\s*(#+)\\s*(.*)" lexeme))
(n-stars (count-chars #\# (list-ref parts 1))))
(assert parts)
(if (and (= n-stars 2)
(not (in-typed-block? 'block-ol)))
(token-ORDINARY-LINE (chop lexeme)) (in-typed-block? 'block-ol (token-OL-ITEM (list n-stars
(chop (list-ref parts 2))))))))
((:: (:* horizontal-whitespace) "|" chars-to-line-end (:? eol-char))
(token-TABLE-ROW lexeme))
((repetition 4 +inf.0 "-")
(token-horizontal-line))
((char-complement eol-char)
(token-ORDINARY-LINE (string-append lexeme (read-line input-port 'any))))
))
(define block-parser
(parser
(start start)
(end EOF)
(tokens block-value-tokens block-empty-tokens)
(error (lambda (grammar? token value start end)
(error 'parser "block ~a error at ~a:~a token=~s~%"
(if grammar? "grammar" "lexer")
(position-line start) (position-col start)
(if value
(cons token value)
token))))
(src-pos)
(grammar
(start (() 'empty)
((error start) (error 'parser "Fatal block parser error: ~s" $2))
((block-sequence)
(let loop ((md (make-immutable-hasheqv '()))
(content $1)
(res '()))
(cond ((null? content) (wikitext md (reverse res)))
((eqv? (caar content) '*METADATA*)
(let ((k (cadar content))
(v (cddar content)))
(loop (hash-update md
k
(λ (orig)
(cond ((pair? orig) (append orig (list v)))
(orig (list orig v))
(else v)))
#f)
(cdr content)
res)))
(else
(loop md
(cdr content)
(cons (car content) res)))))))
(block-sequence
((block-element block-sequence) (cons $1 $2))
((block-element blank-line block-sequence) (cons $1 $3))
((block-element blank-line) (list $1)) ((block-element) (list $1)))
(block-element
((metadata-line) `(*METADATA* . ,$1))
((header) $1)
((paragraph) `(p . ,(parse-paragraph (apply string-append $1))))
((quoted-block-of-paragraphs) (pack-paragraphs 'blockquote 'p (collect-paragraphs $1)))
((block-ul) (in-typed-block? #f (pack-paragraphs 'ul 'li (collect-paragraphs $1))))
((block-ol) (in-typed-block? #f
(pack-paragraphs 'ol 'li (collect-paragraphs $1))))
((table) `(table . ,$1))
((horizontal-line) '(hr))
((BESCAPED-TEXT) (list 'pre $1)))
(header
((HEADER) (cons (case (car $1)
((1) 'h1)
((2) 'h2)
((3) 'h3)
((4) 'h4)
((5) 'h5)
(else (error 'parser "Unrecognised header with ~a '='" (car $1))))
(parse-paragraph (cdr $1)))))
(paragraph
((ORDINARY-LINE paragraph) `(,$1 " " . ,$2))
((ORDINARY-LINE) (list $1)))
(quoted-block-of-paragraphs
((quoted-paragraph quoted-block-of-paragraphs) (append $1 $2))
((quoted-paragraph) $1))
(quoted-paragraph
((QUOTED-PARAGRAPH-BLANK-LINE) (list $1))
((QUOTED-PARAGRAPH-LINE quoted-paragraph)
(if (= (car $1) (caar $2))
(cons (if (null? (cdar $2)) $1 (append $1 (list " ") (cdar $2)))
(cdr $2))
(cons $1 $2)))
((QUOTED-PARAGRAPH-LINE) (list $1)))
(block-ul
((ul-item block-ul) (cons $1 $2))
((ul-item) (list $1)))
(ul-item
((UL-ITEM paragraph) (append $1 (list " ") $2))
((UL-ITEM) $1))
(block-ol
((ol-item block-ol) (cons $1 $2))
((ol-item) (list $1)))
(ol-item
((OL-ITEM paragraph) (append $1 (list " ") $2))
((OL-ITEM) $1))
(table
((TABLE-ROW table) (cons `(tr . ,(parse-paragraph $1 #t)) $2))
((TABLE-ROW) `((tr . ,(parse-paragraph $1 #t)))))
(metadata-line
((METADATA-KEYWORD) $1)))))
(define (count-chars ch s)
(let loop ((i (- (string-length s) 1))
(n 0))
(cond ((< i 0) n)
((char=? (string-ref s i) ch) (loop (- i 1) (+ n 1)))
(else (loop (- i 1) n)))))
(define (collect-paragraphs paras)
(define (get-sublists l)
(cond ((null? l) (values 0 '() '()))
((null? (cdr l)) (values (caar l) (list (cdar l)) '()))
(else
(let ((collect-level (caar l)))
(let loop ((tail-of-l l)
(head-of-l '()))
(if (and (pair? tail-of-l)
(= collect-level (caar tail-of-l)))
(loop (cdr tail-of-l) (cons (car tail-of-l) head-of-l))
(values collect-level (map cdr (reverse head-of-l)) tail-of-l)))))))
(define (enlist n l)
(if (> n 1)
(list (enlist (- n 1) l))
l))
(let loop ((rest paras)
(result '()))
(receive (level head tail)
(get-sublists rest)
(if (null? head)
(apply append (reverse result))
(loop tail (cons (enlist level head) result))))))
(define (pack-paragraphs outer inner paragraphs)
(define (pack-em para)
(if (list? (car para))
(cons outer (map pack-em para))
(cons inner (parse-paragraph (apply string-append para)))))
(pack-em paragraphs))
(define-tokens paragraph-value-tokens
(PSTRING PCHAR PESCAPED-TEXT OLINK OIMG OGENERAL))
(define-empty-tokens paragraph-empty-tokens
(EMPH STRONG QUOTE LINEBREAK LINKSEP TABLE-HEADER-SEPARATOR CLINK CIMG))
(define paragraph-lexer
(lexer
((eof) 'EOF)
((:: "~" any-char)
(token-PCHAR (string-ref lexeme 1)))
("//" 'EMPH)
("**" 'STRONG)
("##" 'MONOSPACE)
("\"" 'QUOTE)
("\\\\" 'LINEBREAK) ((:: "[[" url-string)
(token-OLINK (substring/shared lexeme 2 (string-length lexeme))))
((:: (:* horizontal-whitespace) "|=" (:* horizontal-whitespace))
'TABLE-HEADER-SEPARATOR)
((:: (:* horizontal-whitespace) "|" (:* horizontal-whitespace))
'LINKSEP)
("]]" 'CLINK)
((:: "<<" (:+ alphabetic) (:* horizontal-whitespace))
(token-OGENERAL (car (regexp-match #rx"[a-zA-Z]+" lexeme))))
(">>" 'CGENERAL)
("{{{"
(let ((escaped-text (regexp-match #rx"(.*?)(}}}|$)" input-port)))
(token-PESCAPED-TEXT (bytes->string/utf-8 (cadr escaped-text)))))
("{{" (let ((escaped-text
(regexp-try-match #px"^[[:alnum:]/][[:alnum:]/&+_=#@:.,~()?%-]*" input-port)))
(if escaped-text
(token-OIMG (bytes->string/utf-8 (car escaped-text)))
(token-PSTRING "{{"))))
("}}" 'CIMG)
((:+ string-contents)
(token-PSTRING lexeme))
(any-char
(token-PCHAR (string-ref lexeme 0)))))
(define (push-stack stack obj)
(if (and (string? obj)
(not (null? stack))
(string? (car stack)))
(cons (string-append (car stack) obj) (cdr stack))
(cons obj stack)))
(define (pop-stack stack . args)
(let ((args? (and (not (null? args))
(andmap symbol? args))))
(if args?
(let loop ((res '())
(s stack))
(cond ((null? s)
(values #f stack))
((memv (car s) args)
=> (λ (l)
(values (cons (car l) res)
(cdr s))))
((symbol? (car s))
(values #f stack))
(else
(loop (cons (car s) res)
(cdr s)))))
(let loop ((res '())
(s stack))
(cond ((null? s)
(values res '()))
((symbol? (car s))
(values (cons (car s) res)
(cdr s)))
(else
(loop (cons (car s) res)
(cdr s))))))))
(define (parse-paragraph str . rest)
(define (spot-urls s)
(apply string-append
(let loop ((start 0)
(p (regexp-match-positions* url-regexp-string s))
(res '()))
(cond ((null? p)
(append res (list (substring s start (string-length s)))))
((and (>= (caar p) 1)
(char=? (string-ref s (- (caar p) 1)) #\~))
(let ((url-string (regexp-replace* "//" (substring s (caar p) (cdar p)) "~//")))
(loop (cdar p)
(cdr p)
(append res
(list (substring s start (- (caar p) 1)) url-string)))))
((and (>= (caar p) 2)
(string=? (substring s (- (caar p) 2) (caar p)) "[["))
(loop (cdar p)
(cdr p)
(append res
(list (substring s start (cdar p))))))
(else
(loop (cdar p)
(cdr p)
(append res
(list (substring s start (caar p))
"[[" (substring s (caar p) (cdar p))
"]]"))))))))
(define (clear-stack stack)
(receive (content popped-stack)
(pop-stack stack)
(cond ((null? content)
(error "clear-stack called with null stack"))
((symbol? (car content))
(case (car content)
((OLINK) (clear-stack (push-stack popped-stack (make-a-link content))))
((OIMG) (clear-stack (push-stack popped-stack (make-an-img content))))
((OGENERAL) (clear-stack (push-stack popped-stack (make-general-element content))))
((td)
(let ((last-el (list-ref content (- (length content) 1))))
(cond ((null? (cdr content)) (clear-stack popped-stack))
((and (string? last-el)
(string-trim-right last-el))
=> (λ (trimmed)
(if (= (string-length trimmed) 0)
(clear-stack popped-stack) (clear-stack
(push-stack popped-stack
(append (drop-right content 1) (list trimmed)))))))
(else
(clear-stack (push-stack popped-stack content))))))
(else
(clear-stack (push-stack popped-stack content)))))
(else
content))))
(define (make-a-link content)
(assert (and (pair? content) (pair? (cdr content))))
(cond ((null? (cddr content)) `(a ((href ,(normalise-link (caadr content)))) ,(caadr content)))
(else
`(a ((href ,(normalise-link (caadr content)))) ,@(cddr content)))))
(define (make-an-img content)
(assert (and (pair? content)
(pair? (cdr content))))
`(img ((src ,(caadr content))
. ,(if (null? (cddr content))
'()
(map (λ (s)
(cond ((regexp-match #rx"^([a-zA-Z0-9]+)=(.+)" s)
=> (λ (m) `(,(string->symbol (cadr m)) ,(caddr m))))
(else
`(alt ,s))))
(regexp-split #px";\\b" (caddr content)))))))
(define (make-general-element content)
(assert (and (pair? content) (pair? (cdr content))))
(cons (caadr content) (cddr content)))
(let ((get-lexeme (let ((p (open-input-string (spot-urls str))))
(lambda () (paragraph-lexer p))))
(parsing-table-row? (and (not (null? rest)) (car rest))))
(let loop ((stack '()))
(let ((t (get-lexeme)))
(case (token-name t)
((EOF) (clear-stack stack))
((PSTRING)
(loop (push-stack stack (token-value t))))
((PESCAPED-TEXT)
(loop
(push-stack stack
(escape-xml-chars-in-string (token-value t)))))
((PCHAR)
(loop
(push-stack stack
(escape-char (token-value t)))))
((EMPH)
(receive (content s2)
(pop-stack stack 'em)
(loop (push-stack s2 (or content 'em)))))
((STRONG)
(receive (content s2)
(pop-stack stack 'strong)
(loop (push-stack s2 (or content 'strong)))))
((MONOSPACE)
(receive (content s2)
(pop-stack stack 'code)
(loop (push-stack s2 (or content 'code)))))
((QUOTE)
(receive (content s2)
(pop-stack stack 'q)
(loop (push-stack s2 (or content 'q)))))
((OLINK)
(loop
(push-stack (push-stack stack 'OLINK)
(list (token-value t)))))
((CLINK)
(receive (content s2)
(pop-stack stack 'OLINK)
(loop (push-stack s2 (if content (make-a-link content) "]]")))))
((LINKSEP)
(if parsing-table-row?
(receive (content s2)
(pop-stack stack 'OLINK 'td 'th)
(case (and content (car content))
((OLINK) (loop stack))
((td th) (loop (push-stack (push-stack s2 content) 'td)))
(else (loop (push-stack s2 'td)))))
(loop stack)))
((TABLE-HEADER-SEPARATOR)
(receive (content s2)
(pop-stack stack 'td 'th)
(if content
(loop (push-stack (push-stack s2 content) 'th))
(loop (push-stack s2 'th)))))
((OIMG)
(loop
(push-stack (push-stack stack 'OIMG)
(list (token-value t)))))
((CIMG)
(receive (content s2)
(pop-stack stack 'OIMG)
(loop (push-stack s2 (if content (make-an-img content) "}}")))))
((OGENERAL)
(loop (push-stack (push-stack stack 'OGENERAL) (list (string->symbol (token-value t))))))
((CGENERAL)
(receive (content s2)
(pop-stack stack 'OGENERAL)
(loop (push-stack s2 (if content (make-general-element content) ">>")))))
((LINEBREAK)
(loop (push-stack stack '(br))))
(else
(error 'parse-paragraph "Unexpected token ~s~a"
(token-name t)
(if (token? t)
(format " (~s)" (token-value t))
""))))))))
(define (escape-char c)
(case c
((#\<) "<")
((#\>) ">")
((#\&) "&")
(else (string c))
))
(define (escape-xml-chars-in-string s)
(regexp-replace* #rx"[&<>]"
s
(lambda (match)
(escape-char (string-ref match 0)))))
(define (chop s)
(substring/shared s 0 (- (string-length s) 1)))
(define (normalise-link s)
(if (regexp-match? (regexp "^[a-z]*://") s)
s (string-downcase (regexp-replace* (regexp " +") s "_"))))
(define (list-lexemes p lexer-spec)
(let ((get-lexeme (if (eqv? lexer-spec 'block)
(λ () (position-token-token (block-lexer p)))
(λ () (paragraph-lexer p)))))
(let loop ()
(let ((l (get-lexeme)))
(if (eqv? l 'EOF)
'()
(cons l (loop)))))))
(define (parse source)
(define (make-wiki-parser p)
(port-count-lines! p)
(lambda ()
(block-parser (lambda () (block-lexer p)))))
(cond ((port? source)
((make-wiki-parser source)))
((string? source)
(parse (open-input-string source)))
(else
(error 'parser "Can't parse source of type ~s" source))))
(define (body w)
(wikitext-body w))
(define (lookup w key)
(let ((value (hash-ref (wikitext-metadata w) key #f)))
(cond ((not value)
#f)
((pair? value)
(list-ref value (- (length value) 1)))
(else
value))))
(define (lookup/multiple w key)
(let ((value (hash-ref (wikitext-metadata w) key #f)))
(cond ((not value)
#f)
((pair? value)
value)
((string=? value "")
'())
(else
(list value)))))
(define (parse-date s)
(define (parse-date* s formats)
(with-handlers ((exn:fail?
(lambda (exn)
(parse-date* s (cdr formats)))))
(and (not (null? formats))
(srfi19:string->date s (car formats)))))
(if (regexp-match? #rx"^ *[0-9][0-9][0-9][0-9]" s)
(parse-date* s '("~Y~m~dT~H~M~S" "~Y~m~dT~H~M" "~Y~m~d" "~Y~b~d" "~Y~B~d"))
(parse-date* s '("~d~b~Y" "~d~B~Y" "~d~m~Y"))))
(define (lookup-parsed w key)
(let ((val (lookup w key)))
(case key
((date updated) (and val (parse-date val)))
(else val))))
(define (lookup-keys w)
(hash-map (wikitext-metadata w) (lambda (k v) k)))
(define (set-metadata! w key value)
(let ((prev (lookup/multiple w key)))
(set-wikitext-metadata! w
(hash-set (wikitext-metadata w)
key
(if prev
(append prev (list value))
value)))))