#lang scheme (require "common.ss") (provide new-parser add-item add-items re txt kw switch-phase sub-parse sub-parse-return cons-out parse-text) ;[:title Simple Text Parser] ;: This module provides a simple text parser that can read strings ;: and turn them into data without first building lexems (although it ;: can be used to either lex or parse). ;: ;: More complex or faster parsers may require the use of the intergrated ;: @schememodname[parser-tools]. ;: ;: A parser is given a list of matcher procedures and associated action procedures. ;: A matcher is generally a regexp, the associated action turns ;: the matched text into something else. ;: On the input string, the parser recursively looks ;: for the matcher that matches the earliest ;: character and applies its action. ;: $no-match-proc is applied to the portion of the string (before the first matched ;: character) that has not been matched. ;: ;: The parser has an internal state, the "phase", where it is possible ;: to define local parsers that only work when the parser is in that phase. ;: Actions can make the parser switch to a given phase. ;: Automata transitions can then easily be defined. ;: ;: Instead of switching to another phase, it is also possible to set the ;: parser into a "sub-parser" mode, and to provide the sub-parser with a callback ;: that will be applied only once the sub-parser has returned. ;: ;: ;: The fastest and easiest way to understand how it works is probably to ;: look at the examples in the @filepath{examples} directory. ;: Somes simple examples are also given at the end of this page. ;: See also the @filepath{defs-parser.ss} source file for a more complex usage. ;[:section Priorities] ;: When parsing a string, among all matchers of the current phase, ;: the matcher which action is triggered is the one that matches the earliest ;: character in the string. ;: If several matchers apply, then only the @bold{last} added matcher is chosen. ;: In $add-items, the priority is for the matcher that is defined the lowest in the ;: source file. ; EXAMPLES ;[:convention parser parser?] ;: @defproc[(parser? [p any/c]) boolean?]{ ;: Returns $#t if $p is a parser, #f otherwise. ;: } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Keyword Searcher in Texts ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; useful for syntax colorization in slideshow ; each keyword is associated with a function ; so it is more general ;[:section Main Functions] (define-struct item ; trouver un autre nom car trop ambigu et deja utilisé ? ; name : any ; search-proc : string -> (( pos-begin . pos-end)) (search-proc do-proc) #:transparent) ; phase is a function (void) -> bool (define-struct parser ; output is a parameter containing the current output (items no-match-proc phase output appender sub-parsers) #:mutable #:transparent) (define (new-parser [no-match-proc identity] #:phase [phase 'start] #:appender [appender string-append]) ;:-> parser? ;: [no-match-proc procedure?] ;: [appender procedure?] ;: Creates a new parser with default behavior $no-match-proc, starting in phase $phase. ;: All the outputs generated byt the parser are then appended with $appender. (make-parser '() no-match-proc (make-parameter phase) (make-parameter '()) appender '())) ; Sub-parsers stack: (define (push-sub-parser parser sub) (set-parser-sub-parsers! parser (cons sub (parser-sub-parsers parser)))) (define (pop-sub-parser parser) (let ([subs (parser-sub-parsers parser)]) (set-parser-sub-parsers! parser (rest subs)) (first subs))) ; Each keyword is associated to a procedure that will be applied ; to the keyword string when it will be found in the text. ; General function to add an item; ; match-proc : string -> (list (or #f (start . end))) ; (just like regexp-match-position) (define (add-item-general parser search-proc output-proc) (set-parser-items! parser (cons (make-item search-proc output-proc) (parser-items parser)))) ; useful to easily define phase comparisons (define (eq-phase? phase) (λ(p)(equal? phase p))) (define (add-item-phase parser search-proc output-proc phase?) (add-item-general parser (λ(text)(if (phase? ((parser-phase parser))) (search-proc text) #f)) output-proc)) (define current-parser (make-parameter #f)) (define (to-out out) ; turns a user-given out into a correct out (cond [(procedure? out) out] [else (to-proc out)])) (define (to-phase? phase?) ; turns a user-given phase? into a phase? test-proc (cond [(procedure? phase?) phase?] [(equal? #t phase?) (λ args #t)] [else (eq-phase? phase?)])) (define (add-no-match-cond parser phase? out) (let ([no-match (parser-no-match-proc parser)]) (set-parser-no-match-proc! parser (λ args (if (phase? ((parser-phase parser))) (apply out args) (apply no-match args)))))) (define (add-item parser phase? in out) ;:-> void? ;: [in (or/c #t procedure? list? symbol? string?)] ;: [out (or/c procedure? symbol? string?)] ;: Adds the matcher $in and its associated action $out ;: to $parser. ;: The matcher will match only when the parser is in a phase that ;: returns $#t when applied to $phase?. ;: ;: If $phase? is a procedure, it will be used as is to match the parser's phase. ;: If $phase? equals $#t it will be changed to @scheme[(λ args #t)] ;: such that it matches any phase. ;: Any other value of $phase will be turned into a procedure that matches ;: this value with $equal?. ;: ;: If $in is a string it will be turned into a procedure that matches ;: the corresponding pregexp. ;: If $in is a symbol, it will be turned into a procedure that matches ;: the corresponding pregexp with word boundaries on both sides, (useful ;: for matching names or programming languages keywords). ;: If $in is a list, then $add-item is called recursively on each member ;: of $in with the same $parser, $phase? and $out. ;: If $in equals $#t, it will modify the $no-match-proc procedure ;: to add the corresponding action when $phase? applies to the parser. ;: In the end, $in has returns the same kind of values as $regexp-match-positions. ;: ;: $out must be a procedure that accepts the same number of arguments as ;: the number of values returned by the matcher $in. ;: For example, if $in is @scheme["aa(b+)c(d+)e"], then $out must ;: take 3 arguments (one for the whole string, and two for the b's and the d's). ;: If $out is not a procedure, it will be turned into a procedure that accepts ;: any number of arguments and returns $out. (cond [(list? in) (for-each (λ(in)(add-item parser phase? in out)) in)] [(equal? #t in) (add-no-match-cond parser (to-phase? phase?) (to-out out))] [else (add-item-phase parser (cond [(procedure? in) in] [(symbol? in) (kw in)] [(string? in) (re in)] [else (error "unknown type for " in)]) (to-out out) (to-phase? phase?))] )) (define-syntax-rule (add-items parser [phase? [search-proc output-proc] ...] ...) ;: The general form for adding several items at once. ;: See the examples at the end of this page. (parameterize ([current-parser parser]) (let ([p phase?]) ; externalize phase? from the inner "..." ; otherwise it gets counted (begin (add-item parser p search-proc output-proc) ...)) ...)) (define (parse-text parser #:phase [phase ((parser-phase parser))] . text) ;:-> (listof any/c) ;: [text string?] ;: Parses $text with $parser, starting in phase $phase, which is the current phase ;: by default. ;: ;: It is thus possible to call the parser inside the parsing phase, i.e ;: once a portion of the text has been parsed, it can be given to the parser ;: itself in some phase to make further transformations. ;: This is not the same as sub-parsing because there is no callback. (parameterize ([current-parser parser] [(parser-phase parser) phase] [(parser-output parser) '()]) (let ([line-number 0] [the-line ""]) (with-handlers ([exn:fail? (λ(e)(printf "PARSE-ERROR: around line ~a~n~a~n" (round (/ line-number 2)) the-line) (raise e))]) (for ([line (add-between (append-map (λ(line)(regexp-split "\n" line)) text) "\n")]) (++ line-number) ; for debug (set! the-line line) ; for debug (parse-line parser line)))) (output-append parser ((parser-output parser))) )) ;[:section Matchers] ;: This section describes matching functions that can be used in the ;: $in argument of $add-item and $add-items. ; IN procs: (define (re s) ;:-> procedure? ;: Turns $s into a pregexp and returns a procedure ;: that takes an input string and applies ;: $regexp-match-positions on that string with the pregexp $s. (λ(text)(regexp-match-positions (pregexp s) text))) (define (txt s) ;:-> procedure? ;: Same as $re but regexp-quotes $s beforehand, so that the string $s ;: is matched exactly. (re (regexp-quote s))) (define (kw s) ;:-> procedure? ;: Same as $txt but adds word-boundaries around $s. (re (string-append "\\b" (regexp-quote (to-string s)) "\\b"))) ;[:section Actions] ;: This section describes action functions that can be used in the ;: $out argument of $add-item and $add-items. (define (switch-phase phase) ;:-> string? ;: Sets the parser in the phase $phase and returns @scheme[""]. ((parser-phase (current-parser)) phase) "") (define (sub-parse new-phase [callback identity] #:appender [appender (parser-appender (current-parser))]) ;:-> string? ;: [appender procedure?] ;: [callback procedure?] ;: Sets the current parser in sub-parse mode and switches to $new-phase. ;: The result of the sub-parse is appended with $appender, which by default ;: is the same as the parser's. ;: When the sub-parser has finished parsing ;: (it has returned with $sub-parse-return), ;: $callback is called with the result of the sub-parse and the result of ;: $callback is added to the current parser result. ;: ;: Sub-parsers can be called recursively, once in a sub-parsing mode ;: or in the $callback. ;: ;: Returns @scheme[""]. (start-sub-parser (current-parser) new-phase callback appender) "") (define (cons-out out) ;:-> void? ;: By default, the parser agglomerates the return values ;: of the action procedures. ;: The function $cons-out can be used to add a value to the parser ;: without being a return value of an action. ;: Should be rarely useful. (cons-output (current-parser) out)) (define (sub-parse-return [out #f]) ;: Adds $out to the current parser result and returns ;: from the current sub-parsing mode. (when out (cons-out out)) (terminate-sub-parser (current-parser))) ; this is also the return value! (define-struct sub-parser (parser old-output old-phase new-phase callback appender) #:mutable) (define (start-sub-parser parser new-phase callback appender) (let ([sub (make-sub-parser parser ((parser-output parser)) ((parser-phase parser)) new-phase callback appender)]) (push-sub-parser parser sub) ((parser-phase parser) new-phase) ((parser-output parser) '()) ; fresh-new-output )) (define (terminate-sub-parser parser) (let* ([sub-parser (pop-sub-parser parser)] [output ((parser-output parser))]) ; (printf "Terminating sub-parser: ~a -> ~a ~n ~a ~n" ; (sub-parser-new-phase sub-parser) ; (sub-parser-old-phase sub-parser) ; (parser-sub-parsers parser) ; ) ; restore previous settings ((parser-phase parser) (sub-parser-old-phase sub-parser)) ((parser-output parser) (sub-parser-old-output sub-parser)) ; call to the callback function with the accumulated local output: ((sub-parser-callback sub-parser) (output-append parser output #:appender (sub-parser-appender sub-parser))) )) (define (find-first-matcher parser text) (let ([matches (filter car (for/list ([item (parser-items parser)]) (let ([i-match ((item-search-proc item) text)]) (list (if i-match i-match #f) item))))]) (if (empty? matches) (list (list (cons (string-length text) (string-length text))) #f) (argmin caaar matches)))) ; argmin stops on the first min, so priority to the last added items (cons) (define (positions->strings str lpos) (map (λ(pos)(if pos (substring str (car pos) (cdr pos)) pos)) lpos)) (define (cons-output parser x) ((parser-output parser) (cons x ((parser-output parser))))) (define (parse-line parser text) (let loop ([text text]) (let* ([first-matcher (find-first-matcher parser text)] [start (caaar first-matcher)] [end (cdaar first-matcher)]) (cons-output parser ((parser-no-match-proc parser) (substring text 0 start))) (when (< start end) ; there is a match (cons-output parser (apply (item-do-proc (second first-matcher)) (positions->strings text (car first-matcher))))) (let ([rest-str (substring text end)]) (unless (string=? "" rest-str) (loop rest-str))) ))) (define (output-append parser output #:appender [appender (parser-appender parser)]) (apply appender (reverse output))) ;[:section Examples] ;[:examples] ;(let ([p (new-parser)]) ; (add-items ; p ; ('start ; ["pl(.[^p]?)p" (λ(s x)(string-append " -gl" x "tch- "))] ; ["ou" "aï"] ; [#t string-upcase])) ; (parse-text p "youcoudouplipcoudouploup" "toupouchou")) ; ;(let ([tree-parser ; (new-parser #:appender ; (λ vals (remove* '(||) vals)))]) ; (add-items ; tree-parser ; ('start ; [#t string->symbol] ; ["\\s+" '||] ; ["\\(" (λ(s)(sub-parse 'start)'||)] ; ["\\)" (λ(s)(sub-parse-return))] ; )) ; (parse-text ; tree-parser ; "tree:(root (node1 (leaf1 leaf2) ;leaf3) (node2 ; leaf4 (node3 leaf5) leaf6) leaf7)")) ;[:end-examples] ;YaïCaïDaï -glitch- CaïDaï -gloutch- ;TaïPaïCHaï ;: Note that the result of the last example is Scheme data, not a string.