#lang scheme/base ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; NET.plt ;; ;; abstraction of common network behaviors and services ;; ;; Bonzai Lab, LLC. All rights reserved. ;; ;; Licensed under LGPL. ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; phrase.ss - parsing the phrase as defined in rfc2822 ;; yc 2/13/2010 - first version (require "depend.ss" "encoded-word.ss" "quoted-string.ss" ) ;; this is the atext... hmm... most of the characters here will be *rejected*... (define p:atext/unicode (char-when (lambda (c) (and (not (char-whitespace? c)) (not (memq c '(#\, #\< #\> #\: #\;))))))) (define p:atom (seq atom <- (one-many p:atext/unicode) (return (list->string atom)))) (define p:word (choice p:encoded-word p:qstring p:atom)) ;; what we need is something to alternatively picking things up... ;; token WORD will suck in the tailing (define p:phrase (seq (zero-one whitespaces (return #t)) w <- p:word words <- (zero-many (seq ws <- whitespaces word <- p:word (return (list (list->string ws) word)))) (return (string-join (flatten (cons w words)) "")))) (define read-phrase (make-reader p:phrase #:eof? #f)) ;; another way of reading phrase would be to read anything, and an (define p:text (char-when (lambda (c) (and (not (char-whitespace? c)))))) (define p:text* (seq v <- (one-many p:text) (return (list->string v)))) (define p:TEXT (choice p:encoded-word p:text*)) (define p:whitespaces/str (seq v <- whitespaces (return (list->string v)))) (define p:freeform (seq ws <- p:whitespaces/str t <- p:TEXT texts <- (zero-many (seq ws <- p:whitespaces/str t <- p:TEXT (return (string-append ws t)))) (return (string-join (list* ws t texts) "")))) ;; subject should use this one... (define read-freeform (make-reader p:freeform #:eof? #f)) ;; what about the 3rd form? the parameters are not a full phrase, instead ;; they are just tokens... ;; they must not be anything that have spaces unless it is incorporated into a ;; #| NO-WS-CTL = %d1-8 / ; US-ASCII control characters %d11 / ; that do not include the %d12 / ; carriage return, line feed, %d14-31 / ; and white space characters %d127 CTL -> d1-d32,d127 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,127 tspecials := "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "\" / <"> "/" / "[" / "]" / "?" / "=" ; Must be in quoted-string, ; to use within parameter values ;;|# (define (token-special-char? c) (memq c '(#\( #\) #\< #\> #\@ #\, #\; #\: #\\ #\" #\/ #\[ #\] #\? #\=))) (define p:token-char (char-when (lambda (c) (and (< 32 (char->integer c) 128) (not (token-special-char? c)))))) (define p:atom/token (seq v <- (one-many p:token-char) (return (list->string v)))) ;; in this particular case we can also embed encoded-word into quoted-string... (define p:param-value (choice p:quoted-encoded-word p:encoded-word quoted-string p:atom/token)) (define read-param-value (make-reader p:param-value #:eof? #f)) ;; encode-phrase ;; do the following... ;; if it's all ascii, check to see if it has one of the tspecial chars, and if so ;; make it into quoted-string... ;; if it's not all ascii, then convert it into encoded word... unless it's more than ;; 48 characters, do not split it into 2... (define (encode-phrase phrase) (let-values (((ascii latin-1 unicode) (string-char-ratios phrase))) (cond ((= ascii 1) (if (string-char-or? phrase token-special-char?) (encode-qstring phrase) phrase)) ((> ascii 0.7) (encode-encoded-word* "utf-8" "Q" phrase)) (else (encode-encoded-word* "utf-8" "B" phrase))))) (define (encode-param-value param) (let-values (((ascii latin-1 unicode) (string-char-ratios param))) (cond ((= ascii 1) (if (string-char-or? param char-whitespace?) (encode-qstring param) param)) ((> ascii 0.7) (encode-encoded-word "utf-8" "Q" param)) (else (encode-encoded-word "utf-8" "B" param))))) (provide/contract (p:phrase Parser/c) (read-phrase Reader/c) (p:freeform Parser/c) (read-freeform Reader/c) (p:param-value Parser/c) (read-param-value Reader/c) (encode-phrase (-> string? string?)) (encode-param-value (-> string? string?)) ) #| let's fix the phrase issue. currently phrase is not working correctly... 1 - it's unable to parse all of the words 2 - phrase as defined in 2822 are not used *everywhere* - it's mainly used with email addresses 3 - phrase as defined in 2822 cannot be used within quoted-strings, but in real-life they exists... (specifically as a filename in content-disposition for CJK filenames) 4 - phrase does not really exist for header such as subject, because subject does not make use of quoted-strings (but it does make use of encoded-words) so most likely I will need to break things up. in email address we can use the following: phrase as defined in 2822 (ATOM & quoted-string) & encoded-word as defined in 2047 in subject we can use regular text (no quoted-string), but we need to look for encoded-word token in content-disposition params we need to parse for quoted-string, but also encoded-word within quoted-string... so it's okay to help parse for encoded-word within quoted string... but within subject we should not parse for that... remember that if there are multiple of them together then they should not ;;|#