id.ss
; Module header is generated automatically
#cs(module id mzscheme
(require "common.ss")
(require "myenv.ss")
(require "access-remote.ss")
(require "sxpathlib.ss")

;; Creation and manipulation of the ID-index
;; Provides the DTD parser for extracting ID attribute declarations
;
; This software is in Public Domain.
; IT IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
;
; Please send bug reports and comments to:
;   lizorkin@hotbox.ru    Dmitry Lizorkin
;
; ID-index provides fast access to XML elements by their unique ID.
; ID-index has the following structure:
;  id-index = ( (id . element) (id . element) ... )
; i.e.
;  id-index = (list
;               (cons id element)
;               (cons id element)
;              ...)
; where
;  id - (a string) element's unique ID
;  element - an SXML presentation of an element
;
; Creation of an id-index generally consists of two steps.
; On the first step, a document declaration (internal and external DTD)
; is read and information of ID attributes is extracted.
; This is presented in a following form:
;  id-attrs = ( (elem-name attr-name attr-name attr-name ...)
;               (elem-name attr-name attr-name attr-name ...) ... )
; i.e.
;  id-attrs = (list
;               (cons
;                 elem-name
;                 (list attr-name attr-name attr-name ...)
;               (cons
;                 elem-name
;                 (list attr-name attr-name attr-name ...)
;               ...)
; where
;  elem-name - (a symbol) a name of the element
;  attr-name - (a symbol) element's attribute having an ID type
;
; On the second step, if an SXML presentation of the document is available,
; 'id-attrs' are used for forming an 'id-index'.
; If there is no SXML presentation for a document yet, both steps are
; performed as a single function call - to a specialized SSAX parser.
; This parser constructs an SXML presentation and an 'id-index'
; in a single pass
;
; ATTENTION:
; 1. Only non-qualified 'elem-name' and 'attr-name' are correctly supported
; 2. Parameter entity reference (PEReference) is NOT supported

;=========================================================================
; Functions which read XML document declaration

;------------------------------------------------
; Trivial functions that ignore symbols

; Function reads a whitespace (S production)
(define (id:process-s port)
  (let ((symb (peek-char port)))
    (cond((eof-object? symb) symb)
         ((char=? symb #\space) (read-char port)
                                (id:process-s port))
         ((char=? symb #\return) (read-char port)
                                 (id:process-s port))
         ((char=? symb #\newline)(read-char port)
                                 (id:process-s port))
         ((char=? symb #\tab)(read-char port)
                             (id:process-s port))
         (else symb))))


; Ignores all symbols until template-symbol
(define (id:ignore-until templ-sym port)
  (let loop ((symb (peek-char port)))
    (cond((eof-object? symb) symb)
         ((equal? symb templ-sym) (read-char port)
                                  symb)
         (else (read-char port)
               (loop (peek-char port))))))


;------------------------------------------------
; These functions perform reading from a file

; Read N symbols from a port
(define (id:read-n num port)
  (id:process-s port)
  (let loop ((num num) (res '()))
    (if(= num 0) 
       (list->string (reverse res))
       (let((symb (peek-char port)))
         (cond((eof-object? symb) symb)
              (else (read-char port)
                    (loop (- num 1) (cons symb res))))))))


; This function reads a name - a sequence of characters ending with
; a whitespace or '<'. '>', '(', ')', '[', ']', '|'
(define (id:read-name port)
  (id:process-s port)
  (let loop ((res ""))
    (let ((symb (peek-char port)))
      (cond((eof-object? symb) res)
           ((member symb '(#\space #\tab #\return #\newline
                           #\< #\> #\( #\) #\[ #\] #\|)) 
                    res)
           (else (loop (string-append res (string (read-char port)))))))))


; This function reads a literal
;  literal   ::=    ('"' [^"]* '"') | ("'" [^']* "'") 
; A string is returned
(define (id:process-literal port)
  (id:process-s port)
  (let((quot (peek-char port)))
    (if(eof-object? quot)  ; an incorrect situaltion
       ""
       (let((quot (if (char=? (read-char port) #\") #\" #\')))
         (let loop ((res '()))
           (let((symb (peek-char port)))
             (cond
               ((eof-object? symb) (list->string (reverse res)))
               ((char=? symb quot)  ; end of the string
                (read-char port)
                (list->string (reverse res)))
               (else
                (read-char port)
                (loop (cons symb res))))))))))
         

;------------------------------------------------
; Miscellaneous

; Converts a string into small letters
(define (id:to-small str)
  (let loop ((arg (string->list str)) (res '()))
    (cond((null? arg) (list->string (reverse res)))
         ((char-upper-case? (car arg))
           (loop (cdr arg) (cons (char-downcase (car arg)) res)))
         (else (loop (cdr arg) (cons (car arg) res))))))


; Takes an 'id-attrs' which can contain equal element names
; Returns a new 'id-attrs' where all element names are unique
(define (id:unite-id-attrs id-attrs)
  (let loop ((id-attrs id-attrs)
             (new '()))
    (if
     (null? id-attrs)
     new
     (let rpt ((elem-name (caar id-attrs))
               (atts (cdar id-attrs))
               (rest (cdr id-attrs))
               (id-attrs '()))
       (cond
         ((null? rest) 
          (loop id-attrs (cons (cons elem-name atts) new)))
         ((equal? (caar rest) elem-name)
          (rpt elem-name 
               (append atts (cdar rest))
               (cdr rest)
               id-attrs))
         (else
          (rpt elem-name atts (cdr rest) (cons (car rest) id-attrs))))))))
    
  
;------------------------------------------------
; Parsing XML productions concerning document declaration
; These functions are not intendes for error detection, they assume that
; the document is correct

; This function ignores information related to a PI production [16]
; [16]    PI    ::=    '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
; It looks for an ending '?>' template
(define (id:ignore-PI port)
  (id:ignore-until #\? port)
  (let ((symb (peek-char port)))
    (cond((eof-object? symb) symb)
         ((equal? symb #\>) (read-char port)
                            symb)
         (else (id:ignore-PI port)))))


; This function ignores information related to a Comment production [15]
; [15]    Comment    ::=    '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
; The starting '<!' has been already processed
; The function looks for an ending '-->' template
(define (id:ignore-comment port)
  (read-char port)   ; it is '-'
  (read-char port)   ; it is '-'
  (id:ignore-until #\- port)
  (let((sym1 (peek-char port)))
    (cond((eof-object? sym1) sym1)
         ((char=? sym1 #\-) (read-char port)
                            (let((sym2 (read-char port)))  ; must be '>'
                              sym2))
         (else (id:ignore-comment port)))))


; This function processes AttType production ([54]-[59] in XML specification)
; [54]  AttType    ::=    StringType | TokenizedType | EnumeratedType 
; [55]  StringType    ::=    'CDATA'
; [56]  TokenizedType    ::=    'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
;                               | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
; [57]  EnumeratedType    ::=    NotationType | Enumeration 
; [58]  NotationType    ::=  'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
; [59]  Enumeration    ::=  '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
; The function returnd #t if the attribute has an ID type and #f otherwise
(define (id:AttType-ID? port)
  (let((type (id:to-small (id:read-name port))))
    (cond((string=? type "id") #t)
         ((string=? type "notation")
           (id:process-s port)
           (read-char port)  ; it is #\(
           (id:ignore-until #\) port)
           #f)
         ((and (string=? type "") (char=? (peek-char port) #\())   ; see [59]
           (id:ignore-until #\) port)
           #f)
         (else #f))))


; This function processes DefaultDecl production ([60] in XML specification)
; [60]    DefaultDecl    ::=    '#REQUIRED'
;                               | '#IMPLIED'
;                               | (('#FIXED' S)? AttValue)
; The result is always #t
(define (id:process-DefaultDecl port)
  (let((type (id:to-small (id:read-name port))))
    (cond((string=? type "#fixed")
           (id:read-name port)   ; reads a default value
           #t)  
         (else #t))))


; This function processes AttDef production ([53] in XML specification)
; [53]    AttDef    ::=    S Name S AttType S DefaultDecl
; If an attribute has an ID type, (list attribule-name) is returned
; (a list of one element). Otherwise, function returns an empty list
(define (id:process-AttDef port)
  (let((att-name (string->symbol (id:read-name port))))
    (let((bool (id:AttType-ID? port)))
      (id:process-DefaultDecl port)
      (if bool (list att-name) '()))))
                

; The function processes AttlistDecl production ([52] in XML specification)
; [52]    AttlistDecl    ::=    '<!ATTLIST' S Name AttDef* S? '>'
; The starting '<!ATTLIST' has been already processed
; 'id-attrs' are returned as a result
(define (id:process-AttlistDecl port)
  (let((element-name (string->symbol (id:read-name port))))
    (let loop ((atts '()))
      (id:process-s port)
      (cond((char=? (peek-char port) #\>) ; no more attributes will be declared
             (read-char port)
             (if(null? atts)
                '()
                (list (cons element-name atts))))
           (else
             (loop (append (id:process-AttDef port) atts)))))))


; This function processes a multiple markupdecl production [29]
; [29]    markupdecl    ::=    elementdecl | AttlistDecl | EntityDecl
;                              | NotationDecl | PI | Comment
; 'id-attrs' are returned as a result
(define (id:process-markupdecl* port)
  (let loop ((id-attrs '()))
    (let((beg (id:read-n 2 port)))
      (cond((eof-object? beg) id-attrs)   ; the file is over
           ((string=? beg "]>") id-attrs)   ; the end of the markupdecl
           ((string=? beg "<?")   ; processing instruction
             (id:ignore-PI port)
             (loop id-attrs))
           ((and (string=? beg "<!") (char=? (peek-char port) #\-)) ; a comment
             (id:ignore-comment port)
             (loop id-attrs))
           ((string=? beg "<!")   ; AttlistDecl or any other declarations
             (let ((name (id:to-small (id:read-name port))))
               (cond((string=? name "attlist") 
                      (loop (append (id:process-AttlistDecl port) id-attrs)))
                    (else 
                      (id:ignore-until #\> port)
                      (loop id-attrs)))))
           (else   ; an error condition
             (cerr "Error in markupdecl production: unexpected " beg nl)
             (id:ignore-until #\> port)
             id-attrs)))))
                 

; This function processes a doctypedecl production ([75] in XML specification)
; [75]    ExternalID    ::=    'SYSTEM' S SystemLiteral
;                              | 'PUBLIC' S PubidLiteral S SystemLiteral
; The function ignores a PubidLiteral
; 'id-attrs' are returned as a result
(define (id:process-ExternalID port)
  (let((system-literal
        (let((name (id:to-small (id:read-name port))))
          (cond
            ((string=? name "system")
             (id:process-literal port))
            ((string=? name "public")
             (id:process-literal port)
             (id:process-literal port))
            (else #f)))))
    (if(not system-literal)
       '()  ; an incorrect situation
       (let((external-port (open-input-resource system-literal)))
         (if(not external-port)
            '()   ; a failure
            (let((id-attrs (id:process-markupdecl* external-port)))
              (close-input-port external-port)
              id-attrs))))))
         

; This function processes a doctypedecl production ([28] in XML specification)
; [28]    doctypedecl    ::=    '<!DOCTYPE' S Name (S ExternalID)?
;                               S? ('[' (markupdecl | DeclSep)* ']' S?)? '>'
; The function doesn't process a DeclSep (this is a PEReference which
; this programme doesn't support)
; The starting '<!DOCTYPE' has been already processed
; 'id-attrs' are returned as a result
(define (id:process-doctypedecl port)
  (let((name (id:read-name port)))  ; root element's name
    (id:process-s port)
    (let((symb (peek-char port)))
      (cond
        ((eof-object? symb) '())  ; an incorrect situation
        ((char=? symb #\[)
         (read-char port)
         (id:process-markupdecl* port))
        (else
         (let((id-attrs (id:process-ExternalID port)))
           (id:process-s port)
           (let((symb (peek-char port)))
             (cond
               ((eof-object? symb) id-attrs)  ; an incorrect situation
               ((char=? symb #\[)
                (read-char port)
                (append id-attrs (id:process-markupdecl* port)))
               (else  ; an incorrect situation
                id-attrs)))))))))
                   

; This function processes a prolog production ([22] in XML specification)
; [1]    document    ::=    prolog element Misc*
; [22]    prolog    ::=    XMLDecl? Misc* (doctypedecl Misc*)?
; [23]    XMLDecl    ::=    '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
; [27]    Misc    ::=    Comment | PI | S
; 'id-attrs' are returned as a result
(define (id:process-prolog port)
  (let((beg (id:read-n 2 port)))
    (cond((eof-object? beg) '())  ; a file is over - strange...
         ((string=? beg "<?")   ; PI or XMLDecl
          (id:ignore-PI port)
          (id:process-prolog port))
         ((and (string=? beg "<!") (char=? (peek-char port) #\-))   ; a comment
          (id:ignore-comment port)
          (id:process-prolog port))
         ((string=? beg "<!")   ; doctypedecl expected
          (let ((name (id:to-small (id:read-name port))))
            (cond((string=? name "doctype") 
                  (id:process-doctypedecl port))
                 (else 
                  (cerr "doctypedecl production expected" nl)
                  '()))))
         (else   ; element begins, there was no doctypedecl
          '()))))



;=========================================================================
; Two-step id-index creation (user level functions)
; We use this variant when we already have an SXML presentation of the
; document

;------------------------------------------------
; The first step - creating 'id-attrs'

; Read the DTD
;  uri-string - a URI for the DTD location (a string)
; 'id-attrs' are returned as a result
(define (id:read-external-dtd uri-string)
  (let((port (open-input-resource uri-string)))
    (if(not port)
       '()  ; a situation of an error
       (let((id-attrs (id:unite-id-attrs (id:process-markupdecl* port))))
         (close-input-port port)
         id-attrs))))


; Read the declaration from the document's prolog.
; If prolog contains a reference to an external DTD, it is processed either
;  uri-string - a URI for the document location (a string)
; 'id-attrs' are returned as a result
(define (id:read-document-declaration uri-string)
  (let((port (open-input-resource uri-string)))
    (if(not port)
       '()  ; a situation of an error
       (let((id-attrs (id:unite-id-attrs (id:process-prolog port))))
         (close-input-port port)
         id-attrs))))


;------------------------------------------------
; The second step - creating an 'id-index' using 'id-attrs'

; This function forms an 'id-index' and inserts it in the document
;  document - a root node of the document (SXML presentation)
;  id-attrs - the result of the previous step
; A new SXML document is returned. It contains an auxiliary list with an
; 'id-index subtree. If the source document already contains such a
; subtree, it will be replaced. Other subtrees in an auxiliary list will
; remain unchanged.
(define (SXML->SXML+id document id-attrs)
  (let((aux-subtrees
        (let((aux ((select-kids (ntype?? '@@)) document)))
          (if(null? aux)
             '()
             (let rpt ((res '())
                       (to-see (cdar aux)))
               (cond
                 ((null? to-see) (reverse res))
                 ((equal? (caar to-see) 'id-index) (rpt res (cdr to-see)))
                 (else (rpt (cons (car to-see) res)
                            (cdr to-see)))))))))
    (let loop ((nodeset (list document))
               (id-index '()))
      (if(null? nodeset)
         (let((kids ((select-kids
                      (lambda (node)
                        (not (and (pair? node) (equal? (car node) '@@)))))
                     document)))
           (cons* '*TOP*
                  (cons* '@@
                         (cons 'id-index id-index)
                         aux-subtrees)
                  kids))         
         (let((cur-node (car nodeset)))
           (cond
             ((not (pair? cur-node))  ; a text node
              (loop (cdr nodeset) id-index))
             ((assoc (car cur-node) id-attrs)
              =>
              (lambda (lst)
                (let((id-values
                      ((select-kids (lambda (x) #t))
                       ((sxml:filter (lambda (x) (member (car x) (cdr lst))))
                        ((select-kids (lambda (x) #t))
                         ((select-kids (ntype?? '@)) cur-node))))))
                  (loop
                   (append 
                    ((select-kids (ntype?? '*)) (car nodeset))
                    (cdr nodeset))
                   (append
                    id-index
                    (map
                     (lambda (x) (cons x cur-node))
                     id-values))))))
             (else
              (loop
               (append ((select-kids (ntype?? '*)) (car nodeset)) (cdr nodeset))
               id-index))))))))



;=========================================================================
; Some stuff for a SSAX multi parser

;------------------------------------------------
; Id-related part of the seed
;  id:seed = (list  id-attrs  id-index)
;  id-attrs, id-index - see a head comment

; Mutator
(define (id:make-seed id-attrs id-index)
  (list id-attrs id-index))


; Accessors
(define (id:seed-attrs id:seed)
  (car id:seed))
        
(define (id:seed-index id:seed)
  (cadr id:seed))
        

;------------------------------------------------
; Handler units

; This function is called by the NEW-LEVEL-SEED handler
; A new 'id:seed' is returned
(define (id:new-level-seed-handler id:seed)
  id:seed)


; This function is called by the FINISH-ELEMENT handler
; A new 'id:seed' is returned
(define (id:finish-element-handler elem-gi attributes id:seed element)
  (cond
    ((assoc elem-gi (id:seed-attrs id:seed))
     => 
     (lambda (lst)
       (let loop ((atts attributes)
                  (id-index (id:seed-index id:seed)))
         (if
          (null? atts)
          (id:make-seed (id:seed-attrs id:seed) id-index)
          (let((att (car atts)))
            (cond
              ((pair? (car att))  ; namespace aware
               (loop (cdr atts) id-index))
              ((member (car att) (cdr lst))
               (loop (cdr atts)
                     (cons (cons (cdr att) element) id-index)))
              (else
               (loop (cdr atts) id-index))))))))
    (else
     id:seed)))

         
; This function is called by the DOCTYPE handler
; A new 'id:seed' is returned
(define (id:doctype-handler port systemid internal-subset?)
  (let((id-attrs
        (if
         (not systemid)
         '()   ; systemid not supplied
         (let((external-port (open-input-resource systemid)))
           (if
            (not external-port)
            '()   ; a failure
            (let((id-attrs (id:process-markupdecl* external-port)))
              (close-input-port external-port)
              id-attrs))))))
    (let((id-attrs
          (if
           internal-subset?
           (id:unite-id-attrs 
            (append id-attrs (id:process-markupdecl* port)))
           (id:unite-id-attrs id-attrs))))
      (id:make-seed id-attrs '()))))


; This function constructs the member of an axuiliary list
(define (id:ending-action id:seed)
  (let((id-index (id:seed-index id:seed)))
    (cons 'id-index id-index)))

                      


(provide (all-defined)))