├── .gitignore ├── CHANGES ├── PORTING-TO-DO ├── README ├── info.rkt └── sxml ├── ddo-axes.rkt ├── ddo-txpath.rkt ├── info.rkt ├── lazy-ssax.rkt ├── lazy-xpath.rkt ├── main.rkt ├── modif.rkt ├── scribblings ├── .gitignore ├── SXML.scm.tmp ├── all-exported.scrbl ├── extract-provides.rkt ├── extracted-sperber.scrbl ├── sax-parsing.scrbl ├── serialization.scrbl ├── sxml-rep.scrbl ├── sxml.scrbl ├── sxpath.scrbl ├── sxslt.scrbl └── util.rkt ├── serializer.rkt ├── ssax ├── SSAX-code.rkt ├── SXML-tree-trans.rkt ├── access-remote.rkt ├── doc.txt ├── errors-and-warnings.rkt ├── id.rkt ├── info.rkt ├── input-parse.rkt ├── multi-parser.rkt ├── myenv.rkt ├── parse-error.rkt ├── ssax-prim.rkt ├── ssax.rkt ├── sxpathlib.rkt ├── tests │ └── vinput-parse.scm ├── util.rkt └── xlink-parser.rkt ├── sxml-tools.rkt ├── sxpath-ext.rkt ├── sxpath.rkt ├── tests ├── README ├── if.xml ├── ssax-tests.rkt ├── tests.rkt ├── vSXML-to-HTML.rkt └── vSXML-tree-trans.rkt ├── txpath.rkt ├── xpath-ast.rkt ├── xpath-context_xlink.rkt └── xpath-parser.rkt /.gitignore: -------------------------------------------------------------------------------- 1 | compiled/ 2 | doc/ 3 | *~ 4 | planet-docs/ 5 | coverage/ 6 | -------------------------------------------------------------------------------- /CHANGES: -------------------------------------------------------------------------------- 1 | - (require (planet clements/sxml2)) includes both sxml and ssax 2 | - taking out the (unused) run-test define in SSAX-code.rkt 3 | - moving tests (such as they are) to a sub-directory 4 | -------------------------------------------------------------------------------- /PORTING-TO-DO: -------------------------------------------------------------------------------- 1 | - ADD DOCUMENTATION !!! 2 | - add a test suite-- what's in the CVS repo? 3 | 4 | - decide what to do with all the version strings 5 | - remove the "bug reports to lisovsky/lizorkin" messages 6 | - change to set-mcar! to get mutation back? 7 | 8 | - (ryanc) add (or find?) sxml-top?, sxml-element?, etc predicates 9 | - (ryanc) parameters and/or kw args for continuable errors / warnings 10 | - (ryanc) parameters for defaults, eg namespace-abbreviations 11 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | This repository represents a PLT package containing SXML, a family of XML manipulation 2 | tools designed by Oleg Kiselyov and written by lots of people, including 3 | 4 | - Oleg Kiselyov, 5 | - Kirill Lisovsky, 6 | - Dmitry Lizorkin, 7 | - Michael Sperber, 8 | - Neil Van Dyke, 9 | - Ryan Culpepper, 10 | - John Clements, 11 | 12 | ... and others that I've forgotten (pull requests welcome!). 13 | -------------------------------------------------------------------------------- /info.rkt: -------------------------------------------------------------------------------- 1 | #lang setup/infotab 2 | 3 | (define collection 'multi) 4 | (define deps '("base" "srfi-lib")) 5 | (define build-deps '("racket-doc" "scribble-lib")) 6 | -------------------------------------------------------------------------------- /sxml/info.rkt: -------------------------------------------------------------------------------- 1 | #lang setup/infotab 2 | 3 | (define name "sxml") 4 | (define blurb 5 | (list "Collection of tools for processing markup documents " 6 | "in the form of S-expressions")) 7 | (define primary-file "main.rkt") 8 | (define homepage "http://modis.ispras.ru/Lizorkin/sxml-tutorial.html") 9 | (define categories '(xml)) 10 | (define compile-omit-paths '("tests/" "ssax/tests/")) 11 | (define repositories '("4.x")) 12 | (define release-notes (list "bug fixes, serializers can now accept paths")) 13 | (define version "2012-02-16 10:11") 14 | 15 | (define scribblings '(("scribblings/sxml.scrbl" (multi-page)))) 16 | -------------------------------------------------------------------------------- /sxml/lazy-ssax.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require racket/promise 3 | srfi/13/string 4 | "ssax/parse-error.rkt" 5 | "ssax/SSAX-code.rkt" 6 | "lazy-xpath.rkt") 7 | (provide (all-defined-out)) 8 | 9 | ;; A specialized lazy XML->SXML parser 10 | ; Is heavily based on continuations 11 | 12 | ;------------------------------------------------- 13 | ; Preliminary helper functions 14 | 15 | ; A helper that forces all descendants of a given node or a nodeset 16 | (define (lazy:force-descendants node) 17 | (cond 18 | ((lazy:promise? node) ; force it 19 | (lazy:force-descendants (force node))) 20 | ((pair? node) ; not null 21 | (for-each lazy:force-descendants node)) 22 | (else ; null or not pair 23 | #t ; nothing to be done 24 | ))) 25 | 26 | ; Returns the list containing of all members of the argument list except 27 | ; for the last member 28 | (define (lazy:except-last lst) 29 | (if 30 | (or (null? lst) ; this shouldn't happen 31 | (null? (cdr lst))) 32 | '() 33 | (cons (car lst) (lazy:except-last (cdr lst))))) 34 | 35 | ;------------------------------------------------- 36 | ; 37 | 38 | ; Returns the common part of the seed 39 | (define (lazy:seed-common seed) 40 | ((if (null? (cdr seed)) ; a short seed 41 | car caddr) 42 | seed)) 43 | 44 | ; A monad-like handler 45 | ; Replaces the common part of the seed 46 | (define (lazy:replace-common seed new-common) 47 | (if (null? (cdr seed)) ; a short seed 48 | (list new-common) 49 | (list (car seed) 50 | (cadr seed) 51 | new-common 52 | (cadddr seed)))) 53 | 54 | ; Produces a lazy SXML document, which corresponds to reading a source 55 | ; document in a stream-wise fashion 56 | (define (lazy:xml->sxml port namespace-prefix-assig) 57 | (let ((namespaces 58 | (map (lambda (el) 59 | (list* #f (car el) (ssax:uri-string->symbol (cdr el)))) 60 | namespace-prefix-assig)) 61 | (RES-NAME->SXML 62 | (lambda (res-name) 63 | (string->symbol 64 | (string-append 65 | (symbol->string (car res-name)) 66 | ":" 67 | (symbol->string (cdr res-name))))))) 68 | ((lambda (result) 69 | ; We assume that nobody follows the document element 70 | (if (null? namespace-prefix-assig) 71 | (cons '*TOP* (lazy:except-last result)) 72 | (cons 73 | '*TOP* 74 | (cons 75 | `(@@ (*NAMESPACES* 76 | ,@(map 77 | (lambda (ns) (list (car ns) (cdr ns))) 78 | namespace-prefix-assig))) 79 | (lazy:except-last result))))) 80 | (call-with-current-continuation ; we grab the continuation to escape from parsing 81 | (lambda (result-k) 82 | ; seed ::= (list result-k state-k common-seed level) 83 | ; result-k - continuation on what to do with the current result portion 84 | ; state-k - continuation to return to SSAX state on this level of XML 85 | ; tree hierarchy 86 | ; common-seed - general seed information 87 | ; level - level of a current node in a tree hierarchy 88 | ((ssax:make-parser 89 | NEW-LEVEL-SEED 90 | (lambda (elem-gi attributes namespaces expected-content seed) 91 | ;(pp (cons elem-gi (cadddr seed))) 92 | (if 93 | (or (null? (cdr seed)) ; short seed 94 | (> (cadddr seed) 3)) ; deep level 95 | (list '()) ; work like a conventional SSAX parser 96 | (let ((attrs 97 | (attlist-fold 98 | (lambda (attr accum) 99 | (cons (list 100 | (if (symbol? (car attr)) (car attr) 101 | (RES-NAME->SXML (car attr))) 102 | (cdr attr)) accum)) 103 | '() attributes))) 104 | (call-with-current-continuation 105 | (lambda (new-level-k) ; how to parse next 106 | ((car seed) ; return the result 107 | (let ((elem-content 108 | ; A promise to continue parsing 109 | (call-with-current-continuation ; where to put the result 110 | (lambda (elem-k) 111 | (new-level-k 112 | (list ; now form a seed 113 | elem-k ; what to do with result 114 | new-level-k ; SSAX state on this level 115 | '() ; common-seed is empty 116 | (+ (cadddr seed) 1) ; increase level 117 | )))))) 118 | (append 119 | ; Previous string content 120 | (ssax:reverse-collect-str-drop-ws (caddr seed)) 121 | (list 122 | (cons 123 | (if (symbol? elem-gi) elem-gi 124 | (RES-NAME->SXML elem-gi)) 125 | (if (null? attrs) elem-content 126 | (cons (cons '@ attrs) elem-content))) 127 | ; The following siblings of this element 128 | (delay 129 | (call-with-current-continuation ; where to put the result 130 | (lambda (foll-k) 131 | ; First we force the parsing of the current element 132 | (lazy:force-descendants elem-content) 133 | ; Than continue parsing 134 | ((cadr seed) ; recover the parent level of nesting 135 | (list 136 | foll-k ; what to do with result 137 | (cadr seed) 138 | '() ; common-seed is empty 139 | (cadddr seed) ; the same level for siblings 140 | )))))))))))))) 141 | 142 | FINISH-ELEMENT 143 | (lambda (elem-gi attributes namespaces parent-seed seed) 144 | (if 145 | (null? (cdr seed)) ; a short seed 146 | (let ((common (ssax:reverse-collect-str-drop-ws 147 | (lazy:seed-common seed))) 148 | (attrs 149 | (attlist-fold 150 | (lambda (attr accum) 151 | (cons (list 152 | (if (symbol? (car attr)) (car attr) 153 | (RES-NAME->SXML (car attr))) 154 | (cdr attr)) accum)) 155 | '() attributes))) 156 | (lazy:replace-common 157 | parent-seed 158 | (cons 159 | (cons 160 | (if (symbol? elem-gi) elem-gi 161 | (RES-NAME->SXML elem-gi)) 162 | (if (null? attrs) common 163 | (cons (cons '@ attrs) common))) 164 | (lazy:seed-common parent-seed)))) 165 | ; Otherwise - just return the remaining character content 166 | ((car seed) ; continuation 167 | (ssax:reverse-collect-str-drop-ws 168 | (lazy:seed-common seed))))) 169 | 170 | CHAR-DATA-HANDLER 171 | (lambda (string1 string2 seed) 172 | ;(pp (list string1 string2 seed)) 173 | (lazy:replace-common 174 | seed 175 | (if (string-null? string2) 176 | (cons string1 (lazy:seed-common seed)) 177 | (list* string2 string1 (lazy:seed-common seed))))) 178 | 179 | DOCTYPE 180 | (lambda (port docname systemid internal-subset? seed) 181 | (when internal-subset? 182 | (ssax:warn port 183 | "Internal DTD subset is not currently handled ") 184 | (ssax:skip-internal-dtd port)) 185 | (ssax:warn port "DOCTYPE DECL " docname " " 186 | systemid " found and skipped") 187 | (values #f '() namespaces seed)) 188 | 189 | UNDECL-ROOT 190 | (lambda (elem-gi seed) 191 | (values #f '() namespaces seed)) 192 | 193 | PI 194 | ((*DEFAULT* . 195 | (lambda (port pi-tag seed) 196 | (lazy:replace-common 197 | seed 198 | (cons 199 | (list '*PI* pi-tag (ssax:read-pi-body-as-string port)) 200 | (lazy:seed-common seed)))))) 201 | ) 202 | port 203 | (list ; form initial seed 204 | result-k ; put the result 205 | (lambda (seed) ; dummy top-level parser state that produces '() 206 | ((car seed) ; where to put the result nodeset 207 | '())) 208 | '() 209 | 1 ; level for the document element 210 | ))))))) 211 | -------------------------------------------------------------------------------- /sxml/main.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | 3 | (require "sxml-tools.rkt") 4 | (require "sxpath-ext.rkt") 5 | (require "xpath-parser.rkt") 6 | (require "txpath.rkt") 7 | (require "sxpath.rkt") 8 | (require "xpath-ast.rkt") 9 | (require "xpath-context_xlink.rkt") 10 | (require "ddo-axes.rkt") 11 | (require "ddo-txpath.rkt") 12 | (require "lazy-xpath.rkt") 13 | (require "lazy-ssax.rkt") 14 | (require "modif.rkt") 15 | (require "serializer.rkt") 16 | (provide (all-from-out "sxml-tools.rkt")) 17 | (provide (all-from-out "sxpath-ext.rkt")) 18 | (provide (all-from-out "xpath-parser.rkt")) 19 | (provide (all-from-out "txpath.rkt")) 20 | (provide (all-from-out "sxpath.rkt")) 21 | (provide (all-from-out "xpath-ast.rkt")) 22 | (provide (all-from-out "xpath-context_xlink.rkt")) 23 | (provide (all-from-out "ddo-axes.rkt")) 24 | (provide (all-from-out "ddo-txpath.rkt")) 25 | (provide (all-from-out "lazy-xpath.rkt")) 26 | (provide (all-from-out "lazy-ssax.rkt")) 27 | (provide (all-from-out "modif.rkt")) 28 | (provide (all-from-out "serializer.rkt")) 29 | 30 | (require "ssax/multi-parser.rkt" 31 | "ssax/sxpathlib.rkt" 32 | "ssax/SXML-tree-trans.rkt" 33 | "ssax/SSAX-code.rkt") 34 | (provide ssax:make-parser 35 | ssax:multi-parser 36 | (all-from-out "ssax/sxpathlib.rkt") 37 | pre-post-order 38 | ssax:xml->sxml) 39 | -------------------------------------------------------------------------------- /sxml/scribblings/.gitignore: -------------------------------------------------------------------------------- 1 | racket.css 2 | scribble-common.js 3 | scribble-style.css 4 | scribble.css 5 | sxml.html 6 | -------------------------------------------------------------------------------- /sxml/scribblings/extract-provides.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require racket/runtime-path) 4 | 5 | (define-runtime-path parent-dir "..") 6 | 7 | (define-namespace-anchor ns-anchor) 8 | 9 | (define (interleave requires-and-provides) 10 | (define reqs (map 11 | (lambda (x) 12 | (list 'section (second x))) 13 | (filter (lambda (x) (equal? (first x) '#%require)) 14 | requires-and-provides))) 15 | (define pros (filter (lambda (x) (equal? (first x) '#%provide)) 16 | requires-and-provides)) 17 | (apply append 18 | (map list reqs pros))) 19 | 20 | (module+ main 21 | (call-with-input-file (build-path parent-dir "ssax" "ssax.rkt") 22 | (lambda (port) 23 | (parameterize ([current-directory (build-path parent-dir "ssax")] 24 | [read-accept-reader #t] 25 | [current-namespace (namespace-anchor->namespace ns-anchor)]) 26 | (interleave 27 | (rest (fourth (syntax->datum (expand (read-syntax "main.rkt" port)))))))))) 28 | 29 | -------------------------------------------------------------------------------- /sxml/scribblings/sax-parsing.scrbl: -------------------------------------------------------------------------------- 1 | #lang scribble/doc 2 | @(require scribble/manual 3 | "util.rkt" 4 | (for-label sxml)) 5 | 6 | @title[#:tag "ssax"]{SAX Parsing} 7 | 8 | @defproc[(ssax:xml->sxml [port input-port?] 9 | [namespace-prefix-assig (listof (cons/c symbol? string?))]) 10 | sxml?]{ 11 | 12 | Reads an XML document (which can be a single XML element) from 13 | @racket[port], and returns the corresponding SXML (@racket[_top]) 14 | representation. The @racket[namespace-prefix-assig] association list 15 | provides shortened forms to be used in place of namespaces. 16 | 17 | @examples[#:eval the-eval 18 | (ssax:xml->sxml 19 | (open-input-string 20 | "abcd") 21 | '()) 22 | #| should be '(*TOP* (zippy (pippy (|@| (pigtails "2")) "ab") "cd")) |# 23 | 24 | (ssax:xml->sxml 25 | (open-input-string 26 | "4") 27 | '()) 28 | 29 | (ssax:xml->sxml 30 | (open-input-string 31 | "4") 32 | '((v . "vehicles"))) 33 | ] 34 | } 35 | 36 | 37 | @defproc[(sxml:document [url-string string?] [namespace-prefix-assig any/c]) sxml?]{ 38 | Given a local file URI, return the corresponding SXML representation. 39 | 40 | NOTE: currently, this appears to work only for local documents. 41 | 42 | NAMESPACE-PREFIX-ASSIG - is passed as-is to the SSAX parser: there it is 43 | used for assigning certain user prefixes to certain namespaces. 44 | 45 | NAMESPACE-PREFIX-ASSIG is an optional argument and has an effect for an 46 | XML resource only. For an HTML resource requested, NAMESPACE-PREFIX-ASSIG 47 | is silently ignored. 48 | 49 | So, for instance, if the file @filepath{/tmp/foo.xml} contains an XML file, 50 | you should be able to call 51 | 52 | @racketblock[ 53 | (sxml:document "file:///tmp/foo")] 54 | 55 | (Note the plethora of slashes required by the URI format.) 56 | 57 | 58 | } 59 | 60 | 61 | @;{has to be a defform because of Oleg's approach to argument naming. 62 | can't see a way in defform to describe result.} 63 | @defform[#:kind "procedure" 64 | (ssax:make-parser new-level-seed-spec 65 | finish-element-spec 66 | char-data-handler-spec 67 | tag-spec ...) 68 | #:grammar 69 | [(new-level-seed-spec NEW-LEVEL-SEED new-level-seed-proc) 70 | (finish-element-spec FINISH-ELEMENT finish-element-proc) 71 | (char-data-handler-spec CHAR-DATA-HANDLER char-data-handler-proc) 72 | (tag-spec tag tag-proc)]]{ 73 | 74 | Returns a procedure of two arguments, an input port @racket[xml-port], and an 75 | object @racket[init-seed]. That procedure will parse the XML document 76 | produced by @racket[xml-port], and the object @racket[init-seed], according to the 77 | specifications @racket[new-level-seed-spec], @racket[finish-element-spec], 78 | @racket[char-data-handler-spec], and @racket[tag-spec]s, and will return an object 79 | of the same type as @racket[init-seed]. 80 | 81 | 82 | @racket[new-level-seed-spec] consists of the tag @racket[NEW-LEVEL-SEED] in upper 83 | case, followed by a procedure @racket[new-level-seed-proc]. This procedure 84 | must take the arguments @racket[element-name], @racket[attributes], @racket[namespaces], 85 | @racket[expected-content], and @racket[seed]. It must return an object of the same 86 | type as @racket[init-seed]. 87 | 88 | 89 | @racket[finish-element-spec] consists of the tag @racket[FINISH-ELEMENT] in upper 90 | case, followed by a procedure @racket[finish-element-proc]. This procedure 91 | must take the arguments @racket[element-name], @racket[attributes], @racket[namespaces], 92 | @racket[parent-seed], and @racket[seed]. It must return an object of the same type 93 | as @racket[init-seed]. 94 | 95 | @racket[char-data-handler-spec] consists of the tag @racket[CHAR-DATA-HANDLER] in 96 | upper case, followed by a procedure @racket[char-data-handler-proc]. This 97 | procedure must take the arguments @racket[string-1], @racket[string-2], and @racket[seed]. 98 | It must return an object of the same type as @racket[init-seed]. 99 | 100 | 101 | `tag-spec': TODO. 102 | 103 | 104 | Here's an example that returns a string containing the text, after removing markup, from the 105 | XML document produced by the input port `in'. 106 | 107 | @codeblock|{ 108 | #lang racket 109 | 110 | (require racket/string sxml) 111 | 112 | (define (remove-markup xml-port) 113 | (let* ([parser 114 | (ssax:make-parser NEW-LEVEL-SEED remove-markup-nls 115 | FINISH-ELEMENT remove-markup-fe 116 | CHAR-DATA-HANDLER remove-markup-cdh)] 117 | [strings (parser xml-port null)]) 118 | (string-join (reverse strings) ""))) 119 | 120 | (define (remove-markup-nls gi attributes namespaces expected-content 121 | seed) 122 | seed) 123 | 124 | (define (remove-markup-fe gi attributes namespaces parent-seed seed) 125 | seed) 126 | 127 | (define (remove-markup-cdh string-1 string-2 seed) 128 | (let ([seed (cons string-1 seed)]) 129 | (if (non-empty-string? string-2) 130 | (cons string-2 seed) 131 | seed))) 132 | 133 | (remove-markup 134 | (open-input-string 135 | "Hello, world!")) 136 | }| 137 | } 138 | -------------------------------------------------------------------------------- /sxml/scribblings/serialization.scrbl: -------------------------------------------------------------------------------- 1 | #lang scribble/doc 2 | @(require scribble/manual 3 | "util.rkt" 4 | (for-label sxml)) 5 | 6 | @title[#:tag "srl"]{Serialization} 7 | 8 | @defproc*[([(srl:sxml->xml [sxml-obj sxml?]) 9 | string?] 10 | [(srl:sxml->xml [sxml-obj sxml?] 11 | [dest (or/c output-port? path-string?)]) 12 | void?])]{ 13 | 14 | Serializes the SXML node or nodeset @racket[sxml-obj] into XML, with 15 | indentation to facilitate readability by a human. 16 | 17 | If @racket[dest] is not supplied, the function returns a string that 18 | contains the serialized representation of the @racket[sxml-obj]. If 19 | @racket[dest] is supplied and is a port, the functions write the 20 | serialized representation of @racket[sxml-obj] to this port. If 21 | @racket[dest] is supplied and is a string, this string is treated as 22 | an output filename, the serialized representation of 23 | @racket[sxml-obj] is written to that filename. If a file with the 24 | given name already exists, the effect is unspecified. 25 | 26 | @examples[#:eval the-eval 27 | (srl:sxml->xml '(zippy (pippy (|@| (pigtails "2")) "ab") "bc")) 28 | (srl:sxml->xml '(zippy (pippy (|@| (pigtails "2")) "ab") "bc") 29 | (current-output-port)) 30 | (srl:sxml->xml (for/fold ([body '(nothing)]) ([i (in-range 5)]) 31 | `(doll (|@| (level ,(number->string i))) ,body)) 32 | (current-output-port)) 33 | ] 34 | } 35 | 36 | @defproc*[([(srl:sxml->xml-noindent [sxml-obj sxml?]) 37 | string?] 38 | [(srl:sxml->xml-noindent [sxml-obj sxml?] 39 | [dest (or/c output-port? path-string?)]) 40 | void?])]{ 41 | 42 | Like @racket[srl:sxml->xml] but without indentation. 43 | 44 | @examples[#:eval the-eval 45 | (srl:sxml->xml-noindent 46 | '(zippy (pippy (|@| (pigtails "2")) "ab") "bc")) 47 | (srl:sxml->xml-noindent 48 | '(zippy (pippy (|@| (pigtails "2")) "ab") "bc") 49 | (current-output-port)) 50 | (srl:sxml->xml-noindent 51 | (for/fold ([body '(nothing)]) ([i (in-range 5)]) 52 | `(doll (|@| (level ,(number->string i))) ,body)) 53 | (current-output-port)) 54 | ] 55 | } 56 | 57 | @defproc*[([(srl:sxml->html [sxml-obj sxml?]) 58 | string?] 59 | [(srl:sxml->html [sxml-obj sxml?] 60 | [dest (or/c output-port? path-string?)]) 61 | void?])]{ 62 | 63 | Serializes the SXML node or nodeset @racket[sxml-obj] into HTML, with 64 | indentation to facilitate readability by a human. 65 | 66 | If @racket[dest] is not supplied, the functions return a string that 67 | contains the serialized representation of the @racket[sxml-obj]. If 68 | @racket[dest] is supplied and is a port, the functions write the 69 | serialized representation of @racket[sxml-obj] to this port. If 70 | @racket[dest] is supplied and is a string, this string is treated as 71 | an output filename, the serialized representation of 72 | @racket[sxml-obj] is written to that filename. If a file with the 73 | given name already exists, the effect is unspecified. 74 | 75 | NOTE: As far as I can tell, the result of this transformation 76 | is more accurately described as XHTML than as HTML. The most noticeable 77 | difference is that certain tags may be rendered surprisingly by 78 | many browsers when they are expressed in the @tt{} form 79 | rather than in the @tt{} form. Giving these tags an 80 | empty string as a body will force them to render correctly. The 81 | list of tags for which the W3 consortium recommends using the 82 | expanded form is this: 83 | @itemlist[@item{param} 84 | @item{meta} 85 | @item{link} 86 | @item{isindex} 87 | @item{input} 88 | @item{img} 89 | @item{hr} 90 | @item{frame} 91 | @item{col} 92 | @item{br} 93 | @item{basefont} 94 | @item{base} 95 | @item{area}] 96 | 97 | Should this function automatically treat these differently? Yes, probably 98 | so. 99 | } 100 | 101 | @defproc*[([(srl:sxml->html-noindent [sxml-obj sxml?]) 102 | string?] 103 | [(srl:sxml->html-noindent [sxml-obj sxml?] 104 | [dest (or/c output-port? path-string?)]) 105 | void?])]{ 106 | 107 | Like @racket[srl:sxml->html] but without indentation. 108 | } 109 | -------------------------------------------------------------------------------- /sxml/scribblings/sxml-rep.scrbl: -------------------------------------------------------------------------------- 1 | #lang scribble/doc 2 | @(require scribble/manual 3 | "util.rkt" 4 | (for-label sxml)) 5 | 6 | @title{SXML} 7 | 8 | @deftech{SXML} is a representation of XML elements using 9 | unique s-expressions. The following grammar describes the structure of SXML: 10 | 11 | @(define ATSIGN (racketidfont "@")) 12 | 13 | @racketgrammar*[ 14 | #:literals (*TOP* *PI* *COMMENT* *ENTITY* URI *NAMESPACES* |@|) 15 | [top (*TOP* maybe-annotations 16 | PI ... 17 | comment ... 18 | element)] 19 | [element (name maybe-annot-attributes child ...)] 20 | [annot-attributes (|@| attribute ... maybe-annotations)] 21 | [attribute (name maybe-value maybe-annotations)] 22 | [child element 23 | character-data-string 24 | PI 25 | comment 26 | entity] 27 | [PI (*PI* pi-target 28 | maybe-annotations 29 | processing-instruction-content-string)] 30 | [comment (*COMMENT* comment-string)] 31 | [entity (*ENTITY* public-id-string system-id-string)] 32 | [name local-name 33 | exp-name] 34 | [local-name @#,elem{symbol conforming to XML Namespace recommendation}] 35 | [exp-name @#,elem{symbol of the form @racket[_namespace-id]@litchar{:}@racket[_local-name]}] 36 | [namespace-id URI-symbol 37 | user-ns-shortcut-symbol] 38 | [namespaces (*NAMESPACES* namespace-assoc ...)] 39 | [namespace-assoc (namespace-id uri-string maybe-original-prefix)] 40 | [annotations (|@| maybe-namespaces annotation ...)] 41 | [annotation @#,elem{not yet specified}] 42 | ] 43 | 44 | Some tools, such as SXPath, use the following coarse approximation of 45 | SXML structure for simplicity: 46 | 47 | @racketgrammar*[ 48 | #:literals (*TOP* *PI* *COMMENT* *ENTITY* URI *NAMESPACES* |@|) 49 | [node (name . node-list) 50 | string] 51 | [node-list (node ...)] 52 | [name local-name exp-name |@| *TOP* *PI* *COMMENT* *ENTITY* *NAMESPACES*] 53 | ] 54 | 55 | In short, an XML element is represented as a list consisting of its 56 | tag name as a symbol followed by its children nodes. If the XML 57 | element has attributes, they come immediately after the tag symbol, in 58 | a list tagged by an @racket[|@|] symbol. 59 | 60 | For example, the XML element 61 | 62 | @tt{defjkl} 63 | 64 | is represented by the SXML datum 65 | 66 | @racket['(abc "def" (ghi) "jkl")] 67 | 68 | and the XML element 69 | 70 | @tt{Barry White} 71 | 72 | is represented by the SXML datum 73 | 74 | @racket['(customer (|@| (specialness "gazonga")) "Barry White")] 75 | 76 | NOTE! Some of the sxml libraries, particularly sxml:modify, depend 77 | on the fact that sxml elements in a legal document are all "unique"; 78 | as I understand it, the requirement is that no two subtrees of a given 79 | SXML document can be 'eq?' to each other. This can easily 80 | occur when rewriting a tree, for instance a pass that inserts `(delete-me) 81 | in multiple places. 82 | 83 | That's the easy part. Things get more tricky when you start talking 84 | about documents and namespaces. 85 | 86 | Refer to @hyperlink["http://okmij.org/ftp/Scheme/SXML.html"]{the 87 | original SXML specification} for a more detailed explanation of the 88 | representation, including examples. 89 | 90 | 91 | @;{ ============================================================ } 92 | 93 | @section{SXML Functions} 94 | 95 | @defproc[(sxml:element? [v any/c]) boolean?]{ 96 | 97 | Returns @racket[#t] if @racket[v] is a list starting with a symbol 98 | that is not a special symbol, @racket[#f] otherwise. 99 | 100 | @examples[#:eval the-eval 101 | (sxml:element? '(p "blah")) 102 | (sxml:element? '(*COMMENT* "ignore me")) 103 | (sxml:element? '(|@| (href "link.html"))) 104 | ] 105 | } 106 | 107 | @defproc[(ntype-names?? [tags (listof symbol?)]) 108 | (-> any/c boolean?)]{ 109 | 110 | Given a list of allowable tag names, returns a predicate that 111 | recognizes @racket[_element]s with those tags. 112 | 113 | @examples[#:eval the-eval 114 | ((ntype-names?? '(a p)) '(p "blah")) 115 | ((ntype-names?? '(a p)) '(br)) 116 | ] 117 | } 118 | 119 | @defproc[(ntype?? [crit symbol?]) 120 | (-> any/c boolean?)]{ 121 | 122 | If @racket[_crit] is a special symbol, a predicate is returned that 123 | accepts the following classes of @racket[_node]: 124 | 125 | @itemlist[ 126 | @item{@racket['|@|]: an @racket[_annot-attributes] node} 127 | @item{@racket['*]: any @racket[_element] (@racket[sxml:element?])} 128 | @item{@racket['*any*]: any @racket[_node]} 129 | @item{@racket['*text*]: any string} 130 | @item{@racket['*data*]: anything except a pair (@racket[_element])} 131 | @item{@racket['*COMMENT*]: a @racket[_comment] node} 132 | @item{@racket['*PI*]: a @racket[_PI] (processing instruction) node} 133 | @item{@racket['*ENTITY*]: an @racket[_entity] node} 134 | ] 135 | 136 | Otherwise, it is an ordinary tag name, and a predicate is returned 137 | that recognizes @racket[_element]s with that tag. 138 | 139 | @examples[#:eval the-eval 140 | ((ntype?? '*) "blah") 141 | ((ntype?? '*) '(p "blah")) 142 | ((ntype?? '*text*) "blah") 143 | ((ntype?? '*text*) '(p "blah")) 144 | ] 145 | } 146 | 147 | @defproc[(ntype-namespace-id?? [ns-id (or/c string? #f)]) 148 | (-> any/c boolean?)]{ 149 | 150 | Returns a predicate that recognizes @racket[_element]s with tags 151 | belonging to the namespace @racket[ns-id]. If @racket[ns-id] is 152 | @racket[#f], the predicate recognizes elements whose tags have no 153 | namespace. 154 | 155 | @examples[#:eval the-eval 156 | ((ntype-namespace-id?? "atom") '(atom:id "blah")) 157 | ((ntype-namespace-id?? "atom") '(atomic "section")) 158 | ((ntype-namespace-id?? #f) '(atomic "section")) 159 | ] 160 | } 161 | 162 | @defproc[(sxml:node? [v any/c]) boolean?]{ 163 | 164 | Returns @racket[#t] for anything except an attribute list (that is, 165 | a list whose first element is @racket['|@|]). 166 | 167 | Note that the set of values accepted by @racket[sxml:node?] is 168 | different from the non-terminal @racket[_node]. 169 | 170 | @examples[#:eval the-eval 171 | (sxml:node? '(a (|@| (href "link.html")) "blah")) 172 | (sxml:node? '(|@| (href "link.html"))) 173 | ] 174 | } 175 | 176 | @defproc[(sxml:attr-list [node _node]) 177 | (listof _attribute)]{ 178 | 179 | If @racket[node] is an @racket[_element], returns its list of 180 | attributes (or @racket['()]) if it has no attributes; for all other 181 | types of @racket[_node], returns @racket['()]. 182 | 183 | @examples[#:eval the-eval 184 | (sxml:attr-list '(a (|@| (href "link.html")) "blah")) 185 | (sxml:attr-list '(p "blah")) 186 | (sxml:attr-list "blah") 187 | ] 188 | } 189 | 190 | @;{ ============================================================ } 191 | @;{ -- From sxml-tools.rkt -- } 192 | 193 | @defproc[(sxml:attr-list-node [elem sxml:element?]) 194 | (or/c #f (cons/c '|@| (listof @#,racket[_attribute])))]{ 195 | 196 | Returns an element's attribute list node, or @racket[#f] it is has 197 | none. Compare @racket[sxml:attr-list]. 198 | 199 | @examples[#:eval the-eval 200 | (sxml:attr-list-node '(a (|@| (href "link.html")) "blah")) 201 | (sxml:attr-list-node '(p "blah")) 202 | ] 203 | } 204 | 205 | @;{ 206 | sxml:attr-as-list 207 | sxml:aux-list-node 208 | sxml:aux-as-list 209 | } 210 | 211 | @defproc[(sxml:empty-element? [elem sxml:element?]) 212 | boolean?]{ 213 | 214 | Returns @racket[#t] if @racket[elem] has no nested elements, text 215 | nodes, etc. The element may have attributes. 216 | 217 | @examples[#:eval the-eval 218 | (sxml:empty-element? '(br)) 219 | (sxml:empty-element? '(p "blah")) 220 | (sxml:empty-element? '(link (|@| (rel "self") (href "here.html")))) 221 | ] 222 | } 223 | 224 | @;{ 225 | sxml:shallow-normalized? 226 | sxml:normalized 227 | sxml:shallow-minimized? 228 | sxml:minimized? 229 | sxml:name ;; what is domain??? 230 | } 231 | 232 | @defproc[(sxml:element-name [elem sxml:element?]) 233 | symbol?]{ 234 | 235 | Returns an element's tag. 236 | } 237 | 238 | @defproc[(sxml:ncname [qualified-name symbol?]) 239 | string?]{ 240 | 241 | Returns the local part of a qualified name. 242 | } 243 | 244 | @defproc[(sxml:name->ns-id [qualified-name symbol?]) 245 | (or/c string? #f)]{ 246 | 247 | Returns the namespace part of a qualified name. 248 | } 249 | 250 | @defproc[(sxml:content [node-or-nodeset (or/c _node nodeset?)]) 251 | (listof _node)]{ 252 | Returns the contents (elements and text nodes) of an element or 253 | nodeset. 254 | } 255 | 256 | @defproc[(sxml:text [node-or-nodeset (or/c _node nodeset?)]) 257 | string?]{ 258 | 259 | Returns a string consisting of all of the character data immediately 260 | within @racket[node-or-nodeset]. 261 | 262 | @examples[#:eval the-eval 263 | (sxml:text '(p (em "red") " fish; " (em "blue") " fish")) 264 | ] 265 | } 266 | 267 | @defproc[(sxml:attr [elem sxml:element?] 268 | [attr-name symbol?]) 269 | (or/c string? #f)]{ 270 | 271 | Gets the value of the @racket[attr-name] attribute of @racket[elem]. 272 | } 273 | 274 | @;{ 275 | sxml:content-raw 276 | sxml:attr-list-u 277 | sxml:aux-list 278 | sxml:aux-list-u 279 | sxml:aux-node 280 | sxml:aux-nodes 281 | sxml:attr-from-list 282 | sxml:num-attr 283 | sxml:attr-u 284 | sxml:ns-list 285 | sxml:ns-id->nodes 286 | sxml:ns-id->uri 287 | sxml:ns-uri->nodes 288 | sxml:ns-id 289 | sxml:ns-uri 290 | sxml:ns-prefix 291 | } 292 | 293 | @defproc[(sxml:change-content [elem sxml:element?] 294 | [new-content (listof _child)]) 295 | sxml:element?]{ 296 | 297 | Replaces the content of @racket[elem] with @racket[new-content], 298 | preserving its attributes and auxiliary information. 299 | } 300 | 301 | @defproc[(sxml:change-attrlist [elem sxml:element?] 302 | [new-attrlist (listof _attribute)]) 303 | sxml:element?]{ 304 | 305 | Replaces the attributes of @racket[elem] with @racket[new-attrlist], 306 | preserving its contents and auxiliary information. 307 | } 308 | 309 | @defproc[(sxml:change-name [elem sxml:element?] 310 | [tag symbol?]) 311 | sxml:element?]{ 312 | 313 | Changes the tag name of @racket[elem], preserving its attributes, 314 | auxiliary information, and contents. 315 | } 316 | 317 | @defproc[(sxml:set-attr [elem sxml:element?] 318 | [attr (list/c symbol? any/c)]) 319 | sxml:element?]{ 320 | 321 | Returns an element like @racket[elem] but with the attribute 322 | @racket[attr], which replaces any existing attribute with 323 | @racket[attr]'s key. 324 | } 325 | 326 | @defproc[(sxml:add-attr [elem sxml:element?] 327 | [attr (list/c symbol? any/c)]) 328 | (or/c sxml:element? #f)]{ 329 | 330 | Like @racket[sxml:set-attr], but returns @racket[#f] if 331 | @racket[elem] already contains an attribute with @racket[attr]'s 332 | key. 333 | } 334 | 335 | @defproc[(sxml:change-attr [elem sxml:element?] 336 | [attr (list/c symbol? any/c)]) 337 | (or/c sxml:element? #f)]{ 338 | 339 | Like @racket[sxml:set-attr], but returns @racket[#f] unless 340 | @racket[elem] already contains an attribute with @racket[attr]'s 341 | key. 342 | } 343 | 344 | @defproc[(sxml:squeeze [elem sxml:element?]) 345 | sxml:element?]{ 346 | 347 | Eliminates empty attribute lists and auxiliary lists. 348 | } 349 | 350 | @defproc[(sxml:clean [elem sxml:element?]) 351 | sxml:element?]{ 352 | 353 | Eliminates empty attribute lists and removes all auxilary lists. 354 | } 355 | 356 | @;{ 357 | sxml:add-aux 358 | } 359 | 360 | @;{ -- } 361 | 362 | @;{ 363 | sxml:node-parent 364 | sxml:lookup 365 | sxml:attr->xml 366 | sxml:string->xml 367 | sxml:sxml->xml 368 | sxml:attr->html 369 | sxml:string->html 370 | sxml:sxml->html 371 | } 372 | -------------------------------------------------------------------------------- /sxml/scribblings/sxml.scrbl: -------------------------------------------------------------------------------- 1 | #lang scribble/doc 2 | @(require scribble/manual 3 | "util.rkt" 4 | (for-label racket/base 5 | sxml)) 6 | 7 | @title{SXML: S-Expression Representation of XML} 8 | @;{@author[(author+email "John Clements" "clements@racket-lang.org")]} 9 | 10 | @defmodule[sxml]{ 11 | 12 | This library contains Oleg Kiselyov's SXML 13 | libraries in a Racket-friendly format. It is a direct descendant of 14 | Dmitry Lizorkin's PLaneT package. It's different from that package in that 15 | 16 | @itemize[#:style 'ordered 17 | @item{It contains some documentation (here it is!),} 18 | @item{it contains some tests that run in Racket,} 19 | @item{it raises racket exceptions rather than printing to stderr and raising "-1",} 20 | @item{it has only one require point (ssax & sxml are both included), and} 21 | @item{it doesn't depend on schemeunit:3, so it compiles quickly.}] 22 | 23 | This documentation is scraped together from various sources; the bulk of it 24 | (currently) is pulled from in-source comments. 25 | 26 | I'm hoping that this will become a Racket community project, with various 27 | people contributing documentation and test cases and maybe even bug fixes. 28 | 29 | To that end, this code currently lives in a github repository which should 30 | be fairly easy to find. Patches gratefully accepted. 31 | 32 | It's perhaps also worth noting that this project currently has less than 23% 33 | test case coverage. Bleh. 34 | 35 | For Heaven's sake, report lots of bugs! 36 | 37 | --John Clements, 2011-02-17 38 | } 39 | 40 | @include-section["sxml-rep.scrbl"] 41 | @include-section["sax-parsing.scrbl"] 42 | @include-section["serialization.scrbl"] 43 | @include-section["sxpath.scrbl"] 44 | @include-section["sxslt.scrbl"] 45 | 46 | @include-section["extracted-sperber.scrbl"] 47 | @include-section["all-exported.scrbl"] 48 | 49 | @(close-eval the-eval) 50 | -------------------------------------------------------------------------------- /sxml/scribblings/sxpath.scrbl: -------------------------------------------------------------------------------- 1 | #lang scribble/manual 2 | @(require scribble/core 3 | "util.rkt" 4 | scribble/racket 5 | (for-syntax racket/base) 6 | (for-label sxml)) 7 | 8 | @title[#:tag "sxpath"]{Search (SXPath)} 9 | 10 | The @hyperlink["https://www.w3.org/TR/xpath/"]{W3C "XPath" standard} describes a standardized way to perform 11 | searches in XML documents. For instance, the XPath string 12 | @racket["/A/B/C"] (thank you, Wikipedia) describes a search for 13 | a @tt{C} element whose parent is a @tt{B} element whose parent 14 | is an @tt{A} element that is the root of the document. 15 | 16 | The @racket[sxpath] function performs a similar search over SXML data, 17 | using either the standard XPath strings or a list of Racket values. 18 | 19 | @interaction[#:eval (make-base-eval) 20 | (require sxml/sxpath) 21 | ((sxpath "/A/B/C") 22 | '(*TOP* (A (B (C))))) 23 | ((sxpath '(A B C)) 24 | '(*TOP* (A (B (C))))) 25 | ((sxpath "//p[contains(@class, 'blue')]/text()") 26 | '(*TOP* 27 | (body 28 | (p (|@| (class "blue")) "P1") 29 | (p (|@| (class "blue green")) "P2") 30 | (p (|@| (class "red")) "P3"))))] 31 | 32 | (This documentation desperately needs more examples.) 33 | 34 | Let's consider the following XML document: 35 | @verbatim{ 36 | " 37 | 38 | 39 | www content 40 | 41 | 42 | 43 | content in ccc 44 | 45 | 46 | "} 47 | 48 | If we use Neil Van Dyke's html parser, we might parse this into 49 | the following sxml document: 50 | 51 | @interaction[#:eval the-eval 52 | (define example-doc 53 | '(*TOP* 54 | (aaa "\n" " " 55 | (bbb "\n" " " 56 | (ccc) "\n" " " 57 | (www " www content " (xxx) (www "\n" " " (zzz) "\n" " "))) 58 | "\n" " " 59 | (xxx "\n" " " (ddd " content in ccc \n" " ") "\n" " ") 60 | "\n")))] 61 | 62 | @defproc[(sxpath [path (or/c list? string?)] 63 | [ns-bindings (listof (cons/c symbol? string?)) '()]) 64 | (-> (or/c _node nodeset?) nodeset?)]{ 65 | 66 | Given a representation of a @racket[path], produces a procedure that 67 | accepts an SXML document and returns a list of matches. Path 68 | representations are interpreted according to the following rewrite 69 | rules. 70 | 71 | @;{ 72 | docs previously said: 73 | Note that the @racket[*TOP*] node of the document is required. 74 | But it isn't! 75 | } 76 | 77 | @(let () 78 | (define-syntax-rule (rewrite-table line ...) 79 | (tabular #:style (let ([cs (style #f (list 'top))]) 80 | (style #f (list (table-columns (list cs cs cs))))) 81 | (append line ...))) 82 | (define-syntax-rule (line lhs rhs) 83 | (list (list @racketblock[lhs] @elem{⇒} @racketblock[rhs]))) 84 | (define-syntax-rule (LINE lhs rhs) 85 | (list (list @racketblock[lhs] 'cont 'cont) 86 | (list @elem{} @elem{⇒} @racketblock[rhs]))) 87 | (define-syntax-rule (BREAK) 88 | (list (list @elem{} @elem{@~} @elem{}))) 89 | (define-syntax-rule (defmetas id ...) 90 | (begin (define-syntax id 91 | (make-element-id-transformer 92 | (lambda _ 93 | #'(racketvarfont (symbol->string 'id))))) 94 | ...)) 95 | (defmetas path-component0 path-components x p path symbol reducer) 96 | (rewrite-table 97 | (line (sxpath '()) (node-join)) 98 | (LINE (sxpath (cons path-component0 path-components)) 99 | (node-join (sxpath1 path-component0) 100 | (sxpath path-components))) 101 | (BREAK) 102 | (line (sxpath1 '//) 103 | (sxml:descendant-or-self sxml:node?)) 104 | (line (sxpath1 `(equal? ,x)) 105 | (select-kids (node-equal? x))) 106 | (line (sxpath1 `(eq? ,x)) 107 | (select-kids (node-eq? x))) 108 | (line (sxpath1 `(*or* ,p ...)) 109 | (select-kids (ntype-names?? `(,p ...)))) 110 | (line (sxpath1 `(*not* ,p ...)) 111 | (select-kids 112 | (sxml:complement 113 | (ntype-names?? `(,p ...))))) 114 | (line (sxpath1 `(ns-id:* ,x)) 115 | (select-kids (ntype-namespace-id?? x))) 116 | (line (sxpath1 _symbol) 117 | (select-kids (ntype?? _symbol))) 118 | (line (sxpath1 _string) 119 | (txpath _string)) 120 | (line (sxpath1 _procedure) 121 | _procedure) 122 | ;; ryanc: symbol has to be followed by reducer, right? 123 | ;; original comments just have ellipses, unclear 124 | (LINE (sxpath1 `(,symbol ,reducer ...)) 125 | (sxpath1 `((,symbol) ,reducer ...))) 126 | (LINE (sxpath1 `(,path ,reducer ...)) 127 | (node-reduce (sxpath path) 128 | (sxpathr reducer) ...)) 129 | (BREAK) 130 | (line (sxpathr _number) 131 | (node-pos _number)) 132 | (line (sxpathr _path) 133 | (sxml:filter (sxpath _path))))) 134 | 135 | To extract the @tt{xxx}'s inside the @tt{aaa} from the example document: 136 | 137 | @interaction[#:eval the-eval 138 | ((sxpath '(aaa xxx)) example-doc)] 139 | 140 | To extract all cells from an HTML table: 141 | 142 | @interaction[#:eval the-eval 143 | (define table 144 | `(*TOP* 145 | (table 146 | (tr (td "a") (td "b")) 147 | (tr (td "c") (td "d"))))) 148 | ((sxpath '(table tr td)) table) 149 | #| should produce '((td "a") (td "b") (td "c") (td "d")) |# 150 | ] 151 | 152 | To extract all cells anywhere in a document: 153 | 154 | @interaction[#:eval the-eval 155 | (define table 156 | `(*TOP* 157 | (div 158 | (p (table 159 | (tr (td "a") (td "b")) 160 | (tr (td "c") (td "d")))) 161 | (table 162 | (tr (td "e")))))) 163 | ((sxpath '(// td)) table) 164 | #| should produce '((td "a") (td "b") (td "c") (td "d") (td "e")) |# 165 | ] 166 | 167 | One result may be nested in another one: 168 | 169 | @interaction[#:eval the-eval 170 | (define doc 171 | `(*TOP* 172 | (div 173 | (p (div "3") 174 | (div (div "4")))))) 175 | ((sxpath '(// div)) doc) 176 | #| should produce 177 | '((div (p (div "3") (div (div "4")))) (div "3") (div (div "4")) (div "4")) 178 | |# 179 | ] 180 | 181 | There's also a string-based syntax, @racket[txpath]. As shown in the grammar above, 182 | @racket[sxpath] assumes that any strings in the path are expressed using the 183 | @racket[txpath] syntax. 184 | 185 | So, for instance, the prior example could be rewritten using a string: 186 | 187 | @interaction[#:eval the-eval 188 | (define doc 189 | `(*TOP* 190 | (div 191 | (p (div "3") 192 | (div (div "4")))))) 193 | ((sxpath "//div") doc) 194 | #| should produce 195 | '((div (p (div "3") (div (div "4")))) (div "3") (div (div "4")) (div "4")) 196 | |# 197 | ] 198 | 199 | More generally, lists in the s-expression syntax correspond to string 200 | concatenation in the txpath syntax. 201 | 202 | So, to find all italics that appear at top level within a paragraph: 203 | 204 | @interaction[#:eval the-eval 205 | (define doc 206 | `(*TOP* 207 | (div 208 | (p (i "3") 209 | (froogy (i "4")))))) 210 | ((sxpath "//p/i") doc) 211 | #| should produce '((i "3")) |# 212 | ] 213 | 214 | Handling of namespaces in @racket[sxpath] is a bit surprising. In particular, 215 | it appears to me that sxpath's model is that namespaces must appear fully expanded 216 | in the matched source. For instance: 217 | 218 | @interaction[#:eval the-eval 219 | ((sxpath "//ns:p" `((ns . "http://example.com"))) 220 | '(*TOP* (html (|http://example.com:body| 221 | (|http://example.com:p| "first para") 222 | (|http://example.com:p| 223 | "second para containing" 224 | (|http://example.com:p| "third para") "inside it"))))) 225 | #| should produce 226 | '((|http://example.com:p| "first para") 227 | (|http://example.com:p| "second para containing" 228 | (|http://example.com:p| "third para") "inside it") 229 | (|http://example.com:p| "third para")) 230 | |# 231 | ] 232 | 233 | But the corresponding example where the source document contains a namespace shortcut does 234 | not match in the same way. That is: 235 | 236 | @interaction[#:eval the-eval 237 | ((sxpath "//ns:p" `((ns . "http://example.com"))) 238 | '(*TOP* (|@| (*NAMESPACES* (ns "http://example.com"))) 239 | (html (ns:body (ns:p "first para") 240 | (ns:p "second para containing" 241 | (ns:p "third para") "inside it"))))) 242 | ] 243 | 244 | It produces the empty list. Instead, you must pretend that the 245 | shortcut is actually the namespace. Thus: 246 | 247 | @interaction[#:eval the-eval 248 | ((sxpath "//ns:p" `((ns . "ns"))) 249 | '(*TOP* (|@| (*NAMESPACES* (ns "http://example.com"))) 250 | (html (ns:body (ns:p "first para") 251 | (ns:p "second para containing" 252 | (ns:p "third para") "inside it"))))) 253 | #| should produce 254 | '((ns:p "first para") 255 | (ns:p "second para containing" 256 | (ns:p "third para") "inside it") 257 | (ns:p "third para")) 258 | |# 259 | ] 260 | 261 | Ah well. 262 | 263 | @section{Filtering} 264 | 265 | It may sometimes be the case that you're looking for @racket[abc] 266 | nodes that contain @racket[def] nodes. You could try 267 | 268 | @racketblock[(sxpath '(// abc def))] 269 | 270 | ... but that would give you the inner @racket[def] nodes, not the 271 | @racket[abc] parent nodes that contain the @racket[def] nodes. 272 | If you pore over the expansion above, you will discover that you can 273 | do this, using ... well, essentially, using a pair of nested lists: 274 | 275 | @racketblock[ 276 | ((sxpath '(// (abc (def)))) 277 | '(x (x (abc (x) 278 | (def "1"))) 279 | (abc (x) "2") 280 | (abc (def) "3")))] 281 | 282 | Note that this is also the right way to go if for instance you're looking 283 | for a @racket[div] with a particular id: 284 | 285 | @codeblock|{ 286 | ((sxpath '(// (div (@ id (equal? "wanted"))))) 287 | '(body (foo (div (@ (id "a")) 288 | (div (@ (id "b")) "abc") 289 | "def") 290 | (div (@ (id "wanted")) 291 | (div (@ (id "c")) "qq") 292 | "ghi")))) 293 | }| 294 | 295 | But what if you want to check not that the string is equal to a fixed value, but 296 | rather that it contains a given value? This is common in the case of the "class" 297 | attribute, which often has a space-separated list of tokens. It turns out that 298 | SXML's combinator library can handle this just fine, but in order to use it, you'll 299 | need to unzip the sxpath to allow you to use the combinators. To see this, let's 300 | amend the earlier example to find a class containing the token @racket["wanted"]. 301 | 302 | Our first step is to "unzip" the sxpath syntax. The following query produces exactly 303 | the same result as the previous one: 304 | 305 | @codeblock|{ 306 | ((node-join (sxpath '(//)) 307 | (node-reduce 308 | (sxpath '(div)) 309 | (sxml:filter (node-join (sxpath '(@ id)) 310 | (select-kids 311 | (node-equal? "wanted")))))) 312 | '(body (foo (div (@ (id "a")) 313 | (div (@ (id "b")) "abc") 314 | "def") 315 | (div (@ (id "wanted")) 316 | (div (@ (id "c")) "qq") 317 | "ghi")))) 318 | }| 319 | 320 | At this point, we can replace @racket[node-equal?] with any predicate on strings. In the following 321 | example we generalize it to look for occurrences of the token, and then we add a bunch of 322 | other junk to the specified id, to show that it still works: 323 | 324 | @codeblock|{ 325 | ;; does the given string occur as a "word" in the text? 326 | (define (str-includes? str) 327 | (λ (text) (member str (string-split text)))) 328 | 329 | ((node-join (sxpath '(//)) 330 | (node-reduce 331 | (sxpath '(div)) 332 | (sxml:filter (node-join (sxpath '(@ id)) 333 | (select-kids 334 | (str-includes? "wanted")))))) 335 | '(body (foo (div (@ (id "a")) 336 | (div (@ (id "b")) "abc") 337 | "def") 338 | (div (@ (id "wanted bar baz")) 339 | (div (@ (id "c")) "qq") 340 | "ghi")))) 341 | }| 342 | 343 | 344 | } 345 | @defproc[(txpath [xpath-location-path string?] 346 | [ns-bindings (listof (cons/c symbol? string?)) '()]) 347 | (-> (or/c _node nodeset?) nodeset?)]{ 348 | 349 | Like @racket[sxpath], but only accepts an XPath query in string 350 | form, using the standard XPath syntax. 351 | 352 | Deprecated; use @racket[sxpath] instead. 353 | } 354 | 355 | @;{ ============================================================ } 356 | 357 | A @deftech{sxml-converter} is a function 358 | @racketblock[(-> (or/c _node nodeset?) 359 | nodeset?)] 360 | that is, it takes nodes or nodesets to nodesets. A 361 | @deftech{sxml-converter-as-predicate} is an @tech{sxml-converter} used 362 | as a predicate; a return value of @racket['()] indicates false. 363 | 364 | @defproc[(nodeset? [v any/c]) boolean?]{ 365 | Returns @racket[#t] if @racket[v] is a list of nodes (that is, a 366 | list that does not start with a symbol). 367 | 368 | @examples[#:eval the-eval 369 | (nodeset? '(p "blah")) 370 | (nodeset? '((p "blah") (br) "more")) 371 | ] 372 | } 373 | 374 | @defproc[(as-nodeset [v any/c]) 375 | nodeset?]{ 376 | 377 | If @racket[v] is a nodeset, returns @racket[v], otherwise returns 378 | @racket[(list v)]. 379 | 380 | @examples[#:eval the-eval 381 | (as-nodeset '(p "blah")) 382 | (as-nodeset '((p "blah") (br) "more")) 383 | ] 384 | } 385 | 386 | @; ---- 387 | 388 | @defproc[(node-eq? [v any/c]) 389 | (-> any/c boolean?)]{ 390 | Curried @racket[eq?]. 391 | } 392 | @defproc[(node-equal? [v any/c]) 393 | (-> any/c boolean?)]{ 394 | Curried @racket[equal?]. 395 | } 396 | 397 | @defproc[(node-pos [n (or/c exact-positive-integer? exact-negative-integer?)]) 398 | @#,tech{sxml-converter}]{ 399 | 400 | Returns a converter that selects the @racket[n]th element (counting 401 | from 1, not 0) of a nodelist and returns it as a singleton nodelist. If 402 | @racket[n] is negative, it selects from the right: @racket[-1] 403 | selects the last node, and so forth. 404 | 405 | @examples[#:eval the-eval 406 | ((node-pos 2) '((a) (b) (c) (d) (e))) 407 | ((node-pos -1) '((a) (b) (c))) 408 | ] 409 | } 410 | 411 | @; ---- 412 | 413 | @defproc[(sxml:filter [pred @#,tech{sxml-converter-as-predicate}]) 414 | @#,tech{sxml-converter}] 415 | 416 | @defproc[(sxml:complement [pred @#,tech{sxml-converter-as-predicate}]) 417 | @#,tech{sxml-converter-as-predicate}] 418 | 419 | @;{ 420 | take-until 421 | take-after 422 | map-union 423 | node-reverse 424 | node-trace 425 | } 426 | 427 | @defproc[(select-kids [pred @#,tech{sxml-converter-as-predicate}]) 428 | @#,tech{sxml-converter}]{ 429 | 430 | Returns a converter that selects an (ordered) subset of the children 431 | of the given node (or the children of the members of the given 432 | nodelist) satisfying @racket[pred]. 433 | 434 | @examples[#:eval the-eval 435 | ((select-kids (ntype?? 'p)) '(p "blah")) 436 | ((select-kids (ntype?? '*text*)) '(p "blah")) 437 | ((select-kids (ntype?? 'p)) (list '(p "blah") '(br) '(p "blahblah"))) 438 | ] 439 | } 440 | 441 | @defproc[(select-first-kid [pred @#,tech{sxml-converter-as-predicate}]) 442 | (-> (or/c _node nodeset?) (or/c _node #f))]{ 443 | 444 | Like @racket[select-kids] but returns only the first one, or 445 | @racket[#f] if none. 446 | } 447 | 448 | @defproc[(node-self [pred @#,tech{sxml-converter-as-predicate}]) 449 | @#,tech{sxml-converter}]{ 450 | 451 | Returns a function that when applied to @racket[_node], returns 452 | @racket[(list _node)] if @racket[(pred _node)] is neither @racket[#f] 453 | nor @racket['()], otherwise returns @racket['()]. 454 | 455 | @examples[#:eval the-eval 456 | ((node-self (ntype?? 'p)) '(p "blah")) 457 | ((node-self (ntype?? 'p)) '(br)) 458 | ] 459 | } 460 | 461 | @defproc[(node-join [selector @#,tech{sxml-converter}] ...) 462 | @#,tech{sxml-converter}]{ 463 | Forms a new sxml-converter that is the sequential composition of 464 | the sxml-converters in the list.} 465 | 466 | @defproc[(node-reduce [converter @#,tech{sxml-converter}] ...) 467 | @#,tech{sxml-converter}] 468 | 469 | @defproc[(node-or [converter @#,tech{sxml-converter}] ...) 470 | @#,tech{sxml-converter}] 471 | 472 | @defproc[(node-closure [converter @#,tech{sxml-converter}]) 473 | @#,tech{sxml-converter}] 474 | 475 | @deftogether[[ 476 | @defproc[(sxml:attribute [pred @#,tech{sxml-converter-as-predicate}]) 477 | @#,tech{sxml-converter}] 478 | @defproc[(sxml:child [pred @#,tech{sxml-converter-as-predicate}]) 479 | @#,tech{sxml-converter}] 480 | @defthing[sxml:child-nodes @#,tech{sxml-converter}] 481 | @defthing[sxml:child-elements @#,tech{sxml-converter}] 482 | @defproc[(sxml:descendant [pred @#,tech{sxml-converter-as-predicate}]) 483 | @#,tech{sxml-converter}] 484 | @defproc[(sxml:descendant-or-self [pred @#,tech{sxml-converter-as-predicate}]) 485 | @#,tech{sxml-converter}] 486 | ]]{ 487 | 488 | XPath axes and accessors. 489 | } 490 | 491 | The following procedures depend explicitly on the root node. 492 | 493 | @deftogether[[ 494 | @defproc[((sxml:parent [pred @#,tech{sxml-converter-as-predicate}]) 495 | [root _node]) 496 | @#,tech{sxml-converter}] 497 | @defproc[(node-parent [root _node]) 498 | @#,tech{sxml-converter}] 499 | @defproc[((sxml:ancestor [pred @#,tech{sxml-converter-as-predicate}]) 500 | [root _node]) 501 | @#,tech{sxml-converter}] 502 | @defproc[((sxml:ancestor-or-self [pred @#,tech{sxml-converter-as-predicate}]) 503 | [root _node]) 504 | @#,tech{sxml-converter}] 505 | @defproc[((sxml:following [pred @#,tech{sxml-converter-as-predicate}]) 506 | [root _node]) 507 | @#,tech{sxml-converter}] 508 | @defproc[((sxml:following-sibling [pred @#,tech{sxml-converter-as-predicate}]) 509 | [root _node]) 510 | @#,tech{sxml-converter}] 511 | ]]{ 512 | Gosh, I wish these functions were documented. 513 | } 514 | 515 | @defproc[((sxml:preceding [pred @#,tech{sxml-converter-as-predicate}]) 516 | [root _node]) 517 | @#,tech{sxml-converter}]{ 518 | given a predicate and a root node, returns a procedure that accepts a 519 | nodeset and returns all nodes that appear before the given nodes in 520 | document order, filtered using the predicate. 521 | 522 | Here's an example: 523 | 524 | @interaction[#:eval the-eval 525 | (((sxml:preceding (ntype?? 'www)) example-doc) ((sxpath `(aaa xxx)) example-doc))] 526 | 527 | } 528 | 529 | @defproc[((sxml:preceding-sibling [pred @#,tech{sxml-converter-as-predicate}]) 530 | [root _node]) 531 | @#,tech{sxml-converter}]{ 532 | given a predicate and a root node, returns a procedure that accepts a 533 | nodeset and returns all dones that are preceding siblings (in document 534 | order) of the given nodes. 535 | 536 | @interaction[#:eval the-eval 537 | (define doc '(*TOP* (div (p "foo") (p "bar") 538 | (img "baz") (p "quux")))) 539 | (((sxml:preceding-sibling (ntype?? 'p)) doc) ((sxpath '(// img)) doc))] 540 | } 541 | 542 | -------------------------------------------------------------------------------- /sxml/scribblings/sxslt.scrbl: -------------------------------------------------------------------------------- 1 | #lang scribble/doc 2 | @(require scribble/manual 3 | "util.rkt" 4 | (for-label sxml)) 5 | 6 | @title[#:tag "sxslt"]{SXML Transformation (SXSLT)} 7 | 8 | This library provides a means to transform XML elements. It is 9 | the descendant of the system described by Kiselyov and Krishnamurthi 10 | in their 2003 PADL paper, 11 | @link["http://cs.brown.edu/~sk/Publications/Papers/Published/kk-sxslt/"]{"SXSLT: Manipulation Language for SXML"}. 12 | 13 | @defproc[(sxml:modify [updater _update-spec] ...) 14 | (-> _node _node)]{ 15 | 16 | Returns a procedure that applies the given @racket[updater]s to an 17 | SXML document, producing a new SXML document. Each @racket[updater] 18 | has the following form: 19 | 20 | @racketgrammar*[ 21 | #:literals (list quote) 22 | [update-spec (list xpath-location-path action action-param ...)] 23 | [action 'delete 24 | 'delete-undeep 25 | 'insert-into 26 | 'insert-following 27 | 'insert-preceding 28 | 'replace 29 | 'move-into 30 | 'move-following 31 | 'move-preceding 32 | handler-proc]] 33 | 34 | The @racket[xpath-location-path] describes the nodes to be transformed 35 | (see also @racket[sxpath]). The @racket[xpath-location-path] is 36 | interpreted with respect to some base node. If the location path is 37 | absolute, the base node is the root of the document being 38 | transformed. If the location path is relative, the base node is the 39 | node selected by the previous @racket[updater]. 40 | 41 | Note! @racket[sxml:modify] depends on the uniqueness of all subtrees; 42 | if an @racket[eq?] subtree occurs more than once, @racket['delete] 43 | may fail to delete all instances, for example. 44 | 45 | The following combinations of @racket[action]s and 46 | @racket[action-param]s are supported: 47 | 48 | @specsubform[#:literals (quote) 'delete]{ 49 | Deletes the selected nodes. 50 | } 51 | @specsubform[#:literals (quote) 'delete-undeep]{ 52 | Deletes the selected nodes, but keeps all of their contents, which 53 | thus move one level upwards in the document tree. 54 | } 55 | @specsubform[#:literals (quote) (code:line 'insert-into new-node ...)]{ 56 | Inserts the @racket[new-node]s as the last children of the selected 57 | nodes. 58 | } 59 | @specsubform[#:literals (quote) (code:line 'insert-following new-node ...)]{ 60 | Inserts the @racket[new-node]s after the selected nodes. 61 | } 62 | @specsubform[#:literals (quote) (code:line 'insert-preceding new-node ...)]{ 63 | Inserts the @racket[new-node]s before the selected nodes. 64 | } 65 | @specsubform[#:literals (quote) (code:line 'replace new-node ...)]{ 66 | Replaces the selected nodes with the @racket[new-node]s. 67 | } 68 | @specsubform[#:literals (quote) (code:line 'rename new-tag)]{ 69 | Renames the selected nodes, replacing its element tag with 70 | @racket[new-tag]. 71 | } 72 | @specsubform[#:literals (quote) (code:line 'move-into new-location-path)]{ 73 | Moves the selected nodes to a new location. The selected nodes become 74 | the last children of the nodes selected by @racket[new-location-path]. 75 | } 76 | @specsubform[#:literals (quote) (code:line 'move-following new-location-path)]{ 77 | Moves the selected nodes to a new location. The selected nodes are 78 | placed immediately after the nodes selected by 79 | @racket[new-location-path]. 80 | } 81 | @specsubform[#:literals (quote) (code:line 'move-preceding new-location-path)]{ 82 | Moves the selected nodes to a new location. The selected nodes are 83 | placed immediately before the nodes selected by 84 | @racket[new-location-path]. 85 | } 86 | @specsubform[handler-proc]{ 87 | Applies @racket[handler-proc] to three arguments: the selected node, 88 | a context (?), and the base node. The procedure must return a node 89 | or nodeset, which replaces the selected node. 90 | } 91 | 92 | @examples[#:eval the-eval 93 | (define sample-doc 94 | `(*TOP* 95 | (html (title "the title") 96 | (body (p "paragraph 1") 97 | (p "paragraph 2"))))) 98 | ((sxml:modify (list "//title" 'delete)) sample-doc) 99 | ((sxml:modify (list "//body" 'delete-undeep)) sample-doc) 100 | ((sxml:modify (list "//body" 'rename 'table) 101 | (list "p" (lambda (node ctx root) 102 | `(tr (td ,node))))) 103 | sample-doc) 104 | ] 105 | } 106 | 107 | 108 | @defproc[(pre-post-order [tree _node] 109 | [bindings (listof _binding)]) 110 | _node]{ 111 | 112 | Traverses @racket[tree], applying the transformation rules specified 113 | by @racket[bindings] to each node. 114 | 115 | @racketgrammar*[ 116 | #:literals (list* quote *preorder* *macro* *text* *default*) 117 | [binding (list* trigger-symbol '*preorder* . handler-proc) 118 | (list* trigger-symbol '*macro* . handler-proc) 119 | (list* trigger-symbol new-bindings . handler-proc) 120 | (list* trigger-symbol . handler-proc)] 121 | [trigger-symbol XMLname 122 | '*text* 123 | '*default*] 124 | ] 125 | 126 | The @racket[pre-post-order] function visits the nodes and nodelists 127 | pre-post-order (depth-first). For each node of the form 128 | @racket[(_name _node ...)] it looks up an association with the given 129 | @racket[_name] among its @racket[bindings]. If it fails, 130 | @racket[pre-post-order] tries to locate a default binding. It's an 131 | error if the latter attempt fails as well. 132 | 133 | The following types of @racket[_binding] are supported: 134 | 135 | @specsubform[#:literals (list* quote) 136 | (list* trigger-symbol '*preorder* . handler-proc)]{ 137 | 138 | The @racket[handler-proc] is applied to each node matching 139 | @racket[trigger-symbol] without first processing the node's 140 | contents. The result is not traversed by @racket[pre-post-order]. 141 | } 142 | 143 | @specsubform[#:literals (list* quote) 144 | (list* trigger-symbol '*macro* . handler-proc)]{ 145 | 146 | This is equivalent to @racket['*preorder*] described above. However, 147 | the result is re-processed again, with the current stylesheet. 148 | } 149 | 150 | @specsubform[#:literals (list* quote) 151 | (list* trigger-symbol new-bindings . handler-proc)] 152 | @specsubform[#:literals (list* quote) 153 | (list* trigger-symbol . handler-proc)]{ 154 | 155 | The @racket[handler-proc] is applied to each node matching 156 | @racket[trigger-symbol], but only after the node's contents have been 157 | recursively processed, using the additional @racket[_new-bindings], if 158 | present. 159 | 160 | To be more precise, the handler is applied to the head of the current 161 | node and its processed children. The result of the handler, which 162 | should also be a tree, replaces the current node. If the current node 163 | is a text string or other atom, a special binding with a symbol 164 | @racket[*text*] is looked up. 165 | } 166 | 167 | @examples[#:eval the-eval 168 | (define sample-doc 169 | `(*TOP* 170 | (html (title "the title") 171 | (body (p "paragraph 1") 172 | (p "paragraph 2"))))) 173 | (define italicizer 174 | `((p . ,(lambda (tag . content) 175 | (cons tag (cons "PARAGRAPH BEGINS: " content)))) 176 | (*text* . ,(lambda (tag content) 177 | `(i ,content))) 178 | (*default* . ,(lambda args args)))) 179 | (pre-post-order sample-doc italicizer) 180 | #| should produce 181 | '(*TOP* 182 | (html 183 | (title (i "the title")) 184 | (body 185 | (p "PARAGRAPH BEGINS: " (i "paragraph 1")) 186 | (p "PARAGRAPH BEGINS: " (i "paragraph 2"))))) 187 | |# 188 | ] 189 | } 190 | -------------------------------------------------------------------------------- /sxml/scribblings/util.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require scribble/manual 3 | scribble/eval 4 | (for-label racket/base 5 | racket/contract)) 6 | (provide (all-from-out scribble/manual) 7 | (all-from-out scribble/eval) 8 | (for-label 9 | (all-from-out racket/base) 10 | (all-from-out racket/contract)) 11 | the-eval) 12 | 13 | (define the-eval (make-base-eval)) 14 | (the-eval `(begin (require sxml 15 | racket/pretty) 16 | (current-print pretty-print-handler))) 17 | -------------------------------------------------------------------------------- /sxml/ssax/SXML-tree-trans.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require "myenv.ss") 3 | (provide post-order 4 | pre-post-order 5 | SRV:send-reply 6 | replace-range) 7 | 8 | 9 | ; XML/HTML processing in Scheme 10 | ; SXML expression tree transformers 11 | ; 12 | ; IMPORT 13 | ; A prelude appropriate for your Scheme system 14 | ; (myenv-bigloo.scm, myenv-mit.scm, etc.) 15 | ; 16 | ; EXPORT 17 | ; (provide SRV:send-reply 18 | ; post-order pre-post-order replace-range) 19 | ; 20 | ; See vSXML-tree-trans.scm for the validation code, which also 21 | ; serves as usage examples. 22 | ; 23 | ; $Id: SXML-tree-trans.scm,v 1.7 2004/11/09 20:22:26 sperber Exp $ 24 | 25 | 26 | ; procedure: SRV:send-reply FRAGMENT ... 27 | ; 28 | ; Output the 'fragments' 29 | ; The fragments are a list of strings, characters, 30 | ; numbers, thunks, #f, #t -- and other fragments. 31 | ; The function traverses the tree depth-first, writes out 32 | ; strings and characters, executes thunks, and ignores 33 | ; #f and '(). 34 | ; The function returns #t if anything was written at all; 35 | ; otherwise the result is #f 36 | ; If #t occurs among the fragments, it is not written out 37 | ; but causes the result of SRV:send-reply to be #t 38 | 39 | (define (SRV:send-reply . fragments) 40 | (let loop ((fragments fragments) (result #f)) 41 | (cond 42 | ((null? fragments) result) 43 | ((not (car fragments)) (loop (cdr fragments) result)) 44 | ((null? (car fragments)) (loop (cdr fragments) result)) 45 | ((eq? #t (car fragments)) (loop (cdr fragments) #t)) 46 | ((pair? (car fragments)) 47 | (loop (cdr fragments) (loop (car fragments) result))) 48 | ((procedure? (car fragments)) 49 | ((car fragments)) 50 | (loop (cdr fragments) #t)) 51 | (else 52 | (display (car fragments)) 53 | (loop (cdr fragments) #t))))) 54 | 55 | 56 | 57 | ; procedure: pre-post-order TREE BINDINGS 58 | ; 59 | ; Traversal of an SXML tree or a grove: 60 | ; a or a 61 | ; 62 | ; A and a are mutually-recursive datatypes that 63 | ; underlie the SXML tree: 64 | ; ::= (name . ) | "text string" 65 | ; An (ordered) set of nodes is just a list of the constituent nodes: 66 | ; ::= ( ...) 67 | ; Nodelists, and Nodes other than text strings are both lists. A 68 | ; however is either an empty list, or a list whose head is 69 | ; not a symbol (an atom in general). A symbol at the head of a node is 70 | ; either an XML name (in which case it's a tag of an XML element), or 71 | ; an administrative name such as '@'. 72 | ; See SXPath.scm and SSAX.scm for more information on SXML. 73 | ; 74 | ; 75 | ; Pre-Post-order traversal of a tree and creation of a new tree: 76 | ; pre-post-order:: x -> 77 | ; where 78 | ; ::= ( ...) 79 | ; ::= ( *preorder* . ) | 80 | ; ( *macro* . ) | 81 | ; ( . ) | 82 | ; ( . ) 83 | ; ::= XMLname | *text* | *default* 84 | ; :: x [] -> 85 | ; 86 | ; The pre-post-order function visits the nodes and nodelists 87 | ; pre-post-order (depth-first). For each of the form (name 88 | ; ...) it looks up an association with the given 'name' among 89 | ; its . If failed, pre-post-order tries to locate a 90 | ; *default* binding. It's an error if the latter attempt fails as 91 | ; well. Having found a binding, the pre-post-order function first 92 | ; checks to see if the binding is of the form 93 | ; ( *preorder* . ) 94 | ; If it is, the handler is 'applied' to the current node. Otherwise, 95 | ; the pre-post-order function first calls itself recursively for each 96 | ; child of the current node, with prepended to the 97 | ; in effect. The result of these calls is passed to the 98 | ; (along with the head of the current ). To be more 99 | ; precise, the handler is _applied_ to the head of the current node 100 | ; and its processed children. The result of the handler, which should 101 | ; also be a , replaces the current . If the current 102 | ; is a text string or other atom, a special binding with a symbol 103 | ; *text* is looked up. 104 | ; 105 | ; A binding can also be of a form 106 | ; ( *macro* . ) 107 | ; This is equivalent to *preorder* described above. However, the result 108 | ; is re-processed again, with the current stylesheet. 109 | ; 110 | (define (pre-post-order tree bindings) 111 | (let* ((default-binding (assq '*default* bindings)) 112 | (text-binding (or (assq '*text* bindings) default-binding)) 113 | (text-handler ; Cache default and text bindings 114 | (and text-binding 115 | (if (procedure? (cdr text-binding)) 116 | (cdr text-binding) (cddr text-binding))))) 117 | (let loop ((tree tree)) 118 | (cond 119 | ((null? tree) '()) 120 | ((not (pair? tree)) 121 | (let ((trigger '*text*)) 122 | (if text-handler (text-handler trigger tree) 123 | (myenv:error "Unknown binding for " trigger " and no default")))) 124 | ((not (symbol? (car tree))) (map loop tree)) ; tree is a nodelist 125 | (else ; tree is an SXML node 126 | (let* ((trigger (car tree)) 127 | (binding (or (assq trigger bindings) default-binding))) 128 | (cond 129 | ((not binding) 130 | (myenv:error "Unknown binding for " trigger " and no default")) 131 | ((not (pair? (cdr binding))) ; must be a procedure: handler 132 | (apply (cdr binding) trigger (map loop (cdr tree)))) 133 | ((eq? '*preorder* (cadr binding)) 134 | (apply (cddr binding) tree)) 135 | ((eq? '*macro* (cadr binding)) 136 | (loop (apply (cddr binding) tree))) 137 | (else ; (cadr binding) is a local binding 138 | (apply (cddr binding) trigger 139 | (pre-post-order (cdr tree) (append (cadr binding) bindings))) 140 | )))))))) 141 | 142 | ; procedure: post-order TREE BINDINGS 143 | ; post-order is a strict subset of pre-post-order without *preorder* 144 | ; (let alone *macro*) traversals. 145 | ; Now pre-post-order is actually faster than the old post-order. 146 | ; The function post-order is deprecated and is aliased below for 147 | ; backward compatibility. 148 | (define post-order pre-post-order) 149 | 150 | ;------------------------------------------------------------------------ 151 | ; Extended tree fold 152 | ; tree = atom | (node-name tree ...) 153 | ; 154 | ; foldts fdown fup fhere seed (Leaf str) = fhere seed str 155 | ; foldts fdown fup fhere seed (Nd kids) = 156 | ; fup seed $ foldl (foldts fdown fup fhere) (fdown seed) kids 157 | 158 | ; procedure fhere: seed -> atom -> seed 159 | ; procedure fdown: seed -> node -> seed 160 | ; procedure fup: parent-seed -> last-kid-seed -> node -> seed 161 | ; foldts returns the final seed 162 | 163 | (define (foldts fdown fup fhere seed tree) 164 | (cond 165 | ((null? tree) seed) 166 | ((not (pair? tree)) ; An atom 167 | (fhere seed tree)) 168 | (else 169 | (let loop ((kid-seed (fdown seed tree)) (kids (cdr tree))) 170 | (if (null? kids) 171 | (fup seed kid-seed tree) 172 | (loop (foldts fdown fup fhere kid-seed (car kids)) 173 | (cdr kids))))))) 174 | 175 | ; procedure: replace-range:: BEG-PRED x END-PRED x FOREST -> FOREST 176 | ; Traverse a forest depth-first and cut/replace ranges of nodes. 177 | ; 178 | ; The nodes that define a range don't have to have the same immediate 179 | ; parent, don't have to be on the same level, and the end node of a 180 | ; range doesn't even have to exist. A replace-range procedure removes 181 | ; nodes from the beginning node of the range up to (but not including) 182 | ; the end node of the range. In addition, the beginning node of the 183 | ; range can be replaced by a node or a list of nodes. The range of 184 | ; nodes is cut while depth-first traversing the forest. If all 185 | ; branches of the node are cut a node is cut as well. The procedure 186 | ; can cut several non-overlapping ranges from a forest. 187 | 188 | ; replace-range:: BEG-PRED x END-PRED x FOREST -> FOREST 189 | ; where 190 | ; type FOREST = (NODE ...) 191 | ; type NODE = Atom | (Name . FOREST) | FOREST 192 | ; 193 | ; The range of nodes is specified by two predicates, beg-pred and end-pred. 194 | ; beg-pred:: NODE -> #f | FOREST 195 | ; end-pred:: NODE -> #f | FOREST 196 | ; The beg-pred predicate decides on the beginning of the range. The node 197 | ; for which the predicate yields non-#f marks the beginning of the range 198 | ; The non-#f value of the predicate replaces the node. The value can be a 199 | ; list of nodes. The replace-range procedure then traverses the tree and skips 200 | ; all the nodes, until the end-pred yields non-#f. The value of the end-pred 201 | ; replaces the end-range node. The new end node and its brothers will be 202 | ; re-scanned. 203 | ; The predicates are evaluated pre-order. We do not descend into a node that 204 | ; is marked as the beginning of the range. 205 | 206 | (define (replace-range beg-pred end-pred forest) 207 | 208 | ; loop forest keep? new-forest 209 | ; forest is the forest to traverse 210 | ; new-forest accumulates the nodes we will keep, in the reverse 211 | ; order 212 | ; If keep? is #t, keep the curr node if atomic. If the node is not atomic, 213 | ; traverse its children and keep those that are not in the skip range. 214 | ; If keep? is #f, skip the current node if atomic. Otherwise, 215 | ; traverse its children. If all children are skipped, skip the node 216 | ; as well. 217 | 218 | (define (loop forest keep? new-forest) 219 | (if (null? forest) (values (reverse new-forest) keep?) 220 | (let ((node (car forest))) 221 | (if keep? 222 | (cond ; accumulate mode 223 | ((beg-pred node) => ; see if the node starts the skip range 224 | (lambda (repl-branches) ; if so, skip/replace the node 225 | (loop (cdr forest) #f 226 | (append (reverse repl-branches) new-forest)))) 227 | ((not (pair? node)) ; it's an atom, keep it 228 | (loop (cdr forest) keep? (cons node new-forest))) 229 | (else 230 | (let ((node? 231 | (symbol? (car node)))) ; or is it a nodelist? 232 | (call-with-values 233 | ; traverse its children 234 | (lambda () (loop (if node? (cdr node) node) #t '())) 235 | (lambda (new-kids keep?) 236 | (loop (cdr forest) keep? 237 | (cons 238 | (if node? (cons (car node) new-kids) new-kids) 239 | new-forest))))))) 240 | ; skip mode 241 | (cond 242 | ((end-pred node) => ; end the skip range 243 | (lambda (repl-branches) ; repl-branches will be re-scanned 244 | (loop (append repl-branches (cdr forest)) #t 245 | new-forest))) 246 | ((not (pair? node)) ; it's an atom, skip it 247 | (loop (cdr forest) keep? new-forest)) 248 | (else 249 | (let ((node? 250 | (symbol? (car node)))) ; or is it a nodelist? 251 | ; traverse its children 252 | (call-with-values 253 | (lambda () (loop (if node? (cdr node) node) #f '())) 254 | (lambda (new-kids keep?) 255 | (loop 256 | (cdr forest) keep? 257 | (if (or keep? (pair? new-kids)) 258 | (cons 259 | (if node? (cons (car node) new-kids) new-kids) 260 | new-forest) 261 | new-forest) ; if all kids are skipped 262 | )))))))))) ; skip the node too 263 | 264 | (call-with-values 265 | (lambda () (loop forest #t '())) 266 | (lambda (new-forest keep?) 267 | new-forest))) 268 | -------------------------------------------------------------------------------- /sxml/ssax/access-remote.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require net/url 3 | net/head 4 | racket/path 5 | srfi/13/string 6 | "errors-and-warnings.rkt") 7 | (provide open-input-resource 8 | ar:resolve-uri-according-base 9 | ar:resource-type) 10 | 11 | ;; Uniform access to local and remote resources 12 | ;; Resolution for relative URIs in accordance with RFC 2396 13 | ; 14 | ; This software is in Public Domain. 15 | ; IT IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. 16 | ; 17 | ; Please send bug reports and comments to: 18 | ; lizorkin@hotbox.ru Dmitry Lizorkin 19 | 20 | ;========================================================================= 21 | ; Accessing (remote) resources 22 | 23 | ; Opens an input port for a resource 24 | ; REQ-URI - a string representing a URI of the resource 25 | ; An input port is returned if there were no errors. In case of an error, 26 | ; the function returns #f and displays an error message as a side effect. 27 | ; Doesn't raise any exceptions. 28 | ;; ryanc: Why not?! 29 | (define (open-input-resource req-uri) 30 | (with-handlers ([exn:fail? 31 | (lambda (e) 32 | (sxml:warn 'open-input-resource "~a: ~a" 33 | req-uri (exn-message e)) 34 | #f)]) 35 | (get-pure-port (string->url req-uri)))) 36 | 37 | ;========================================================================= 38 | ; Determining resource type 39 | 40 | ; Determines the type of a resource 41 | ; REQ-URI - a string representing a URI of the resource 42 | ; For a local resource, its type is determined by its file extension 43 | ; One of the following is returned: 44 | ; #f - if the requested resource doesn't exist 45 | ; 'xml - for a resource that is an XML document 46 | ; 'html - for a resource that is an HTML document 47 | ; 'unknown - for any other resource type 48 | (define (ar:resource-type req-uri) 49 | (cond [(string-prefix? "http://" req-uri) ; HTTP scheme is used in REQ-URI 50 | (with-handlers ([exn:fail? (lambda (exn) #f)]) 51 | (call/input-url (string->url req-uri) head-impure-port 52 | (lambda (port) 53 | (let* ([headers (purify-port port)] 54 | [content-type (extract-field "content-type" headers)]) 55 | (cond [(not content-type) ;; no content-type specified 56 | 'unknown] 57 | [(string-prefix? "text/xml" content-type) 58 | 'xml] 59 | [(string-prefix? "text/html" content-type) 60 | 'html] 61 | [(string-prefix? "text/plain" content-type) 62 | 'plain] 63 | [else 'unknown])))))] 64 | [(string-prefix? "file://" req-uri) 65 | (define filename (substring req-uri 7)) 66 | (cond [(not (file-exists? filename)) ; file doesn't exist 67 | #f] 68 | [(assoc (filename-extension filename) 69 | '((#"xml" . xml) (#"html" . html) (#"htm" . html))) 70 | => cdr] 71 | [else 'unknown])] 72 | [else 'unknown])) 73 | 74 | ;========================================================================= 75 | ; Working on absolute/relative URIs 76 | ; This section is based on RFC 2396 77 | 78 | ;------------------------------------------------- 79 | ; Resolves a relative URI with respect to the base URI 80 | 81 | ; base-uri - base URI for the requiested one 82 | ; Returns the resolved URI 83 | (define (ar:resolve-uri-according-base base-uri req-uri) 84 | (url->string (combine-url/relative (string->url base-uri) req-uri))) 85 | -------------------------------------------------------------------------------- /sxml/ssax/doc.txt: -------------------------------------------------------------------------------- 1 | SSAX Package 2 | ============ 3 | 4 | A SSAX functional XML parsing framework consists of a DOM/SXML parser, a SAX 5 | parser, and a supporting library of lexing and parsing procedures. The 6 | procedures in the package can be used separately to tokenize or parse various 7 | pieces of XML documents. The framework supports XML Namespaces, character, 8 | internal and external parsed entities, attribute value normalization, 9 | processing instructions and CDATA sections. The package includes a 10 | semi-validating SXML parser: a DOM-mode parser that is an instantiation of 11 | a SAX parser (called SSAX). 12 | 13 | SSAX is a full-featured, algorithmically optimal, pure-functional parser, 14 | which can act as a stream processor. SSAX is an efficient SAX parser that is 15 | easy to use. SSAX minimizes the amount of application-specific state that has 16 | to be shared among user-supplied event handlers. SSAX makes the maintenance 17 | of an application-specific element stack unnecessary, which eliminates several 18 | classes of common bugs. SSAX is written in a pure-functional subset of Scheme. 19 | Therefore, the event handlers are referentially transparent, which makes them 20 | easier for a programmer to write and to reason about. The more expressive, 21 | reliable and easier to use application interface for the event-driven XML 22 | parsing is the outcome of implementing the parsing engine as an enhanced tree 23 | fold combinator, which fully captures the control pattern of the depth-first 24 | tree traversal. 25 | 26 | ------------------------------------------------- 27 | 28 | Quick start 29 | 30 | ; procedure: ssax:xml->sxml PORT NAMESPACE-PREFIX-ASSIG 31 | ; 32 | ; This is an instance of a SSAX parser that returns an SXML 33 | ; representation of the XML document to be read from PORT. 34 | ; NAMESPACE-PREFIX-ASSIG is a list of (USER-PREFIX . URI-STRING) 35 | ; that assigns USER-PREFIXes to certain namespaces identified by 36 | ; particular URI-STRINGs. It may be an empty list. 37 | ; The procedure returns an SXML tree. The port points out to the 38 | ; first character after the root element. 39 | (define (ssax:xml->sxml port namespace-prefix-assig) ...) 40 | 41 | ; procedure: pre-post-order TREE BINDINGS 42 | ; 43 | ; Traversal of an SXML tree or a grove: 44 | ; a or a 45 | ; 46 | ; A and a are mutually-recursive datatypes that 47 | ; underlie the SXML tree: 48 | ; ::= (name . ) | "text string" 49 | ; An (ordered) set of nodes is just a list of the constituent nodes: 50 | ; ::= ( ...) 51 | ; Nodelists, and Nodes other than text strings are both lists. A 52 | ; however is either an empty list, or a list whose head is 53 | ; not a symbol (an atom in general). A symbol at the head of a node is 54 | ; either an XML name (in which case it's a tag of an XML element), or 55 | ; an administrative name such as '@'. 56 | ; See SXPath.scm and SSAX.scm for more information on SXML. 57 | ; 58 | ; 59 | ; Pre-Post-order traversal of a tree and creation of a new tree: 60 | ; pre-post-order:: x -> 61 | ; where 62 | ; ::= ( ...) 63 | ; ::= ( *preorder* . ) | 64 | ; ( *macro* . ) | 65 | ; ( . ) | 66 | ; ( . ) 67 | ; ::= XMLname | *text* | *default* 68 | ; :: x [] -> 69 | ; 70 | ; The pre-post-order function visits the nodes and nodelists 71 | ; pre-post-order (depth-first). For each of the form (name 72 | ; ...) it looks up an association with the given 'name' among 73 | ; its . If failed, pre-post-order tries to locate a 74 | ; *default* binding. It's an error if the latter attempt fails as 75 | ; well. Having found a binding, the pre-post-order function first 76 | ; checks to see if the binding is of the form 77 | ; ( *preorder* . ) 78 | ; If it is, the handler is 'applied' to the current node. Otherwise, 79 | ; the pre-post-order function first calls itself recursively for each 80 | ; child of the current node, with prepended to the 81 | ; in effect. The result of these calls is passed to the 82 | ; (along with the head of the current ). To be more 83 | ; precise, the handler is _applied_ to the head of the current node 84 | ; and its processed children. The result of the handler, which should 85 | ; also be a , replaces the current . If the current 86 | ; is a text string or other atom, a special binding with a symbol 87 | ; *text* is looked up. 88 | ; 89 | ; A binding can also be of a form 90 | ; ( *macro* . ) 91 | ; This is equivalent to *preorder* described above. However, the result 92 | ; is re-processed again, with the current stylesheet. 93 | ; 94 | (define (pre-post-order tree bindings) ...) 95 | 96 | ------------------------------------------------- 97 | 98 | Additional tools included into the package 99 | 100 | 1. "access-remote.ss" 101 | Uniform access to local and remote resources 102 | Resolution for relative URIs in accordance with RFC 2396 103 | 104 | 2. "id.ss" 105 | Creation and manipulation of the ID-index for a faster access to SXML elements 106 | by their unique ID 107 | Provides the DTD parser for extracting ID attribute declarations 108 | 109 | 3. "xlink-parser.ss" 110 | Parser for XML documents that contain XLink elements 111 | 112 | 4. "multi-parser.ss" 113 | SSAX multi parser: combines several specialized parsers into one 114 | Provides creation of parent pointers to SXML document constructed 115 | -------------------------------------------------------------------------------- /sxml/ssax/errors-and-warnings.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require racket/contract/base) 3 | (provide/contract 4 | [current-sxml-warning-handler 5 | (parameter/c 6 | (->* (symbol? string?) () #:rest list? any))] 7 | [make-warning-handler 8 | (-> (or/c output-port? (-> output-port?)) 9 | (->* (symbol? string?) () #:rest list? any))] 10 | [sxml:warn 11 | (->* (symbol? string?) () #:rest list? void?)] 12 | [sxml:warn/concat 13 | (->* (symbol?) () #:rest list? void?)]) 14 | 15 | ;; Goal: replace all uses of 'cerr' etc with 'sxml:warn' 16 | 17 | ;; warning-handler adds newline; fmt doesn't need to 18 | (define (make-warning-handler out) 19 | (lambda (who fmt . args) 20 | (let ([out (if (procedure? out) (out) out)]) 21 | (apply fprintf out (string-append "~a: " fmt "\n") who args)))) 22 | 23 | (define current-sxml-warning-handler 24 | (make-parameter (make-warning-handler current-error-port))) 25 | 26 | (define (sxml:warn who fmt . args) 27 | (apply (current-sxml-warning-handler) who fmt args) 28 | (void)) 29 | 30 | (define (sxml:warn/concat who . args) 31 | ((current-sxml-warning-handler) 32 | who 33 | (apply string-append (map (lambda (x) (format "~a" x)) args)))) 34 | -------------------------------------------------------------------------------- /sxml/ssax/id.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require "errors-and-warnings.rkt" 3 | "myenv.ss" 4 | "access-remote.ss" 5 | "sxpathlib.ss") 6 | (provide id:doctype-handler 7 | id:ending-action 8 | id:finish-element-handler 9 | id:make-seed 10 | id:new-level-seed-handler) 11 | 12 | ;; Creation and manipulation of the ID-index 13 | ;; Provides the DTD parser for extracting ID attribute declarations 14 | ; 15 | ; This software is in Public Domain. 16 | ; IT IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. 17 | ; 18 | ; Please send bug reports and comments to: 19 | ; lizorkin@hotbox.ru Dmitry Lizorkin 20 | ; 21 | ; ID-index provides fast access to XML elements by their unique ID. 22 | ; ID-index has the following structure: 23 | ; id-index = ( (id . element) (id . element) ... ) 24 | ; i.e. 25 | ; id-index = (list 26 | ; (cons id element) 27 | ; (cons id element) 28 | ; ...) 29 | ; where 30 | ; id - (a string) element's unique ID 31 | ; element - an SXML presentation of an element 32 | ; 33 | ; Creation of an id-index generally consists of two steps. 34 | ; On the first step, a document declaration (internal and external DTD) 35 | ; is read and information of ID attributes is extracted. 36 | ; This is presented in a following form: 37 | ; id-attrs = ( (elem-name attr-name attr-name attr-name ...) 38 | ; (elem-name attr-name attr-name attr-name ...) ... ) 39 | ; i.e. 40 | ; id-attrs = (list 41 | ; (cons 42 | ; elem-name 43 | ; (list attr-name attr-name attr-name ...) 44 | ; (cons 45 | ; elem-name 46 | ; (list attr-name attr-name attr-name ...) 47 | ; ...) 48 | ; where 49 | ; elem-name - (a symbol) a name of the element 50 | ; attr-name - (a symbol) element's attribute having an ID type 51 | ; 52 | ; On the second step, if an SXML presentation of the document is available, 53 | ; 'id-attrs' are used for forming an 'id-index'. 54 | ; If there is no SXML presentation for a document yet, both steps are 55 | ; performed as a single function call - to a specialized SSAX parser. 56 | ; This parser constructs an SXML presentation and an 'id-index' 57 | ; in a single pass 58 | ; 59 | ; ATTENTION: 60 | ; 1. Only non-qualified 'elem-name' and 'attr-name' are correctly supported 61 | ; 2. Parameter entity reference (PEReference) is NOT supported 62 | 63 | ;========================================================================= 64 | ; Functions which read XML document declaration 65 | 66 | ;------------------------------------------------ 67 | ; Trivial functions that ignore symbols 68 | 69 | ; Function reads a whitespace (S production) 70 | (define (id:process-s port) 71 | (let ((symb (peek-char port))) 72 | (cond((eof-object? symb) symb) 73 | ((char=? symb #\space) (read-char port) 74 | (id:process-s port)) 75 | ((char=? symb #\return) (read-char port) 76 | (id:process-s port)) 77 | ((char=? symb #\newline)(read-char port) 78 | (id:process-s port)) 79 | ((char=? symb #\tab)(read-char port) 80 | (id:process-s port)) 81 | (else symb)))) 82 | 83 | 84 | ; Ignores all symbols until template-symbol 85 | (define (id:ignore-until templ-sym port) 86 | (let loop ((symb (peek-char port))) 87 | (cond((eof-object? symb) symb) 88 | ((equal? symb templ-sym) (read-char port) 89 | symb) 90 | (else (read-char port) 91 | (loop (peek-char port)))))) 92 | 93 | 94 | ;------------------------------------------------ 95 | ; These functions perform reading from a file 96 | 97 | ; Read N symbols from a port 98 | ;; If fewer than N are available before EOF, return eof 99 | (define (id:read-n num port) 100 | (id:process-s port) 101 | (let ([result (read-string num port)]) 102 | (cond [(string? result) 103 | (if (< (string-length result) num) eof result)] 104 | [else eof]))) 105 | 106 | ; This function reads a name - a sequence of characters ending with 107 | ; a whitespace or '<'. '>', '(', ')', '[', ']', '|' 108 | ;; ryanc: FIXME: use buffer from other file? (next-token??) 109 | (define (id:read-name port) 110 | (id:process-s port) 111 | (let ([out (open-output-string)]) 112 | (let loop () 113 | (let ([symb (peek-char port)]) 114 | (cond [(or (eof-object? symb) 115 | (member symb '(#\space #\tab #\return #\newline 116 | #\< #\> #\( #\) #\[ #\] #\|))) 117 | (get-output-string out)] 118 | [else 119 | (write-char (read-char port) out) 120 | (loop)]))))) 121 | 122 | ; This function reads a literal 123 | ; literal ::= ('"' [^"]* '"') | ("'" [^']* "'") 124 | ; A string is returned 125 | (define (id:process-literal port) 126 | (id:process-s port) 127 | (let ([quot (read-char port)] 128 | [out (open-output-string)]) 129 | ;; ryanc: FIXME, check is either ' or " 130 | (let loop () 131 | (let ([next (read-char port)]) 132 | (cond [(or (eof-object? next) 133 | (eqv? next quot)) 134 | (get-output-string out)] 135 | [else 136 | (write-char next out) 137 | (loop)]))))) 138 | 139 | ;------------------------------------------------ 140 | ; Miscellaneous 141 | 142 | ; Converts a string into small letters 143 | (define (id:to-small str) (string-downcase str)) 144 | 145 | ; Takes an 'id-attrs' which can contain equal element names 146 | ; Returns a new 'id-attrs' where all element names are unique 147 | (define (id:unite-id-attrs id-attrs) 148 | (let loop ((id-attrs id-attrs) 149 | (new '())) 150 | (if (null? id-attrs) 151 | new 152 | (let rpt ((elem-name (caar id-attrs)) 153 | (atts (cdar id-attrs)) 154 | (rest (cdr id-attrs)) 155 | (id-attrs '())) 156 | (cond 157 | ((null? rest) 158 | (loop id-attrs (cons (cons elem-name atts) new))) 159 | ((equal? (caar rest) elem-name) 160 | (rpt elem-name 161 | (append atts (cdar rest)) 162 | (cdr rest) 163 | id-attrs)) 164 | (else 165 | (rpt elem-name atts (cdr rest) (cons (car rest) id-attrs)))))))) 166 | 167 | 168 | ;------------------------------------------------ 169 | ; Parsing XML productions concerning document declaration 170 | ; These functions are not intendes for error detection, they assume that 171 | ; the document is correct 172 | 173 | ; This function ignores information related to a PI production [16] 174 | ; [16] PI ::= '' Char*)))? '?>' 175 | ; It looks for an ending '?>' template 176 | (define (id:ignore-PI port) 177 | (id:ignore-until #\? port) 178 | (let ((symb (peek-char port))) 179 | (cond((eof-object? symb) symb) 180 | ((equal? symb #\>) (read-char port) 181 | symb) 182 | (else (id:ignore-PI port))))) 183 | 184 | 185 | ; This function ignores information related to a Comment production [15] 186 | ; [15] Comment ::= '' 187 | ; The starting '' template 189 | (define (id:ignore-comment port) 190 | (read-char port) ; it is '-' 191 | (read-char port) ; it is '-' 192 | (id:ignore-until #\- port) 193 | (let((sym1 (peek-char port))) 194 | (cond((eof-object? sym1) sym1) 195 | ((char=? sym1 #\-) (read-char port) 196 | (let((sym2 (read-char port))) ; must be '>' 197 | sym2)) 198 | (else (id:ignore-comment port))))) 199 | 200 | 201 | ; This function processes AttType production ([54]-[59] in XML specification) 202 | ; [54] AttType ::= StringType | TokenizedType | EnumeratedType 203 | ; [55] StringType ::= 'CDATA' 204 | ; [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' 205 | ; | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' 206 | ; [57] EnumeratedType ::= NotationType | Enumeration 207 | ; [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' 208 | ; [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' 209 | ; The function returnd #t if the attribute has an ID type and #f otherwise 210 | (define (id:AttType-ID? port) 211 | (let((type (id:to-small (id:read-name port)))) 212 | (cond((string=? type "id") #t) 213 | ((string=? type "notation") 214 | (id:process-s port) 215 | (read-char port) ; it is #\( 216 | (id:ignore-until #\) port) 217 | #f) 218 | ((and (string=? type "") (char=? (peek-char port) #\()) ; see [59] 219 | (id:ignore-until #\) port) 220 | #f) 221 | (else #f)))) 222 | 223 | 224 | ; This function processes DefaultDecl production ([60] in XML specification) 225 | ; [60] DefaultDecl ::= '#REQUIRED' 226 | ; | '#IMPLIED' 227 | ; | (('#FIXED' S)? AttValue) 228 | ; The result is always #t 229 | (define (id:process-DefaultDecl port) 230 | (let((type (id:to-small (id:read-name port)))) 231 | (cond((string=? type "#fixed") 232 | (id:read-name port) ; reads a default value 233 | #t) 234 | (else #t)))) 235 | 236 | 237 | ; This function processes AttDef production ([53] in XML specification) 238 | ; [53] AttDef ::= S Name S AttType S DefaultDecl 239 | ; If an attribute has an ID type, (list attribule-name) is returned 240 | ; (a list of one element). Otherwise, function returns an empty list 241 | (define (id:process-AttDef port) 242 | (let((att-name (string->symbol (id:read-name port)))) 243 | (let((bool (id:AttType-ID? port))) 244 | (id:process-DefaultDecl port) 245 | (if bool (list att-name) '())))) 246 | 247 | 248 | ; The function processes AttlistDecl production ([52] in XML specification) 249 | ; [52] AttlistDecl ::= '' 250 | ; The starting 'symbol (id:read-name port)))) 254 | (let loop ((atts '())) 255 | (id:process-s port) 256 | (cond((char=? (peek-char port) #\>) ; no more attributes will be declared 257 | (read-char port) 258 | (if(null? atts) 259 | '() 260 | (list (cons element-name atts)))) 261 | (else 262 | (loop (append (id:process-AttDef port) atts))))))) 263 | 264 | 265 | ; This function processes a multiple markupdecl production [29] 266 | ; [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl 267 | ; | NotationDecl | PI | Comment 268 | ; 'id-attrs' are returned as a result 269 | (define (id:process-markupdecl* port) 270 | (let loop ((id-attrs '())) 271 | (let((beg (id:read-n 2 port))) 272 | (cond((eof-object? beg) id-attrs) ; the file is over 273 | ((string=? beg "]>") id-attrs) ; the end of the markupdecl 274 | ((string=? beg " port) 286 | (loop id-attrs))))) 287 | (else ; an error condition 288 | (sxml:warn 'id:process-markupdecl 289 | "error in markupdecl production: unexpected ~a" beg) 290 | (id:ignore-until #\> port) 291 | id-attrs))))) 292 | 293 | 294 | ; This function processes a doctypedecl production ([75] in XML specification) 295 | ; [75] ExternalID ::= 'SYSTEM' S SystemLiteral 296 | ; | 'PUBLIC' S PubidLiteral S SystemLiteral 297 | ; The function ignores a PubidLiteral 298 | ; 'id-attrs' are returned as a result 299 | (define (id:process-ExternalID port) 300 | (let((system-literal 301 | (let((name (id:to-small (id:read-name port)))) 302 | (cond 303 | ((string=? name "system") 304 | (id:process-literal port)) 305 | ((string=? name "public") 306 | (id:process-literal port) 307 | (id:process-literal port)) 308 | (else #f))))) 309 | (if(not system-literal) 310 | '() ; an incorrect situation 311 | (let((external-port (open-input-resource system-literal))) 312 | (if(not external-port) 313 | '() ; a failure 314 | (let((id-attrs (id:process-markupdecl* external-port))) 315 | (close-input-port external-port) 316 | id-attrs)))))) 317 | 318 | 319 | ; This function processes a doctypedecl production ([28] in XML specification) 320 | ; [28] doctypedecl ::= '' 322 | ; The function doesn't process a DeclSep (this is a PEReference which 323 | ; this programme doesn't support) 324 | ; The starting '' 352 | ; [27] Misc ::= Comment | PI | S 353 | ; 'id-attrs' are returned as a result 354 | (define (id:process-prolog port) 355 | (let((beg (id:read-n 2 port))) 356 | (cond((eof-object? beg) '()) ; a file is over - strange... 357 | ((string=? beg "SXML+id document id-attrs) 420 | (let((aux-subtrees 421 | (let((aux ((select-kids (ntype?? '@@)) document))) 422 | (if(null? aux) 423 | '() 424 | (let rpt ((res '()) 425 | (to-see (cdar aux))) 426 | (cond 427 | ((null? to-see) (reverse res)) 428 | ((equal? (caar to-see) 'id-index) (rpt res (cdr to-see))) 429 | (else (rpt (cons (car to-see) res) 430 | (cdr to-see))))))))) 431 | (let loop ((nodeset (list document)) 432 | (id-index '())) 433 | (if(null? nodeset) 434 | (let((kids ((select-kids 435 | (lambda (node) 436 | (not (and (pair? node) (equal? (car node) '@@))))) 437 | document))) 438 | (cons* '*TOP* 439 | (cons* '@@ 440 | (cons 'id-index id-index) 441 | aux-subtrees) 442 | kids)) 443 | (let((cur-node (car nodeset))) 444 | (cond 445 | ((not (pair? cur-node)) ; a text node 446 | (loop (cdr nodeset) id-index)) 447 | ((assoc (car cur-node) id-attrs) 448 | => 449 | (lambda (lst) 450 | (let((id-values 451 | ((select-kids (lambda (x) #t)) 452 | ((sxml:filter (lambda (x) (member (car x) (cdr lst)))) 453 | ((select-kids (lambda (x) #t)) 454 | ((select-kids (ntype?? '@)) cur-node)))))) 455 | (loop 456 | (append 457 | ((select-kids (ntype?? '*)) (car nodeset)) 458 | (cdr nodeset)) 459 | (append 460 | id-index 461 | (map 462 | (lambda (x) (cons x cur-node)) 463 | id-values)))))) 464 | (else 465 | (loop 466 | (append ((select-kids (ntype?? '*)) (car nodeset)) (cdr nodeset)) 467 | id-index)))))))) 468 | 469 | 470 | 471 | ;========================================================================= 472 | ; Some stuff for a SSAX multi parser 473 | 474 | ;------------------------------------------------ 475 | ; Id-related part of the seed 476 | ; id:seed = (list id-attrs id-index) 477 | ; id-attrs, id-index - see a head comment 478 | 479 | ; Mutator 480 | (define (id:make-seed id-attrs id-index) 481 | (list id-attrs id-index)) 482 | 483 | 484 | ; Accessors 485 | (define (id:seed-attrs id:seed) 486 | (car id:seed)) 487 | 488 | (define (id:seed-index id:seed) 489 | (cadr id:seed)) 490 | 491 | 492 | ;------------------------------------------------ 493 | ; Handler units 494 | 495 | ; This function is called by the NEW-LEVEL-SEED handler 496 | ; A new 'id:seed' is returned 497 | (define (id:new-level-seed-handler id:seed) 498 | id:seed) 499 | 500 | 501 | ; This function is called by the FINISH-ELEMENT handler 502 | ; A new 'id:seed' is returned 503 | (define (id:finish-element-handler elem-gi attributes id:seed element) 504 | (cond 505 | ((assoc elem-gi (id:seed-attrs id:seed)) 506 | => 507 | (lambda (lst) 508 | (let loop ((atts attributes) 509 | (id-index (id:seed-index id:seed))) 510 | (if 511 | (null? atts) 512 | (id:make-seed (id:seed-attrs id:seed) id-index) 513 | (let((att (car atts))) 514 | (cond 515 | ((pair? (car att)) ; namespace aware 516 | (loop (cdr atts) id-index)) 517 | ((member (car att) (cdr lst)) 518 | (loop (cdr atts) 519 | (cons (cons (cdr att) element) id-index))) 520 | (else 521 | (loop (cdr atts) id-index)))))))) 522 | (else 523 | id:seed))) 524 | 525 | 526 | ; This function is called by the DOCTYPE handler 527 | ; A new 'id:seed' is returned 528 | (define (id:doctype-handler port systemid internal-subset?) 529 | (let((id-attrs 530 | (if 531 | (not systemid) 532 | '() ; systemid not supplied 533 | (let((external-port (open-input-resource systemid))) 534 | (if 535 | (not external-port) 536 | '() ; a failure 537 | (let((id-attrs (id:process-markupdecl* external-port))) 538 | (close-input-port external-port) 539 | id-attrs)))))) 540 | (let((id-attrs 541 | (if 542 | internal-subset? 543 | (id:unite-id-attrs 544 | (append id-attrs (id:process-markupdecl* port))) 545 | (id:unite-id-attrs id-attrs)))) 546 | (id:make-seed id-attrs '())))) 547 | 548 | 549 | ; This function constructs the member of an axuiliary list 550 | (define (id:ending-action id:seed) 551 | (let((id-index (id:seed-index id:seed))) 552 | (cons 'id-index id-index))) 553 | -------------------------------------------------------------------------------- /sxml/ssax/info.rkt: -------------------------------------------------------------------------------- 1 | (module info (lib "infotab.ss" "setup") 2 | (define name "ssax") 3 | (define blurb 4 | (list "SSAX functional XML parsing framework " 5 | "to inter-convert between an angular-bracket and " 6 | "an S-expression-based notations for markup documents")) 7 | (define primary-file "ssax.ss") 8 | (define doc.txt "doc.txt") 9 | (define categories '(xml)) 10 | ) 11 | -------------------------------------------------------------------------------- /sxml/ssax/input-parse.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require racket/port 3 | "myenv.ss" 4 | "parse-error.ss") 5 | (provide (all-defined-out)) 6 | 7 | ;**************************************************************************** 8 | ; Simple Parsing of input 9 | ; 10 | ; The following simple functions surprisingly often suffice to parse 11 | ; an input stream. They either skip, or build and return tokens, 12 | ; according to inclusion or delimiting semantics. The list of 13 | ; characters to expect, include, or to break at may vary from one 14 | ; invocation of a function to another. This allows the functions to 15 | ; easily parse even context-sensitive languages. 16 | ; 17 | ; EOF is generally frowned on, and thrown up upon if encountered. 18 | ; Exceptions are mentioned specifically. The list of expected characters 19 | ; (characters to skip until, or break-characters) may include an EOF 20 | ; "character", which is to be coded as symbol *eof* 21 | ; 22 | ; The input stream to parse is specified as a PORT, which is usually 23 | ; the last (and optional) argument. It defaults to the current input 24 | ; port if omitted. 25 | ; 26 | ; IMPORT 27 | ; This package relies on a function parser-error, which must be defined 28 | ; by a user of the package. The function has the following signature: 29 | ; parser-error PORT MESSAGE SPECIALISING-MSG* 30 | ; Many procedures of this package call parser-error to report a parsing 31 | ; error. The first argument is a port, which typically points to the 32 | ; offending character or its neighborhood. Most of the Scheme systems 33 | ; let the user query a PORT for the current position. MESSAGE is the 34 | ; description of the error. Other arguments supply more details about 35 | ; the problem. 36 | ; 37 | ; $Id: input-parse.scm,v 1.1.1.1 2001/07/11 19:33:43 oleg Exp $ 38 | 39 | ;------------------------------------------------------------------------ 40 | ; Preparation and tuning section 41 | 42 | ; This package is heavily used. Therefore, we take time to tune it in, 43 | ; in particular for Gambit. 44 | 45 | ;------------------------------------------------------------------------ 46 | 47 | ; -- procedure+: peek-next-char PORT 48 | ; advances to the next character in the PORT and peeks at it. 49 | ; This function is useful when parsing LR(1)-type languages 50 | ; (one-char-read-ahead). 51 | 52 | (define (peek-next-char port) 53 | (read-char port) 54 | (peek-char port)) 55 | 56 | 57 | ;------------------------------------------------------------------------ 58 | 59 | ; -- procedure+: assert-curr-char CHAR-LIST STRING PORT 60 | ; Reads a character from the PORT and looks it up 61 | ; in the CHAR-LIST of expected characters 62 | ; If the read character was found among expected, it is returned 63 | ; Otherwise, the procedure writes a nasty message using STRING 64 | ; as a comment, and quits. 65 | 66 | (define (assert-curr-char expected-chars comment port) 67 | (let ((c (read-char port))) 68 | (if (memq c expected-chars) c 69 | (parser-error port "Wrong character " c 70 | " (0x" (if (eof-object? c) "*eof*" 71 | (number->string (char->integer c) 16)) ") " 72 | comment ". " expected-chars " expected")))) 73 | 74 | 75 | ; -- procedure+: skip-until CHAR PORT 76 | ; Reads and skips characters from the PORT until the break 77 | ; character is encountered. This break character is returned. 78 | 79 | (define (skip-until-char stop-char port) 80 | (let loop ((c (read-char port))) 81 | (cond 82 | ((eqv? c stop-char) c) 83 | ((eof-object? c) 84 | (parser-error port "Unexpected EOF while skipping until " stop-char)) 85 | (else (loop (read-char port)))))) 86 | 87 | ; -- procedure+: skip-while CHAR-LIST PORT 88 | ; Reads characters from the PORT and disregards them, 89 | ; as long as they are mentioned in the CHAR-LIST. 90 | ; The first character (which may be EOF) peeked from the stream 91 | ; that is NOT a member of the CHAR-LIST is returned. This character 92 | ; is left on the stream. 93 | 94 | (define (skip-while skip-chars port) 95 | (do ((c (peek-char port) (peek-char port))) 96 | ((not (memv c skip-chars)) c) 97 | (read-char port))) 98 | 99 | ; whitespace const 100 | 101 | ;------------------------------------------------------------------------ 102 | ; Stream tokenizers 103 | 104 | 105 | ; -- procedure+: 106 | ; next-token PREFIX-CHAR-LIST BREAK-CHAR-LIST [COMMENT-STRING] [PORT] 107 | ; skips any number of the prefix characters (members of the 108 | ; PREFIX-CHAR-LIST), if any, and reads the sequence of characters 109 | ; up to (but not including) a break character, one of the 110 | ; BREAK-CHAR-LIST. 111 | ; The string of characters thus read is returned. 112 | ; The break character is left on the input stream 113 | ; The list of break characters may include EOF, which is to be coded as 114 | ; a symbol *eof*. Otherwise, EOF is fatal, generating an error message 115 | ; including a specified COMMENT-STRING (if any) 116 | ; 117 | ; Note: since we can't tell offhand how large the token being read is 118 | ; going to be, we make a guess, pre-allocate a string, and grow it by 119 | ; quanta if necessary. The quantum is always the length of the string 120 | ; before it was extended the last time. Thus the algorithm does 121 | ; a Fibonacci-type extension, which has been proven optimal. 122 | ; Note, explicit port specification in read-char, peek-char helps. 123 | 124 | ; Procedure input-parse:init-buffer 125 | ; returns an initial buffer for next-token* procedures. 126 | ; The input-parse:init-buffer may allocate a new buffer per each invocation: 127 | ; (define (input-parse:init-buffer) (make-string 32)) 128 | ; Size 32 turns out to be fairly good, on average. 129 | ; That policy is good only when a Scheme system is multi-threaded with 130 | ; preemptive scheduling, or when a Scheme system supports shared substrings. 131 | ; In all the other cases, it's better for input-parse:init-buffer to 132 | ; return the same static buffer. next-token* functions return a copy 133 | ; (a substring) of accumulated data, so the same buffer can be reused. 134 | ; We shouldn't worry about new token being too large: next-token will use 135 | ; a larger buffer automatically. Still, the best size for the static buffer 136 | ; is to allow most of the tokens to fit in. 137 | ; Using a static buffer _dramatically_ reduces the amount of produced garbage 138 | ; (e.g., during XML parsing). 139 | ;; ryanc: Unfortunately, single static buffer not safe in Racket 140 | ;; FIXME: tune size, see if thread-cell cache is worth it 141 | (define input-parse:init-buffer 142 | (let ([buffers (make-thread-cell #f)]) 143 | (lambda () 144 | (let ([buffer (thread-cell-ref buffers)]) 145 | (or buffer 146 | (let ([buffer (make-string 512)]) 147 | (thread-cell-set! buffers buffer) 148 | buffer)))))) 149 | 150 | (define (next-token prefix-skipped-chars break-chars comment port) 151 | (let* ((buffer (input-parse:init-buffer)) 152 | (curr-buf-len (string-length buffer)) (quantum 16)) 153 | (let loop ((i 0) (c (skip-while prefix-skipped-chars port))) 154 | (cond 155 | ((memq c break-chars) (substring buffer 0 i)) 156 | ((eof-object? c) 157 | (if (memq '*eof* break-chars) 158 | (substring buffer 0 i) ; was EOF expected? 159 | (parser-error port "EOF while reading a token " comment))) 160 | (else 161 | (when (>= i curr-buf-len) ; make space for i-th char in buffer 162 | (begin ; -> grow the buffer by the quantum 163 | (set! buffer (string-append buffer (make-string quantum))) 164 | (set! quantum curr-buf-len) 165 | (set! curr-buf-len (string-length buffer)))) 166 | (string-set! buffer i c) 167 | (read-char port) ; move to the next char 168 | (loop (++ i) (peek-char port)) 169 | ))))) 170 | 171 | 172 | ; -- procedure+: next-token-of INC-CHARSET PORT 173 | ; Reads characters from the PORT that belong to the list of characters 174 | ; INC-CHARSET. The reading stops at the first character which is not 175 | ; a member of the set. This character is left on the stream. 176 | ; All the read characters are returned in a string. 177 | ; 178 | ; -- procedure+: next-token-of PRED PORT 179 | ; Reads characters from the PORT for which PRED (a procedure of one 180 | ; argument) returns non-#f. The reading stops at the first character 181 | ; for which PRED returns #f. That character is left on the stream. 182 | ; All the results of evaluating of PRED up to #f are returned in a 183 | ; string. 184 | ; 185 | ; PRED is a procedure that takes one argument (a character 186 | ; or the EOF object) and returns a character or #f. The returned 187 | ; character does not have to be the same as the input argument 188 | ; to the PRED. For example, 189 | ; (next-token-of (lambda (c) 190 | ; (cond ((eof-object? c) #f) 191 | ; ((char-alphabetic? c) (char-downcase c)) 192 | ; (else #f)))) 193 | ; will try to read an alphabetic token from the current 194 | ; input port, and return it in lower case. 195 | ; 196 | ; Note: since we can't tell offhand how large the token being read is 197 | ; going to be, we make a guess, pre-allocate a string, and grow it by 198 | ; quanta if necessary. The quantum is always the length of the string 199 | ; before it was extended the last time. Thus the algorithm does 200 | ; a Fibonacci-type extension, which has been proven optimal. 201 | ; 202 | ; This procedure is similar to next-token but only it implements 203 | ; an inclusion rather than delimiting semantics. 204 | 205 | (define (next-token-of incl-list/pred port) 206 | (let* ((buffer (input-parse:init-buffer)) 207 | (curr-buf-len (string-length buffer)) 208 | (quantum 16)) 209 | (if (procedure? incl-list/pred) 210 | (let loop ((i 0) (c (peek-char port))) 211 | (cond 212 | ((incl-list/pred c) => 213 | (lambda (c) 214 | (when (>= i curr-buf-len) ; make space for i-th char in buffer 215 | (begin ; -> grow the buffer by the quantum 216 | (set! buffer (string-append buffer (make-string quantum))) 217 | (set! quantum curr-buf-len) 218 | (set! curr-buf-len (string-length buffer)))) 219 | (string-set! buffer i c) 220 | (read-char port) ; move to the next char 221 | (loop (++ i) (peek-char port)))) 222 | (else (substring buffer 0 i)))) 223 | ; incl-list/pred is a list of allowed characters 224 | (let loop ((i 0) (c (peek-char port))) 225 | (cond 226 | ((not (memq c incl-list/pred)) (substring buffer 0 i)) 227 | (else 228 | (when (>= i curr-buf-len) ; make space for i-th char in buffer 229 | (begin ; -> grow the buffer by the quantum 230 | (set! buffer (string-append buffer (make-string quantum))) 231 | (set! quantum curr-buf-len) 232 | (set! curr-buf-len (string-length buffer)))) 233 | (string-set! buffer i c) 234 | (read-char port) ; move to the next char 235 | (loop (++ i) (peek-char port)) 236 | )))))) 237 | 238 | ;; ============================================================ 239 | 240 | ; -- Function: find-string-from-port? STR IN-PORT 241 | ; Looks for a string STR within the input port IN-PORT 242 | ; When the STR is found, the function returns the number of 243 | ; characters it has read from the port, and the port is set 244 | ; to read the first char after that (that is, after the STR) 245 | ; The function returns #f when the string wasn't found 246 | ; Note the function reads the port *STRICTLY* sequentially, and does not 247 | ; perform any buffering. So the function can be used even if the port is open 248 | ; on a pipe or other communication channel. 249 | 250 | (define (find-string-from-port? str input-port) 251 | (let ([output-counting-port (open-output-nowhere)]) 252 | ;; Could just use file-position, but that returns byte count, not char count 253 | ;; Could use output string port, but don't need contents (memory use) 254 | (port-count-lines-enabled output-counting-port) 255 | (let ([result 256 | (regexp-match? (regexp-quote str) 257 | input-port 258 | 0 #f 259 | output-counting-port)]) 260 | (and result 261 | (let-values ([(_line _col pos) (port-next-location output-counting-port)]) 262 | (+ (string-length str) 263 | (sub1 pos))))))) 264 | 265 | ;; ============================================================ 266 | 267 | ; Character-encoding 268 | ; 269 | ; This module deals with particular character-encoding issues such as 270 | ; conversions between characters and their ASCII or UCS2 codes, Scheme 271 | ; representations of "Carriage Return" (CR), "tabulation" (TAB) and 272 | ; other control characters. 273 | ; 274 | ; This module by necessity is platform-specific as character encoding 275 | ; issues are hardly addressed in R5RS. For example, the result of 276 | ; char->integer is generally not an ASCII code of an integer (although 277 | ; it is, on many Scheme systems, with the important exception of 278 | ; Scheme48 and SCSH). The level of support for character sets other 279 | ; than ASCII varies widely among Scheme systems. 280 | ; 281 | ; This file collects various character-encoding functions that are 282 | ; necessary for the SSAX XML parser. The functions are of general use 283 | ; and scope. 284 | ; 285 | ; $Id: char-encoding.scm,v 1.1 2003/04/09 20:34:28 oleg Exp $ 286 | 287 | 288 | ; ascii->char INT -> CHAR 289 | ; return a character whose ASCII code is INT 290 | ; Note, because ascii->char is injective (there are more characters than 291 | ; ASCII characters), the inverse transformation is not defined. 292 | (define ascii->char integer->char) 293 | 294 | 295 | ; ucscode->char INT -> CHAR 296 | ; Return a character whose UCS (ISO/IEC 10646) code is INT 297 | ; Note 298 | ; This function is required for processing of XML character entities: 299 | ; According to Section "4.1 Character and Entity References" 300 | ; of the XML Recommendation: 301 | ; "[Definition: A character reference refers to a specific character 302 | ; in the ISO/IEC 10646 character set, for example one not directly 303 | ; accessible from available input devices.]" 304 | 305 | (define (ucscode->char code) 306 | (integer->char code)) 307 | 308 | ; Commonly used control characters 309 | 310 | (define char-return (ascii->char 13)) 311 | (define char-tab (ascii->char 9)) 312 | (define char-newline (ascii->char 10)) ; a.k.a. #\newline, per R5RS 313 | (define char-space (ascii->char 32)) 314 | -------------------------------------------------------------------------------- /sxml/ssax/multi-parser.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require "myenv.ss" 3 | srfi/13/string 4 | "parse-error.ss" 5 | "SSAX-code.ss" 6 | "ssax-prim.ss" 7 | "id.ss" 8 | "xlink-parser.ss") 9 | (provide parent:new-level-seed-handler 10 | parent:construct-element 11 | ssax:multi-parser) 12 | 13 | ;; SSAX multi parser 14 | ;; Provides ID-index creation, SXML parent pointers and XLink grammar parsing 15 | ; 16 | ; This software is in Public Domain. 17 | ; IT IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. 18 | ; 19 | ; Please send bug reports and comments to: 20 | ; lisovsky@acm.org Kirill Lisovsky 21 | ; lizorkin@hotbox.ru Dmitry Lizorkin 22 | ; 23 | ; Primary features: 24 | ; '() 25 | ; '(parent) 26 | ; '(id) 27 | ; '(parent id) 28 | ; '(id xlink) 29 | ; '(parent id xlink) 30 | 31 | ;========================================================================= 32 | ; Parent seed 33 | 34 | ;------------------------------------------------ 35 | ; Parent-related part of the seed 36 | ; It is a list of one element: 37 | ; a function of no arguments which returns a pointer to element's parent 38 | ; or '*TOP-PTR* symbol for a root SXML element 39 | ; Duuring an element construction it may be just a pointer to parents head, 40 | ; because a parent itself may be under construction at the moment. 41 | 42 | ; This function is called by the NEW-LEVEL-SEED handler 43 | ; elem-name = (if(symbol? elem-gi) elem-gi (RES-NAME->SXML elem-gi) 44 | ; A new 'parent:seed' is returned 45 | (define (parent:new-level-seed-handler elem-name) 46 | (let 47 | ((head (list elem-name))) 48 | (list (lambda () head)))) 49 | 50 | ; A function which constructs an element from its attributes, children 51 | ; and delayed parent information 52 | ; parent:seed - contains a delayed pointer to element's parent 53 | ; attrs - element's attributes 54 | ; children - a list of child elements 55 | (define (parent:construct-element parent:parent-seed parent:seed 56 | attrs children) 57 | (let ((head ((car parent:seed)))) 58 | (append head 59 | (list (cons '@ attrs)) 60 | children))) 61 | 62 | ;========================================================================= 63 | ; A seed 64 | ; seed = (list original-seed parent:seed id:seed xlink:seed) 65 | ; original-seed - the seed of the original 'SSAX:XML->SXML' function. It 66 | ; contains an SXML tree being constructed. 67 | ; parent:seed - parent-related part 68 | ; id:seed - id-related part 69 | ; xlink:seed - xlink-related part 70 | 71 | ;------------------------------------------------------------------------------ 72 | ; Accessors 73 | 74 | ; (mul:seed-original seed) 75 | (define get-sxml-seed car) 76 | 77 | ; Renamed: 78 | ; mul:seed-parent get-pptr-seed 79 | ; mul:seed-id get-id-seed 80 | ; mul:seed-xlink get-xlink-seed 81 | ; Handler for attempts to access an absent seed. 82 | (define (bad-accessor type) 83 | (lambda args 84 | (error 'ssax:multi-parser "bad accessor; called with ~a" 85 | (string-join (map (lambda (x) (format "~e" x)) args) " ")))) 86 | 87 | ; Seed constructor. #f seeds will be omitted. 88 | (define (make-seed . seeds) 89 | (let rpt 90 | ((s (cdr seeds)) (rzt (list (car seeds)))) 91 | (cond 92 | ((null? s) (reverse rzt)) 93 | ((car s) (rpt (cdr s) 94 | (cons (car s) rzt))) 95 | (else (rpt (cdr s) rzt))))) 96 | 97 | ;========================================================================= 98 | ; This is a multi parser constructor function 99 | 100 | ; parent, id, xlink - boolean parameters. #t means that we construct the 101 | ; corresponding feature, #f - otherwise 102 | ; ns - for future development. Is not used anywhere in the function 103 | (define (ssax:multi-parser . req-features) 104 | (let ((ns-assig '()) 105 | (with-parent? (memq 'parent req-features)) 106 | (with-id? (memq 'id req-features)) 107 | (with-xlink? (memq 'xlink req-features))) 108 | (call-with-values 109 | (lambda () (values 110 | (if with-parent? 111 | cadr (bad-accessor 'par)) 112 | (if with-id? 113 | (if with-parent? caddr cadr) 114 | (bad-accessor 'id)) 115 | (if with-xlink? 116 | (cond 117 | ((and with-parent? with-id?) 118 | cadddr) 119 | ((or with-parent? with-id?) 120 | caddr) 121 | (else cadr)) 122 | (bad-accessor 'xlink)))) 123 | (lambda (get-pptr-seed get-id-seed get-xlink-seed) 124 | (let ((initial-seed ; Initial values for specialized seeds 125 | (make-seed 126 | '() 127 | (and with-parent? (list '*TOP-PTR*)) 128 | (and with-id? (id:make-seed '() '())) 129 | (and with-xlink? 130 | (xlink:make-small-seed 'general '() '(1) '()))))) 131 | (letrec 132 | ( 133 | ; Making a special function, which, if applyed to the final seed, 134 | ; will construct a document 135 | (ending-actions 136 | (cond 137 | ((not (or with-id? with-xlink?)) 138 | (lambda (seed) 139 | (let ((result (reverse (get-sxml-seed seed)))) 140 | (cons '*TOP* result)))) 141 | ((and with-id? (not with-xlink?)) ; with-id? 142 | (lambda (seed) 143 | (let((result (reverse (get-sxml-seed seed))) 144 | (aux (list (id:ending-action (get-id-seed seed))))) 145 | (cons* '*TOP* 146 | (cons '@@ aux) 147 | result)))) 148 | ((and with-id? with-xlink?) ; with-id, with-xlink 149 | (lambda (seed) 150 | (let((result (reverse (get-sxml-seed seed))) 151 | (aux (list (xlink:ending-action (get-xlink-seed seed)) 152 | (id:ending-action (get-id-seed seed))))) 153 | (cons* '*TOP* 154 | (cons '@@ aux) 155 | result)))) 156 | (else 157 | (error 'ssax:multi-parser 158 | "ending-actions NIY: ~a ~a ~a" 159 | with-parent? with-id? with-xlink?)))) 160 | 161 | 162 | ;------------------------------------ 163 | ; Some handlers 164 | 165 | ; A special function 166 | ; When given an input port, it becomes a handler for a NEW-LEVEL-SEED 167 | (new-level-seed-handler 168 | (cond 169 | ((not (or with-parent? with-id? with-xlink?)) 170 | (lambda(port) 171 | (lambda (elem-gi attributes namespaces expected-content seed) 172 | (list '())))) 173 | ((and with-parent? (not (or with-id? with-xlink?))) ; with-parent 174 | (lambda(port) 175 | (lambda (elem-gi attributes namespaces expected-content seed) 176 | (make-seed 177 | '() 178 | (and with-parent? 179 | (parent:new-level-seed-handler 180 | (if (symbol? elem-gi) 181 | elem-gi 182 | (RES-NAME->SXML elem-gi)))) 183 | )))) 184 | ((and with-id? (not (or with-parent? with-xlink?))) ; with-id 185 | (lambda(port) 186 | (lambda (elem-gi attributes namespaces expected-content seed) 187 | (list ; make-seed 188 | '() 189 | (id:new-level-seed-handler (get-id-seed seed)))))) 190 | ((and with-parent? with-id? (not with-xlink?)) ; parent, id 191 | (lambda(port) 192 | (lambda (elem-gi attributes namespaces expected-content seed) 193 | (list ; make-seed 194 | '() 195 | (parent:new-level-seed-handler 196 | (if(symbol? elem-gi) elem-gi (RES-NAME->SXML elem-gi))) 197 | (id:new-level-seed-handler (get-id-seed seed)))))) 198 | ((and with-id? with-xlink? (not with-parent?)) ; id, xlink 199 | (lambda(port) 200 | (lambda (elem-gi attributes namespaces expected-content seed) 201 | (list ; make-seed 202 | '() 203 | (id:new-level-seed-handler (get-id-seed seed)) 204 | (xlink:new-level-seed-handler 205 | port attributes namespaces (get-xlink-seed seed)))))) 206 | ((and with-parent? with-id? with-xlink?) ; parent, id, xlink 207 | (lambda(port) 208 | (lambda (elem-gi attributes namespaces expected-content seed) 209 | (list ; make-seed 210 | '() 211 | (parent:new-level-seed-handler 212 | (if(symbol? elem-gi) elem-gi (RES-NAME->SXML elem-gi))) 213 | (id:new-level-seed-handler (get-id-seed seed)) 214 | (xlink:new-level-seed-handler 215 | port attributes namespaces (get-xlink-seed seed)))))) 216 | (else 217 | (error 'ssax:multi-parser 218 | "new-level NIY: ~s ~s ~s" 219 | with-parent? with-id? with-xlink?)))) 220 | 221 | 222 | ; A special handler function for a FINISH-ELEMENT 223 | (finish-element-handler 224 | (cond 225 | ((not (or with-parent? with-id? with-xlink?)) 226 | (lambda (elem-gi attributes namespaces parent-seed seed) 227 | (let ((children (reverse-collect-str-drop-ws (get-sxml-seed seed))) 228 | (attrs 229 | (attlist-fold 230 | (lambda (attr accum) 231 | (cons (list 232 | (if (symbol? (car attr)) (car attr) 233 | (RES-NAME->SXML (car attr))) 234 | (cdr attr)) accum)) 235 | '() attributes))) 236 | (list ; make-seed 237 | (cons 238 | (cons 239 | (if (symbol? elem-gi) elem-gi 240 | (RES-NAME->SXML elem-gi)) 241 | (if (null? attrs) children 242 | (cons (cons '@ attrs) children))) 243 | (get-sxml-seed parent-seed)))))) 244 | ((and with-parent? (not (or with-id? with-xlink?))) ; parent 245 | (lambda (elem-gi attributes namespaces parent-seed seed) 246 | (let((children (reverse-collect-str-drop-ws (get-sxml-seed seed))) 247 | (attrs 248 | (attlist-fold 249 | (lambda (attr accum) 250 | (cons (list 251 | (if (symbol? (car attr)) (car attr) 252 | (RES-NAME->SXML (car attr))) 253 | (cdr attr)) accum)) 254 | '() attributes))) 255 | (list ; make-seed 256 | (cons 257 | (parent:construct-element 258 | (get-pptr-seed parent-seed) 259 | (get-pptr-seed seed) 260 | attrs children) 261 | (get-sxml-seed parent-seed)) 262 | ; pptr- seed from parent seed is not modified: 263 | (get-pptr-seed parent-seed) 264 | )))) 265 | ((and with-id? (not (or with-parent? with-xlink?))) ; id 266 | (lambda (elem-gi attributes namespaces parent-seed seed) 267 | (let((children (reverse-collect-str-drop-ws (get-sxml-seed seed))) 268 | (attrs 269 | (attlist-fold 270 | (lambda (attr accum) 271 | (cons (list 272 | (if (symbol? (car attr)) (car attr) 273 | (RES-NAME->SXML (car attr))) 274 | (cdr attr)) accum)) 275 | '() attributes))) 276 | (let((element 277 | (cons 278 | (if(symbol? elem-gi) 279 | elem-gi 280 | (RES-NAME->SXML elem-gi)) 281 | (if(null? attrs) 282 | children 283 | (cons (cons '@ attrs) children))))) 284 | (list ; make-seed 285 | (cons element (get-sxml-seed parent-seed)) 286 | (id:finish-element-handler 287 | elem-gi attributes (get-id-seed seed) element)))))) 288 | ((and with-parent? with-id? (not with-xlink?)) ; parent, id 289 | (lambda (elem-gi attributes namespaces parent-seed seed) 290 | (let((children (reverse-collect-str-drop-ws (get-sxml-seed seed))) 291 | (attrs 292 | (attlist-fold 293 | (lambda (attr accum) 294 | (cons (list 295 | (if (symbol? (car attr)) (car attr) 296 | (RES-NAME->SXML (car attr))) 297 | (cdr attr)) accum)) 298 | '() attributes))) 299 | (let((element 300 | (parent:construct-element 301 | (get-pptr-seed parent-seed) (get-pptr-seed seed) 302 | attrs children))) 303 | (list ; make-seed 304 | (cons element (get-sxml-seed parent-seed)) 305 | ; pptr- seed from parent seed is not modified: 306 | (get-pptr-seed parent-seed) 307 | (id:finish-element-handler 308 | elem-gi attributes (get-id-seed seed) element)))))) 309 | ((and with-id? with-xlink? (not with-parent?)) ; id, xlink 310 | (lambda (elem-gi attributes namespaces parent-seed seed) 311 | (let((children (reverse-collect-str-drop-ws (get-sxml-seed seed))) 312 | (attrs 313 | (attlist-fold 314 | (lambda (attr accum) 315 | (cons (list 316 | (if (symbol? (car attr)) (car attr) 317 | (RES-NAME->SXML (car attr))) 318 | (cdr attr)) accum)) 319 | '() attributes))) 320 | (let((element 321 | (cons 322 | (if(symbol? elem-gi) 323 | elem-gi 324 | (RES-NAME->SXML elem-gi)) 325 | (if(null? attrs) 326 | children 327 | (cons (cons '@ attrs) children))))) 328 | (list ; make-seed 329 | (cons element (get-sxml-seed parent-seed)) 330 | (id:finish-element-handler 331 | elem-gi attributes (get-id-seed seed) element) 332 | (xlink:finish-element-handler 333 | (get-xlink-seed parent-seed) 334 | (get-xlink-seed seed) element)))))) 335 | ((and with-parent? with-id? with-xlink?) ; parent, id, xlink 336 | (lambda (elem-gi attributes namespaces parent-seed seed) 337 | (let((children (reverse-collect-str-drop-ws (get-sxml-seed seed))) 338 | (attrs 339 | (attlist-fold 340 | (lambda (attr accum) 341 | (cons (list 342 | (if (symbol? (car attr)) (car attr) 343 | (RES-NAME->SXML (car attr))) 344 | (cdr attr)) accum)) 345 | '() attributes))) 346 | (let((element 347 | (parent:construct-element 348 | (get-pptr-seed parent-seed) (get-pptr-seed seed) 349 | attrs children))) 350 | (list ; make-seed 351 | (cons element (get-sxml-seed parent-seed)) 352 | ; pptr- seed from parent seed is not modified: 353 | (get-pptr-seed parent-seed) 354 | (id:finish-element-handler 355 | elem-gi attributes (get-id-seed seed) element) 356 | (xlink:finish-element-handler 357 | (get-xlink-seed parent-seed) 358 | (get-xlink-seed seed) element)))))) 359 | (else 360 | (error 'ssax:multi-parser 361 | "finish-element: NIY")))) 362 | 363 | 364 | ; A special function 365 | ; Given 'namespaces', it becomes a handler for a DOCTYPE 366 | (doctype-handler 367 | (if 368 | (not with-id?) 369 | (lambda (namespaces) 370 | (lambda (port docname systemid internal-subset? seed) 371 | (when internal-subset? 372 | (ssax:warn port 373 | "Internal DTD subset is not currently handled ") 374 | (ssax:skip-internal-dtd port)) 375 | (ssax:warn port "DOCTYPE DECL " docname " " 376 | systemid " found and skipped") 377 | (values #f '() namespaces seed))) 378 | (cond 379 | ((not (or with-parent? with-xlink?)) ; with-id 380 | (lambda (namespaces) 381 | (lambda (port docname systemid internal-subset? seed) 382 | (values 383 | #f '() namespaces 384 | (list ; make-seed 385 | (get-sxml-seed seed) 386 | (id:doctype-handler port systemid internal-subset?)))))) 387 | ((and with-parent? (not with-xlink?)) ; with-parent, with-id 388 | (lambda (namespaces) 389 | (lambda (port docname systemid internal-subset? seed) 390 | (values 391 | #f '() namespaces 392 | (list ; make-seed 393 | (get-sxml-seed seed) 394 | (get-pptr-seed seed) 395 | (id:doctype-handler port systemid internal-subset?)))))) 396 | ((and (not with-parent?) with-xlink?) ; with-id, with-xlink 397 | (lambda (namespaces) 398 | (lambda (port docname systemid internal-subset? seed) 399 | (values 400 | #f '() namespaces 401 | (list ; make-seed 402 | (get-sxml-seed seed) 403 | (id:doctype-handler port systemid internal-subset?) 404 | (get-xlink-seed seed)))))) 405 | (else ; with-parent, with-id, with-xlink 406 | (lambda (namespaces) 407 | (lambda (port docname systemid internal-subset? seed) 408 | (values 409 | #f '() namespaces 410 | (list ; make-seed 411 | (get-sxml-seed seed) 412 | (get-pptr-seed seed) 413 | (id:doctype-handler port systemid internal-subset?) 414 | (get-xlink-seed seed))))))))) 415 | 416 | ) ; end of letrec 417 | 418 | ; Constructing a special parser function 419 | (lambda (port) 420 | (let 421 | ((namespaces 422 | (map (lambda (el) 423 | (cons* #f (car el) (ssax:uri-string->symbol (cdr el)))) 424 | ns-assig))) 425 | (ending-actions 426 | ((ssax:make-parser 427 | 428 | NEW-LEVEL-SEED 429 | (new-level-seed-handler port) 430 | 431 | FINISH-ELEMENT 432 | finish-element-handler 433 | 434 | CHAR-DATA-HANDLER 435 | (lambda (string1 string2 seed) 436 | (cons 437 | (if(string-null? string2) 438 | (cons string1 (car seed)) 439 | (cons* string2 string1 (car seed))) 440 | (cdr seed))) 441 | 442 | DOCTYPE 443 | (doctype-handler namespaces) 444 | 445 | UNDECL-ROOT 446 | (lambda (elem-gi seed) 447 | (values #f '() namespaces seed)) 448 | 449 | PI 450 | ((*DEFAULT* . (lambda (port pi-tag seed) 451 | (cons 452 | (cons 453 | (list '*PI* pi-tag 454 | (ssax:read-pi-body-as-string port)) 455 | (car seed)) 456 | (cdr seed))))) 457 | ) 458 | port 459 | initial-seed)))))) 460 | )))) 461 | -------------------------------------------------------------------------------- /sxml/ssax/myenv.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require (for-syntax racket/base syntax/stx)) 3 | (provide myenv:error 4 | assert 5 | cout 6 | cerr 7 | nl 8 | ++ 9 | ++! 10 | -- 11 | --! 12 | whennot 13 | push! 14 | cond-expand 15 | inc 16 | dec 17 | cons*) 18 | 19 | ;; $Id: myenv.ss,v 1.14 2002/03/28 22:23:06 nwv Exp $ 20 | ;; $Source: /home/nwv/cvsroot/projects/ssax-plt/myenv.ss,v $ 21 | ;; [ssax-plt] This is a modified version of "official/lib/myenv.scm". 22 | ;(module myenv mzscheme 23 | ; (require (lib "defmacro.ss")) 24 | ; (require (rename (lib "pretty.ss") pp pretty-print)) 25 | 26 | ; My Standard Scheme "Prelude" 27 | ; 28 | ; This version of the prelude contains several forms and procedures 29 | ; that are specific to a Gambit-C 3.0 system. 30 | ; See myenv-scm.scm, myenv-bigloo.scm, etc. for versions 31 | ; of this prelude that are tuned to other Scheme systems. 32 | ; 33 | ; Id: myenv.scm,v 1.2 2001/09/21 19:53:30 oleg Exp 34 | 35 | (define myenv:error error) 36 | 37 | ; assert the truth of an expression (or of a sequence of expressions) 38 | ; 39 | ; syntax: assert ?expr ?expr ... 40 | ; 41 | ; If (and ?expr ?expr ...) evaluates to anything but #f, the result 42 | ; is the value of that expression. 43 | ; If (and ?expr ?expr ...) evaluates to #f, an error is reported. 44 | ; The error message will show the failed expressions, as well 45 | ; as the values of selected variables (or expressions, in general). 46 | 47 | ;; ryanc: The original assert macro here was broken. It sometimes 48 | ;; calls cerr on procedures expecting to print them out, but (cerr 49 | ;; proc) applies proc to stderr port! Also, report: was only used by 50 | ;; assure macro, which was unused, so both assure and report: deleted. 51 | 52 | (define-syntax-rule (assert e ...) 53 | (let ([v (and e ...)]) 54 | (unless v 55 | (fprintf (current-error-port) 56 | "assertion failure: ~s\n" ' 57 | '(assert e ...))) 58 | v)) 59 | 60 | ; like cout << arguments << args 61 | ; where argument can be any Scheme object. If it's a procedure 62 | ; (without args) it's executed rather than printed (like newline) 63 | 64 | (define (cout . args) 65 | (for-each (lambda (x) 66 | (if (procedure? x) (x) (display x))) 67 | args)) 68 | 69 | ;; [ssax-plt] In `cerr', `##stderr' replaced with `(current-error-port)'. 70 | 71 | (define (cerr . args) 72 | (for-each (lambda (x) 73 | (if (procedure? x) 74 | (x (current-error-port)) 75 | (display x (current-error-port)))) 76 | args)) 77 | 78 | ;(##define-macro (nl) '(newline)) 79 | (define nl (string #\newline)) 80 | 81 | ;; [ssax-plt] `##fixnum.' prefix removed. 82 | 83 | ; Some useful increment/decrement operators 84 | ; Note, ##fixnum prefix is Gambit-specific, it means that the 85 | ; operands assumed FIXNUM (as they ought to be anyway). 86 | ; This perfix could be safely removed: it'll leave the code just as 87 | ; correct, but more portable (and less efficient) 88 | 89 | (define-syntax-rule (++! x) (set! x (add1 x))) 90 | (define-syntax-rule (++ x) (add1 x)) 91 | (define-syntax-rule (--! x) (set! x (sub1 x))) 92 | (define-syntax-rule (-- x) (sub1 x)) 93 | 94 | ; Some useful control operators 95 | 96 | ; if condition is false execute stmts in turn 97 | ; and return the result of the last statement 98 | ; otherwise, return #t 99 | ; This primitive is often called 'unless' 100 | (define-syntax-rule (whennot condition . stmts) 101 | (or condition (begin . stmts))) 102 | 103 | ; Prepend an ITEM to a LIST, like a Lisp macro PUSH 104 | ; an ITEM can be an expression, but ls must be a VAR 105 | (define-syntax-rule (push! item ls) 106 | (set! ls (cons item ls))) 107 | 108 | ; Implementation of SRFI-0 109 | ;; ryanc: rewrote, not very robust 110 | (define-syntax (cond-expand stx) 111 | (define (feature-req-satisfied? freq) 112 | (cond [(memq freq '(plt srfi-0 else)) #t] 113 | [(not (pair? freq)) #f] 114 | [(eq? 'and (car freq)) 115 | (andmap feature-req-satisfied? (cdr freq))] 116 | [(eq? 'or (car freq)) 117 | (ormap feature-req-satisfied? (cdr freq))] 118 | [(eq? 'not (car freq)) 119 | (not (feature-req-satisfied? (cadr freq)))] 120 | [else #f])) 121 | (syntax-case stx () 122 | [(ce [freq . body] ...) 123 | (or (for/or ([clause (in-list (syntax->list #'([freq . body] ...)))] 124 | #:when (feature-req-satisfied? (syntax->datum (stx-car clause)))) 125 | #`(begin . #,(stx-cdr clause))) 126 | (raise-syntax-error #f "unsatisfied" stx))])) 127 | 128 | ;============================================================================== 129 | ; DL: this piece of code is taken from the previous version of "myenv.scm" 130 | ; Stubs 131 | 132 | (define-syntax-rule (inc x) (add1 x)) 133 | (define-syntax-rule (dec x) (sub1 x)) 134 | 135 | (define cons* list*) 136 | -------------------------------------------------------------------------------- /sxml/ssax/parse-error.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require racket/contract/base 3 | syntax/readerr 4 | "errors-and-warnings.rkt") 5 | (provide/contract 6 | [parser-error 7 | (->* (port?) () #:rest list? any)] 8 | [ssax:warn 9 | (->* (port?) () #:rest list? any)]) 10 | 11 | ; This code provides informative error messages 12 | ; for SSAX (S)XML parser. 13 | 14 | ;============================================================================== 15 | ; Error handler 16 | 17 | ; According to the SSAX convention this function 18 | ; accepts the port as its first argument which is used for 19 | ; location of the error in input file. 20 | ; Other parameters are considered as error messages, 21 | ; they are printed to stderr as is. 22 | 23 | ;; NB : updated to signal a racket error rather than printing to stdout. 24 | (define (parser-error p . args) 25 | (let-values ([(line col pos) (port-next-location p)]) 26 | (raise-read-error (format "SXML parser error: ~a" (args->display-string args)) 27 | (object-name p) 28 | line col pos #f))) 29 | 30 | ;; map args to their display representations, glue them together: 31 | (define (args->display-string args) 32 | (apply string-append (map (lambda (x) (format "~a" x)) args))) 33 | 34 | (define (ssax:warn p . args) 35 | (sxml:warn 'ssax:warn 36 | "warning at position ~a: ~a" 37 | (file-position p) 38 | (args->display-string args))) 39 | -------------------------------------------------------------------------------- /sxml/ssax/ssax-prim.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require "SSAX-code.ss") 3 | (provide RES-NAME->SXML 4 | reverse-collect-str 5 | reverse-collect-str-drop-ws) 6 | 7 | ;========================================================================= 8 | ; This is a multi parser constructor function 9 | 10 | ;------------------------------------------------ 11 | ; Some Oleg Kiselyov's features from SSAX:XML->SXML 12 | 13 | ; Returns 14 | (define (RES-NAME->SXML res-name) 15 | (string->symbol 16 | (string-append 17 | (symbol->string (car res-name)) 18 | ":" 19 | (symbol->string (cdr res-name))))) 20 | 21 | 22 | ; given the list of fragments (some of which are text strings) 23 | ; reverse the list and concatenate adjacent text strings 24 | (define (reverse-collect-str fragments) 25 | (if (null? fragments) '() ; a shortcut 26 | (let loop ((fragments fragments) (result '()) (strs '())) 27 | (cond 28 | ((null? fragments) 29 | (if (null? strs) result 30 | (cons (apply string-append strs) result))) 31 | ((string? (car fragments)) 32 | (loop (cdr fragments) result (cons (car fragments) strs))) 33 | (else 34 | (loop (cdr fragments) 35 | (cons 36 | (car fragments) 37 | (if (null? strs) result 38 | (cons (apply string-append strs) result))) 39 | '())))))) 40 | 41 | 42 | ; given the list of fragments (some of which are text strings) 43 | ; reverse the list and concatenate adjacent text strings 44 | ; We also drop "unsignificant" whitespace, that is, whitespace 45 | ; in front, behind and between elements. The whitespace that 46 | ; is included in character data is not affected. 47 | (define (reverse-collect-str-drop-ws fragments) 48 | (cond 49 | ((null? fragments) '()) ; a shortcut 50 | ((and (string? (car fragments)) ; another shortcut 51 | (null? (cdr fragments)) ; remove trailing ws 52 | (string-whitespace? (car fragments))) '()) 53 | (else 54 | (let loop ((fragments fragments) (result '()) (strs '()) 55 | (all-whitespace? #t)) 56 | (cond 57 | ((null? fragments) 58 | (if all-whitespace? result ; remove leading ws 59 | (cons (apply string-append strs) result))) 60 | ((string? (car fragments)) 61 | (loop (cdr fragments) result (cons (car fragments) strs) 62 | (and all-whitespace? 63 | (string-whitespace? (car fragments))))) 64 | (else 65 | (loop (cdr fragments) 66 | (cons 67 | (car fragments) 68 | (if all-whitespace? result 69 | (cons (apply string-append strs) result))) 70 | '() #t))))))) 71 | -------------------------------------------------------------------------------- /sxml/ssax/ssax.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | 3 | (require "myenv.rkt") 4 | (require "util.rkt") 5 | (require "parse-error.rkt") 6 | (require "input-parse.rkt") 7 | (require "SSAX-code.rkt") 8 | (require "SXML-tree-trans.rkt") 9 | (require "sxpathlib.rkt") 10 | (require "access-remote.rkt") 11 | (require "id.rkt") 12 | (require "xlink-parser.rkt") 13 | (require "ssax-prim.rkt") 14 | (require "multi-parser.rkt") 15 | (provide (all-from-out "myenv.rkt")) 16 | (provide (all-from-out "util.rkt")) 17 | (provide (all-from-out "parse-error.rkt")) 18 | (provide (all-from-out "input-parse.rkt")) 19 | (provide (all-from-out "SSAX-code.rkt")) 20 | (provide (all-from-out "SXML-tree-trans.rkt")) 21 | (provide (all-from-out "sxpathlib.rkt")) 22 | (provide (all-from-out "access-remote.rkt")) 23 | (provide (all-from-out "id.rkt")) 24 | (provide (all-from-out "xlink-parser.rkt")) 25 | (provide (all-from-out "ssax-prim.rkt")) 26 | (provide (all-from-out "multi-parser.rkt")) 27 | -------------------------------------------------------------------------------- /sxml/ssax/tests/vinput-parse.scm: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require srfi/13/string 4 | "../myenv.rkt" 5 | "../util.rkt" 6 | "../input-parse.rkt" 7 | #;"../look-for-str.rkt" 8 | #;"../catch-error.rkt" 9 | rackunit) 10 | 11 | ;; copied from ssax-tests: ryanc, does this belong here? 12 | ;; ryanc: catch exceptions (simplified from original lib/catch-error.scm) 13 | (define-syntax failed? 14 | (syntax-rules () 15 | ((failed? . stmts) 16 | (with-handlers ([exn? (lambda (e) #t)]) 17 | (let () . stmts) 18 | #f)))) 19 | 20 | ;----------------------------------------------------------------------------- 21 | ; This is a test driver for input parsing functions, to make sure they 22 | ; really work as intended 23 | ; 24 | ; IMPORT 25 | ; appropriate prelude: myenv.scm, myenv-bigloo.scm, myenv-scm.scm 26 | ; depending on your system 27 | ; catch-error.scm -- for procedure, for-syntax 28 | ; util.scm 29 | ; srfi-13-local.scm if no native SRFI-13 support is available. 30 | ; look-for-str.scm 31 | ; input-parse.scm 32 | ; 33 | ; See the Makefile in this directory, which shows how to run 34 | ; this code on a variery of Scheme systems. 35 | ; 36 | ; $Id: vinput-parse.scm,v 1.8 2004/07/08 21:53:33 oleg Exp $ 37 | 38 | 39 | ; This function is imported into the input-parse.scm module 40 | (define (parser-error port msg . specializing-msgs) 41 | (apply error (cons msg specializing-msgs))) 42 | 43 | ; make sure that the 'FORM' gave upon evaluation the 44 | ; EXPECTED-RESULT 45 | 46 | (define-syntax expect 47 | (syntax-rules () 48 | ((expect form expected-result) 49 | (check-equal? form expected-result)))) 50 | 51 | 52 | ; apply FORM to parse the input from the STR 53 | ; and compare the result with the EXPECTED-RESULT 54 | ; EXPECTED-RESULT is a pair: expected result from the 55 | ; form and the expected next character from the stream 56 | (define-syntax expect-parse-result 57 | (syntax-rules () 58 | ((expect-parse-result str form expected-result) 59 | (with-input-from-string 60 | str 61 | (lambda () 62 | (let* ((real-result form) (real-next-char 63 | (read-char (current-input-port)))) 64 | (check-equal? (cons real-result real-next-char) 65 | expected-result))))))) 66 | 67 | ; Build a string out of components 68 | ; A component can be a string, a character, a number 69 | ; (converted into a character), symbols cr and lf 70 | ; We could've used a notation like "abc\n\t" 71 | ; Unfortunately, not all Scheme systems support C-like notation 72 | ; of Scheme strings 73 | (define (s . components) 74 | (apply string-append 75 | (map (lambda (component) 76 | (cond 77 | ((string? component) component) 78 | ((char? component) (string component)) 79 | ((number? component) (string (integer->char component))) 80 | ((eq? 'lf component) (string #\newline)) 81 | ((eq? 'cr component) (string (integer->char 13))) 82 | (else (error "bad component: " component)))) 83 | components))) 84 | 85 | 86 | (test-case 87 | "string->integer" 88 | (let () 89 | (expect (string->integer "" 0 0) #f) 90 | (expect (string->integer "" 0 1) #f) 91 | (expect (string->integer "" 1 0) #f) 92 | (expect (string->integer "1" 0 0) #f) 93 | (expect (string->integer "1" 0 1) 1) 94 | (expect (string->integer "1" 0 2) #f) 95 | (expect (string->integer "1" 1 1) #f) 96 | (expect (string->integer "1" 1 0) #f) 97 | (expect (string->integer "81234" 0 5) 81234) 98 | (expect (string->integer "81234" 1 5) 1234) 99 | (expect (string->integer "81234" -1 5) #f) 100 | (expect (string->integer "81234" 1 6) #f) 101 | (expect (string->integer "81234" 1 4) 123) 102 | (expect (string->integer "81234" 5 4) #f) 103 | (expect (string->integer "81234" 4 4) #f) 104 | (expect (string->integer "81234" 4 5) 4) 105 | (expect (string->integer "-1234" 4 5) 4) 106 | (expect (string->integer "-1234" 1 5) 1234) 107 | (expect (string->integer "-1234" 0 5) #f) 108 | (expect (string->integer "x12+4" 0 5) #f) 109 | (expect (string->integer "x12+4" 0 3) #f) 110 | (expect (string->integer "x12+4" 1 3) 12) 111 | (expect (string->integer "x12+4" 1 4) #f) 112 | )) 113 | 114 | (test-case 115 | "string-split" 116 | (let ((tab " ")) ; This is a string of one tab character 117 | (expect (string-split "") '()) 118 | (expect (string-split "" '()) '()) 119 | (expect (string-split "" '() 0) '()) 120 | (expect (string-split "" '() 10) '()) 121 | (expect (string-split " " '() 0) '()) 122 | (expect (string-split " ") '()) 123 | (expect (string-split (string-append tab " " tab) '() 10) '()) 124 | (expect (string-split "abcd" '() 10) '("abcd")) 125 | (expect (string-split "abcd") '("abcd")) 126 | (expect (string-split " abcd ") '("abcd")) 127 | (expect (string-split " abcd " '() -5) '()) 128 | (expect (string-split " abcd " '() 1) '("abcd ")) 129 | (expect (string-split (string-append " ab" tab "cd ")) '("ab" "cd")) 130 | (expect (string-split (string-append " ab" tab " cd ")) '("ab" "cd")) 131 | (expect (string-split (string-append " ab" tab " cd ") '() 1) 132 | (list (string-append "ab" tab " cd "))) 133 | (expect (string-split (string-append " ab" tab " cd ") '() 2) 134 | '("ab" "cd ")) 135 | (expect (string-split (string-append " ab" tab " cd ") '() 3) 136 | '("ab" "cd")) 137 | (expect (string-split " abc d e f ") '("abc" "d" "e" "f")) 138 | (expect (string-split " abc d e f " '() 1) '("abc d e f ")) 139 | (expect (string-split " abc d e f " '() 3) '("abc" "d" "e f ")) 140 | 141 | (expect (string-split "" '(#\: #\+)) '()) 142 | (expect (string-split "" '(#\: #\+) 0) '()) 143 | (expect (string-split "" '(#\: #\+) 10) '()) 144 | (expect (string-split " " '(#\: #\+)) '(" ")) 145 | (expect (string-split " " '(#\: #\+) 1) '(" ")) 146 | (expect (string-split " " '(#\: #\+) 0) '()) 147 | (expect (string-split ":" '(#\: #\+)) '("" "")) 148 | (expect (string-split "a:" '(#\: #\+)) '("a" "")) 149 | (expect (string-split "a:" '(#\: #\+) 1) '("a:")) 150 | (expect (string-split ":a" '(#\: #\+)) '("" "a")) 151 | (expect (string-split ":a" '(#\: #\+) 1) '(":a")) 152 | (expect (string-split ":" '(#\: #\+) 1) '(":")) 153 | (expect (string-split ":+" '(#\: #\+)) '("" "" "")) 154 | (expect (string-split ":+" '(#\: #\+) 1) '(":+")) 155 | (expect (string-split ":+" '(#\: #\+) 2) '("" "+")) 156 | (expect (string-split ":+" '(#\: #\+) 3) '("" "" "")) 157 | (expect (string-split ":+" '(#\: #\+) 4) '("" "" "")) 158 | (expect (string-split ":abc:d:e:f:" '(#\:)) '("" "abc" "d" "e" "f" "")) 159 | (expect (string-split ":abc:d:e::f:" '(#\:)) '("" "abc" "d" "e" "" "f" "")) 160 | (expect (string-split "root:x:0:0:Lord" '(#\:) 2) '("root" "x:0:0:Lord")) 161 | (expect (string-split "/usr/local/bin:/usr/bin:/usr/ucb/bin" '(#\:)) 162 | '("/usr/local/bin" "/usr/bin" "/usr/ucb/bin")) 163 | (expect (string-split "/usr/local/bin" '(#\/)) '("" "usr" "local" "bin")) 164 | )) 165 | 166 | 167 | (test-case 168 | "make-char-quotator" 169 | (let ((string->goodHTML 170 | (make-char-quotator 171 | '((#\< . "<") (#\> . ">") (#\& . "&") (#\" . """))))) 172 | (expect (string->goodHTML "abc!def ") "abc!def ") 173 | (expect (string->goodHTML "") "") 174 | (expect (string->goodHTML "<") '("<")) 175 | (expect (string->goodHTML "goodHTML "a&b") '("a" "&" "b")) 177 | (expect (string->goodHTML "a b>") '("a b" ">")) 178 | (expect (string->goodHTML "<>&\"") '("<" ">" "&" """)) 179 | (expect (string->goodHTML " <>&\\\"") 180 | '(" " "<" ">" "&" "\\" """)) 181 | (expect (string->goodHTML "&") '("&" "amp;")) 182 | )) 183 | 184 | 185 | (test-case 186 | "assert-curr-char" 187 | (let () 188 | (define (test-assert-curr-char str char-list) 189 | (with-input-from-string str 190 | (lambda () 191 | (assert-curr-char char-list "assert curr char" (current-input-port)) 192 | ))) 193 | 194 | (expect (test-assert-curr-char " abcd" '(#\a #\space)) #\space) 195 | (expect (test-assert-curr-char "a bcd" '(#\a #\space)) #\a) 196 | (assert (failed? (expect (test-assert-curr-char "bacd" '(#\a #\space)) #\a))) 197 | )) 198 | 199 | ;; wow, can't see how to fix these either 200 | #;((test-case 201 | "skipping of characters" 202 | (let ( 203 | (eof (with-input-from-string "" read))) 204 | 205 | ;; calls skip-until-char with a number? 206 | #;(expect-parse-result " abcd" (skip-until-char 1) '(#f . #\a)) 207 | #;(assert (failed? (expect-parse-result " abcd" (skip-until-char 10) '(#f . #f)))) 208 | #;(expect-parse-result " abcd" (skip-until-char 5) `(#f . ,eof)) 209 | (expect-parse-result " abcd" (skip-until-char '(#\a #\space)) '(#\space . #\a)) 210 | (expect-parse-result "xxxc bcd" (skip-until-char '(#\a #\space #\c)) 211 | '(#\c . #\space)) 212 | (expect-parse-result "xxxc" (skip-until-char '(#\a #\space #\c) (current-input-port)) 213 | `(#\c . ,eof)) 214 | (assert (failed? (expect-parse-result "xxxd" 215 | (skip-until-char '(#\a #\space #\c)) '(#f . #f)))) 216 | (expect-parse-result "xxxd" (skip-until-char '(#\a #\space #\c *eof*)) 217 | `(,eof . ,eof)) 218 | (expect-parse-result "xxxc" (skip-until-char '(#\a #\space #\c *eof*)) 219 | `(#\c . ,eof)) 220 | 221 | (expect-parse-result "xxxd" (skip-while '(#\a #\space #\x)) 222 | '(#\d . #\d)) 223 | (expect-parse-result "yxxxd" (skip-while '(#\a #\space #\x)) 224 | '(#\y . #\y)) 225 | (expect-parse-result "xxx" (skip-while '(#\a #\space #\x) (current-input-port)) 226 | `(,eof . ,eof)) 227 | (expect-parse-result "xxa x" (skip-while '(#\a #\space #\x)) 228 | `(,eof . ,eof)) 229 | ))) 230 | 231 | (test-case 232 | "tokenizing of the input stream" 233 | (let ((eof (with-input-from-string "" read))) 234 | ;; apparently there's supposed to be a 2-arg version of next-token? 235 | (define (next-token/2 a b) 236 | (next-token a b "no comment..." (current-input-port))) 237 | 238 | (expect-parse-result "xxxd" 239 | (next-token '(#\a #\space #\x) '(#\d) "next token" (current-input-port)) 240 | '("" . #\d)) 241 | (expect-parse-result "xxx xa cccxd" 242 | (next-token/2 '(#\a #\space #\x) '(#\d)) 243 | '("cccx" . #\d)) 244 | (expect-parse-result "xxx xa cccxdaa" 245 | (next-token/2 '() '(#\d)) 246 | '("xxx xa cccx" . #\d)) 247 | (expect-parse-result "xxx xa cccxdaa" 248 | (next-token/2 '() '(#\d #\a)) 249 | '("xxx x" . #\a)) 250 | (expect-parse-result "cccxd" 251 | (next-token/2 '(#\a #\space #\x) '(#\d)) 252 | '("cccx" . #\d)) 253 | (assert (failed? (expect-parse-result 254 | "cccx" 255 | (next-token '(#\a #\space #\x) '(#\d) "next token" 256 | (current-input-port)) 257 | '(#f . #f)))) 258 | (assert (failed? (expect-parse-result 259 | "cccx" 260 | (next-token/2 '(#\a #\space #\x) '(#\d)) 261 | '(#f . #f)))) 262 | (expect-parse-result "cccx" 263 | (next-token '(#\a #\space #\x) '(#\d *eof*) "" (current-input-port)) 264 | `("cccx" . ,eof)) 265 | (assert (failed? (expect-parse-result 266 | "cccx" 267 | (next-token/2 '(#\c #\space #\x) '(#\d)) 268 | '(#f . #f)))) 269 | )) 270 | 271 | (test-case 272 | "tokenizing, big tokens" 273 | (let* ((eof (with-input-from-string "" read)) 274 | (big-token 275 | (call-with-output-string 276 | (lambda (port) 277 | (do ((i 0 (inc i))) ((>= i 512)) 278 | (display (modulo i 10) port))))) 279 | (big-token1 (string-append 280 | big-token big-token big-token 281 | (substring big-token 0 511))) 282 | (term-list '(#\space #\newline *eof*))) 283 | 284 | (call-with-input-string big-token 285 | (lambda (port) 286 | (let ((token (next-token '(#\space) term-list "" port))) 287 | (assert (equal? token big-token) (eof-object? (peek-char port)))))) 288 | 289 | (call-with-input-string big-token1 290 | (lambda (port) 291 | (let ((token (next-token '() term-list "" port))) 292 | (assert (equal? token big-token1) (eof-object? (read-char port)))))) 293 | 294 | (call-with-input-string (string-append " " big-token " ") 295 | (lambda (port) 296 | (let ((token (next-token '(#\space) term-list "comment" port))) 297 | (assert (equal? token big-token) 298 | (memv (peek-char port) term-list))))) 299 | 300 | (call-with-input-string (string-append big-token1 (string #\newline)) 301 | (lambda (port) 302 | (let ((token (next-token '(#\space) term-list "" port))) 303 | (assert (equal? token big-token1) 304 | (memv (peek-char port) term-list))))) 305 | 306 | (call-with-input-string (string-append big-token) 307 | (lambda (port) 308 | (let ((token (next-token-of 309 | (lambda (c) (and (not (eof-object? c)) c)) port))) 310 | (assert (equal? token big-token) 311 | (eof-object? (peek-char port)))))) 312 | 313 | (call-with-input-string (string-append big-token1 (string #\newline)) 314 | (lambda (port) 315 | (let ((token (next-token-of (string->list "a0123456789") port))) 316 | (assert (equal? token big-token1) 317 | (memv (peek-char port) term-list))))) 318 | )) 319 | 320 | ;; wrong # of args to next-token of, no idea how to fix 321 | #;( 322 | (test-case 323 | "next-token-of" 324 | (let ((eof (with-input-from-string "" read))) 325 | 326 | (expect-parse-result "" (next-token-of '(#\a #\space #\x)) 327 | `("" . ,eof)) 328 | (expect-parse-result "d" (next-token-of '(#\a #\space #\x)) 329 | '("" . #\d)) 330 | (expect-parse-result "a xx " (next-token-of '(#\a #\space #\x)) 331 | `("a xx " . ,eof)) 332 | (expect-parse-result (s "a xx " 'lf) 333 | (next-token-of '(#\a #\space #\x) (current-input-port)) 334 | '("a xx " . #\newline)) 335 | (expect-parse-result (s "a " 'cr " xx ") (next-token-of '(#\a #\space #\x)) 336 | (cons "a " (integer->char 13))) 337 | (expect-parse-result (s 'lf "a " 'cr " xx ") 338 | (next-token-of '(#\a #\space #\x)) 339 | '("" . #\newline)) 340 | 341 | (expect-parse-result "" 342 | (next-token-of (lambda (c) (and (not (eof-object? c)) c))) 343 | `("" . ,eof)) 344 | (expect-parse-result (s "123" 'lf 'cr 0 "!") 345 | (next-token-of (lambda (c) (and (not (eof-object? c)) c))) 346 | `(,(s "123" 'lf 'cr 0 "!") . ,eof)) 347 | 348 | (let ((down-pred 349 | (lambda (c) 350 | (cond ((eof-object? c) #f) 351 | ((char-alphabetic? c) (char-downcase c)) 352 | (else #f))))) 353 | 354 | (expect-parse-result "" (next-token-of down-pred) 355 | `("" . ,eof)) 356 | (expect-parse-result "12abc" (next-token-of down-pred) 357 | '("" . #\1)) 358 | (expect-parse-result "abc12" 359 | (next-token-of down-pred (current-input-port)) 360 | '("abc" . #\1)) 361 | (expect-parse-result "aB c12" (next-token-of down-pred) 362 | '("ab" . #\space)) 363 | (expect-parse-result "XYZ" (next-token-of down-pred) 364 | `("xyz" . ,eof)) 365 | ) 366 | ))) 367 | 368 | ;; really can't find any definition or reference to read-text-line... 369 | #;((test-case 370 | "read-text-line" 371 | (let ((eof (with-input-from-string "" read))) 372 | 373 | (expect-parse-result "" (read-text-line) 374 | `(,eof . ,eof)) 375 | (expect-parse-result "a 1 % xx" (read-text-line) 376 | `("a 1 % xx" . ,eof)) 377 | (expect-parse-result (s 'lf) (read-text-line) 378 | `("" . ,eof)) 379 | (expect-parse-result (s 'cr) (read-text-line) 380 | `("" . ,eof)) 381 | (expect-parse-result (s 'cr 'lf) (read-text-line) 382 | `("" . ,eof)) 383 | (expect-parse-result (s 'cr 'cr 'lf) (read-text-line (current-input-port)) 384 | (cons "" (integer->char 13))) 385 | (expect-parse-result (s 'lf 'lf) (read-text-line) 386 | '("" . #\newline)) 387 | (expect-parse-result (s #\space 'lf 'cr 'lf) (read-text-line) 388 | (cons " " (integer->char 13))) 389 | (expect-parse-result (s " 12" 'lf "3" 'cr 'lf) (read-text-line) 390 | '(" 12" . #\3)) 391 | (expect-parse-result (s " 12 " 'cr "3" 'cr 'lf) (read-text-line) 392 | '(" 12 " . #\3)) 393 | (expect-parse-result (s " 12 " 'cr 'lf " 4" 'cr 'lf) 394 | (read-text-line (current-input-port)) 395 | '(" 12 " . #\space)) 396 | (expect-parse-result (s " 12 " 'cr 'lf 'cr 'lf) (read-text-line) 397 | (cons " 12 " (integer->char 13))) 398 | ))) 399 | 400 | ;; tests fail. looks like now parse returns # on parse of empty 401 | ;; string? 402 | #;((test-case 403 | "read-string" 404 | (let ((eof (with-input-from-string "" read))) 405 | 406 | (expect-parse-result "" (read-string 1) 407 | `("" . ,eof)) 408 | (expect-parse-result "" (read-string 0) 409 | `("" . ,eof)) 410 | (expect-parse-result "1234" (read-string 0) 411 | '("" . #\1)) 412 | (expect-parse-result "1234" (read-string -10) 413 | '("" . #\1)) 414 | (expect-parse-result (s 'lf "1234 " 'cr) 415 | (read-string 1 (current-input-port)) 416 | (cons (s 'lf) #\1)) 417 | (expect-parse-result (s 'lf "1234 " 'cr) (read-string 3) 418 | (cons (s 'lf "12") #\3)) 419 | (expect-parse-result (s 'lf "1234 " 'cr) (read-string 7) 420 | (cons (s 'lf "1234 " 'cr) eof)) 421 | (expect-parse-result (s 'lf "1234 " 'cr) 422 | (read-string 8 (current-input-port)) 423 | (cons (s 'lf "1234 " 'cr) eof)) 424 | (expect-parse-result (s 'lf "1234 " 'cr) (read-string 100) 425 | (cons (s 'lf "1234 " 'cr) eof)) 426 | ))) 427 | 428 | 429 | (test-case 430 | "find-string-from-port" 431 | (let ((eof (with-input-from-string "" read))) 432 | 433 | ;; these tests expect a version of "find-string-from-port?" 434 | ;; that accepts an extra argument. Perhaps a limit on chars 435 | ;; read? function doesn't actually seem to be used other than 436 | ;; as a predicate in the main code. 437 | #;(expect-parse-result "bacacabd" 438 | (find-string-from-port? "acab" (current-input-port) 100) 439 | '(7 . #\d)) 440 | #;(expect-parse-result "bacacabd" 441 | (find-string-from-port? "acad" (current-input-port) 100) 442 | `(#f . ,eof)) 443 | #;(expect-parse-result "bacacabd" 444 | (find-string-from-port? "bd" (current-input-port) 8) 445 | `(8 . ,eof)) 446 | #;(expect-parse-result "bacacabd" 447 | (find-string-from-port? "be" (current-input-port) 20) 448 | `(#f . ,eof)) 449 | 450 | #;(expect-parse-result "bacacabd" 451 | (find-string-from-port? "bd" (current-input-port) 5) 452 | '(#f . #\a)) 453 | #;(expect-parse-result "bacacabd" 454 | (find-string-from-port? "bd" (current-input-port) 9) 455 | `(8 . ,eof)) 456 | 457 | (expect-parse-result "bacacabd" 458 | (find-string-from-port? "acab" (current-input-port)) 459 | '(7 . #\d)) 460 | 461 | (expect-parse-result "bacacabd" 462 | (find-string-from-port? "acad" (current-input-port)) 463 | `(#f . ,eof)) 464 | 465 | (expect-parse-result "bacacabd" 466 | (find-string-from-port? "bd" (current-input-port)) 467 | `(8 . ,eof)) 468 | 469 | )) 470 | -------------------------------------------------------------------------------- /sxml/ssax/util.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require racket/list 3 | srfi/13/string 4 | "myenv.ss") 5 | (provide (all-defined-out)) 6 | 7 | ;**************************************************************************** 8 | ; My Scheme misc utility functions 9 | ; (mainly dealing with string and list manipulations) 10 | ; 11 | ; myenv.scm, myenv-bigloo.scm or similar prelude is assumed. 12 | ; From SRFI-13, import many functions 13 | ; If a particular implementation lacks SRFI-13 support, please 14 | ; include the file srfi-13-local.scm 15 | ; 16 | ; $Id: util.scm,v 1.5 2004/07/07 16:02:31 sperber Exp $ 17 | 18 | ;------------------------------------------------------------------------ 19 | ; Iterator ANY? 20 | ; 21 | ; -- procedure+: any? PRED COLLECTION 22 | ; Searches for the first element in the collection satisfying a 23 | ; given predicate 24 | ; That is, the procedure applies PRED to every element of the 25 | ; COLLECTION in turn. 26 | ; The first element for which PRED returns non-#f stops the iteration; 27 | ; the value of the predicate is returned. 28 | ; If none of the elements of the COLLECTION satisfy the predicate, 29 | ; the return value from the procedure is #f 30 | ; COLLECTION can be a list, a vector, a string, or an input port. 31 | ; See vmyenv.scm for validation tests. 32 | (define (any? pred? coll) 33 | (cond [(list? coll) 34 | (ormap pred? coll)] 35 | [(vector? coll) 36 | (for/or ([x (in-vector coll)]) (pred? x))] 37 | [(string? coll) 38 | (for/or ([x (in-string coll)]) (pred? x))] 39 | [(input-port? coll) 40 | (for/or ([x (in-port read-char coll)]) (pred? x))] 41 | [else (error 'any? "invalid collection")])) 42 | 43 | 44 | ;------------------------------------------------------------------------ 45 | ; Some list manipulation functions 46 | 47 | ; -- procedure+: list-intersperse SRC-L ELEM 48 | ; inserts ELEM between elements of the SRC-L, returning a freshly allocated 49 | ; list (cells, that is) 50 | 51 | (define (list-intersperse src-l elem) 52 | (add-between src-l elem)) 53 | 54 | 55 | ; List-tail-difference: given two lists, list1 and list2 where 56 | ; list2 is presumably a tail of list1, return 57 | ; a (freshly allocated) list which is a difference between list1 58 | ; and list2. If list2 is *not* a tail of list1, the entire list1 59 | ; is returned. 60 | (define (list-tail-diff list1 list2) 61 | (let loop ((l1-curr list1) (difference '())) 62 | (cond 63 | ((eq? l1-curr list2) (reverse difference)) 64 | ((null? l1-curr) (reverse difference)) 65 | (else (loop (cdr l1-curr) (cons (car l1-curr) difference)))))) 66 | 67 | 68 | ;------------------------------------------------------------------------ 69 | ; String utilities 70 | ; See SRFI-13 or srfi-13-local.scm 71 | 72 | 73 | ; Return the index of the last occurence of a-char in str, or #f 74 | ; See SRFI-13 75 | (define string-rindex string-index-right) 76 | 77 | ; -- procedure+: substring? PATTERN STRING 78 | ; Searches STRING to see if it contains the substring PATTERN. 79 | ; Returns the index of the first substring of STRING that is equal 80 | ; to PATTERN; or `#f' if STRING does not contain PATTERN. 81 | ; 82 | ; (substring? "rat" "pirate") => 2 83 | ; (substring? "rat" "outrage") => #f 84 | ; (substring? "" any-string) => 0 85 | (define (substring? pattern str) (string-contains str pattern)) 86 | 87 | 88 | ; -- procedure+: string->integer STR START END 89 | ; 90 | ; Makes sure a substring of the STR from START (inclusive) till END 91 | ; (exclusive) is a representation of a non-negative integer in decimal 92 | ; notation. If so, this integer is returned. Otherwise -- when the 93 | ; substring contains non-decimal characters, or when the range from 94 | ; START till END is not within STR, the result is #f. 95 | ; 96 | ; This procedure is a simplification of the standard string->number. 97 | ; The latter is far more generic: for example, it will try to read 98 | ; strings like "1/2" "1S2" "1.34" and even "1/0" (the latter causing 99 | ; a zero-divide error). Note that to string->number, "1S2" is a valid 100 | ; representation of an _inexact_ integer (100 to be precise). 101 | ; Oftentimes we want to be more restrictive about what we consider a 102 | ; number; we want merely to read an integral label. 103 | (define (string->integer str start end) 104 | (and (< -1 start end (inc (string-length str))) 105 | (let loop ((pos start) (accum 0)) 106 | (cond 107 | ((>= pos end) accum) 108 | ((char-numeric? (string-ref str pos)) 109 | (loop (inc pos) (+ (char->integer (string-ref str pos)) 110 | (- (char->integer #\0)) (* 10 accum)))) 111 | (else #f))))) 112 | 113 | 114 | ; -- procedure+: string-split STRING 115 | ; -- procedure+: string-split STRING '() 116 | ; -- procedure+: string-split STRING '() MAXSPLIT 117 | ; 118 | ; Returns a list of whitespace delimited words in STRING. 119 | ; If STRING is empty or contains only whitespace, then the empty list 120 | ; is returned. Leading and trailing whitespaces are trimmed. 121 | ; If MAXSPLIT is specified and positive, the resulting list will 122 | ; contain at most MAXSPLIT elements, the last of which is the string 123 | ; remaining after (MAXSPLIT - 1) splits. If MAXSPLIT is specified and 124 | ; non-positive, the empty list is returned. "In time critical 125 | ; applications it behooves you not to split into more fields than you 126 | ; really need." 127 | ; 128 | ; -- procedure+: string-split STRING CHARSET 129 | ; -- procedure+: string-split STRING CHARSET MAXSPLIT 130 | ; 131 | ; Returns a list of words delimited by the characters in CHARSET in 132 | ; STRING. CHARSET is a list of characters that are treated as delimiters. 133 | ; Leading or trailing delimeters are NOT trimmed. That is, the resulting 134 | ; list will have as many initial empty string elements as there are 135 | ; leading delimiters in STRING. 136 | ; 137 | ; If MAXSPLIT is specified and positive, the resulting list will 138 | ; contain at most MAXSPLIT elements, the last of which is the string 139 | ; remaining after (MAXSPLIT - 1) splits. If MAXSPLIT is specified and 140 | ; non-positive, the empty list is returned. "In time critical 141 | ; applications it behooves you not to split into more fields than you 142 | ; really need." 143 | ; 144 | ; This is based on the split function in Python/Perl 145 | ; 146 | ; (string-split " abc d e f ") ==> ("abc" "d" "e" "f") 147 | ; (string-split " abc d e f " '() 1) ==> ("abc d e f ") 148 | ; (string-split " abc d e f " '() 0) ==> () 149 | ; (string-split ":abc:d:e::f:" '(#\:)) ==> ("" "abc" "d" "e" "" "f" "") 150 | ; (string-split ":" '(#\:)) ==> ("" "") 151 | ; (string-split "root:x:0:0:Lord" '(#\:) 2) ==> ("root" "x:0:0:Lord") 152 | ; (string-split "/usr/local/bin:/usr/bin:/usr/ucb/bin" '(#\:)) 153 | ; ==> ("/usr/local/bin" "/usr/bin" "/usr/ucb/bin") 154 | ; (string-split "/usr/local/bin" '(#\/)) ==> ("" "usr" "local" "bin") 155 | 156 | (define (string-split str . rest) 157 | ; maxsplit is a positive number 158 | (define (split-by-whitespace str maxsplit) 159 | (define (skip-ws i yet-to-split-count) 160 | (cond 161 | ((>= i (string-length str)) '()) 162 | ((char-whitespace? (string-ref str i)) 163 | (skip-ws (inc i) yet-to-split-count)) 164 | (else (scan-beg-word (inc i) i yet-to-split-count)))) 165 | (define (scan-beg-word i from yet-to-split-count) 166 | (cond 167 | ((zero? yet-to-split-count) 168 | (cons (substring str from (string-length str)) '())) 169 | (else (scan-word i from yet-to-split-count)))) 170 | (define (scan-word i from yet-to-split-count) 171 | (cond 172 | ((>= i (string-length str)) 173 | (cons (substring str from i) '())) 174 | ((char-whitespace? (string-ref str i)) 175 | (cons (substring str from i) 176 | (skip-ws (inc i) (- yet-to-split-count 1)))) 177 | (else (scan-word (inc i) from yet-to-split-count)))) 178 | (skip-ws 0 (- maxsplit 1))) 179 | 180 | ; maxsplit is a positive number 181 | ; str is not empty 182 | (define (split-by-charset str delimeters maxsplit) 183 | (define (scan-beg-word from yet-to-split-count) 184 | (cond 185 | ((>= from (string-length str)) '("")) 186 | ((zero? yet-to-split-count) 187 | (cons (substring str from (string-length str)) '())) 188 | (else (scan-word from from yet-to-split-count)))) 189 | (define (scan-word i from yet-to-split-count) 190 | (cond 191 | ((>= i (string-length str)) 192 | (cons (substring str from i) '())) 193 | ((memq (string-ref str i) delimeters) 194 | (cons (substring str from i) 195 | (scan-beg-word (inc i) (- yet-to-split-count 1)))) 196 | (else (scan-word (inc i) from yet-to-split-count)))) 197 | (scan-beg-word 0 (- maxsplit 1))) 198 | 199 | ; resolver of overloading... 200 | ; if omitted, maxsplit defaults to 201 | ; (inc (string-length str)) 202 | (if (string-null? str) '() 203 | (if (null? rest) 204 | (split-by-whitespace str (inc (string-length str))) 205 | (let ((charset (car rest)) 206 | (maxsplit 207 | (if (pair? (cdr rest)) (cadr rest) (inc (string-length str))))) 208 | (cond 209 | ((not (positive? maxsplit)) '()) 210 | ((null? charset) (split-by-whitespace str maxsplit)) 211 | (else (split-by-charset str charset maxsplit))))))) 212 | 213 | 214 | ; make-char-quotator QUOT-RULES 215 | ; 216 | ; Given QUOT-RULES, an assoc list of (char . string) pairs, return 217 | ; a quotation procedure. The returned quotation procedure takes a string 218 | ; and returns either a string or a list of strings. The quotation procedure 219 | ; check to see if its argument string contains any instance of a character 220 | ; that needs to be encoded (quoted). If the argument string is "clean", 221 | ; it is returned unchanged. Otherwise, the quotation procedure will 222 | ; return a list of string fragments. The input straing will be broken 223 | ; at the places where the special characters occur. The special character 224 | ; will be replaced by the corresponding encoding strings. 225 | ; 226 | ; For example, to make a procedure that quotes special HTML characters, 227 | ; do 228 | ; (make-char-quotator 229 | ; '((#\< . "<") (#\> . ">") (#\& . "&") (#\" . """))) 230 | 231 | (define (make-char-quotator char-encoding) 232 | (let ((bad-chars (map car char-encoding))) 233 | 234 | ; Check to see if str contains one of the characters in charset, 235 | ; from the position i onward. If so, return that character's index. 236 | ; otherwise, return #f 237 | (define (index-cset str i charset) 238 | (let loop ((i i)) 239 | (and (< i (string-length str)) 240 | (if (memv (string-ref str i) charset) i 241 | (loop (inc i)))))) 242 | 243 | ; The body of the function 244 | (lambda (str) 245 | (let ((bad-pos (index-cset str 0 bad-chars))) 246 | (if (not bad-pos) str ; str had all good chars 247 | (let loop ((from 0) (to bad-pos)) 248 | (cond 249 | ((>= from (string-length str)) '()) 250 | ((not to) 251 | (cons (substring str from (string-length str)) '())) 252 | (else 253 | (let ((quoted-char 254 | (cdr (assv (string-ref str to) char-encoding))) 255 | (new-to 256 | (index-cset str (inc to) bad-chars))) 257 | (if (< from to) 258 | (cons 259 | (substring str from to) 260 | (cons quoted-char (loop (inc to) new-to))) 261 | (cons quoted-char (loop (inc to) new-to)))))))))))) 262 | -------------------------------------------------------------------------------- /sxml/sxpath.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require "sxml-tools.rkt" 3 | "ssax/sxpathlib.rkt" 4 | srfi/2 5 | "ssax/errors-and-warnings.rkt" 6 | "sxpath-ext.rkt" 7 | "txpath.rkt") 8 | (provide (all-defined-out)) 9 | 10 | ;; $Id: sxpath.scm,v 1.5 2005/09/07 09:27:34 lizorkin Exp $ 11 | ;; Highghest level SXPath 12 | ;; Refactored from sxml-tools.scm and sxpathlib.scm 13 | 14 | ;============================================================================== 15 | ; Abbreviated SXPath 16 | 17 | ; Evaluate an abbreviated SXPath 18 | ; sxpath:: AbbrPath -> Converter, or 19 | ; sxpath:: AbbrPath -> Node|Nodeset -> Nodeset 20 | ; AbbrPath is a list. It is translated to the full SXPath according 21 | ; to the following rewriting rules 22 | ; (sxpath '()) -> (node-join) 23 | ; (sxpath '(path-component ...)) -> 24 | ; (node-join (sxpath1 path-component) (sxpath '(...))) 25 | ; (sxpath1 '//) -> (sxml:descendant-or-self sxml:node?) 26 | ; (sxpath1 '(equal? x)) -> (select-kids (node-equal? x)) 27 | ; (sxpath1 '(eq? x)) -> (select-kids (node-eq? x)) 28 | ; (sxpath1 '(*or* ...)) -> (select-kids (ntype-names?? 29 | ; (cdr '(*or* ...)))) 30 | ; (sxpath1 '(*not* ...)) -> (select-kids (sxml:complement 31 | ; (ntype-names?? 32 | ; (cdr '(*not* ...))))) 33 | ; (sxpath1 '(ns-id:* x)) -> (select-kids 34 | ; (ntype-namespace-id?? x)) 35 | ; (sxpath1 ?symbol) -> (select-kids (ntype?? ?symbol)) 36 | ; (sxpath1 ?string) -> (txpath ?string) 37 | ; (sxpath1 procedure) -> procedure 38 | ; (sxpath1 '(?symbol ...)) -> (sxpath1 '((?symbol) ...)) 39 | ; (sxpath1 '(path reducer ...)) -> 40 | ; (node-reduce (sxpath path) (sxpathr reducer) ...) 41 | ; (sxpathr number) -> (node-pos number) 42 | ; (sxpathr path-filter) -> (filter (sxpath path-filter)) 43 | (define (sxpath path [ns-binding null]) 44 | (let loop ((converters '()) 45 | (root-vars '()) ; a list of booleans, one per location step: 46 | ; #t - location step function is binary 47 | ; #f - location step function is unary 48 | (path (if (string? path) (list path) path))) 49 | (cond 50 | ((null? path) ; parsing is finished 51 | (lambda (node [var-binding null]) 52 | (let () 53 | (let rpt ((nodeset (as-nodeset node)) 54 | (conv (reverse converters)) 55 | (r-v (reverse root-vars))) 56 | (if 57 | (null? conv) ; the path is over 58 | nodeset 59 | (rpt 60 | (if (car r-v) ; the current converter consumes 2 arguments 61 | ((car conv) nodeset var-binding) 62 | ((car conv) nodeset)) 63 | (cdr conv) 64 | (cdr r-v))))))) 65 | ; *or* handler 66 | ((and (pair? (car path)) 67 | (not (null? (car path))) 68 | (eq? '*or* (caar path))) 69 | (loop (cons (select-kids (ntype-names?? (cdar path))) converters) 70 | (cons #f root-vars) 71 | (cdr path))) 72 | ; *not* handler 73 | ((and (pair? (car path)) 74 | (not (null? (car path))) 75 | (eq? '*not* (caar path))) 76 | (loop (cons 77 | (select-kids (sxml:complement (ntype-names?? (cdar path)))) 78 | converters) 79 | (cons #f root-vars) 80 | (cdr path))) 81 | ((procedure? (car path)) 82 | (loop (cons (car path) converters) 83 | (cons #t root-vars) 84 | (cdr path))) 85 | ((eq? '// (car path)) 86 | (if (or (null? (cdr path)) 87 | (not (symbol? (cadr path))) 88 | (eq? (cadr path) '@)) 89 | (loop (cons (sxml:descendant-or-self sxml:node?) 90 | converters) 91 | (cons #f root-vars) 92 | (cdr path)) 93 | (loop (cons (sxml:descendant (ntype?? (cadr path))) 94 | converters) 95 | (cons #f root-vars) 96 | (cddr path)))) 97 | ((symbol? (car path)) 98 | (loop (cons (select-kids (ntype?? (car path))) converters) 99 | (cons #f root-vars) 100 | (cdr path))) 101 | ((string? (car path)) 102 | (and-let* 103 | ((f (sxml:xpath-expr (car path) ns-binding))) ; DL: was: txpath 104 | (loop (cons f converters) 105 | (cons #t root-vars) 106 | (cdr path)))) 107 | ((and (pair? (car path)) (eq? 'equal? (caar path))) 108 | (loop (cons (select-kids (apply node-equal? (cdar path))) converters) 109 | (cons #f root-vars) 110 | (cdr path))) 111 | ; ns-id:* handler 112 | ((and (pair? (car path)) (eq? 'ns-id:* (caar path))) 113 | (loop 114 | (cons (select-kids (ntype-namespace-id?? (cadar path))) converters) 115 | (cons #f root-vars) 116 | (cdr path))) 117 | ((and (pair? (car path)) (eq? 'eq? (caar path))) 118 | (loop (cons (select-kids (apply node-eq? (cdar path))) converters) 119 | (cons #f root-vars) 120 | (cdr path))) 121 | ((pair? (car path)) 122 | (and-let* 123 | ((select 124 | (if 125 | (symbol? (caar path)) 126 | (lambda (node [var-binding null]) ;; ryanc: unused!! 127 | ((select-kids (ntype?? (caar path))) node)) 128 | (sxpath (caar path) ns-binding)))) 129 | (let reducer ((reducing-path (cdar path)) 130 | (filters '())) 131 | (cond 132 | ((null? reducing-path) 133 | (loop 134 | (cons 135 | (lambda (node var-binding) 136 | (map-union 137 | (lambda (node) 138 | (let label ((nodeset (select node var-binding)) 139 | (fs (reverse filters))) 140 | (if 141 | (null? fs) 142 | nodeset 143 | (label 144 | ((car fs) nodeset var-binding) 145 | (cdr fs))))) 146 | (if (nodeset? node) node (list node)))) 147 | converters) 148 | (cons #t root-vars) 149 | (cdr path))) 150 | ((number? (car reducing-path)) 151 | (reducer 152 | (cdr reducing-path) 153 | (cons 154 | (lambda (node var-binding) 155 | ((node-pos (car reducing-path)) node)) 156 | filters))) 157 | (else 158 | (and-let* 159 | ((func (sxpath (car reducing-path) ns-binding))) 160 | (reducer 161 | (cdr reducing-path) 162 | (cons 163 | (lambda (node var-binding) 164 | ((sxml:filter 165 | (lambda (n) (func n var-binding))) 166 | node)) 167 | filters)))))))) 168 | (else 169 | (sxml:warn 'sxpath "invalid path step: ~e" (car path)) 170 | #f)))) 171 | 172 | 173 | ;============================================================================== 174 | ; Wrappers 175 | 176 | ; sxpath always returns a list, which is #t in Scheme 177 | ; if-sxpath returns #f instead of empty list 178 | (define (if-sxpath path) 179 | (lambda (obj) 180 | (let ((x ((sxpath path) obj))) 181 | (if (null? x) #f x)))) 182 | 183 | ; Returns first node found, if any. 184 | ; Otherwise returns #f. 185 | (define (if-car-sxpath path) 186 | (lambda (obj) 187 | (let ((x ((sxpath path) obj))) 188 | (if (null? x) #f (car x))))) 189 | 190 | ; Returns first node found, if any. 191 | ; Otherwise returns empty list. 192 | (define (car-sxpath path) 193 | (lambda (obj) 194 | (let ((x ((sxpath path) obj))) 195 | (if (null? x) '() (car x))))) 196 | 197 | ;============================================================================== 198 | ; lookup by a value of ID type attribute 199 | ; See also sxml:lookup in sxml-tools 200 | 201 | ; Built an index as a list of (ID_value . element) pairs for given 202 | ; node. lpaths are location paths for attributes of type ID. 203 | (define (sxml:id-alist node . lpaths) 204 | (apply 205 | append 206 | (map 207 | (lambda(lp) 208 | (let ((lpr (reverse lp))) 209 | (map 210 | (lambda (nd) 211 | (cons (sxml:attr nd (car lpr)) 212 | nd)) 213 | ; Selects elements with ID attributes 214 | ; using (lpath ,(node-self (sxpath '(@ attrname)))) 215 | ((sxpath (reverse (cons 216 | (lambda(n r+v) 217 | ((node-self (sxpath `(@ ,(car lpr)))) n)) 218 | (cddr lpr)))) node)) 219 | )) 220 | lpaths))) 221 | -------------------------------------------------------------------------------- /sxml/tests/README: -------------------------------------------------------------------------------- 1 | This directory is for tests. 2 | 3 | tests.rkt -- a few tests, all pass. 4 | ssax-test.rkt -- a very noisy test suite. All pass. 5 | 6 | other files -- random test-related stuff pulled in from the main tree. -------------------------------------------------------------------------------- /sxml/tests/tests.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (module+ test 4 | (require "../main.rkt" 5 | rackunit 6 | rackunit/text-ui) 7 | ;; this set of tests is wildly inadequate. 8 | 9 | ;; call xml->sxml with a port created from a string 10 | (define (read-from-string str [ns '()]) 11 | (ssax:xml->sxml (open-input-string str) ns)) 12 | 13 | (define basic-tests 14 | (test-suite "basic ssax tests" 15 | (test-equal? "a plain tag" 16 | (read-from-string "") 17 | `(*TOP* (a))) 18 | (test-equal? "a default namespace" 19 | (read-from-string "") 20 | `(*TOP* (gooble:a))) 21 | (test-equal? "a default namespace applies to sub-elements" 22 | (read-from-string "") 23 | `(*TOP* (gooble:a (gooble:b)))) 24 | (test-equal? "a non-default namespace has to be triggered explicitly" 25 | (read-from-string "") 26 | `(*TOP* (a (b)))) 27 | (test-equal? "here's how you use a non-default namespace" 28 | (read-from-string "") 29 | `(*TOP* (gooble:a (gooble:b)))) 30 | (test-equal? "non-default namespace is non-default" 31 | (read-from-string "") 32 | `(*TOP* (gooble:a (b)))) 33 | (test-equal? "you can use the namespace argument to shorten the prefix tag" 34 | (read-from-string "" 35 | `((g . "gooble"))) 36 | `(*TOP* (@ (*NAMESPACES* (g "gooble"))) 37 | (g:a (b)))) 38 | (test-equal? "empty tags are indistinguishable from tags with empty strings" 39 | (read-from-string "") 40 | `(*TOP* (a))) 41 | (test-equal? "empty tag 2" 42 | (read-from-string "") 43 | `(*TOP* (a))) 44 | (test-equal? "by default, the SSAX reader discards ... whitespace-only strings?" 45 | (read-from-string "\n") 46 | `(*TOP* (a))) 47 | (test-equal? "discard whitespace 2" 48 | (read-from-string "\nt\n") 49 | `(*TOP* (a "\nt\n"))) 50 | (test-exn "parsing empty string fails" 51 | #rx"unexpected EOF" 52 | (lambda () (ssax:xml->sxml (open-input-string "") '()))) 53 | (test-equal? "serialization 1" 54 | (srl:sxml->xml `(*TOP* (p))) 55 | "

") 56 | (test-exn "serialization bad" 57 | (lambda (e) #t) 58 | (lambda () (srl:sxml->xml '(foo (@ (bar (13))))))) 59 | (let ([temp (make-temporary-file)]) 60 | (test-not-exn "serialization accepts paths" 61 | (lambda () (and (path? temp) 62 | (not (string? temp)) 63 | (delete-file temp) 64 | (srl:sxml->xml `(*TOP* (p)) temp))))))) 65 | 66 | (run-tests basic-tests)) 67 | -------------------------------------------------------------------------------- /sxml/tests/vSXML-to-HTML.rkt: -------------------------------------------------------------------------------- 1 | ; Validation code for SXML-to-HTML.scm 2 | ; 3 | ; IMPORT 4 | ; SXML-to-HTML.scm and all of its imports 5 | ; ,open sxml-to-html sxml-tree-trans coutputs assertions with-output-to-string srfi-23 6 | ; 7 | ; $Id: vSXML-to-HTML.scm,v 1.3 2004/07/07 16:02:31 sperber Exp $ 8 | 9 | ; equal-strs? LIST-OF-PRINTABLES STRING 10 | ; Check to make sure that the result of writing out LIST-OF-PRINTABLES 11 | ; is the same as STRING 12 | ; LIST-OF-PRINTABLES can include strings, characters and numbers 13 | 14 | #lang racket/base 15 | 16 | (module+ test 17 | (require racket/port 18 | rackunit 19 | rackunit/text-ui 20 | "../ssax/SXML-tree-trans.rkt" 21 | (only-in "../sxml-tools.rkt" 22 | [sxml:string->html string->goodHTML]) 23 | (only-in "../serializer.rkt" 24 | [srl:sxml->html sxml->html])) 25 | 26 | ;; STATUS: tests mostly fail 27 | ;; 2016: tests still mostly fail... but could be fixed, by 28 | ;; someone more confident about what sxml->html should produce 29 | 30 | (define (display-to-str strs) 31 | (with-output-to-string 32 | (lambda () 33 | (for-each display strs)))) 34 | 35 | (run-tests 36 | (test-suite "SXML-to-HTML" 37 | (test-case "test 1" 38 | (letrec ((gen (lambda (test-val) 39 | (sxml->html 40 | `(p "par1" "par2" 41 | ,@(if test-val (list "par3" "par4") '())))))) 42 | (check-equal? (gen #t) "

par1par2par3par4

") 43 | (check-equal? (gen #t) "

par1par2par3par4

") 44 | (check-equal? (gen #f) "

par1par2

"))) 45 | (test-case 46 | "test 2" 47 | (check-equal? (sxml->html '(p "&")) "

&

") 48 | (check-equal? (sxml->html '(p (@ (align "center")) "bad chars:" "<>&\"")) 49 | ;; changed to match provided... double-quote char looks 50 | ;; okay to me. 51 | "

bad chars:<>&\"

") 52 | ;; wait... you don't need quoting there, do you? 53 | #;(check-equal? (sxml->html '(p (@ (align "center") (atr "")) 54 | "bad chars:" (em "<>&\""))) 55 | "

bad chars: 56 | <>&"

") 57 | (check-equal? (sxml->html '(p (@ (align "center") (atr "\"text\"")) (br) 58 | (ul (@ (compact)) (li "item " 1)))) 59 | "

60 |
61 |

    62 |
  • item 1
  • 63 |
64 |

") 65 | (check-equal? (sxml->html '(p (@) (br) (ul (@ (compact)) (li "item " 1)))) 66 | "

67 |
68 |

    69 |
  • item 1
  • 70 |
71 |

") 72 | ;; yikes, looks like this has changed dramatically... 73 | #;(check-equal? (sxml->html 74 | '(html:begin "my title" 75 | (body (@ (bgcolor "#ffffff")) (p "par1")))) 76 | "Content-type: text/html 77 | 78 | my title 79 | "" 80 |

par1

")) 81 | ;; this test is doing goofy stuff with numbers inline. Bleh. 82 | #;(test-case "test 3" 83 | (let () 84 | (define (print-slide n max-count) 85 | (sxml->html 86 | `((h2 "Slide number:" ,n) ; Note n is used in its native form 87 | ,@(cond [(positive? n) 88 | `((a (@ (href "base-url&slide=" ,(- n 1))) "prev"))] 89 | [else null]) 90 | ,@(cond [(< (+ n 1) max-count) 91 | `((a (@ (href "base-url&slide=" ,(+ n 1))) "next"))] 92 | [else null]) 93 | (p "the text of the slide")))) 94 | (check-equal? (print-slide 0 1) 95 | (display-to-str 96 | '("

Slide number:0

" 97 | #\newline "

the text of the slide

"))) 98 | (check-equal? (print-slide 0 3) 99 | (display-to-str 100 | '("

Slide number:0

" 101 | #\newline "next" 102 | #\newline "

the text of the slide

"))) 103 | (check-equal? (print-slide 1 3) 104 | (display-to-str 105 | '("

Slide number:1

" 106 | #\newline "prev" 107 | #\newline "next" 108 | #\newline "

the text of the slide

"))) 109 | (check-equal? (print-slide 2 3) 110 | (display-to-str 111 | '(#\newline "

Slide number:2

" 112 | #\newline "prev" 113 | #\newline "

the text of the slide

"))))) 114 | (test-case "test 4" 115 | (void 116 | (sxml->html 117 | `(ul 118 | ,@(map (lambda (filename-title) 119 | `(li (a (@ (href ,(car filename-title)))) 120 | ,(cdr filename-title))) 121 | '(("slides/slide0001.gif" . "Introduction") 122 | ("slides/slide0010.gif" . "Summary"))))))) 123 | 124 | ;; nope, this test cases doesn't look right to me either. 125 | #;(test-case "preorder and macro rules" 126 | (let () 127 | (define enattr list) ;; ?? 128 | (define entag cons) ;; ?? 129 | (define (custom-sxml->html tree) 130 | (with-output-to-string 131 | (lambda () 132 | (SRV:send-reply 133 | (pre-post-order tree 134 | ;; Universal transformation rules. Work for every HTML, 135 | ;; present and future 136 | `((@ 137 | ((*default* ; local override for attributes 138 | . ,(lambda (attr-key . value) (enattr attr-key value)))) 139 | . ,(lambda (trigger . value) (cons '@ value))) 140 | (*default* . ,(lambda (tag . elems) (entag tag elems))) 141 | (*text* . ,(lambda (trigger str) 142 | (if (string? str) (string->goodHTML str) str))) 143 | (link 144 | *macro* 145 | . ,(lambda (tag url body) 146 | `(a (@ (href ,url)) ,body))) 147 | (vspace ; (vspace flag) 148 | *preorder* ; where flag is a symbol: small, large 149 | . ,(lambda (tag flag) 150 | (case flag 151 | ((large) (list "


 

")) 152 | ((small) (list "
 
")) 153 | (else (error "wrong flag:" flag)))))) 154 | ))))) 155 | 6(check-equal? (custom-sxml->html '(p "text" (link "url" "") "text1")) 156 | (display-to-str 157 | '("

text" 158 | #\newline "<body>text1

"))) 159 | (check-equal? (custom-sxml->html '(p "text" (vspace small) "text1")) 160 | (display-to-str 161 | '( "

text
 
text1

"))) 162 | (check-equal? (custom-sxml->html '(p "text" (vspace large) "text1")) 163 | (display-to-str 164 | '("

text


 

text1

"))))) 165 | ))) 166 | -------------------------------------------------------------------------------- /sxml/tests/vSXML-tree-trans.rkt: -------------------------------------------------------------------------------- 1 | ; Validation code for SXML-tree-trans.scm 2 | ; 3 | ; IMPORT 4 | ; SXML-tree-trans.scm and all of its imports 5 | ; Pretty-printer of trees (named pp) 6 | ; ,open sxml-tree-trans ppretty-prints coutputs assertions 7 | ; 8 | ; $Id: vSXML-tree-trans.scm,v 1.2 2004/07/07 16:02:31 sperber Exp $ 9 | 10 | #lang racket/base 11 | 12 | (module+ test 13 | (require racket/port 14 | rackunit 15 | rackunit/text-ui 16 | "../ssax/SXML-tree-trans.rkt") 17 | 18 | ;; STATUS: all tests pass 19 | 20 | (run-tests 21 | (test-suite "SXML-tree-trans" 22 | (let* ((tree 23 | '(root (n1 (n11) "s12" (n13)) 24 | "s2" 25 | (n2 (n21) "s22") 26 | (n3 27 | (n31 (n311)) 28 | "s32" 29 | (n33 (n331) "s332" (n333)) 30 | "s34"))) 31 | (test 32 | (lambda (pred-begin pred-end expected) 33 | (let ((computed 34 | (car (replace-range pred-begin pred-end (list tree))))) 35 | (check-equal? computed expected))))) 36 | (test-case "Remove one node, s2" 37 | (test 38 | (lambda (node) 39 | (and (equal? node "s2") '())) 40 | (lambda (node) (list node)) 41 | '(root (n1 (n11) "s12" (n13)) 42 | (n2 (n21) "s22") 43 | (n3 (n31 (n311)) "s32" (n33 (n331) "s332" (n333)) "s34")))) 44 | 45 | (test-case "Replace one node, s2 with s2-new" 46 | (test 47 | (lambda (node) 48 | (and (equal? node "s2") '("s2-new"))) 49 | (lambda (node) (list node)) 50 | '(root (n1 (n11) "s12" (n13)) 51 | "s2-new" 52 | (n2 (n21) "s22") 53 | (n3 (n31 (n311)) "s32" (n33 (n331) "s332" (n333)) "s34")))) 54 | 55 | (test-case "Replace one node, s2 with s2-new and its brother (n-new s)" 56 | (test 57 | (lambda (node) 58 | (and (equal? node "s2") '("s2-new" (n-new "s")))) 59 | (lambda (node) (list node)) 60 | '(root (n1 (n11) "s12" (n13)) 61 | "s2-new" (n-new "s") 62 | (n2 (n21) "s22") 63 | (n3 (n31 (n311)) "s32" (n33 (n331) "s332" (n333)) "s34")))) 64 | 65 | (test-case "Remove everything from s2 onward" 66 | (test 67 | (lambda (node) 68 | (and (equal? node "s2") '())) 69 | (lambda (node) #f) 70 | '(root (n1 (n11) "s12" (n13))))) 71 | 72 | (test-case "Remove everything from n1 onward" 73 | (test 74 | (lambda (node) 75 | (and (pair? node) (eq? 'n1 (car node)) '())) 76 | (lambda (node) #f) 77 | '(root))) 78 | 79 | (test-case "Replace from n1 through n33" 80 | (test 81 | (lambda (node) 82 | (and (pair? node) 83 | (eq? 'n1 (car node)) 84 | (list node '(n1* "s12*")))) 85 | (lambda (node) 86 | (and (pair? node) 87 | (eq? 'n33 (car node)) 88 | (list node))) 89 | '(root 90 | (n1 (n11) "s12" (n13)) 91 | (n1* "s12*") 92 | (n3 93 | (n33 (n331) "s332" (n333)) 94 | "s34")))) 95 | )))) 96 | -------------------------------------------------------------------------------- /sxml/xpath-ast.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require srfi/2 3 | "ssax/errors-and-warnings.rkt" 4 | "xpath-parser.rkt") 5 | (provide (all-defined-out)) 6 | 7 | ;; XPath/XPointer -> Abstract Syntax Tree parser 8 | ; 9 | ; This software is in Public Domain. 10 | ; IT IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. 11 | ; 12 | ; Please send bug reports and comments to: 13 | ; lisovsky@acm.org Kirill Lisovsky 14 | ; lizorkin@hotbox.ru Dmitry Lizorkin 15 | 16 | 17 | ;========================================================================== 18 | ; W3C textual XPath/XPointer -> AST 19 | 20 | ; Writing operations as an S-expression in an infix notation 21 | (define (txp:ast-operation-helper expr-lst op-lst add-on) 22 | (let ((rev-expr-lst (reverse expr-lst))) 23 | (let loop ((exprs (cdr rev-expr-lst)) 24 | (ops (reverse op-lst)) 25 | (res (car rev-expr-lst))) 26 | (if (null? ops) 27 | res 28 | (loop (cdr exprs) (cdr ops) 29 | (list (car ops) (car exprs) res)))))) 30 | 31 | ;------------------------------------------------- 32 | ; Parameters for TXPath -> AST implementation 33 | 34 | (define txp:ast-params 35 | `( 36 | ; Axes 37 | (axis 38 | ((ancestor 39 | ,(lambda (add-on) 'ancestor)) 40 | (ancestor-or-self 41 | ,(lambda (add-on) 'ancestor-or-self)) 42 | (attribute 43 | ,(lambda (add-on) 'attribute)) 44 | (child 45 | ,(lambda (add-on) 'child)) 46 | (descendant 47 | ,(lambda (add-on) 'descendant)) 48 | (descendant-or-self 49 | ,(lambda (add-on) 'descendant-or-self)) 50 | (following 51 | ,(lambda (add-on) 'following)) 52 | (following-sibling 53 | ,(lambda (add-on) 'following-sibling)) 54 | (namespace 55 | ,(lambda (add-on) 'namespace)) 56 | (parent 57 | ,(lambda (add-on) 'parent)) 58 | (preceding 59 | ,(lambda (add-on) 'preceding)) 60 | (preceding-sibling 61 | ,(lambda (add-on) 'preceding-sibling)) 62 | (self 63 | ,(lambda (add-on) 'self)) 64 | ; Addition by XLink 65 | (arc 66 | ,(lambda (add-on) 'arc)) 67 | (traverse 68 | ,(lambda (add-on) 'traverse)) 69 | (traverse-arc 70 | ,(lambda (add-on) 'traverse-arc)))) 71 | 72 | ; Node test 73 | (node-test 74 | ((star 75 | ,(lambda (add-on) '((*)))) 76 | (uri+star 77 | ,(lambda (uri add-on) 78 | `((namespace-uri ,uri)))) 79 | (qname 80 | ,(lambda (uri local-name add-on) 81 | (if (not uri) 82 | `((local-name ,local-name)) 83 | `((namespace-uri ,uri) (local-name ,local-name))))) 84 | (comment 85 | ,(lambda (add-on) '((comment)))) 86 | (text 87 | ,(lambda (add-on) '((text)))) 88 | (processing-instruction 89 | ,(lambda (literal-string add-on) 90 | (if (not literal-string) ; no literal provided 91 | '((pi)) 92 | `((pi ,literal-string))))) 93 | (node 94 | ,(lambda (add-on) '((node)))) 95 | (point 96 | ,(lambda (add-on) '((point)))) 97 | (range 98 | ,(lambda (add-on) '((range)))))) 99 | 100 | ; Location step 101 | (step 102 | ((common 103 | ,(lambda (axis-res node-test-res predicate-res-lst add-on) 104 | `(step 105 | (axis-specifier (,axis-res)) 106 | (node-test ,@node-test-res) 107 | ,@predicate-res-lst))) 108 | (range-to 109 | ,(lambda (expr-res predicate-res-lst add-on) 110 | `(range-to 111 | (expr ,expr-res) 112 | ,@predicate-res-lst))))) 113 | 114 | ; Relative location path 115 | (relative-lpath 116 | ,(lambda (step-res-lst add-on) 117 | (cons 'relative-location-path step-res-lst))) 118 | 119 | ; Location path 120 | (location-path 121 | ((bare-slash 122 | ,(lambda (add-on) '(absolute-location-path))) 123 | (slash 124 | ,(lambda (relative-lpath-res add-on) 125 | (cons 'absolute-location-path (cdr relative-lpath-res)))) 126 | (double-slash 127 | ,(lambda (relative-lpath-res add-on) 128 | `(absolute-location-path 129 | (step 130 | (axis-specifier (descendant-or-self)) 131 | (node-test (node))) 132 | ,@(cdr relative-lpath-res)))))) 133 | 134 | ; Predicate 135 | (predicate 136 | ,(lambda (expr-res add-on) 137 | (list 'predicate expr-res))) 138 | 139 | ; Variable reference 140 | (variable-ref 141 | ,(lambda (var-name-string add-on) 142 | `(variable-reference ,var-name-string))) 143 | 144 | ; Function call 145 | (function-call 146 | ,(lambda (fun-name-string arg-res-lst add-on) 147 | `(function-call 148 | (function-name ,fun-name-string) 149 | ,@(map 150 | (lambda (arg-res) `(argument ,arg-res)) 151 | arg-res-lst)))) 152 | 153 | ; Primary expression 154 | (primary-expr 155 | ((literal 156 | ,(lambda (literal add-on) 157 | `(literal ,literal))) 158 | (number 159 | ,(lambda (number add-on) 160 | `(number ,number))))) 161 | 162 | ; Filter expression 163 | (filter-expr 164 | ,(lambda (primary-expr-res predicate-res-lst add-on) 165 | `(filter-expr 166 | (primary-expr ,primary-expr-res) 167 | ,@predicate-res-lst))) 168 | 169 | ; Path expression 170 | (path-expr 171 | ((slash 172 | ,(lambda (filter-expr-res relative-lpath-res add-on) 173 | `(path-expr 174 | ,(if (eq? (car filter-expr-res) 'filter-expr) 175 | filter-expr-res 176 | `(filter-expr (primary-expr ,filter-expr-res))) 177 | ,@(cdr relative-lpath-res)))) 178 | (double-slash 179 | ,(lambda (filter-expr-res relative-lpath-res add-on) 180 | `(path-expr 181 | ,(if (eq? (car filter-expr-res) 'filter-expr) 182 | filter-expr-res 183 | `(filter-expr (primary-expr ,filter-expr-res))) 184 | (step 185 | (axis-specifier (descendant-or-self)) 186 | (node-test (node))) 187 | ,@(cdr relative-lpath-res)))))) 188 | 189 | ; Union expression 190 | (union-expr 191 | ,(lambda (path-expr-res-lst add-on) 192 | (cons 'union-expr path-expr-res-lst))) 193 | 194 | ; Unary expression 195 | (unary-expr 196 | ,(lambda (union-expr-res num-minuses add-on) 197 | (let loop ((n num-minuses) 198 | (res union-expr-res)) 199 | (if (= n 0) res 200 | (loop (- n 1) (list '- res)))))) 201 | 202 | ; Different operations 203 | (operations 204 | ((* ,(lambda (add-on) '*)) 205 | (div ,(lambda (add-on) 'div)) 206 | (mod ,(lambda (add-on) 'mod)) 207 | 208 | (+ ,(lambda (add-on) '+)) 209 | (- ,(lambda (add-on) '-)) 210 | (< ,(lambda (add-on) '<)) 211 | (> ,(lambda (add-on) '>)) 212 | (<= ,(lambda (add-on) '<=)) 213 | (>= ,(lambda (add-on) '>=)) 214 | (= ,(lambda (add-on) '=)) 215 | (!= ,(lambda (add-on) '!=)))) 216 | 217 | ; Additive and multiplicative expressions 218 | (mul-expr ,txp:ast-operation-helper) 219 | (add-expr ,txp:ast-operation-helper) 220 | 221 | ; Relational expression 222 | (relational-expr ,txp:ast-operation-helper) 223 | 224 | ; Equality expression 225 | (equality-expr ,txp:ast-operation-helper) 226 | 227 | ; And-expression 228 | (and-expr 229 | ,(lambda (equality-expr-res-lst add-on) 230 | (cons 'and equality-expr-res-lst))) 231 | 232 | ; Or-expression 233 | (or-expr 234 | ,(lambda (and-expr-res-lst add-on) 235 | (cons 'or and-expr-res-lst))) 236 | 237 | ; Full XPointer 238 | (full-xptr 239 | ,(lambda (expr-res-lst add-on) 240 | (cons 'full-xptr expr-res-lst))) 241 | 242 | ; XPointer child sequence 243 | (child-seq 244 | ((with-name 245 | ,(lambda (name-string number-lst add-on) 246 | `(child-seq 247 | (name ,name-string) 248 | ,@(map 249 | (lambda (num) (list 'number num)) 250 | number-lst)))) 251 | (without-name 252 | ,(lambda (number-lst add-on) 253 | (cons 'child-seq 254 | (map 255 | (lambda (num) (list 'number num)) 256 | number-lst)))))) 257 | )) 258 | 259 | (define txp:ast-res (txp:parameterize-parser txp:ast-params)) 260 | 261 | ;------------------------------------------------- 262 | ; Highest level API functions 263 | ; 264 | ; xpath-string - an XPath location path (a string) 265 | ; ns-binding - declared namespace prefixes (an optional argument) 266 | ; ns-binding = (list (prefix . uri) 267 | ; (prefix . uri) 268 | ; ...) 269 | ; prefix - a symbol 270 | ; uri - a string 271 | ; 272 | ; The returned result: abstract-syntax-tree or #f 273 | ; abstract-syntax-tree - an S-expression 274 | ; #f - signals of a parse error (an error message is printed as a side effect 275 | ; during parsing) 276 | 277 | (define (txp:ast-api-helper parse-proc) 278 | (lambda (xpath-string . ns-binding) 279 | (let ((res (parse-proc 280 | xpath-string 281 | (if (null? ns-binding) ns-binding (car ns-binding)) 282 | '()))) 283 | (if (txp:error? res) ; error detected 284 | #f res)))) 285 | 286 | (define txp:xpath->ast 287 | (txp:ast-api-helper (cadr (assq 'xpath txp:ast-res)))) 288 | (define txp:xpointer->ast 289 | (txp:ast-api-helper (cadr (assq 'xpointer txp:ast-res)))) 290 | (define txp:expr->ast 291 | (txp:ast-api-helper (cadr (assq 'expr txp:ast-res)))) 292 | 293 | 294 | ;========================================================================== 295 | ; SXPath native syntax -> AST 296 | ; Additional features added to AST by native SXPath 297 | ; Operator += below denotes additional alternatives to AST grammar rules 298 | ; {7} += (node-test (equal? )) 299 | ; | (node-test (eq? )) 300 | ; | (node-test (names + )) 301 | ; | (node-test (not-names + )) 302 | ; {4} += (lambda-step ) 303 | ; | 304 | 305 | (define (txp:sxpath->ast path . ns-binding) 306 | (let ((ns-binding (if (null? ns-binding) ns-binding (car ns-binding)))) 307 | (if 308 | (string? path) ; Just a textual XPath 309 | (txp:expr->ast path ns-binding) 310 | (let loop ((ast-steps '()) 311 | (path path)) 312 | (cond 313 | ((null? path) ; parsing is finished 314 | (if (null? ast-steps) ; empty path 315 | '(absolute-location-path) 316 | (let ((forward-steps (reverse ast-steps))) 317 | (cons 318 | (if (eq? (caar forward-steps) 'filter-expr) 319 | 'path-expr 'relative-location-path) 320 | forward-steps)))) 321 | ((procedure? (car path)) 322 | (loop (cons (list 'lambda-step (car path)) 323 | ast-steps) 324 | (cdr path))) 325 | ((assq (car path) '((// . descendant-or-self) (.. . parent))) 326 | => (lambda (pair) 327 | (loop (cons 328 | `(step (axis-specifier (,(cdr pair))) 329 | (node-test (node))) 330 | ast-steps) 331 | (cdr path)))) 332 | ((symbol? (car path)) 333 | (loop (cons 334 | `(step (axis-specifier (child)) 335 | (node-test 336 | ,(cond 337 | ((assq (car path) '((* . (*)) (*text* . (text)))) 338 | => cdr) 339 | (else 340 | `(local-name ,(symbol->string (car path))))))) 341 | ast-steps) 342 | (cdr path))) 343 | ((string? (car path)) 344 | (and-let* ; only for the location path for the moment 345 | ((txt-ast (txp:expr->ast (car path) ns-binding))) 346 | (loop (if (eq? (car txt-ast) 'relative-location-path) 347 | (append (reverse (cdr txt-ast)) ast-steps) 348 | (cons 349 | `(filter-expr (primary-expr ,txt-ast)) 350 | ast-steps)) 351 | (cdr path)))) 352 | ((and (pair? (car path)) (not (null? (car path)))) 353 | (cond 354 | ((assq (caar path) '((*or* . names) (*not* . not-names))) 355 | => (lambda (pair) 356 | (loop 357 | (cons 358 | `(step (axis-specifier (child)) 359 | (node-test 360 | ,(cons (cdr pair) 361 | (map symbol->string (cdar path))))) 362 | ast-steps) 363 | (cdr path)))) 364 | ((assq (caar path) '((equal? . equal?) (eq? . eq?) 365 | (ns-id:* . namespace-uri))) 366 | => (lambda (pair) 367 | (loop 368 | (cons `(step (axis-specifier (child)) 369 | (node-test ,(list (cdr pair) (cadar path)))) 370 | ast-steps) 371 | (cdr path)))) 372 | (else 373 | (let reducer ((reducing-path (cdar path)) 374 | (filters '())) 375 | (cond 376 | ((null? reducing-path) 377 | (if 378 | (symbol? (caar path)) ; just a child axis 379 | (loop 380 | (cons 381 | `(step 382 | (axis-specifier (child)) 383 | (node-test (local-name ,(symbol->string (caar path)))) 384 | ,@(reverse filters)) 385 | ast-steps) 386 | (cdr path)) 387 | (and-let* 388 | ((select (txp:sxpath->ast (caar path) ns-binding))) 389 | (loop 390 | (cons `(filter-expr 391 | (primary-expr ,select) 392 | ,@(reverse filters)) 393 | ast-steps) 394 | (cdr path))))) 395 | ((number? (car reducing-path)) 396 | (reducer 397 | (cdr reducing-path) 398 | (cons 399 | `(predicate 400 | ,(if 401 | (negative? (car reducing-path)) ; from end of nodeset 402 | `(- (function-call (function-name "last")) 403 | (number ,(- -1 (car reducing-path)))) 404 | `(number ,(car reducing-path)))) 405 | filters))) 406 | (else 407 | (and-let* 408 | ((pred-ast 409 | (txp:sxpath->ast (car reducing-path) ns-binding))) 410 | (reducer 411 | (cdr reducing-path) 412 | (cons `(predicate ,pred-ast) filters))))))))) 413 | (else 414 | (sxml:warn 'tsp:sxpath->ast "invalid path step: ~e" (car path)) 415 | #f)))))) 416 | 417 | 418 | ;========================================================================== 419 | ; Several popular accessors and constructors for AST steps 420 | 421 | ; Whether a representation for location step 422 | (define (txp:step? op) 423 | (and (pair? op) (eq? (car op) 'step))) 424 | 425 | ; Returns the axis specifier of the location step 426 | ; Argument: the AST representation of a location step 427 | ; Result: either '(child) and the like, or #f if the AST contains syntactic 428 | ; error 429 | (define (txp:step-axis op) 430 | (and (txp:step? op) 431 | (not (null? (cdr op))) 432 | (pair? (cadr op)) (eq? (caadr op) 'axis-specifier) 433 | (cadadr op))) 434 | 435 | ; Returns the node test of the location step 436 | ; Argument: the AST representation of a location step 437 | ; Result: either '(*) and the like, or #f if the AST contains syntactic 438 | ; error 439 | (define (txp:step-node-test op) 440 | (and (txp:step? op) 441 | (not (null? (cdr op))) (not (null? (cddr op))) 442 | (pair? (caddr op)) (eq? (caaddr op) 'node-test) 443 | (cadr (caddr op)))) 444 | 445 | ; Returns predicate expressions of the location step 446 | ; Argument: the AST representation of a location step 447 | ; Result: either (listof ast-expr) 448 | ; or #f if syntactic error detected in a location step AST 449 | (define (txp:step-preds op) 450 | (and (txp:step? op) 451 | (not (null? (cdr op))) (not (null? (cddr op))) 452 | (null? (filter 453 | (lambda (sub) ; not a predicate representation 454 | (not (and (pair? sub) (eq? (car sub) 'predicate)))) 455 | (cdddr op))) 456 | (map cadr (cdddr op)))) 457 | 458 | ; Constructs the AST representation for a given axis, node-test and 459 | ; a list of predicate expressions 460 | ; axis ::= '(child) and the like 461 | ; node-test ::= '(*) and the like 462 | ; pred-expr-list ::= (listof ast-expr) 463 | (define (txp:construct-step axis node-test . pred-expr-list) 464 | `(step (axis-specifier ,axis) 465 | (node-test ,node-test) 466 | ,@(map 467 | (lambda (pred-expr) `(predicate ,pred-expr)) 468 | pred-expr-list))) 469 | --------------------------------------------------------------------------------