├── .travis.yml ├── deps.edn ├── .gitignore ├── .github └── workflows │ └── clojure.yml ├── project.clj ├── CHANGELOG.md ├── test └── riveted │ └── core_test.clj ├── README.md └── src └── riveted └── core.clj /.travis.yml: -------------------------------------------------------------------------------- 1 | language: clojure 2 | script: lein all midje 3 | -------------------------------------------------------------------------------- /deps.edn: -------------------------------------------------------------------------------- 1 | {:deps 2 | {com.ximpleware/vtd-xml {:mvn/version "2.13.4"}}} 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /lib 3 | /classes 4 | /checkouts 5 | pom.xml 6 | pom.xml.asc 7 | *.jar 8 | *.class 9 | .lein-deps-sum 10 | .lein-failures 11 | .lein-plugins 12 | .lein-repl-history 13 | -------------------------------------------------------------------------------- /.github/workflows/clojure.yml: -------------------------------------------------------------------------------- 1 | name: Clojure CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | - uses: actions/setup-java@v2 17 | with: 18 | distribution: 'temurin' 19 | java-version: '8' 20 | - name: Install dependencies 21 | run: lein deps 22 | - name: Run tests 23 | run: lein all midje 24 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject riveted "0.2.0" 2 | :description "A Clojure library for the fast processing of XML with VTD-XML." 3 | :url "https://github.com/mudge/riveted" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | :dependencies [[com.ximpleware/vtd-xml "2.13.4"]] 7 | :profiles {:dev {:dependencies [[midje "1.10.5"]] 8 | :plugins [[lein-midje "3.2.2"] 9 | [lein-codox "0.10.8"]]} 10 | :1.3 {:dependencies [[midje "1.6.3"] [org.clojure/clojure "1.3.0"]]} 11 | :1.4 {:dependencies [[midje "1.6.3"] [org.clojure/clojure "1.4.0"]]} 12 | :1.5.1 {:dependencies [[midje "1.6.3"] [org.clojure/clojure "1.5.1"]]} 13 | :1.6 {:dependencies [[midje "1.6.3"] [org.clojure/clojure "1.6.0"]]} 14 | :1.7 {:dependencies [[midje "1.10.5"] [org.clojure/clojure "1.7.0"]]} 15 | :1.8 {:dependencies [[midje "1.10.5"] [org.clojure/clojure "1.8.0"]]} 16 | :1.9 {:dependencies [[midje "1.10.5"] [org.clojure/clojure "1.9.0"]]} 17 | :1.10.0 {:dependencies [[midje "1.10.5"] [org.clojure/clojure "1.10.0"]]} 18 | :1.10.1 {:dependencies [[midje "1.10.5"] [org.clojure/clojure "1.10.1"]]} 19 | :1.10.2 {:dependencies [[midje "1.10.5"] [org.clojure/clojure "1.10.2"]]} 20 | :1.10.3 {:dependencies [[midje "1.10.5"] [org.clojure/clojure "1.10.3"]]}} 21 | :aliases {"all" ["with-profile" "dev,1.3:dev,1.4:dev,1.5.1:dev,1.6:dev,1.7:dev,1.8:dev,1.9:dev,1.10.0:dev,1.10.1:dev,1.10.2:dev,1.10.3"]} 22 | :codox {:src-dir-uri "https://github.com/mudge/riveted/blob/main" 23 | :src-linenum-anchor-prefix "L"}) 24 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | All notable changes to this project will be documented in this file. This 3 | project adheres to [Semantic Versioning](http://semver.org/). 4 | 5 | ## [0.2.0] - 2022-02-15 6 | ### Changed 7 | - Added support for passing a byte array directly to `navigator` rather than only 8 | accepting UTF-8 strings (thanks to Eugen Stan for the suggestion) 9 | - Passing an invalid value to `navigator` (e.g. `nil`) will now throw an 10 | `IllegalArgumentException` rather than a `NullPointerException` 11 | 12 | ## [0.1.2] - 2022-02-11 13 | ### Changed 14 | - Upgrade underlying VTD-XML dependency to 2.13.4 (thanks to Eugen Stan) 15 | 16 | ## [0.1.1] - 2018-01-29 17 | ### Fixed 18 | - Fixed exception when calling fragment on a self-closing tag 19 | 20 | ## [0.1.0] - 2017-06-27 21 | ### Fixed 22 | - Explicitly set the character set to UTF-8 when reading XML 23 | 24 | ### Changed 25 | - Upgrade underlying VTD-XML dependency to 2.13 26 | 27 | ## [0.0.9] - 2013-09-05 28 | ### Added 29 | - Add ability to return the text of an attribute using `text` 30 | - Add ability to check if the navigator is pointed at an attribute with `attribute?` 31 | 32 | ## [0.0.8] - 2013-05-02 33 | ### Added 34 | - Gracefully handle `nil` across the library to make threading easier 35 | 36 | ## [0.0.7] - 2013-03-29 37 | ### Added 38 | - Enrich navigators so that are now sequential, seqable and countable data structures 39 | 40 | ## [0.0.6] - 2013-03-29 41 | ### Added 42 | - Added `select` for selecting elements by name or wildcard 43 | 44 | ## [0.0.5] - 2013-03-28 45 | ### Added 46 | - Added transient interface via `root!`, `parent!`, `next-sibling!`, `previous-sibling!`, `first-child!` and `last-child!` 47 | 48 | ## [0.0.4] - 2013-03-25 49 | ### Added 50 | - Added `attr?` for testing the existence of attributes 51 | - Added ability to pass optional element names to `first-child`, `last-child`, 52 | `next-sibling`, `previous-sibling`, `siblings` and `children` 53 | 54 | ## [0.0.3] - 2013-03-24 55 | ### Added 56 | - Added support for XML namespaces when searching 57 | 58 | ### Changed 59 | - Replaced public `token-type` with `element?` and `document?` functions 60 | 61 | ## [0.0.2] - 2013-03-24 62 | ### Added 63 | - Add ability to fetch previous sibling 64 | - Add ability to fetch token type for an element 65 | 66 | ### Fixed 67 | - Fix fetching all siblings for a navigator, both previous and next 68 | 69 | ## [0.0.1] - 2013-03-24 70 | ### Added 71 | - First stable version of riveted 72 | 73 | [0.2.0]: https://github.com/mudge/riveted/releases/tag/v0.2.0 74 | [0.1.2]: https://github.com/mudge/riveted/releases/tag/v0.1.2 75 | [0.1.1]: https://github.com/mudge/riveted/releases/tag/v0.1.1 76 | [0.1.0]: https://github.com/mudge/riveted/releases/tag/v0.1.0 77 | [0.0.9]: https://github.com/mudge/riveted/releases/tag/v0.0.9 78 | [0.0.8]: https://github.com/mudge/riveted/releases/tag/v0.0.8 79 | [0.0.7]: https://github.com/mudge/riveted/releases/tag/v0.0.7 80 | [0.0.6]: https://github.com/mudge/riveted/releases/tag/v0.0.6 81 | [0.0.5]: https://github.com/mudge/riveted/releases/tag/v0.0.5 82 | [0.0.4]: https://github.com/mudge/riveted/releases/tag/v0.0.4 83 | [0.0.3]: https://github.com/mudge/riveted/releases/tag/v0.0.3 84 | [0.0.2]: https://github.com/mudge/riveted/releases/tag/v0.0.2 85 | [0.0.1]: https://github.com/mudge/riveted/releases/tag/v0.0.1 86 | -------------------------------------------------------------------------------- /test/riveted/core_test.clj: -------------------------------------------------------------------------------- 1 | (ns riveted.core-test 2 | (:use midje.sweet riveted.core) 3 | (:import com.ximpleware.VTDNav)) 4 | 5 | ;;; Test data. 6 | 7 | (def xml "FooFoo woo mooBar") 8 | (def ns-xml "Bob") 9 | 10 | (def nav (navigator xml false)) 11 | (def ns-nav (navigator ns-xml true)) 12 | 13 | ;;; Custom checkers to simplify testing. 14 | 15 | (defn nav? [actual] (instance? riveted.core.Navigator actual)) 16 | (defn tag? [tag-name] (fn [actual] (= tag-name (tag actual)))) 17 | (defn tags? [& tag-names] (fn [actual] (= tag-names (map tag actual)))) 18 | (def root? (tag? "root")) 19 | 20 | (fact "navigator returns a VTD navigator for a byte array." 21 | (navigator (.getBytes "" "UTF-8")) => nav? 22 | (navigator (.getBytes "" "ISO-8859-1")) => nav?) 23 | 24 | (fact "navigator returns a VTD navigator for a UTF-8 string." 25 | (navigator "") => nav?) 26 | 27 | (fact "navigator raises an IllegalArgumentException nil if given nil." 28 | (navigator nil) => (throws IllegalArgumentException)) 29 | 30 | (fact "search returns a sequence of matching navigators for a given XPath." 31 | (search nav "/root/basic-title") => (one-of nav?) 32 | (search nav "//i") => (two-of nav?) 33 | (search nav "/missing") => empty? 34 | (search ns-nav "/root/foo:name" "foo" "http://purl.org/dc/elements/1.1/") => (one-of nav?) 35 | (search nil "/foo") => empty?) 36 | 37 | (fact "at returns the first matching navigator for a given XPath." 38 | (at nav "/root/basic-title") => nav? 39 | (at nav "/missing") => nil? 40 | (at ns-nav "/root/foo:name" "foo" "http://purl.org/dc/elements/1.1/") => nav? 41 | (at nil "/foo") => nil?) 42 | 43 | (fact "select returns navigators for all matching elements." 44 | (select nav "*") => (tags? "root" "basic-title" "complex-title" "i" "b" "i" "foo") 45 | (select nav "i") => (two-of (tag? "i")) 46 | (select nav :i) => (two-of (tag? "i")) 47 | (select (at nav "/root/complex-title") "*") => (tags? "complex-title" "i" "b") 48 | (select nav "missing") => empty? 49 | (select nil "foo") => empty?) 50 | 51 | (fact "text returns the text nodes descending from a navigator." 52 | (text (at nav "/root/basic-title")) => "Foo" 53 | (text (at nav "/root/complex-title")) => "Foo woo moo" 54 | (text (at nav "/root/foo")) => nil? 55 | (text nil) => nil?) 56 | 57 | (fact "fragment returns the content of the given navigator as an XML fragment." 58 | (fragment (at nav "/root/complex-title")) => "Foo woo moo" 59 | (fragment (at nav "/root/basic-title")) => "Foo" 60 | (fragment (at nav "/root/foo")) => "" 61 | (fragment nil) => nil?) 62 | 63 | (fact "attr returns the value of the given attribute" 64 | (attr (at nav "/root/complex-title") :id) => "42" 65 | (attr (at nav "/root/complex-title") "id") => "42" 66 | (attr (at nav "/root/complex-title") :missing) => nil? 67 | (attr nil :foo) => nil?) 68 | 69 | (fact "tag returns the current element name of the given navigator." 70 | (tag (root nav)) => "root" 71 | (tag (at nav "/root/complex-title")) => "complex-title" 72 | (tag nil) => nil?) 73 | 74 | (fact "document? returns true if the navigator is set to the document." 75 | (parent (root nav)) => document? 76 | (root nav) =not=> document? 77 | (document? nil) => false) 78 | 79 | (fact "element? returns true if the navigator is set to an element." 80 | (root nav) => element? 81 | (parent (root nav)) =not=> element? 82 | (element? nil) => false) 83 | 84 | (fact "parent returns a navigator for the parent of the current navigator." 85 | (parent (at nav "/root/basic-title")) => nav? 86 | (parent nil) => nil?) 87 | 88 | (fact "parent returns the document as the parent of the root." 89 | (parent (root nav)) => document?) 90 | 91 | (fact "parent returns nil for the parent of the document." 92 | (parent (parent (root nav))) => nil?) 93 | 94 | (fact "root returns a navigator for the root element." 95 | (root (at nav "/root/complex-title/i")) => nav? 96 | (root (at nav "/root/complex-title/i")) => root?) 97 | 98 | (fact "first-child returns a navigator for the first child element." 99 | (first-child (at nav "/root/complex-title")) => nav? 100 | (first-child (at nav "/root/complex-title")) => (tag? "i") 101 | (first-child (at nav "/root/foo")) => nil?) 102 | 103 | (fact "first-child takes an optional element name." 104 | (first-child (root nav) :complex-title) => (tag? "complex-title") 105 | (first-child (root nav) "complex-title") => (tag? "complex-title") 106 | (first-child (root nav) :missing) => nil?) 107 | 108 | (fact "last-child returns a navigator for the last child element." 109 | (last-child (at nav "/root/complex-title")) => (tag? "b") 110 | (last-child (at nav "/root/foo")) => nil?) 111 | 112 | (fact "last-child takes an optional element name." 113 | (last-child (root nav) :complex-title) => (tag? "complex-title") 114 | (last-child (root nav) "complex-title") => (tag? "complex-title") 115 | (last-child (root nav) :missing) => nil?) 116 | 117 | (fact "next-sibling returns a navigator for the next sibling element." 118 | (next-sibling (at nav "/root/basic-title")) => (tag? "complex-title") 119 | (next-sibling (at nav "/root/complex-title")) => (tag? "i") 120 | (next-sibling (at nav "/root/foo")) => nil?) 121 | 122 | (fact "next-sibling takes an optional element name." 123 | (next-sibling (at nav "/root/basic-title") :i) => (tag? "i") 124 | (next-sibling (at nav "/root/basic-title") "i") => (tag? "i") 125 | (next-sibling (at nav "/root/basic-title") :missing) => nil?) 126 | 127 | (fact "previous-sibling returns a navigator for the previous sibling element." 128 | (previous-sibling (at nav "/root/complex-title")) => (tag? "basic-title") 129 | (previous-sibling (at nav "/root/foo")) => (tag? "i") 130 | (previous-sibling (at nav "/root/basic-title")) => nil?) 131 | 132 | (fact "previous-sibling takes an optional element name." 133 | (previous-sibling (at nav "/root/i") :basic-title) => (tag? "basic-title") 134 | (previous-sibling (at nav "/root/i") "basic-title") => (tag? "basic-title") 135 | (previous-sibling (at nav "/root/i") :missing) => nil?) 136 | 137 | (fact "next-siblings returns navigators for all next sibling elements." 138 | (next-siblings (at nav "/root/basic-title")) => (tags? "complex-title" "i" 139 | "foo") 140 | (next-siblings (at nav "/root/foo")) => empty?) 141 | 142 | (fact "next-siblings takes an optional element name." 143 | (next-siblings (at nav "/root/basic-title") :i) => (tags? "i") 144 | (next-siblings (at nav "/root/basic-title") "i") => (tags? "i") 145 | (next-siblings (at nav "/root/basic-title") :missing) => empty?) 146 | 147 | (fact "previous-siblings returns navigators for all previous sibling elements." 148 | (previous-siblings (at nav "/root/foo")) => (tags? "i" "complex-title" 149 | "basic-title") 150 | (previous-siblings (at nav "/root/basic-title")) => empty?) 151 | 152 | (fact "previous-siblings takes an optional element name." 153 | (previous-siblings (at nav "/root/foo") :i) => (tags? "i") 154 | (previous-siblings (at nav "/root/foo") "i") => (tags? "i") 155 | (previous-siblings (at nav "/root/foo") :missing) => empty?) 156 | 157 | (fact "siblings returns navigators for all sibling elements." 158 | (siblings (at nav "/root/basic-title")) => (tags? "complex-title" "i" "foo") 159 | (siblings (at nav "/root/complex-title")) => (tags? "basic-title" "i" "foo") 160 | (siblings (at nav "/root/i")) => (tags? "basic-title" "complex-title" "foo") 161 | (siblings (root nav)) => empty?) 162 | 163 | (fact "siblings takes an optional element name." 164 | (siblings (at nav "/root/basic-title") 165 | :complex-title) => (tags? "complex-title") 166 | (siblings (at nav "/root/basic-title") 167 | "complex-title") => (tags? "complex-title") 168 | (siblings (at nav "/root/basic-title") :missing) => empty?) 169 | 170 | (fact "children returns navigators for all children elements." 171 | (children (root nav)) => (tags? "basic-title" "complex-title" "i" "foo") 172 | (children (at nav "/root/foo")) => empty?) 173 | 174 | (fact "children takes an optional element name." 175 | (children (root nav) :complex-title) => (tags? "complex-title") 176 | (children (root nav) "complex-title") => (tags? "complex-title") 177 | (children (root nav) :missing) => empty?) 178 | 179 | (fact "attr? returns whether or not a given attribute exists." 180 | (attr? (at nav "/root/complex-title") :id) => true 181 | (attr? (at nav "/root/complex-title") "id") => true 182 | (attr? (at nav "/root/complex-title") :missing) => false) 183 | 184 | ;;; Mutable tests. Note that the order of these facts is critical. 185 | 186 | (def nav! (navigator "Foo42Bar" false)) 187 | 188 | (fact "root! moves the given navigator to the root." 189 | (root! nav!) => (tag? "root")) 190 | 191 | (fact "first-child! moves the given navigator to the first child." 192 | (first-child! nav!) => (tag? "child")) 193 | 194 | (fact "first-child! takes an optional element name." 195 | (first-child! nav! :name) => (tag? "name") 196 | (first-child! nav! :missing) => nil?) 197 | 198 | (fact "parent! moves the given navigator to the parent element." 199 | (parent! nav!) => (tag? "child")) 200 | 201 | (fact "next-sibling! moves to the given navigator to the next sibling element." 202 | (next-sibling! nav!) => (tag? "bro") 203 | (next-sibling! nav!) => nil?) 204 | 205 | (fact "next-sibling! takes an optional element name." 206 | (next-sibling! (previous-sibling! nav!) :bro) => (tag? "bro") 207 | (next-sibling! (previous-sibling! nav!) "bro") => (tag? "bro") 208 | (next-sibling! nav! :missing) => nil?) 209 | 210 | (fact "previous-sibling! moves the given navigator to the previous sibling." 211 | (previous-sibling! nav!) => (tag? "child") 212 | (previous-sibling! nav!) => nil?) 213 | 214 | (fact "previous-sibling! takes an optional element name." 215 | (previous-sibling! (next-sibling! nav!) :child) => (tag? "child") 216 | (previous-sibling! (next-sibling! nav!) "child") => (tag? "child") 217 | (previous-sibling! nav! :missing) => nil?) 218 | 219 | (fact "last-child! moves the given navigator to the last child." 220 | (last-child! nav!) => (tag? "age")) 221 | 222 | (fact "navigators are sequential." 223 | nav => sequential?) 224 | 225 | (fact "navigators are counted." 226 | nav => counted? 227 | (count nav) => 18) 228 | 229 | (fact "navigators expose all internal tokens as a seq." 230 | (first nav) => {:type :start-tag, :value "root"} 231 | (second nav) => {:type :comment, :value "Hello"} 232 | (nth nav 2) => {:type :start-tag, :value "basic-title"} 233 | (nth nav 3) => {:type :character-data, :value "Foo"} 234 | (nth nav 5) => {:type :attribute-name, :value "id"} 235 | (nth nav 6) => {:type :attribute-value, :value "42"}) 236 | 237 | (fact "navigators not at the root, seq the remaining nodes." 238 | (first (first-child nav :complex-title)) => {:type :start-tag, 239 | :value "complex-title"} 240 | (last (first-child nav :complex-title)) => {:type :start-tag, 241 | :value "foo"}) 242 | 243 | (fact "navigators can safely be threaded even with nils." 244 | (-> nav (first-child :missing) (last-child :missing) text) => nil?) 245 | 246 | (fact "attribute? returns true if the navigator is set to an attribute." 247 | (at nav "/root/complex-title/@id") => attribute? 248 | (at nav "/root/complex-title") =not=> attribute? 249 | (root nav) =not=> attribute?) 250 | 251 | (fact "attribute values can be retrieved with text." 252 | (text (at nav "/root/complex-title/@id")) => "42" 253 | (text (at nav "/root/complex-title/@empty")) => "") 254 | 255 | (fact "attribute names can be retrieved with tag." 256 | (tag (at nav "/root/complex-title/@id")) => "id" 257 | (tag (at nav "/root/complex-title/@empty")) => "empty") 258 | 259 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # riveted [![Clojure CI](https://github.com/mudge/riveted/actions/workflows/clojure.yml/badge.svg)](https://github.com/mudge/riveted/actions/workflows/clojure.yml) 2 | 3 | A Clojure library for the 4 | [fast](http://vtd-xml.sourceforge.net/benchmark1.html) processing of XML with 5 | [VTD-XML](http://vtd-xml.sourceforge.net), a [Virtual Token 6 | Descriptor](http://vtd-xml.sf.net/VTD.html) XML parser. 7 | 8 | It provides a more Clojure-like abstraction over VTD while still exposing the 9 | power of its low-level interface. 10 | 11 | ## Installation 12 | 13 | As riveted is available on [Clojars](https://clojars.org/riveted), add the 14 | following to your [Leiningen](https://github.com/technomancy/leiningen) 15 | dependencies: 16 | 17 | ```clojure 18 | [riveted "0.2.0"] 19 | ``` 20 | 21 | ## Compatibility 22 | 23 | riveted is tested against Clojure 1.3, 1.4, 1.5.1, 1.6, 1.7, 1.8, 1.9, 1.10.0, 24 | 1.10.1, 1.10.2 and 1.10.3. 25 | 26 | ## API Documentation 27 | 28 | The latest [riveted API documentation](http://mudge.name/riveted/) is 29 | automatically generated with [Codox](https://github.com/weavejester/codox). 30 | 31 | ## Quick Start 32 | 33 | For more details, see [Usage](#usage) below. 34 | 35 | ```clojure 36 | (ns foo 37 | (:require [riveted.core :as vtd])) 38 | 39 | (def nav (vtd/navigator (slurp "foo.xml"))) 40 | 41 | ;; Navigating by direction and returning text content. 42 | (-> nav vtd/first-child vtd/next-sibling vtd/text) ;=> "Foo" 43 | 44 | ;; Navigating by direction, restricted by element and returning attribute 45 | ;; value. 46 | (-> nav (vtd/first-child :p) (attr :id)) ;=> "42" 47 | 48 | ;; Return the tag names of all children elements. 49 | (->> nav vtd/children (map vtd/tag)) ;=> ("p" "a" "b") 50 | 51 | ;; Navigating by element name, regardless of location. 52 | (-> nav (vtd/select :p) first vtd/text) 53 | 54 | ;; Navigating by XPath, returning all matches. 55 | (map vtd/text (vtd/search nav "//author")) 56 | 57 | ;; Navigating by XPath, returning the first match. 58 | (vtd/text (vtd/at nav "/article/title")) 59 | 60 | ;; Calling seq (or any function that uses seq such as first, second, nth, 61 | ;; last, etc.) on the navigator yields a sequence of all parsed tokens as 62 | ;; simple maps with a type and value entry. 63 | (first nav) ;=> {:type :start-tag, :value "a"} 64 | ``` 65 | 66 | ## Usage 67 | 68 | Once installed, you can include riveted into your desired namespace by 69 | requiring `riveted.core` like so: 70 | 71 | ```clojure 72 | (ns foo 73 | (:require [riveted.core :as vtd])) 74 | ``` 75 | 76 | The core data structure in riveted is the navigator: this represents both your 77 | XML document and your current location within it. It can be interrogated for 78 | the tag name, attributes and text value of any given element and also provides 79 | the ability to move around the document. 80 | 81 | Let's say we have a file called `foo.xml` with the following content: 82 | 83 | ```xml 84 |
85 | Foo bar 86 | 87 | Robert Paulson 88 | Joe Bloggs 89 | 90 | 91 | A great article all about things. 92 | 93 |
94 | ``` 95 | 96 | Let's load this into an initial navigator with the `navigator` function, 97 | passing it a UTF-8 encoded string of XML and then storing the result in the 98 | [var](http://clojure.org/vars) `nav`: 99 | 100 | ```clojure 101 | (def nav (vtd/navigator (slurp "foo.xml"))) 102 | ``` 103 | 104 | If you already have your XML in a byte array, you can pass this directly to `navigator` instead of a UTF-8 string: 105 | 106 | ```clojure 107 | (def nav (vtd/navigator my-byte-array)) 108 | ``` 109 | 110 | `navigator` also takes an optional second argument to enable XML namespace 111 | support which is disabled by default. We'll look at this 112 | [later](#namespace-support) but, for now, we can process this document without 113 | using namespaces. 114 | 115 | Now that we have a navigator, we can navigate the document in several ways 116 | (c.f. [VTD-XML's explanation of its different 117 | views](http://vtd-xml.sourceforge.net/userGuide/3.html)): 118 | 119 | * As a [cursor-based hierarchical view](#traversing-by-direction); 120 | * Using [element selectors](#traversing-by-element-name); 121 | * Using [XPath](#traversing-by-xpath); 122 | * As a [flat view of tokens](#flat-view-of-tokens). 123 | 124 | There is also a [mutable interface](#mutable-interface) for more constrained 125 | memory usage. 126 | 127 | ### Traversing by direction 128 | 129 | After parsing a document, the navigator's cursor is always at the root element 130 | of our XML: for `foo.xml`, this means the `article` element. If we want to 131 | retrieve the `title` and we know it's the first child of the article we can 132 | simply use riveted's `first-child` function: 133 | 134 | ```clojure 135 | (vtd/first-child nav) 136 | ``` 137 | 138 | This returns a new navigator with its cursor set to the `title` element. We 139 | can check this by using the `text` and `tag` functions to return the text 140 | content and tag name of the current cursor respectively: 141 | 142 | ```clojure 143 | (vtd/text (vtd/first-child nav)) ;=> "Foo bar" 144 | (vtd/tag (vtd/first-child nav)) ;=> "title" 145 | ``` 146 | 147 | If we then want to move to the `author` element, we can use the `next-sibling` 148 | function in a similar way: 149 | 150 | ```clojure 151 | (vtd/next-sibling (vtd/first-child nav)) 152 | ``` 153 | 154 | It may be more readable to use Clojure's [threading macro, 155 | `->`](http://clojuredocs.org/clojure_core/clojure.core/-%3E) when traversing 156 | in multiple directions: 157 | 158 | ```clojure 159 | (-> nav vtd/first-child vtd/next-sibling) 160 | ``` 161 | 162 | If we want to test an element for its attributes, we can use `attr?` like so: 163 | 164 | ```clojure 165 | (-> nav vtd/first-child vtd/next-sibling (vtd/attr? :id)) ;=> true 166 | ``` 167 | 168 | We can then fetch the value of the attribute with `attr`: 169 | 170 | ```clojure 171 | (-> nav vtd/first-child vtd/next-sibling (vtd/attr :id)) ;=> "1" 172 | 173 | ;; equivalent to: 174 | (vtd/attr (vtd/next-sibling (vtd/first-child nav)) :id) 175 | ``` 176 | 177 | As well as `first-child` and `next-sibling`, you can move in one direction 178 | with the following functions: 179 | 180 | ```clojure 181 | (vtd/previous-sibling nav) ;=> move to the previous sibling element 182 | (vtd/last-child nav) ;=> move to the last child element 183 | (vtd/parent nav) ;=> move to the parent element 184 | (vtd/root nav) ;=> move to the root element 185 | ``` 186 | 187 | We can also test navigators to distinguish elements from the entire document: 188 | 189 | ```clojure 190 | (-> nav vtd/first-child vtd/element?) ;=> true 191 | (-> nav vtd/parent vtd/document?) ;=> true 192 | (-> nav vtd/first-child vtd/attribute?) ;=> false 193 | ``` 194 | 195 | As we are positioned on the `author` element, we might now want to collect the 196 | text values of the `name` elements within it. We could do this using the 197 | directional functions above but riveted provides a `children` function to do 198 | this for us: 199 | 200 | ```clojure 201 | (->> nav vtd/first-child vtd/next-sibling vtd/children (map vtd/text)) 202 | ;=> ("Robert Paulson" "Joe Bloggs") 203 | 204 | ;; or if you prefer not to use the threading macro: 205 | (map vtd/text (vtd/children (vtd/next-sibling (vtd/first-child nav)))) 206 | ``` 207 | 208 | Note that `children`, along with `next-siblings` and `previous-siblings`, 209 | returns a lazy sequence of matching elements. They also take an optional 210 | second argument which allows you to specify an element name which will 211 | restrict results further. 212 | 213 | For example, if you wanted to return the `author` element directly from the 214 | original navigator, you could ask for the first `author` child like so: 215 | 216 | ```clojure 217 | (-> nav (vtd/first-child :author)) 218 | ``` 219 | 220 | Or ask the root for all child `author` elements: 221 | 222 | ```clojure 223 | (-> nav (vtd/children :author)) ;=> a sequence of all author child elements 224 | ``` 225 | 226 | You can also get the full text content of a mixed-content node with `text` 227 | which would be perfect for our `abstract` element: 228 | 229 | ```clojure 230 | (-> nav (vtd/first-child :abstract) vtd/text) 231 | ;=> "A great article all about things." 232 | ``` 233 | 234 | If you want to retrieve the raw XML contents of a node, you can use `fragment` 235 | to do so: 236 | 237 | ```clojure 238 | (-> nav (vtd/first-child :abstract) vtd/fragment) 239 | ;=> "A great article all about things." 240 | ``` 241 | 242 | ### Traversing by element name 243 | 244 | If we'd rather not navigate a document in terms of directions, riveted also 245 | provides a way to traverse XML by element names with `select`. 246 | 247 | To continue our example from above, if we wanted to pull the `title` text, we 248 | could ask the navigator for all `title` elements (regardless of location) like 249 | so: 250 | 251 | ```clojure 252 | (vtd/select nav :title) 253 | ``` 254 | 255 | As this is a lazy sequence, we can ask for the text of the first item like so: 256 | 257 | ```clojure 258 | (-> nav (vtd/select :title) first vtd/text) ;=> "Foo bar" 259 | ``` 260 | 261 | Similarly, we can ask for the text value of all `name` elements like so: 262 | 263 | ```clojure 264 | (map vtd/text (vtd/select nav :name)) ;=> ("Robert Paulson" "Joe Bloggs") 265 | ``` 266 | 267 | Note that this will return `name` elements *anywhere* in the document but we 268 | could restrict its search by moving the navigator, perhaps using some of the 269 | direction functions from above: 270 | 271 | ```clojure 272 | (map vtd/text (-> nav (vtd/first-child :author) (vtd/select :name))) 273 | ;=> ("Robert Paulson" "Joe Bloggs") 274 | ``` 275 | 276 | Or perhaps with `select` itself: 277 | 278 | ```clojure 279 | (map vtd/text (-> nav (vtd/select :author) first (vtd/select :name))) 280 | ;=> ("Robert Paulson" "Joe Bloggs") 281 | ``` 282 | 283 | Finally, we can return a lazy sequence of *all* elements by simply using a 284 | wildcard match: 285 | 286 | ```clojure 287 | (vtd/select nav "*") 288 | ``` 289 | 290 | ### Traversing by XPath 291 | 292 | The last way to traverse a document is to use XPath 1.0 with the `search` 293 | function. Note that this is only used to navigate to elements (so it's not 294 | possible to directly return attribute values with an XPath expression). 295 | 296 | For example, to select all `name` elements: 297 | 298 | ```clojure 299 | (vtd/search nav "//name") 300 | ``` 301 | 302 | If you are expecting only one match then you can use the `at` function to 303 | return only one result: 304 | 305 | ```clojure 306 | (vtd/at nav "/article/title") 307 | ``` 308 | 309 | If accessing attributes via XPath, you can use `text` to return the value of 310 | the attribute: 311 | 312 | ```clojure 313 | (text (vtd/at nav "/article/@id")) 314 | ``` 315 | 316 | ### Namespace support 317 | 318 | If you wish to use namespace-aware features, you will need to enable namespace 319 | support when creating the initial navigator like so: 320 | 321 | ```clojure 322 | (def ns-nav (vtd/navigator (slurp "namespaced.xml") true)) 323 | ``` 324 | 325 | You can then pass a prefix and URL when using `search` and `at` like so: 326 | 327 | ```clojure 328 | (vtd/search ns-nav "//ns1:name" "ns1" "http://purl.org/dc/elements/1.1/") 329 | ``` 330 | 331 | ### Flat view of tokens 332 | 333 | If you need lower level access to the parsed document, you can exploit the 334 | fact that navigators implement [Clojure's `Seqable` 335 | interface](http://clojure.org/sequences) and can be traversed as a flat 336 | sequence much like a list or vector: 337 | 338 | ```clojure 339 | (first nav) ;=> {:type :start-tag, :value "article"} 340 | (second nav) ;=> {:type :start-tag, :value "title"} 341 | (nth nav 2) ;=> {:type :character-data, :value "Foo bar"} 342 | (nth nav 4) ;=> {:type :attribute-name, :value "id"} 343 | (seq nav) ;=> the full sequence of tokens 344 | 345 | ;; Return all comments from a document. 346 | (filter (comp #{:comment} :type) nav) 347 | ``` 348 | 349 | This gives you access to *all* tokens in the document including XML 350 | declarations, doctypes, comments, processing instructions, etc. However, it is 351 | a very low level of abstraction and if you only care about navigating 352 | elements, it might be better to use a cursor-based view instead. 353 | 354 | ### Mutable interface 355 | 356 | riveted also provides a mutable interface to 357 | [VTDNav](http://vtd-xml.sourceforge.net/javadoc/com/ximpleware/VTDNav.html) 358 | (much like Clojure's [transient](http://clojure.org/transients) data 359 | structures) for lower-memory usage (at the cost of immutability): 360 | 361 | ```clojure 362 | ;; Create an initial navigator as per usual. 363 | (def nav (navigator "FooBar")) 364 | 365 | ;; Mutate nav to point to the a element. 366 | (vtd/first-child! nav) 367 | 368 | (vtd/text nav) 369 | ;=> "Foo" 370 | 371 | ;; Mutate nav to point to the b element. 372 | (vtd/next-sibling! nav) 373 | 374 | (vtd/text nav) 375 | ;=> "Bar" 376 | 377 | ;; Mutate nav to point to the a element again. 378 | (vtd/previous-sibling! nav) 379 | 380 | ;; Mutate nav to point to the root element. 381 | (vtd/parent! nav) 382 | 383 | ;; Mutate nav to point to the root of the document (regardless of location). 384 | (vtd/root! nav) 385 | ``` 386 | 387 | In order to mitigate the problems with mutable state, it might be best to use 388 | the above functions much like you would `transient`; viz. within the confines 389 | of a function like so: 390 | 391 | ```clojure 392 | (defn title [nav] 393 | (-> (vtd/root nav) ; Create a new navigator to the root 394 | (vtd/first-child! :front) ; for mutation. 395 | (vtd/first-child! :article-meta) 396 | (vtd/first-child! :title-group) 397 | (vtd/first-child! :article-title) 398 | vtd/text)) 399 | ``` 400 | 401 | In this way, only one extra navigator is created. 402 | 403 | ## Acknowledgements 404 | 405 | [Andrew Diamond's `clj-vtd-xml`](https://github.com/diamondap/clj-vtd-xml) and 406 | [Tim Williams' gist](https://gist.github.com/willtim/822769) are existing 407 | interfaces to VTD-XML from Clojure that were great sources of inspiration. 408 | 409 | [Dave Ray's `seesaw`](https://github.com/daveray/seesaw) set the standard for 410 | helpful docstrings. 411 | 412 | Clojure's 413 | [`core.clj`](https://github.com/clojure/clojure/blob/master/src/clj/clojure/core.clj) 414 | provided fascinating reading, particularly regarding the use of `:inline` 415 | metadata. 416 | 417 | Thanks to [Heikki Hämäläinen](https://github.com/hjhamala) for contributing a 418 | character encoding fix for Windows users. 419 | 420 | Thanks to [Eugen Stan](https://github.com/ieugen) for suggesting that 421 | `navigator` should also accept byte arrays as well as UTF-8 strings. 422 | 423 | ## License 424 | 425 | Copyright © 2013-2022 Paul Mucur. 426 | 427 | Distributed under the Eclipse Public License, the same as Clojure. 428 | -------------------------------------------------------------------------------- /src/riveted/core.clj: -------------------------------------------------------------------------------- 1 | (ns ^{:doc "A Clojure library for the fast processing of XML with VTD-XML." 2 | :author "Paul Mucur"} 3 | riveted.core 4 | (:require [clojure.string :as s]) 5 | (:import [com.ximpleware VTDGen VTDNav AutoPilot TextIter])) 6 | 7 | (set! *warn-on-reflection* true) 8 | 9 | (defn- token-type-name 10 | "Private. Return a keyword representing the token type of the given 11 | VTDNav. 12 | 13 | Possible values are: 14 | 15 | * :start-tag 16 | * :end-tag 17 | * :attribute-name 18 | * :attribute-value 19 | * :namespace 20 | * :character-data 21 | * :comment 22 | * :processing-instruction-name 23 | * :processing-instruction-value 24 | * :declaration-attribute-name 25 | * :declaration-attribute-value 26 | * :cdata 27 | * :doctype" 28 | [^VTDNav nav index] 29 | (condp = (.getTokenType nav index) 30 | VTDNav/TOKEN_DOCUMENT :document 31 | VTDNav/TOKEN_STARTING_TAG :start-tag 32 | VTDNav/TOKEN_ENDING_TAG :end-tag 33 | VTDNav/TOKEN_ATTR_NAME :attribute-name 34 | VTDNav/TOKEN_ATTR_NS :namespace 35 | VTDNav/TOKEN_ATTR_VAL :attribute-value 36 | VTDNav/TOKEN_CHARACTER_DATA :character-data 37 | VTDNav/TOKEN_COMMENT :comment 38 | VTDNav/TOKEN_PI_NAME :processing-instruction-name 39 | VTDNav/TOKEN_PI_VAL :processing-instruction-value 40 | VTDNav/TOKEN_DEC_ATTR_NAME :declaration-attribute-name 41 | VTDNav/TOKEN_DEC_ATTR_VAL :declaration-attribute-value 42 | VTDNav/TOKEN_CDATA_VAL :cdata 43 | VTDNav/TOKEN_DTD_VAL :doctype)) 44 | 45 | (defn- index-seq 46 | "Private. Return a lazy sequence of all tokens from the given VTDNav and 47 | index onwards. 48 | 49 | Tokens are represented as maps with a :type and :value entry." 50 | [^VTDNav nav index] 51 | (lazy-seq 52 | (when (< index (.getTokenCount nav)) 53 | (cons {:type (token-type-name nav index) 54 | :value (.toNormalizedString nav index)} 55 | (index-seq nav (inc index)))))) 56 | 57 | ;;; Wrapper type for the VTDNav class in order to implement Clojure's 58 | ;;; Sequential, Seqable and Counted interfaces. 59 | 60 | (deftype Navigator [^VTDNav nav] 61 | clojure.lang.Sequential 62 | clojure.lang.Seqable 63 | (seq [this] 64 | (index-seq nav (.getCurrentIndex nav))) 65 | clojure.lang.Counted 66 | (count [this] 67 | (.getTokenCount nav))) 68 | 69 | (defn- vtd-nav 70 | "Private. Return the VTDNav for a given Navigator or nil if not applicable. 71 | 72 | The use of vary-meta in the inline version of this function is in order to 73 | type hint the navigator and return type. 74 | 75 | See: 76 | http://stackoverflow.com/questions/7754429/clojure-defmacro-loses-metadata" 77 | {:inline (fn [navigator] 78 | `(when (instance? Navigator ~navigator) 79 | ~(vary-meta `(.nav ~(vary-meta navigator assoc :tag `Navigator)) 80 | assoc :tag `VTDNav)))} 81 | ^VTDNav 82 | [^Navigator navigator] 83 | (when (instance? Navigator navigator) 84 | (.nav navigator))) 85 | 86 | (defn- index 87 | "Private. Return the current index of the given navigator." 88 | {:inline (fn [navigator] 89 | `(when-let [nav# (vtd-nav ~navigator)] 90 | (.getCurrentIndex nav#)))} 91 | [navigator] 92 | (when-let [nav (vtd-nav navigator)] 93 | (.getCurrentIndex nav))) 94 | 95 | (defn- clone 96 | "Private. Returns a new navigator cloned from the given one." 97 | {:inline (fn [navigator] 98 | `(when-let [nav# (vtd-nav ~navigator)] 99 | (Navigator. (.cloneNav nav#))))} 100 | [navigator] 101 | (when-let [nav (vtd-nav navigator)] 102 | (-> nav .cloneNav Navigator.))) 103 | 104 | (defprotocol Navigable 105 | "Protocol for types that can be used to generate a VTD navigator." 106 | (navigator 107 | [xml] 108 | [xml namespace-aware] 109 | "Return a VTD navigator for a given byte array or UTF-8 string of XML with 110 | optional namespace support. If called with only a byte array or string, 111 | namespace support is disabled. 112 | 113 | Examples: 114 | 115 | ; Return a navigator for the given byte array with no namespace support. 116 | (navigator my-byte-array) 117 | 118 | ; Return a navigator for the given UTF-8 string with no namespace support. 119 | (navigator \"Bar\") 120 | 121 | ; Return a navigator for the given UTF-8 string with namespace support. 122 | (navigator \"Bar\" true)")) 123 | 124 | (extend-protocol Navigable 125 | (Class/forName "[B") 126 | (navigator 127 | ([xml] (navigator xml false)) 128 | ([xml namespace-aware] 129 | (let [vg (doto (VTDGen.) (.setDoc xml) 130 | (.parse namespace-aware))] 131 | (Navigator. (.getNav vg))))) 132 | 133 | java.lang.String 134 | (navigator 135 | ([xml] (navigator (.getBytes xml "UTF-8") false)) 136 | ([xml namespace-aware] (navigator (.getBytes xml "UTF-8") namespace-aware)))) 137 | 138 | (defn tag 139 | "Return the tag name for the element under the given VTD navigator as a 140 | string. If positioned on an attribute (e.g. with an XPath like /@foo), return 141 | the name of the attribute. 142 | 143 | Examples: 144 | 145 | (tag (root nav)) 146 | ;=> \"root\" 147 | 148 | (tag (at nav \"/channel/@id\")) 149 | ;=> \"id\"" 150 | [navigator] 151 | (when-let [nav (vtd-nav navigator)] 152 | (.toString nav (index navigator)))) 153 | 154 | (defn- index->text 155 | "Private. Returns the text value of a node identified by the given index in 156 | the given navigator." 157 | {:inline (fn [navigator index] 158 | `(when-let [nav# (vtd-nav ~navigator)] 159 | (when-not (= ~index -1) 160 | (.toNormalizedString nav# ~index))))} 161 | [navigator index] 162 | (when-let [nav (vtd-nav navigator)] 163 | (when-not (= index -1) 164 | (.toNormalizedString nav index)))) 165 | 166 | (defn attr 167 | "Return the value of the named attribute for the given navigator. 168 | Attributes can be specified with either a keyword or string name. 169 | 170 | Examples: 171 | 172 | (attr (root nav) :lang) 173 | ;=> \"en\"" 174 | [navigator attr-name] 175 | (when-let [nav (vtd-nav navigator)] 176 | (let [index (.getAttrVal nav (name attr-name))] 177 | (index->text navigator index)))) 178 | 179 | (defn attr? 180 | "Test whether the given attribute exists on the current element. 181 | Attributes can be specified with either a keyword or string name. 182 | 183 | Examples: 184 | 185 | (attr? (root nav) :lang) 186 | ;=> true" 187 | [navigator attr-name] 188 | (when-let [nav (vtd-nav navigator)] 189 | (.hasAttr nav (name attr-name)))) 190 | 191 | (defn fragment 192 | "Return a string XML fragment for all nodes under the given navigator. 193 | 194 | Examples: 195 | 196 | (fragment nav) 197 | ;=> \"Some XML as a raw string\"" 198 | [navigator] 199 | (when-let [nav (vtd-nav navigator)] 200 | (let [r (.getContentFragment nav)] 201 | (if (= r -1) 202 | "" 203 | (.toString nav (bit-and r 16rFFFFFF) (bit-shift-right r 32)))))) 204 | 205 | ;;; Transient interface for navigation. 206 | 207 | (defn- navigate! 208 | "Private. Low level interface to move the given navigator in the given 209 | direction (optionally restricting moving to the given element type), mutating 210 | it in place. 211 | 212 | Note that this changes the internal state of the navigator (thereby suffering 213 | from the usual problems of mutability including concurrency woes) but saves 214 | duplicating the navigator's state on every move. 215 | 216 | Direction should be one of the standard VTDNav constants, namely: 217 | 218 | * VTDNav/ROOT; 219 | * VTDNav/FIRST_CHILD; 220 | * VTDNav/LAST_CHILD; 221 | * VTDNav/NEXT_SIBLING; 222 | * VTDNav/PREV_SIBLING; 223 | * VTDNav/PARENT. 224 | 225 | Examples: 226 | 227 | ; Move the navigator to the document root. 228 | (navigate! nav VTDNav/ROOT) 229 | 230 | ; Move the navigator to the first p child tag. 231 | (navigate! nav VTDNav/FIRST_CHILD :p) 232 | 233 | See: 234 | http://vtd-xml.sourceforge.net/javadoc/com/ximpleware/VTDNav.html" 235 | ([navigator direction] 236 | {:pre [(>= direction 0) (<= direction 5)]} 237 | (when-let [nav (vtd-nav navigator)] 238 | (when (.toElement nav direction) 239 | navigator))) 240 | ([navigator direction element] 241 | {:pre [(>= direction 0) (<= direction 5)]} 242 | (when-let [nav (vtd-nav navigator)] 243 | (when (.toElement nav direction (name element)) 244 | navigator)))) 245 | 246 | (defn root! 247 | "Move the given navigator to the document root, mutating it in place." 248 | [navigator] 249 | (navigate! navigator VTDNav/ROOT)) 250 | 251 | (defn parent! 252 | "Move the given navigator to the current element's parent element, mutating 253 | it in place." 254 | [navigator] 255 | (navigate! navigator VTDNav/PARENT)) 256 | 257 | (defn next-sibling! 258 | "Move the given navigator to the current element's next sibling element 259 | (restricted by an optional element type), mutating it in place. 260 | 261 | Examples: 262 | 263 | ; Move nav to the next sibling element. 264 | (next-sibling! nav) 265 | 266 | ; Move nav to the next sibling b element. 267 | (next-sibling! nav :b)" 268 | ([navigator] (navigate! navigator VTDNav/NEXT_SIBLING)) 269 | ([navigator element] (navigate! navigator VTDNav/NEXT_SIBLING element))) 270 | 271 | (defn previous-sibling! 272 | "Move the given navigator to the current element's previous sibling element 273 | (restricted by an optional element type), mutating it in place. 274 | 275 | Examples: 276 | 277 | ; Move nav to the previous sibling element. 278 | (previous-sibling! nav) 279 | 280 | ; Move nav to the previous sibling b element. 281 | (previous-sibling! nav :b)" 282 | ([navigator] (navigate! navigator VTDNav/PREV_SIBLING)) 283 | ([navigator element] (navigate! navigator VTDNav/PREV_SIBLING element))) 284 | 285 | (defn first-child! 286 | "Move the given navigator to the current element's first child element 287 | (restricted by an optional element type), mutating it in place. 288 | 289 | Examples: 290 | 291 | ; Move nav to the first child element. 292 | (first-child! nav) 293 | 294 | ; Move nav to the first child b element. 295 | (first-child! nav :b)" 296 | ([navigator] (navigate! navigator VTDNav/FIRST_CHILD)) 297 | ([navigator element] (navigate! navigator VTDNav/FIRST_CHILD element))) 298 | 299 | (defn last-child! 300 | "Move the given navigator to the current element's last child element 301 | (restricted by an optional element type), mutating it in place. 302 | 303 | Examples: 304 | 305 | ; Move nav to the last child element. 306 | (last-child! nav) 307 | 308 | ; Move nav to the last child b element. 309 | (last-child! nav :b)" 310 | ([navigator] (navigate! navigator VTDNav/LAST_CHILD)) 311 | ([navigator element] (navigate! navigator VTDNav/LAST_CHILD element))) 312 | 313 | ;;; Immutable interface to navigation. 314 | 315 | (defn- navigate 316 | "Private. Low level interface to return a new navigator based on moving the 317 | given one in the given direction (optionally restricting movement to the given 318 | element type). Note that this *does not* mutate the existing navigator unlike 319 | (riveted.core/navigate!). 320 | 321 | This relies on cloning the given navigator before moving and therefore will 322 | use more memory than (riveted.core/navigate!) but provides the benefits of an 323 | immutable interface. 324 | 325 | Direction should be one of the standard VTDNav constants, namely: 326 | 327 | * VTDNav/ROOT; 328 | * VTDNav/FIRST_CHILD; 329 | * VTDNav/LAST_CHILD; 330 | * VTDNav/NEXT_SIBLING; 331 | * VTDNav/PREV_SIBLING; 332 | * VTDNav/PARENT. 333 | 334 | Examples: 335 | 336 | ; Return a new navigator pointing at the document root. 337 | (navigate nav VTDNav/ROOT) 338 | 339 | ; Return a new navigator pointing at the first p child tag. 340 | (navigate nav VTDNav/FIRST_CHILD :p) 341 | 342 | See: 343 | http://vtd-xml.sourceforge.net/javadoc/com/ximpleware/VTDNav.html 344 | (riveted.core/navigate!)" 345 | ([navigator direction] 346 | (when-let [navigator' (clone navigator)] 347 | (navigate! navigator' direction))) 348 | ([navigator direction element] 349 | (when-let [navigator' (clone navigator)] 350 | (navigate! navigator' direction element)))) 351 | 352 | (defn root 353 | "Return a new navigator pointing to the document root." 354 | [navigator] 355 | (navigate navigator VTDNav/ROOT)) 356 | 357 | (defn parent 358 | "Return a new navigator pointing to the parent element of the given 359 | navigator." 360 | [navigator] 361 | (navigate navigator VTDNav/PARENT)) 362 | 363 | (defn next-sibling 364 | "Return a new navigator pointing to the current element's next sibling element 365 | (restricted by an optional element type). 366 | 367 | Examples: 368 | 369 | ; Return a new navigator pointing to the next sibling element of nav. 370 | (next-sibling nav) 371 | 372 | ; Return a new navigator pointing to the next sibling b element of nav. 373 | (next-sibling nav :b)" 374 | ([navigator] (navigate navigator VTDNav/NEXT_SIBLING)) 375 | ([navigator element] (navigate navigator VTDNav/NEXT_SIBLING element))) 376 | 377 | (defn previous-sibling 378 | "Return a new navigator pointing to the current element's previous sibling 379 | element (restricted by an optional element type). 380 | 381 | Examples: 382 | 383 | ; Return a new navigator pointing to the previous sibling element of nav. 384 | (previous-sibling nav) 385 | 386 | ; Return a new navigator pointing to the previous sibling b element of nav. 387 | (previous-sibling nav :b)" 388 | ([navigator] (navigate navigator VTDNav/PREV_SIBLING)) 389 | ([navigator element] (navigate navigator VTDNav/PREV_SIBLING element))) 390 | 391 | (defn first-child 392 | "Return a new navigator pointing to the current element's first child element 393 | (restricted by an optional element type). 394 | 395 | Examples: 396 | 397 | ; Return a new navigator pointing to the first child element of nav. 398 | (first-child nav) 399 | 400 | ; Return a new navigator pointing to the first child b element of nav. 401 | (first-child nav :b)" 402 | ([navigator] (navigate navigator VTDNav/FIRST_CHILD)) 403 | ([navigator element] (navigate navigator VTDNav/FIRST_CHILD element))) 404 | 405 | (defn last-child 406 | "Return a new navigator pointing to the current element's last child element 407 | (restricted by an optional element type). 408 | 409 | Examples: 410 | 411 | ; Return a new navigator pointing to the last child element of nav. 412 | (last-child nav) 413 | 414 | ; Return a new navigator pointing to the last child b element of nav. 415 | (last-child nav :b)" 416 | ([navigator] (navigate navigator VTDNav/LAST_CHILD)) 417 | ([navigator element] (navigate navigator VTDNav/LAST_CHILD element))) 418 | 419 | (defn next-siblings 420 | "Return a lazy sequence of navigators representing all siblings next to the 421 | given navigator (optionally restricted by a given element type). 422 | 423 | Examples: 424 | 425 | ; Return navigators for every next sibling element to nav. 426 | (next-siblings nav) 427 | 428 | ; Return navigators for every next sibling p element to nav. 429 | (next-siblings nav :p)" 430 | ([navigator] 431 | (lazy-seq 432 | (when-let [sibling (next-sibling navigator)] 433 | (cons sibling (next-siblings sibling))))) 434 | ([navigator element] 435 | (lazy-seq 436 | (when-let [sibling (next-sibling navigator element)] 437 | (cons sibling (next-siblings sibling element)))))) 438 | 439 | (defn previous-siblings 440 | "Return a lazy sequence of navigators representing all siblings previous to 441 | the given navigator (optionally restricted by a given element type). 442 | 443 | Note that this is lazily evaluated right-to-left so the final sequence will 444 | be in reverse order to the actual nodes in the document. 445 | 446 | Examples: 447 | 448 | ; Return navigators for every previous sibling element to nav. 449 | (previous-siblings nav) 450 | 451 | ; Return navigators for every previous sibling p element to nav. 452 | (previous-siblings nav :p)" 453 | ([navigator] 454 | (lazy-seq 455 | (when-let [sibling (previous-sibling navigator)] 456 | (cons sibling (previous-siblings sibling))))) 457 | ([navigator element] 458 | (lazy-seq 459 | (when-let [sibling (previous-sibling navigator element)] 460 | (cons sibling (previous-siblings sibling element)))))) 461 | 462 | (defn siblings 463 | "Return navigators for all siblings to the given navigator (optionally 464 | restricted by a given element type). 465 | 466 | Note that this is not lazy in order to preserve the correct order of nodes 467 | and previous siblings need to be fully realised for sorting. 468 | 469 | Examples: 470 | 471 | ; Return navigators for all siblings to nav. 472 | (siblings nav) 473 | 474 | ; Return navigators for all sibling p elements to nav. 475 | (siblings nav :p)" 476 | {:inline (fn [navigator & args] 477 | `(let [left# (reverse (previous-siblings ~navigator ~@args)) 478 | right# (next-siblings ~navigator ~@args)] 479 | (when (or (seq left#) (seq right#)) 480 | (concat left# right#)))) 481 | :inline-arities #{1 2}} 482 | ([navigator] 483 | (let [left (reverse (previous-siblings navigator)) 484 | right (next-siblings navigator)] 485 | (when (or (seq left) (seq right)) 486 | (concat left right)))) 487 | ([navigator element] 488 | (let [left (reverse (previous-siblings navigator element)) 489 | right (next-siblings navigator element)] 490 | (when (or (seq left) (seq right)) 491 | (concat left right))))) 492 | 493 | (defn children 494 | "Return a lazy sequence of navigators for all child nodes of the given 495 | navigator (optionally restricted by a given element type). 496 | 497 | Examples: 498 | 499 | ; Return navigators for all children of nav. 500 | (children nav) 501 | 502 | ; Return navigators for all child p elements of nav. 503 | (children nav :p)" 504 | {:inline (fn [navigator & args] 505 | `(lazy-seq 506 | (when-let [child# (first-child ~navigator ~@args)] 507 | (cons child# (next-siblings child# ~@args))))) 508 | :inline-arities #{1 2}} 509 | ([navigator] 510 | (lazy-seq 511 | (when-let [child (first-child navigator)] 512 | (cons child (next-siblings child))))) 513 | ([navigator element] 514 | (lazy-seq 515 | (when-let [child (first-child navigator element)] 516 | (cons child (next-siblings child element)))))) 517 | 518 | (defn- text-seq 519 | "Private. Returns a lazy sequence of all text nodes for a given TextIter." 520 | [^TextIter text-iter] 521 | (lazy-seq 522 | (let [index (.getNext text-iter)] 523 | (when-not (= index -1) 524 | (cons index (text-seq text-iter)))))) 525 | 526 | (defn- text-indices 527 | "Private. Creates a TextIter for the given navigator and returns a sequence of 528 | indices for all text nodes associated with it." 529 | [navigator] 530 | (when-let [nav (vtd-nav navigator)] 531 | (let [iter (doto (TextIter.) (.touch nav))] 532 | (text-seq iter)))) 533 | 534 | (defn- text-descendant-indices 535 | "Private. Returns an ordered sequence of the indices of all text nodes that 536 | are descendants of the given navigator." 537 | [navigator] 538 | (sort (concat (text-indices navigator) 539 | (mapcat text-descendant-indices (children navigator))))) 540 | 541 | (defn- text-descendants 542 | "Private. Returns a sequence of all text descending from the given navigator." 543 | [navigator] 544 | (map (partial index->text navigator) (text-descendant-indices navigator))) 545 | 546 | (defn- token-type 547 | "Private. Returns the token type of the given navigator." 548 | ([navigator] (token-type navigator (index navigator))) 549 | ([navigator index] 550 | (when-let [nav (vtd-nav navigator)] 551 | (.getTokenType nav index)))) 552 | 553 | (defn element? 554 | "Tests whether the given navigator is currently positioned on an element." 555 | [navigator] 556 | (= VTDNav/TOKEN_STARTING_TAG (token-type navigator))) 557 | 558 | (defn document? 559 | "Tests whether the given navigator is currently positioned the document." 560 | [navigator] 561 | (= VTDNav/TOKEN_DOCUMENT (token-type navigator))) 562 | 563 | (defn attribute? 564 | "Tests whether the given navigator is currently positioned on an attribute." 565 | [navigator] 566 | (= VTDNav/TOKEN_ATTR_NAME (token-type navigator))) 567 | 568 | (defn text 569 | "Return all descendant text content below the given navigator as one string. 570 | This means both the value of a simple text node and also the resulting text 571 | value of a mixed content node such as

Foo bar

. If the navigator 572 | is currently positioned on an attribute (e.g. by using an XPath like /@foo), 573 | return the value of the attribute. 574 | 575 | Examples: 576 | 577 | ; Returns \"Foo\" given nav points to

Foo

578 | (text nav) 579 | 580 | ; Returns \"Foo bar\" given nav points to

Foo bar

581 | (text nav) 582 | 583 | ; Returns \"123\" given nav points to @src of 584 | (text nav)" 585 | [navigator] 586 | (if (attribute? navigator) (:value (second navigator)) 587 | (when-let [texts (seq (text-descendants navigator))] 588 | (s/join " " texts)))) 589 | 590 | (defn- xpath-seq 591 | "Private. Returns a lazy sequence of navigators exhaustively evaluating XPath 592 | with the given navigator and AutoPilot." 593 | [navigator ^AutoPilot autopilot] 594 | (lazy-seq 595 | (let [index (.evalXPath autopilot)] 596 | (when-not (= index -1) 597 | (cons (clone navigator) 598 | (xpath-seq navigator autopilot)))))) 599 | 600 | (defn search 601 | "Search for the given XPath in the navigator, returning a lazy sequence of all 602 | matching navigators. If used with a namespace aware navigator, also takes 603 | a namespace prefix and URL for use in the XPath. 604 | 605 | Examples: 606 | 607 | ; Returns navigators for all matching elements. 608 | (search nav \"/article/title\") 609 | 610 | ; Returns navigators for all matching elements providing ns-nav is 611 | ; namespace aware. 612 | (search ns-nav \"//ns1:title\" \"ns1\" \"http://example.com/ns\")" 613 | ([navigator xpath] 614 | (when-let [navigator' (clone navigator)] 615 | (let [autopilot (doto (AutoPilot. (vtd-nav navigator')) 616 | (.selectXPath xpath))] 617 | (xpath-seq navigator' autopilot)))) 618 | ([navigator xpath prefix url] 619 | (when-let [navigator' (clone navigator)] 620 | (let [autopilot (doto (AutoPilot. (vtd-nav navigator')) 621 | (.declareXPathNameSpace prefix url) 622 | (.selectXPath xpath))] 623 | (xpath-seq navigator' autopilot))))) 624 | 625 | (defn- select-seq 626 | "Private. Returns a lazy sequence of navigators exhaustively iterating through 627 | nodes with the given navigator and AutoPilot." 628 | [navigator ^AutoPilot autopilot] 629 | (lazy-seq 630 | (when (.iterate autopilot) 631 | (cons (clone navigator) 632 | (select-seq navigator autopilot))))) 633 | 634 | (defn select 635 | "Return a lazy sequence of navigators matching the given element name, * can 636 | be used to match all elements. 637 | 638 | Examples: 639 | 640 | ; Returns navigators for each element in nav. 641 | (select nav \"*\") 642 | 643 | ; Returns navigators for all b elements in nav. 644 | (select nav \"b\")" 645 | [navigator element] 646 | (when-let [navigator' (clone navigator)] 647 | (let [autopilot (doto (AutoPilot. (vtd-nav navigator')) 648 | (.selectElement (name element)))] 649 | (select-seq navigator' autopilot)))) 650 | 651 | (defn at 652 | "Search for the given XPath in the navigator, returning the first matching 653 | navigator. If used with a namespace aware navigator, also takes a namespace 654 | prefix and URL for use in the XPath. 655 | 656 | Examples: 657 | 658 | ; Returns a single navigator for the first matching element. 659 | (at nav \"/article/title\") 660 | 661 | ; Returns a single navigator for the first matching element providing 662 | ; ns-nav is namespace aware. 663 | (at ns-nav \"//ns1:title\" \"ns1\" \"http://example.com/ns\")" 664 | {:inline (fn [& args] `(first (search ~@args))) 665 | :inline-arities #{2 4}} 666 | ([navigator xpath] (first (search navigator xpath))) 667 | ([navigator xpath prefix url] (first (search navigator xpath prefix url)))) 668 | 669 | --------------------------------------------------------------------------------