├── .travis.yml
├── deps.edn
├── .gitignore
├── .github
└── workflows
│ └── clojure.yml
├── project.clj
├── CHANGELOG.md
├── test
└── riveted
│ └── core_test.clj
├── README.md
└── src
└── riveted
└── core.clj
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: clojure
2 | script: lein all midje
3 |
--------------------------------------------------------------------------------
/deps.edn:
--------------------------------------------------------------------------------
1 | {:deps
2 | {com.ximpleware/vtd-xml {:mvn/version "2.13.4"}}}
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /lib
3 | /classes
4 | /checkouts
5 | pom.xml
6 | pom.xml.asc
7 | *.jar
8 | *.class
9 | .lein-deps-sum
10 | .lein-failures
11 | .lein-plugins
12 | .lein-repl-history
13 |
--------------------------------------------------------------------------------
/.github/workflows/clojure.yml:
--------------------------------------------------------------------------------
1 | name: Clojure CI
2 |
3 | on:
4 | push:
5 | branches: [ main ]
6 | pull_request:
7 | branches: [ main ]
8 |
9 | jobs:
10 | build:
11 |
12 | runs-on: ubuntu-latest
13 |
14 | steps:
15 | - uses: actions/checkout@v2
16 | - uses: actions/setup-java@v2
17 | with:
18 | distribution: 'temurin'
19 | java-version: '8'
20 | - name: Install dependencies
21 | run: lein deps
22 | - name: Run tests
23 | run: lein all midje
24 |
--------------------------------------------------------------------------------
/project.clj:
--------------------------------------------------------------------------------
1 | (defproject riveted "0.2.0"
2 | :description "A Clojure library for the fast processing of XML with VTD-XML."
3 | :url "https://github.com/mudge/riveted"
4 | :license {:name "Eclipse Public License"
5 | :url "http://www.eclipse.org/legal/epl-v10.html"}
6 | :dependencies [[com.ximpleware/vtd-xml "2.13.4"]]
7 | :profiles {:dev {:dependencies [[midje "1.10.5"]]
8 | :plugins [[lein-midje "3.2.2"]
9 | [lein-codox "0.10.8"]]}
10 | :1.3 {:dependencies [[midje "1.6.3"] [org.clojure/clojure "1.3.0"]]}
11 | :1.4 {:dependencies [[midje "1.6.3"] [org.clojure/clojure "1.4.0"]]}
12 | :1.5.1 {:dependencies [[midje "1.6.3"] [org.clojure/clojure "1.5.1"]]}
13 | :1.6 {:dependencies [[midje "1.6.3"] [org.clojure/clojure "1.6.0"]]}
14 | :1.7 {:dependencies [[midje "1.10.5"] [org.clojure/clojure "1.7.0"]]}
15 | :1.8 {:dependencies [[midje "1.10.5"] [org.clojure/clojure "1.8.0"]]}
16 | :1.9 {:dependencies [[midje "1.10.5"] [org.clojure/clojure "1.9.0"]]}
17 | :1.10.0 {:dependencies [[midje "1.10.5"] [org.clojure/clojure "1.10.0"]]}
18 | :1.10.1 {:dependencies [[midje "1.10.5"] [org.clojure/clojure "1.10.1"]]}
19 | :1.10.2 {:dependencies [[midje "1.10.5"] [org.clojure/clojure "1.10.2"]]}
20 | :1.10.3 {:dependencies [[midje "1.10.5"] [org.clojure/clojure "1.10.3"]]}}
21 | :aliases {"all" ["with-profile" "dev,1.3:dev,1.4:dev,1.5.1:dev,1.6:dev,1.7:dev,1.8:dev,1.9:dev,1.10.0:dev,1.10.1:dev,1.10.2:dev,1.10.3"]}
22 | :codox {:src-dir-uri "https://github.com/mudge/riveted/blob/main"
23 | :src-linenum-anchor-prefix "L"})
24 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Change Log
2 | All notable changes to this project will be documented in this file. This
3 | project adheres to [Semantic Versioning](http://semver.org/).
4 |
5 | ## [0.2.0] - 2022-02-15
6 | ### Changed
7 | - Added support for passing a byte array directly to `navigator` rather than only
8 | accepting UTF-8 strings (thanks to Eugen Stan for the suggestion)
9 | - Passing an invalid value to `navigator` (e.g. `nil`) will now throw an
10 | `IllegalArgumentException` rather than a `NullPointerException`
11 |
12 | ## [0.1.2] - 2022-02-11
13 | ### Changed
14 | - Upgrade underlying VTD-XML dependency to 2.13.4 (thanks to Eugen Stan)
15 |
16 | ## [0.1.1] - 2018-01-29
17 | ### Fixed
18 | - Fixed exception when calling fragment on a self-closing tag
19 |
20 | ## [0.1.0] - 2017-06-27
21 | ### Fixed
22 | - Explicitly set the character set to UTF-8 when reading XML
23 |
24 | ### Changed
25 | - Upgrade underlying VTD-XML dependency to 2.13
26 |
27 | ## [0.0.9] - 2013-09-05
28 | ### Added
29 | - Add ability to return the text of an attribute using `text`
30 | - Add ability to check if the navigator is pointed at an attribute with `attribute?`
31 |
32 | ## [0.0.8] - 2013-05-02
33 | ### Added
34 | - Gracefully handle `nil` across the library to make threading easier
35 |
36 | ## [0.0.7] - 2013-03-29
37 | ### Added
38 | - Enrich navigators so that are now sequential, seqable and countable data structures
39 |
40 | ## [0.0.6] - 2013-03-29
41 | ### Added
42 | - Added `select` for selecting elements by name or wildcard
43 |
44 | ## [0.0.5] - 2013-03-28
45 | ### Added
46 | - Added transient interface via `root!`, `parent!`, `next-sibling!`, `previous-sibling!`, `first-child!` and `last-child!`
47 |
48 | ## [0.0.4] - 2013-03-25
49 | ### Added
50 | - Added `attr?` for testing the existence of attributes
51 | - Added ability to pass optional element names to `first-child`, `last-child`,
52 | `next-sibling`, `previous-sibling`, `siblings` and `children`
53 |
54 | ## [0.0.3] - 2013-03-24
55 | ### Added
56 | - Added support for XML namespaces when searching
57 |
58 | ### Changed
59 | - Replaced public `token-type` with `element?` and `document?` functions
60 |
61 | ## [0.0.2] - 2013-03-24
62 | ### Added
63 | - Add ability to fetch previous sibling
64 | - Add ability to fetch token type for an element
65 |
66 | ### Fixed
67 | - Fix fetching all siblings for a navigator, both previous and next
68 |
69 | ## [0.0.1] - 2013-03-24
70 | ### Added
71 | - First stable version of riveted
72 |
73 | [0.2.0]: https://github.com/mudge/riveted/releases/tag/v0.2.0
74 | [0.1.2]: https://github.com/mudge/riveted/releases/tag/v0.1.2
75 | [0.1.1]: https://github.com/mudge/riveted/releases/tag/v0.1.1
76 | [0.1.0]: https://github.com/mudge/riveted/releases/tag/v0.1.0
77 | [0.0.9]: https://github.com/mudge/riveted/releases/tag/v0.0.9
78 | [0.0.8]: https://github.com/mudge/riveted/releases/tag/v0.0.8
79 | [0.0.7]: https://github.com/mudge/riveted/releases/tag/v0.0.7
80 | [0.0.6]: https://github.com/mudge/riveted/releases/tag/v0.0.6
81 | [0.0.5]: https://github.com/mudge/riveted/releases/tag/v0.0.5
82 | [0.0.4]: https://github.com/mudge/riveted/releases/tag/v0.0.4
83 | [0.0.3]: https://github.com/mudge/riveted/releases/tag/v0.0.3
84 | [0.0.2]: https://github.com/mudge/riveted/releases/tag/v0.0.2
85 | [0.0.1]: https://github.com/mudge/riveted/releases/tag/v0.0.1
86 |
--------------------------------------------------------------------------------
/test/riveted/core_test.clj:
--------------------------------------------------------------------------------
1 | (ns riveted.core-test
2 | (:use midje.sweet riveted.core)
3 | (:import com.ximpleware.VTDNav))
4 |
5 | ;;; Test data.
6 |
7 | (def xml "FooFoo woo mooBar")
8 | (def ns-xml "Bob")
9 |
10 | (def nav (navigator xml false))
11 | (def ns-nav (navigator ns-xml true))
12 |
13 | ;;; Custom checkers to simplify testing.
14 |
15 | (defn nav? [actual] (instance? riveted.core.Navigator actual))
16 | (defn tag? [tag-name] (fn [actual] (= tag-name (tag actual))))
17 | (defn tags? [& tag-names] (fn [actual] (= tag-names (map tag actual))))
18 | (def root? (tag? "root"))
19 |
20 | (fact "navigator returns a VTD navigator for a byte array."
21 | (navigator (.getBytes "" "UTF-8")) => nav?
22 | (navigator (.getBytes "" "ISO-8859-1")) => nav?)
23 |
24 | (fact "navigator returns a VTD navigator for a UTF-8 string."
25 | (navigator "") => nav?)
26 |
27 | (fact "navigator raises an IllegalArgumentException nil if given nil."
28 | (navigator nil) => (throws IllegalArgumentException))
29 |
30 | (fact "search returns a sequence of matching navigators for a given XPath."
31 | (search nav "/root/basic-title") => (one-of nav?)
32 | (search nav "//i") => (two-of nav?)
33 | (search nav "/missing") => empty?
34 | (search ns-nav "/root/foo:name" "foo" "http://purl.org/dc/elements/1.1/") => (one-of nav?)
35 | (search nil "/foo") => empty?)
36 |
37 | (fact "at returns the first matching navigator for a given XPath."
38 | (at nav "/root/basic-title") => nav?
39 | (at nav "/missing") => nil?
40 | (at ns-nav "/root/foo:name" "foo" "http://purl.org/dc/elements/1.1/") => nav?
41 | (at nil "/foo") => nil?)
42 |
43 | (fact "select returns navigators for all matching elements."
44 | (select nav "*") => (tags? "root" "basic-title" "complex-title" "i" "b" "i" "foo")
45 | (select nav "i") => (two-of (tag? "i"))
46 | (select nav :i) => (two-of (tag? "i"))
47 | (select (at nav "/root/complex-title") "*") => (tags? "complex-title" "i" "b")
48 | (select nav "missing") => empty?
49 | (select nil "foo") => empty?)
50 |
51 | (fact "text returns the text nodes descending from a navigator."
52 | (text (at nav "/root/basic-title")) => "Foo"
53 | (text (at nav "/root/complex-title")) => "Foo woo moo"
54 | (text (at nav "/root/foo")) => nil?
55 | (text nil) => nil?)
56 |
57 | (fact "fragment returns the content of the given navigator as an XML fragment."
58 | (fragment (at nav "/root/complex-title")) => "Foo woo moo"
59 | (fragment (at nav "/root/basic-title")) => "Foo"
60 | (fragment (at nav "/root/foo")) => ""
61 | (fragment nil) => nil?)
62 |
63 | (fact "attr returns the value of the given attribute"
64 | (attr (at nav "/root/complex-title") :id) => "42"
65 | (attr (at nav "/root/complex-title") "id") => "42"
66 | (attr (at nav "/root/complex-title") :missing) => nil?
67 | (attr nil :foo) => nil?)
68 |
69 | (fact "tag returns the current element name of the given navigator."
70 | (tag (root nav)) => "root"
71 | (tag (at nav "/root/complex-title")) => "complex-title"
72 | (tag nil) => nil?)
73 |
74 | (fact "document? returns true if the navigator is set to the document."
75 | (parent (root nav)) => document?
76 | (root nav) =not=> document?
77 | (document? nil) => false)
78 |
79 | (fact "element? returns true if the navigator is set to an element."
80 | (root nav) => element?
81 | (parent (root nav)) =not=> element?
82 | (element? nil) => false)
83 |
84 | (fact "parent returns a navigator for the parent of the current navigator."
85 | (parent (at nav "/root/basic-title")) => nav?
86 | (parent nil) => nil?)
87 |
88 | (fact "parent returns the document as the parent of the root."
89 | (parent (root nav)) => document?)
90 |
91 | (fact "parent returns nil for the parent of the document."
92 | (parent (parent (root nav))) => nil?)
93 |
94 | (fact "root returns a navigator for the root element."
95 | (root (at nav "/root/complex-title/i")) => nav?
96 | (root (at nav "/root/complex-title/i")) => root?)
97 |
98 | (fact "first-child returns a navigator for the first child element."
99 | (first-child (at nav "/root/complex-title")) => nav?
100 | (first-child (at nav "/root/complex-title")) => (tag? "i")
101 | (first-child (at nav "/root/foo")) => nil?)
102 |
103 | (fact "first-child takes an optional element name."
104 | (first-child (root nav) :complex-title) => (tag? "complex-title")
105 | (first-child (root nav) "complex-title") => (tag? "complex-title")
106 | (first-child (root nav) :missing) => nil?)
107 |
108 | (fact "last-child returns a navigator for the last child element."
109 | (last-child (at nav "/root/complex-title")) => (tag? "b")
110 | (last-child (at nav "/root/foo")) => nil?)
111 |
112 | (fact "last-child takes an optional element name."
113 | (last-child (root nav) :complex-title) => (tag? "complex-title")
114 | (last-child (root nav) "complex-title") => (tag? "complex-title")
115 | (last-child (root nav) :missing) => nil?)
116 |
117 | (fact "next-sibling returns a navigator for the next sibling element."
118 | (next-sibling (at nav "/root/basic-title")) => (tag? "complex-title")
119 | (next-sibling (at nav "/root/complex-title")) => (tag? "i")
120 | (next-sibling (at nav "/root/foo")) => nil?)
121 |
122 | (fact "next-sibling takes an optional element name."
123 | (next-sibling (at nav "/root/basic-title") :i) => (tag? "i")
124 | (next-sibling (at nav "/root/basic-title") "i") => (tag? "i")
125 | (next-sibling (at nav "/root/basic-title") :missing) => nil?)
126 |
127 | (fact "previous-sibling returns a navigator for the previous sibling element."
128 | (previous-sibling (at nav "/root/complex-title")) => (tag? "basic-title")
129 | (previous-sibling (at nav "/root/foo")) => (tag? "i")
130 | (previous-sibling (at nav "/root/basic-title")) => nil?)
131 |
132 | (fact "previous-sibling takes an optional element name."
133 | (previous-sibling (at nav "/root/i") :basic-title) => (tag? "basic-title")
134 | (previous-sibling (at nav "/root/i") "basic-title") => (tag? "basic-title")
135 | (previous-sibling (at nav "/root/i") :missing) => nil?)
136 |
137 | (fact "next-siblings returns navigators for all next sibling elements."
138 | (next-siblings (at nav "/root/basic-title")) => (tags? "complex-title" "i"
139 | "foo")
140 | (next-siblings (at nav "/root/foo")) => empty?)
141 |
142 | (fact "next-siblings takes an optional element name."
143 | (next-siblings (at nav "/root/basic-title") :i) => (tags? "i")
144 | (next-siblings (at nav "/root/basic-title") "i") => (tags? "i")
145 | (next-siblings (at nav "/root/basic-title") :missing) => empty?)
146 |
147 | (fact "previous-siblings returns navigators for all previous sibling elements."
148 | (previous-siblings (at nav "/root/foo")) => (tags? "i" "complex-title"
149 | "basic-title")
150 | (previous-siblings (at nav "/root/basic-title")) => empty?)
151 |
152 | (fact "previous-siblings takes an optional element name."
153 | (previous-siblings (at nav "/root/foo") :i) => (tags? "i")
154 | (previous-siblings (at nav "/root/foo") "i") => (tags? "i")
155 | (previous-siblings (at nav "/root/foo") :missing) => empty?)
156 |
157 | (fact "siblings returns navigators for all sibling elements."
158 | (siblings (at nav "/root/basic-title")) => (tags? "complex-title" "i" "foo")
159 | (siblings (at nav "/root/complex-title")) => (tags? "basic-title" "i" "foo")
160 | (siblings (at nav "/root/i")) => (tags? "basic-title" "complex-title" "foo")
161 | (siblings (root nav)) => empty?)
162 |
163 | (fact "siblings takes an optional element name."
164 | (siblings (at nav "/root/basic-title")
165 | :complex-title) => (tags? "complex-title")
166 | (siblings (at nav "/root/basic-title")
167 | "complex-title") => (tags? "complex-title")
168 | (siblings (at nav "/root/basic-title") :missing) => empty?)
169 |
170 | (fact "children returns navigators for all children elements."
171 | (children (root nav)) => (tags? "basic-title" "complex-title" "i" "foo")
172 | (children (at nav "/root/foo")) => empty?)
173 |
174 | (fact "children takes an optional element name."
175 | (children (root nav) :complex-title) => (tags? "complex-title")
176 | (children (root nav) "complex-title") => (tags? "complex-title")
177 | (children (root nav) :missing) => empty?)
178 |
179 | (fact "attr? returns whether or not a given attribute exists."
180 | (attr? (at nav "/root/complex-title") :id) => true
181 | (attr? (at nav "/root/complex-title") "id") => true
182 | (attr? (at nav "/root/complex-title") :missing) => false)
183 |
184 | ;;; Mutable tests. Note that the order of these facts is critical.
185 |
186 | (def nav! (navigator "Foo42Bar" false))
187 |
188 | (fact "root! moves the given navigator to the root."
189 | (root! nav!) => (tag? "root"))
190 |
191 | (fact "first-child! moves the given navigator to the first child."
192 | (first-child! nav!) => (tag? "child"))
193 |
194 | (fact "first-child! takes an optional element name."
195 | (first-child! nav! :name) => (tag? "name")
196 | (first-child! nav! :missing) => nil?)
197 |
198 | (fact "parent! moves the given navigator to the parent element."
199 | (parent! nav!) => (tag? "child"))
200 |
201 | (fact "next-sibling! moves to the given navigator to the next sibling element."
202 | (next-sibling! nav!) => (tag? "bro")
203 | (next-sibling! nav!) => nil?)
204 |
205 | (fact "next-sibling! takes an optional element name."
206 | (next-sibling! (previous-sibling! nav!) :bro) => (tag? "bro")
207 | (next-sibling! (previous-sibling! nav!) "bro") => (tag? "bro")
208 | (next-sibling! nav! :missing) => nil?)
209 |
210 | (fact "previous-sibling! moves the given navigator to the previous sibling."
211 | (previous-sibling! nav!) => (tag? "child")
212 | (previous-sibling! nav!) => nil?)
213 |
214 | (fact "previous-sibling! takes an optional element name."
215 | (previous-sibling! (next-sibling! nav!) :child) => (tag? "child")
216 | (previous-sibling! (next-sibling! nav!) "child") => (tag? "child")
217 | (previous-sibling! nav! :missing) => nil?)
218 |
219 | (fact "last-child! moves the given navigator to the last child."
220 | (last-child! nav!) => (tag? "age"))
221 |
222 | (fact "navigators are sequential."
223 | nav => sequential?)
224 |
225 | (fact "navigators are counted."
226 | nav => counted?
227 | (count nav) => 18)
228 |
229 | (fact "navigators expose all internal tokens as a seq."
230 | (first nav) => {:type :start-tag, :value "root"}
231 | (second nav) => {:type :comment, :value "Hello"}
232 | (nth nav 2) => {:type :start-tag, :value "basic-title"}
233 | (nth nav 3) => {:type :character-data, :value "Foo"}
234 | (nth nav 5) => {:type :attribute-name, :value "id"}
235 | (nth nav 6) => {:type :attribute-value, :value "42"})
236 |
237 | (fact "navigators not at the root, seq the remaining nodes."
238 | (first (first-child nav :complex-title)) => {:type :start-tag,
239 | :value "complex-title"}
240 | (last (first-child nav :complex-title)) => {:type :start-tag,
241 | :value "foo"})
242 |
243 | (fact "navigators can safely be threaded even with nils."
244 | (-> nav (first-child :missing) (last-child :missing) text) => nil?)
245 |
246 | (fact "attribute? returns true if the navigator is set to an attribute."
247 | (at nav "/root/complex-title/@id") => attribute?
248 | (at nav "/root/complex-title") =not=> attribute?
249 | (root nav) =not=> attribute?)
250 |
251 | (fact "attribute values can be retrieved with text."
252 | (text (at nav "/root/complex-title/@id")) => "42"
253 | (text (at nav "/root/complex-title/@empty")) => "")
254 |
255 | (fact "attribute names can be retrieved with tag."
256 | (tag (at nav "/root/complex-title/@id")) => "id"
257 | (tag (at nav "/root/complex-title/@empty")) => "empty")
258 |
259 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # riveted [](https://github.com/mudge/riveted/actions/workflows/clojure.yml)
2 |
3 | A Clojure library for the
4 | [fast](http://vtd-xml.sourceforge.net/benchmark1.html) processing of XML with
5 | [VTD-XML](http://vtd-xml.sourceforge.net), a [Virtual Token
6 | Descriptor](http://vtd-xml.sf.net/VTD.html) XML parser.
7 |
8 | It provides a more Clojure-like abstraction over VTD while still exposing the
9 | power of its low-level interface.
10 |
11 | ## Installation
12 |
13 | As riveted is available on [Clojars](https://clojars.org/riveted), add the
14 | following to your [Leiningen](https://github.com/technomancy/leiningen)
15 | dependencies:
16 |
17 | ```clojure
18 | [riveted "0.2.0"]
19 | ```
20 |
21 | ## Compatibility
22 |
23 | riveted is tested against Clojure 1.3, 1.4, 1.5.1, 1.6, 1.7, 1.8, 1.9, 1.10.0,
24 | 1.10.1, 1.10.2 and 1.10.3.
25 |
26 | ## API Documentation
27 |
28 | The latest [riveted API documentation](http://mudge.name/riveted/) is
29 | automatically generated with [Codox](https://github.com/weavejester/codox).
30 |
31 | ## Quick Start
32 |
33 | For more details, see [Usage](#usage) below.
34 |
35 | ```clojure
36 | (ns foo
37 | (:require [riveted.core :as vtd]))
38 |
39 | (def nav (vtd/navigator (slurp "foo.xml")))
40 |
41 | ;; Navigating by direction and returning text content.
42 | (-> nav vtd/first-child vtd/next-sibling vtd/text) ;=> "Foo"
43 |
44 | ;; Navigating by direction, restricted by element and returning attribute
45 | ;; value.
46 | (-> nav (vtd/first-child :p) (attr :id)) ;=> "42"
47 |
48 | ;; Return the tag names of all children elements.
49 | (->> nav vtd/children (map vtd/tag)) ;=> ("p" "a" "b")
50 |
51 | ;; Navigating by element name, regardless of location.
52 | (-> nav (vtd/select :p) first vtd/text)
53 |
54 | ;; Navigating by XPath, returning all matches.
55 | (map vtd/text (vtd/search nav "//author"))
56 |
57 | ;; Navigating by XPath, returning the first match.
58 | (vtd/text (vtd/at nav "/article/title"))
59 |
60 | ;; Calling seq (or any function that uses seq such as first, second, nth,
61 | ;; last, etc.) on the navigator yields a sequence of all parsed tokens as
62 | ;; simple maps with a type and value entry.
63 | (first nav) ;=> {:type :start-tag, :value "a"}
64 | ```
65 |
66 | ## Usage
67 |
68 | Once installed, you can include riveted into your desired namespace by
69 | requiring `riveted.core` like so:
70 |
71 | ```clojure
72 | (ns foo
73 | (:require [riveted.core :as vtd]))
74 | ```
75 |
76 | The core data structure in riveted is the navigator: this represents both your
77 | XML document and your current location within it. It can be interrogated for
78 | the tag name, attributes and text value of any given element and also provides
79 | the ability to move around the document.
80 |
81 | Let's say we have a file called `foo.xml` with the following content:
82 |
83 | ```xml
84 |
85 | Foo bar
86 |
87 | Robert Paulson
88 | Joe Bloggs
89 |
90 |
91 | A great article all about things.
92 |
93 |
94 | ```
95 |
96 | Let's load this into an initial navigator with the `navigator` function,
97 | passing it a UTF-8 encoded string of XML and then storing the result in the
98 | [var](http://clojure.org/vars) `nav`:
99 |
100 | ```clojure
101 | (def nav (vtd/navigator (slurp "foo.xml")))
102 | ```
103 |
104 | If you already have your XML in a byte array, you can pass this directly to `navigator` instead of a UTF-8 string:
105 |
106 | ```clojure
107 | (def nav (vtd/navigator my-byte-array))
108 | ```
109 |
110 | `navigator` also takes an optional second argument to enable XML namespace
111 | support which is disabled by default. We'll look at this
112 | [later](#namespace-support) but, for now, we can process this document without
113 | using namespaces.
114 |
115 | Now that we have a navigator, we can navigate the document in several ways
116 | (c.f. [VTD-XML's explanation of its different
117 | views](http://vtd-xml.sourceforge.net/userGuide/3.html)):
118 |
119 | * As a [cursor-based hierarchical view](#traversing-by-direction);
120 | * Using [element selectors](#traversing-by-element-name);
121 | * Using [XPath](#traversing-by-xpath);
122 | * As a [flat view of tokens](#flat-view-of-tokens).
123 |
124 | There is also a [mutable interface](#mutable-interface) for more constrained
125 | memory usage.
126 |
127 | ### Traversing by direction
128 |
129 | After parsing a document, the navigator's cursor is always at the root element
130 | of our XML: for `foo.xml`, this means the `article` element. If we want to
131 | retrieve the `title` and we know it's the first child of the article we can
132 | simply use riveted's `first-child` function:
133 |
134 | ```clojure
135 | (vtd/first-child nav)
136 | ```
137 |
138 | This returns a new navigator with its cursor set to the `title` element. We
139 | can check this by using the `text` and `tag` functions to return the text
140 | content and tag name of the current cursor respectively:
141 |
142 | ```clojure
143 | (vtd/text (vtd/first-child nav)) ;=> "Foo bar"
144 | (vtd/tag (vtd/first-child nav)) ;=> "title"
145 | ```
146 |
147 | If we then want to move to the `author` element, we can use the `next-sibling`
148 | function in a similar way:
149 |
150 | ```clojure
151 | (vtd/next-sibling (vtd/first-child nav))
152 | ```
153 |
154 | It may be more readable to use Clojure's [threading macro,
155 | `->`](http://clojuredocs.org/clojure_core/clojure.core/-%3E) when traversing
156 | in multiple directions:
157 |
158 | ```clojure
159 | (-> nav vtd/first-child vtd/next-sibling)
160 | ```
161 |
162 | If we want to test an element for its attributes, we can use `attr?` like so:
163 |
164 | ```clojure
165 | (-> nav vtd/first-child vtd/next-sibling (vtd/attr? :id)) ;=> true
166 | ```
167 |
168 | We can then fetch the value of the attribute with `attr`:
169 |
170 | ```clojure
171 | (-> nav vtd/first-child vtd/next-sibling (vtd/attr :id)) ;=> "1"
172 |
173 | ;; equivalent to:
174 | (vtd/attr (vtd/next-sibling (vtd/first-child nav)) :id)
175 | ```
176 |
177 | As well as `first-child` and `next-sibling`, you can move in one direction
178 | with the following functions:
179 |
180 | ```clojure
181 | (vtd/previous-sibling nav) ;=> move to the previous sibling element
182 | (vtd/last-child nav) ;=> move to the last child element
183 | (vtd/parent nav) ;=> move to the parent element
184 | (vtd/root nav) ;=> move to the root element
185 | ```
186 |
187 | We can also test navigators to distinguish elements from the entire document:
188 |
189 | ```clojure
190 | (-> nav vtd/first-child vtd/element?) ;=> true
191 | (-> nav vtd/parent vtd/document?) ;=> true
192 | (-> nav vtd/first-child vtd/attribute?) ;=> false
193 | ```
194 |
195 | As we are positioned on the `author` element, we might now want to collect the
196 | text values of the `name` elements within it. We could do this using the
197 | directional functions above but riveted provides a `children` function to do
198 | this for us:
199 |
200 | ```clojure
201 | (->> nav vtd/first-child vtd/next-sibling vtd/children (map vtd/text))
202 | ;=> ("Robert Paulson" "Joe Bloggs")
203 |
204 | ;; or if you prefer not to use the threading macro:
205 | (map vtd/text (vtd/children (vtd/next-sibling (vtd/first-child nav))))
206 | ```
207 |
208 | Note that `children`, along with `next-siblings` and `previous-siblings`,
209 | returns a lazy sequence of matching elements. They also take an optional
210 | second argument which allows you to specify an element name which will
211 | restrict results further.
212 |
213 | For example, if you wanted to return the `author` element directly from the
214 | original navigator, you could ask for the first `author` child like so:
215 |
216 | ```clojure
217 | (-> nav (vtd/first-child :author))
218 | ```
219 |
220 | Or ask the root for all child `author` elements:
221 |
222 | ```clojure
223 | (-> nav (vtd/children :author)) ;=> a sequence of all author child elements
224 | ```
225 |
226 | You can also get the full text content of a mixed-content node with `text`
227 | which would be perfect for our `abstract` element:
228 |
229 | ```clojure
230 | (-> nav (vtd/first-child :abstract) vtd/text)
231 | ;=> "A great article all about things."
232 | ```
233 |
234 | If you want to retrieve the raw XML contents of a node, you can use `fragment`
235 | to do so:
236 |
237 | ```clojure
238 | (-> nav (vtd/first-child :abstract) vtd/fragment)
239 | ;=> "A great article all about things."
240 | ```
241 |
242 | ### Traversing by element name
243 |
244 | If we'd rather not navigate a document in terms of directions, riveted also
245 | provides a way to traverse XML by element names with `select`.
246 |
247 | To continue our example from above, if we wanted to pull the `title` text, we
248 | could ask the navigator for all `title` elements (regardless of location) like
249 | so:
250 |
251 | ```clojure
252 | (vtd/select nav :title)
253 | ```
254 |
255 | As this is a lazy sequence, we can ask for the text of the first item like so:
256 |
257 | ```clojure
258 | (-> nav (vtd/select :title) first vtd/text) ;=> "Foo bar"
259 | ```
260 |
261 | Similarly, we can ask for the text value of all `name` elements like so:
262 |
263 | ```clojure
264 | (map vtd/text (vtd/select nav :name)) ;=> ("Robert Paulson" "Joe Bloggs")
265 | ```
266 |
267 | Note that this will return `name` elements *anywhere* in the document but we
268 | could restrict its search by moving the navigator, perhaps using some of the
269 | direction functions from above:
270 |
271 | ```clojure
272 | (map vtd/text (-> nav (vtd/first-child :author) (vtd/select :name)))
273 | ;=> ("Robert Paulson" "Joe Bloggs")
274 | ```
275 |
276 | Or perhaps with `select` itself:
277 |
278 | ```clojure
279 | (map vtd/text (-> nav (vtd/select :author) first (vtd/select :name)))
280 | ;=> ("Robert Paulson" "Joe Bloggs")
281 | ```
282 |
283 | Finally, we can return a lazy sequence of *all* elements by simply using a
284 | wildcard match:
285 |
286 | ```clojure
287 | (vtd/select nav "*")
288 | ```
289 |
290 | ### Traversing by XPath
291 |
292 | The last way to traverse a document is to use XPath 1.0 with the `search`
293 | function. Note that this is only used to navigate to elements (so it's not
294 | possible to directly return attribute values with an XPath expression).
295 |
296 | For example, to select all `name` elements:
297 |
298 | ```clojure
299 | (vtd/search nav "//name")
300 | ```
301 |
302 | If you are expecting only one match then you can use the `at` function to
303 | return only one result:
304 |
305 | ```clojure
306 | (vtd/at nav "/article/title")
307 | ```
308 |
309 | If accessing attributes via XPath, you can use `text` to return the value of
310 | the attribute:
311 |
312 | ```clojure
313 | (text (vtd/at nav "/article/@id"))
314 | ```
315 |
316 | ### Namespace support
317 |
318 | If you wish to use namespace-aware features, you will need to enable namespace
319 | support when creating the initial navigator like so:
320 |
321 | ```clojure
322 | (def ns-nav (vtd/navigator (slurp "namespaced.xml") true))
323 | ```
324 |
325 | You can then pass a prefix and URL when using `search` and `at` like so:
326 |
327 | ```clojure
328 | (vtd/search ns-nav "//ns1:name" "ns1" "http://purl.org/dc/elements/1.1/")
329 | ```
330 |
331 | ### Flat view of tokens
332 |
333 | If you need lower level access to the parsed document, you can exploit the
334 | fact that navigators implement [Clojure's `Seqable`
335 | interface](http://clojure.org/sequences) and can be traversed as a flat
336 | sequence much like a list or vector:
337 |
338 | ```clojure
339 | (first nav) ;=> {:type :start-tag, :value "article"}
340 | (second nav) ;=> {:type :start-tag, :value "title"}
341 | (nth nav 2) ;=> {:type :character-data, :value "Foo bar"}
342 | (nth nav 4) ;=> {:type :attribute-name, :value "id"}
343 | (seq nav) ;=> the full sequence of tokens
344 |
345 | ;; Return all comments from a document.
346 | (filter (comp #{:comment} :type) nav)
347 | ```
348 |
349 | This gives you access to *all* tokens in the document including XML
350 | declarations, doctypes, comments, processing instructions, etc. However, it is
351 | a very low level of abstraction and if you only care about navigating
352 | elements, it might be better to use a cursor-based view instead.
353 |
354 | ### Mutable interface
355 |
356 | riveted also provides a mutable interface to
357 | [VTDNav](http://vtd-xml.sourceforge.net/javadoc/com/ximpleware/VTDNav.html)
358 | (much like Clojure's [transient](http://clojure.org/transients) data
359 | structures) for lower-memory usage (at the cost of immutability):
360 |
361 | ```clojure
362 | ;; Create an initial navigator as per usual.
363 | (def nav (navigator "FooBar"))
364 |
365 | ;; Mutate nav to point to the a element.
366 | (vtd/first-child! nav)
367 |
368 | (vtd/text nav)
369 | ;=> "Foo"
370 |
371 | ;; Mutate nav to point to the b element.
372 | (vtd/next-sibling! nav)
373 |
374 | (vtd/text nav)
375 | ;=> "Bar"
376 |
377 | ;; Mutate nav to point to the a element again.
378 | (vtd/previous-sibling! nav)
379 |
380 | ;; Mutate nav to point to the root element.
381 | (vtd/parent! nav)
382 |
383 | ;; Mutate nav to point to the root of the document (regardless of location).
384 | (vtd/root! nav)
385 | ```
386 |
387 | In order to mitigate the problems with mutable state, it might be best to use
388 | the above functions much like you would `transient`; viz. within the confines
389 | of a function like so:
390 |
391 | ```clojure
392 | (defn title [nav]
393 | (-> (vtd/root nav) ; Create a new navigator to the root
394 | (vtd/first-child! :front) ; for mutation.
395 | (vtd/first-child! :article-meta)
396 | (vtd/first-child! :title-group)
397 | (vtd/first-child! :article-title)
398 | vtd/text))
399 | ```
400 |
401 | In this way, only one extra navigator is created.
402 |
403 | ## Acknowledgements
404 |
405 | [Andrew Diamond's `clj-vtd-xml`](https://github.com/diamondap/clj-vtd-xml) and
406 | [Tim Williams' gist](https://gist.github.com/willtim/822769) are existing
407 | interfaces to VTD-XML from Clojure that were great sources of inspiration.
408 |
409 | [Dave Ray's `seesaw`](https://github.com/daveray/seesaw) set the standard for
410 | helpful docstrings.
411 |
412 | Clojure's
413 | [`core.clj`](https://github.com/clojure/clojure/blob/master/src/clj/clojure/core.clj)
414 | provided fascinating reading, particularly regarding the use of `:inline`
415 | metadata.
416 |
417 | Thanks to [Heikki Hämäläinen](https://github.com/hjhamala) for contributing a
418 | character encoding fix for Windows users.
419 |
420 | Thanks to [Eugen Stan](https://github.com/ieugen) for suggesting that
421 | `navigator` should also accept byte arrays as well as UTF-8 strings.
422 |
423 | ## License
424 |
425 | Copyright © 2013-2022 Paul Mucur.
426 |
427 | Distributed under the Eclipse Public License, the same as Clojure.
428 |
--------------------------------------------------------------------------------
/src/riveted/core.clj:
--------------------------------------------------------------------------------
1 | (ns ^{:doc "A Clojure library for the fast processing of XML with VTD-XML."
2 | :author "Paul Mucur"}
3 | riveted.core
4 | (:require [clojure.string :as s])
5 | (:import [com.ximpleware VTDGen VTDNav AutoPilot TextIter]))
6 |
7 | (set! *warn-on-reflection* true)
8 |
9 | (defn- token-type-name
10 | "Private. Return a keyword representing the token type of the given
11 | VTDNav.
12 |
13 | Possible values are:
14 |
15 | * :start-tag
16 | * :end-tag
17 | * :attribute-name
18 | * :attribute-value
19 | * :namespace
20 | * :character-data
21 | * :comment
22 | * :processing-instruction-name
23 | * :processing-instruction-value
24 | * :declaration-attribute-name
25 | * :declaration-attribute-value
26 | * :cdata
27 | * :doctype"
28 | [^VTDNav nav index]
29 | (condp = (.getTokenType nav index)
30 | VTDNav/TOKEN_DOCUMENT :document
31 | VTDNav/TOKEN_STARTING_TAG :start-tag
32 | VTDNav/TOKEN_ENDING_TAG :end-tag
33 | VTDNav/TOKEN_ATTR_NAME :attribute-name
34 | VTDNav/TOKEN_ATTR_NS :namespace
35 | VTDNav/TOKEN_ATTR_VAL :attribute-value
36 | VTDNav/TOKEN_CHARACTER_DATA :character-data
37 | VTDNav/TOKEN_COMMENT :comment
38 | VTDNav/TOKEN_PI_NAME :processing-instruction-name
39 | VTDNav/TOKEN_PI_VAL :processing-instruction-value
40 | VTDNav/TOKEN_DEC_ATTR_NAME :declaration-attribute-name
41 | VTDNav/TOKEN_DEC_ATTR_VAL :declaration-attribute-value
42 | VTDNav/TOKEN_CDATA_VAL :cdata
43 | VTDNav/TOKEN_DTD_VAL :doctype))
44 |
45 | (defn- index-seq
46 | "Private. Return a lazy sequence of all tokens from the given VTDNav and
47 | index onwards.
48 |
49 | Tokens are represented as maps with a :type and :value entry."
50 | [^VTDNav nav index]
51 | (lazy-seq
52 | (when (< index (.getTokenCount nav))
53 | (cons {:type (token-type-name nav index)
54 | :value (.toNormalizedString nav index)}
55 | (index-seq nav (inc index))))))
56 |
57 | ;;; Wrapper type for the VTDNav class in order to implement Clojure's
58 | ;;; Sequential, Seqable and Counted interfaces.
59 |
60 | (deftype Navigator [^VTDNav nav]
61 | clojure.lang.Sequential
62 | clojure.lang.Seqable
63 | (seq [this]
64 | (index-seq nav (.getCurrentIndex nav)))
65 | clojure.lang.Counted
66 | (count [this]
67 | (.getTokenCount nav)))
68 |
69 | (defn- vtd-nav
70 | "Private. Return the VTDNav for a given Navigator or nil if not applicable.
71 |
72 | The use of vary-meta in the inline version of this function is in order to
73 | type hint the navigator and return type.
74 |
75 | See:
76 | http://stackoverflow.com/questions/7754429/clojure-defmacro-loses-metadata"
77 | {:inline (fn [navigator]
78 | `(when (instance? Navigator ~navigator)
79 | ~(vary-meta `(.nav ~(vary-meta navigator assoc :tag `Navigator))
80 | assoc :tag `VTDNav)))}
81 | ^VTDNav
82 | [^Navigator navigator]
83 | (when (instance? Navigator navigator)
84 | (.nav navigator)))
85 |
86 | (defn- index
87 | "Private. Return the current index of the given navigator."
88 | {:inline (fn [navigator]
89 | `(when-let [nav# (vtd-nav ~navigator)]
90 | (.getCurrentIndex nav#)))}
91 | [navigator]
92 | (when-let [nav (vtd-nav navigator)]
93 | (.getCurrentIndex nav)))
94 |
95 | (defn- clone
96 | "Private. Returns a new navigator cloned from the given one."
97 | {:inline (fn [navigator]
98 | `(when-let [nav# (vtd-nav ~navigator)]
99 | (Navigator. (.cloneNav nav#))))}
100 | [navigator]
101 | (when-let [nav (vtd-nav navigator)]
102 | (-> nav .cloneNav Navigator.)))
103 |
104 | (defprotocol Navigable
105 | "Protocol for types that can be used to generate a VTD navigator."
106 | (navigator
107 | [xml]
108 | [xml namespace-aware]
109 | "Return a VTD navigator for a given byte array or UTF-8 string of XML with
110 | optional namespace support. If called with only a byte array or string,
111 | namespace support is disabled.
112 |
113 | Examples:
114 |
115 | ; Return a navigator for the given byte array with no namespace support.
116 | (navigator my-byte-array)
117 |
118 | ; Return a navigator for the given UTF-8 string with no namespace support.
119 | (navigator \"Bar\")
120 |
121 | ; Return a navigator for the given UTF-8 string with namespace support.
122 | (navigator \"Bar\" true)"))
123 |
124 | (extend-protocol Navigable
125 | (Class/forName "[B")
126 | (navigator
127 | ([xml] (navigator xml false))
128 | ([xml namespace-aware]
129 | (let [vg (doto (VTDGen.) (.setDoc xml)
130 | (.parse namespace-aware))]
131 | (Navigator. (.getNav vg)))))
132 |
133 | java.lang.String
134 | (navigator
135 | ([xml] (navigator (.getBytes xml "UTF-8") false))
136 | ([xml namespace-aware] (navigator (.getBytes xml "UTF-8") namespace-aware))))
137 |
138 | (defn tag
139 | "Return the tag name for the element under the given VTD navigator as a
140 | string. If positioned on an attribute (e.g. with an XPath like /@foo), return
141 | the name of the attribute.
142 |
143 | Examples:
144 |
145 | (tag (root nav))
146 | ;=> \"root\"
147 |
148 | (tag (at nav \"/channel/@id\"))
149 | ;=> \"id\""
150 | [navigator]
151 | (when-let [nav (vtd-nav navigator)]
152 | (.toString nav (index navigator))))
153 |
154 | (defn- index->text
155 | "Private. Returns the text value of a node identified by the given index in
156 | the given navigator."
157 | {:inline (fn [navigator index]
158 | `(when-let [nav# (vtd-nav ~navigator)]
159 | (when-not (= ~index -1)
160 | (.toNormalizedString nav# ~index))))}
161 | [navigator index]
162 | (when-let [nav (vtd-nav navigator)]
163 | (when-not (= index -1)
164 | (.toNormalizedString nav index))))
165 |
166 | (defn attr
167 | "Return the value of the named attribute for the given navigator.
168 | Attributes can be specified with either a keyword or string name.
169 |
170 | Examples:
171 |
172 | (attr (root nav) :lang)
173 | ;=> \"en\""
174 | [navigator attr-name]
175 | (when-let [nav (vtd-nav navigator)]
176 | (let [index (.getAttrVal nav (name attr-name))]
177 | (index->text navigator index))))
178 |
179 | (defn attr?
180 | "Test whether the given attribute exists on the current element.
181 | Attributes can be specified with either a keyword or string name.
182 |
183 | Examples:
184 |
185 | (attr? (root nav) :lang)
186 | ;=> true"
187 | [navigator attr-name]
188 | (when-let [nav (vtd-nav navigator)]
189 | (.hasAttr nav (name attr-name))))
190 |
191 | (defn fragment
192 | "Return a string XML fragment for all nodes under the given navigator.
193 |
194 | Examples:
195 |
196 | (fragment nav)
197 | ;=> \"Some XML as a raw string\""
198 | [navigator]
199 | (when-let [nav (vtd-nav navigator)]
200 | (let [r (.getContentFragment nav)]
201 | (if (= r -1)
202 | ""
203 | (.toString nav (bit-and r 16rFFFFFF) (bit-shift-right r 32))))))
204 |
205 | ;;; Transient interface for navigation.
206 |
207 | (defn- navigate!
208 | "Private. Low level interface to move the given navigator in the given
209 | direction (optionally restricting moving to the given element type), mutating
210 | it in place.
211 |
212 | Note that this changes the internal state of the navigator (thereby suffering
213 | from the usual problems of mutability including concurrency woes) but saves
214 | duplicating the navigator's state on every move.
215 |
216 | Direction should be one of the standard VTDNav constants, namely:
217 |
218 | * VTDNav/ROOT;
219 | * VTDNav/FIRST_CHILD;
220 | * VTDNav/LAST_CHILD;
221 | * VTDNav/NEXT_SIBLING;
222 | * VTDNav/PREV_SIBLING;
223 | * VTDNav/PARENT.
224 |
225 | Examples:
226 |
227 | ; Move the navigator to the document root.
228 | (navigate! nav VTDNav/ROOT)
229 |
230 | ; Move the navigator to the first p child tag.
231 | (navigate! nav VTDNav/FIRST_CHILD :p)
232 |
233 | See:
234 | http://vtd-xml.sourceforge.net/javadoc/com/ximpleware/VTDNav.html"
235 | ([navigator direction]
236 | {:pre [(>= direction 0) (<= direction 5)]}
237 | (when-let [nav (vtd-nav navigator)]
238 | (when (.toElement nav direction)
239 | navigator)))
240 | ([navigator direction element]
241 | {:pre [(>= direction 0) (<= direction 5)]}
242 | (when-let [nav (vtd-nav navigator)]
243 | (when (.toElement nav direction (name element))
244 | navigator))))
245 |
246 | (defn root!
247 | "Move the given navigator to the document root, mutating it in place."
248 | [navigator]
249 | (navigate! navigator VTDNav/ROOT))
250 |
251 | (defn parent!
252 | "Move the given navigator to the current element's parent element, mutating
253 | it in place."
254 | [navigator]
255 | (navigate! navigator VTDNav/PARENT))
256 |
257 | (defn next-sibling!
258 | "Move the given navigator to the current element's next sibling element
259 | (restricted by an optional element type), mutating it in place.
260 |
261 | Examples:
262 |
263 | ; Move nav to the next sibling element.
264 | (next-sibling! nav)
265 |
266 | ; Move nav to the next sibling b element.
267 | (next-sibling! nav :b)"
268 | ([navigator] (navigate! navigator VTDNav/NEXT_SIBLING))
269 | ([navigator element] (navigate! navigator VTDNav/NEXT_SIBLING element)))
270 |
271 | (defn previous-sibling!
272 | "Move the given navigator to the current element's previous sibling element
273 | (restricted by an optional element type), mutating it in place.
274 |
275 | Examples:
276 |
277 | ; Move nav to the previous sibling element.
278 | (previous-sibling! nav)
279 |
280 | ; Move nav to the previous sibling b element.
281 | (previous-sibling! nav :b)"
282 | ([navigator] (navigate! navigator VTDNav/PREV_SIBLING))
283 | ([navigator element] (navigate! navigator VTDNav/PREV_SIBLING element)))
284 |
285 | (defn first-child!
286 | "Move the given navigator to the current element's first child element
287 | (restricted by an optional element type), mutating it in place.
288 |
289 | Examples:
290 |
291 | ; Move nav to the first child element.
292 | (first-child! nav)
293 |
294 | ; Move nav to the first child b element.
295 | (first-child! nav :b)"
296 | ([navigator] (navigate! navigator VTDNav/FIRST_CHILD))
297 | ([navigator element] (navigate! navigator VTDNav/FIRST_CHILD element)))
298 |
299 | (defn last-child!
300 | "Move the given navigator to the current element's last child element
301 | (restricted by an optional element type), mutating it in place.
302 |
303 | Examples:
304 |
305 | ; Move nav to the last child element.
306 | (last-child! nav)
307 |
308 | ; Move nav to the last child b element.
309 | (last-child! nav :b)"
310 | ([navigator] (navigate! navigator VTDNav/LAST_CHILD))
311 | ([navigator element] (navigate! navigator VTDNav/LAST_CHILD element)))
312 |
313 | ;;; Immutable interface to navigation.
314 |
315 | (defn- navigate
316 | "Private. Low level interface to return a new navigator based on moving the
317 | given one in the given direction (optionally restricting movement to the given
318 | element type). Note that this *does not* mutate the existing navigator unlike
319 | (riveted.core/navigate!).
320 |
321 | This relies on cloning the given navigator before moving and therefore will
322 | use more memory than (riveted.core/navigate!) but provides the benefits of an
323 | immutable interface.
324 |
325 | Direction should be one of the standard VTDNav constants, namely:
326 |
327 | * VTDNav/ROOT;
328 | * VTDNav/FIRST_CHILD;
329 | * VTDNav/LAST_CHILD;
330 | * VTDNav/NEXT_SIBLING;
331 | * VTDNav/PREV_SIBLING;
332 | * VTDNav/PARENT.
333 |
334 | Examples:
335 |
336 | ; Return a new navigator pointing at the document root.
337 | (navigate nav VTDNav/ROOT)
338 |
339 | ; Return a new navigator pointing at the first p child tag.
340 | (navigate nav VTDNav/FIRST_CHILD :p)
341 |
342 | See:
343 | http://vtd-xml.sourceforge.net/javadoc/com/ximpleware/VTDNav.html
344 | (riveted.core/navigate!)"
345 | ([navigator direction]
346 | (when-let [navigator' (clone navigator)]
347 | (navigate! navigator' direction)))
348 | ([navigator direction element]
349 | (when-let [navigator' (clone navigator)]
350 | (navigate! navigator' direction element))))
351 |
352 | (defn root
353 | "Return a new navigator pointing to the document root."
354 | [navigator]
355 | (navigate navigator VTDNav/ROOT))
356 |
357 | (defn parent
358 | "Return a new navigator pointing to the parent element of the given
359 | navigator."
360 | [navigator]
361 | (navigate navigator VTDNav/PARENT))
362 |
363 | (defn next-sibling
364 | "Return a new navigator pointing to the current element's next sibling element
365 | (restricted by an optional element type).
366 |
367 | Examples:
368 |
369 | ; Return a new navigator pointing to the next sibling element of nav.
370 | (next-sibling nav)
371 |
372 | ; Return a new navigator pointing to the next sibling b element of nav.
373 | (next-sibling nav :b)"
374 | ([navigator] (navigate navigator VTDNav/NEXT_SIBLING))
375 | ([navigator element] (navigate navigator VTDNav/NEXT_SIBLING element)))
376 |
377 | (defn previous-sibling
378 | "Return a new navigator pointing to the current element's previous sibling
379 | element (restricted by an optional element type).
380 |
381 | Examples:
382 |
383 | ; Return a new navigator pointing to the previous sibling element of nav.
384 | (previous-sibling nav)
385 |
386 | ; Return a new navigator pointing to the previous sibling b element of nav.
387 | (previous-sibling nav :b)"
388 | ([navigator] (navigate navigator VTDNav/PREV_SIBLING))
389 | ([navigator element] (navigate navigator VTDNav/PREV_SIBLING element)))
390 |
391 | (defn first-child
392 | "Return a new navigator pointing to the current element's first child element
393 | (restricted by an optional element type).
394 |
395 | Examples:
396 |
397 | ; Return a new navigator pointing to the first child element of nav.
398 | (first-child nav)
399 |
400 | ; Return a new navigator pointing to the first child b element of nav.
401 | (first-child nav :b)"
402 | ([navigator] (navigate navigator VTDNav/FIRST_CHILD))
403 | ([navigator element] (navigate navigator VTDNav/FIRST_CHILD element)))
404 |
405 | (defn last-child
406 | "Return a new navigator pointing to the current element's last child element
407 | (restricted by an optional element type).
408 |
409 | Examples:
410 |
411 | ; Return a new navigator pointing to the last child element of nav.
412 | (last-child nav)
413 |
414 | ; Return a new navigator pointing to the last child b element of nav.
415 | (last-child nav :b)"
416 | ([navigator] (navigate navigator VTDNav/LAST_CHILD))
417 | ([navigator element] (navigate navigator VTDNav/LAST_CHILD element)))
418 |
419 | (defn next-siblings
420 | "Return a lazy sequence of navigators representing all siblings next to the
421 | given navigator (optionally restricted by a given element type).
422 |
423 | Examples:
424 |
425 | ; Return navigators for every next sibling element to nav.
426 | (next-siblings nav)
427 |
428 | ; Return navigators for every next sibling p element to nav.
429 | (next-siblings nav :p)"
430 | ([navigator]
431 | (lazy-seq
432 | (when-let [sibling (next-sibling navigator)]
433 | (cons sibling (next-siblings sibling)))))
434 | ([navigator element]
435 | (lazy-seq
436 | (when-let [sibling (next-sibling navigator element)]
437 | (cons sibling (next-siblings sibling element))))))
438 |
439 | (defn previous-siblings
440 | "Return a lazy sequence of navigators representing all siblings previous to
441 | the given navigator (optionally restricted by a given element type).
442 |
443 | Note that this is lazily evaluated right-to-left so the final sequence will
444 | be in reverse order to the actual nodes in the document.
445 |
446 | Examples:
447 |
448 | ; Return navigators for every previous sibling element to nav.
449 | (previous-siblings nav)
450 |
451 | ; Return navigators for every previous sibling p element to nav.
452 | (previous-siblings nav :p)"
453 | ([navigator]
454 | (lazy-seq
455 | (when-let [sibling (previous-sibling navigator)]
456 | (cons sibling (previous-siblings sibling)))))
457 | ([navigator element]
458 | (lazy-seq
459 | (when-let [sibling (previous-sibling navigator element)]
460 | (cons sibling (previous-siblings sibling element))))))
461 |
462 | (defn siblings
463 | "Return navigators for all siblings to the given navigator (optionally
464 | restricted by a given element type).
465 |
466 | Note that this is not lazy in order to preserve the correct order of nodes
467 | and previous siblings need to be fully realised for sorting.
468 |
469 | Examples:
470 |
471 | ; Return navigators for all siblings to nav.
472 | (siblings nav)
473 |
474 | ; Return navigators for all sibling p elements to nav.
475 | (siblings nav :p)"
476 | {:inline (fn [navigator & args]
477 | `(let [left# (reverse (previous-siblings ~navigator ~@args))
478 | right# (next-siblings ~navigator ~@args)]
479 | (when (or (seq left#) (seq right#))
480 | (concat left# right#))))
481 | :inline-arities #{1 2}}
482 | ([navigator]
483 | (let [left (reverse (previous-siblings navigator))
484 | right (next-siblings navigator)]
485 | (when (or (seq left) (seq right))
486 | (concat left right))))
487 | ([navigator element]
488 | (let [left (reverse (previous-siblings navigator element))
489 | right (next-siblings navigator element)]
490 | (when (or (seq left) (seq right))
491 | (concat left right)))))
492 |
493 | (defn children
494 | "Return a lazy sequence of navigators for all child nodes of the given
495 | navigator (optionally restricted by a given element type).
496 |
497 | Examples:
498 |
499 | ; Return navigators for all children of nav.
500 | (children nav)
501 |
502 | ; Return navigators for all child p elements of nav.
503 | (children nav :p)"
504 | {:inline (fn [navigator & args]
505 | `(lazy-seq
506 | (when-let [child# (first-child ~navigator ~@args)]
507 | (cons child# (next-siblings child# ~@args)))))
508 | :inline-arities #{1 2}}
509 | ([navigator]
510 | (lazy-seq
511 | (when-let [child (first-child navigator)]
512 | (cons child (next-siblings child)))))
513 | ([navigator element]
514 | (lazy-seq
515 | (when-let [child (first-child navigator element)]
516 | (cons child (next-siblings child element))))))
517 |
518 | (defn- text-seq
519 | "Private. Returns a lazy sequence of all text nodes for a given TextIter."
520 | [^TextIter text-iter]
521 | (lazy-seq
522 | (let [index (.getNext text-iter)]
523 | (when-not (= index -1)
524 | (cons index (text-seq text-iter))))))
525 |
526 | (defn- text-indices
527 | "Private. Creates a TextIter for the given navigator and returns a sequence of
528 | indices for all text nodes associated with it."
529 | [navigator]
530 | (when-let [nav (vtd-nav navigator)]
531 | (let [iter (doto (TextIter.) (.touch nav))]
532 | (text-seq iter))))
533 |
534 | (defn- text-descendant-indices
535 | "Private. Returns an ordered sequence of the indices of all text nodes that
536 | are descendants of the given navigator."
537 | [navigator]
538 | (sort (concat (text-indices navigator)
539 | (mapcat text-descendant-indices (children navigator)))))
540 |
541 | (defn- text-descendants
542 | "Private. Returns a sequence of all text descending from the given navigator."
543 | [navigator]
544 | (map (partial index->text navigator) (text-descendant-indices navigator)))
545 |
546 | (defn- token-type
547 | "Private. Returns the token type of the given navigator."
548 | ([navigator] (token-type navigator (index navigator)))
549 | ([navigator index]
550 | (when-let [nav (vtd-nav navigator)]
551 | (.getTokenType nav index))))
552 |
553 | (defn element?
554 | "Tests whether the given navigator is currently positioned on an element."
555 | [navigator]
556 | (= VTDNav/TOKEN_STARTING_TAG (token-type navigator)))
557 |
558 | (defn document?
559 | "Tests whether the given navigator is currently positioned the document."
560 | [navigator]
561 | (= VTDNav/TOKEN_DOCUMENT (token-type navigator)))
562 |
563 | (defn attribute?
564 | "Tests whether the given navigator is currently positioned on an attribute."
565 | [navigator]
566 | (= VTDNav/TOKEN_ATTR_NAME (token-type navigator)))
567 |
568 | (defn text
569 | "Return all descendant text content below the given navigator as one string.
570 | This means both the value of a simple text node and also the resulting text
571 | value of a mixed content node such as
Foo bar
. If the navigator
572 | is currently positioned on an attribute (e.g. by using an XPath like /@foo),
573 | return the value of the attribute.
574 |
575 | Examples:
576 |
577 | ; Returns \"Foo\" given nav points to Foo
578 | (text nav)
579 |
580 | ; Returns \"Foo bar\" given nav points to Foo bar
581 | (text nav)
582 |
583 | ; Returns \"123\" given nav points to @src of
584 | (text nav)"
585 | [navigator]
586 | (if (attribute? navigator) (:value (second navigator))
587 | (when-let [texts (seq (text-descendants navigator))]
588 | (s/join " " texts))))
589 |
590 | (defn- xpath-seq
591 | "Private. Returns a lazy sequence of navigators exhaustively evaluating XPath
592 | with the given navigator and AutoPilot."
593 | [navigator ^AutoPilot autopilot]
594 | (lazy-seq
595 | (let [index (.evalXPath autopilot)]
596 | (when-not (= index -1)
597 | (cons (clone navigator)
598 | (xpath-seq navigator autopilot))))))
599 |
600 | (defn search
601 | "Search for the given XPath in the navigator, returning a lazy sequence of all
602 | matching navigators. If used with a namespace aware navigator, also takes
603 | a namespace prefix and URL for use in the XPath.
604 |
605 | Examples:
606 |
607 | ; Returns navigators for all matching elements.
608 | (search nav \"/article/title\")
609 |
610 | ; Returns navigators for all matching elements providing ns-nav is
611 | ; namespace aware.
612 | (search ns-nav \"//ns1:title\" \"ns1\" \"http://example.com/ns\")"
613 | ([navigator xpath]
614 | (when-let [navigator' (clone navigator)]
615 | (let [autopilot (doto (AutoPilot. (vtd-nav navigator'))
616 | (.selectXPath xpath))]
617 | (xpath-seq navigator' autopilot))))
618 | ([navigator xpath prefix url]
619 | (when-let [navigator' (clone navigator)]
620 | (let [autopilot (doto (AutoPilot. (vtd-nav navigator'))
621 | (.declareXPathNameSpace prefix url)
622 | (.selectXPath xpath))]
623 | (xpath-seq navigator' autopilot)))))
624 |
625 | (defn- select-seq
626 | "Private. Returns a lazy sequence of navigators exhaustively iterating through
627 | nodes with the given navigator and AutoPilot."
628 | [navigator ^AutoPilot autopilot]
629 | (lazy-seq
630 | (when (.iterate autopilot)
631 | (cons (clone navigator)
632 | (select-seq navigator autopilot)))))
633 |
634 | (defn select
635 | "Return a lazy sequence of navigators matching the given element name, * can
636 | be used to match all elements.
637 |
638 | Examples:
639 |
640 | ; Returns navigators for each element in nav.
641 | (select nav \"*\")
642 |
643 | ; Returns navigators for all b elements in nav.
644 | (select nav \"b\")"
645 | [navigator element]
646 | (when-let [navigator' (clone navigator)]
647 | (let [autopilot (doto (AutoPilot. (vtd-nav navigator'))
648 | (.selectElement (name element)))]
649 | (select-seq navigator' autopilot))))
650 |
651 | (defn at
652 | "Search for the given XPath in the navigator, returning the first matching
653 | navigator. If used with a namespace aware navigator, also takes a namespace
654 | prefix and URL for use in the XPath.
655 |
656 | Examples:
657 |
658 | ; Returns a single navigator for the first matching element.
659 | (at nav \"/article/title\")
660 |
661 | ; Returns a single navigator for the first matching element providing
662 | ; ns-nav is namespace aware.
663 | (at ns-nav \"//ns1:title\" \"ns1\" \"http://example.com/ns\")"
664 | {:inline (fn [& args] `(first (search ~@args)))
665 | :inline-arities #{2 4}}
666 | ([navigator xpath] (first (search navigator xpath)))
667 | ([navigator xpath prefix url] (first (search navigator xpath prefix url))))
668 |
669 |
--------------------------------------------------------------------------------