├── LICENSE ├── README.md ├── project.clj ├── src └── pl │ └── danieljanus │ └── tagsoup.clj └── test └── pl └── danieljanus └── tagsoup_test.clj /LICENSE: -------------------------------------------------------------------------------- 1 | clj-tagsoup --- Copyright (c) 2010, Daniel Janus 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated documentation 5 | files (the "Software"), to deal in the Software without 6 | restriction, including without limitation the rights to use, 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the 9 | Software is furnished to do so, subject to the following 10 | conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Clojars Project](http://clojars.org/clj-tagsoup/clj-tagsoup/latest-version.svg)](http://clojars.org/clj-tagsoup/clj-tagsoup) 2 | 3 | clj-tagsoup 4 | =========== 5 | 6 | This is a HTML parser for Clojure, somewhat akin to Common Lisp's 7 | [cl-html-parse]. It is a wrapper around the [TagSoup] Java SAX 8 | parser, but has a DOM interface. It is buildable by [Leiningen]. 9 | 10 | Usage 11 | ----- 12 | 13 | The two main functions defined by clj-tagsoup are `parse` and `parse-string`. 14 | The first one can take anything accepted by clojure.java.io's [reader] function 15 | except for a `Reader`, 16 | while the second can parse HTML from a string. 17 | 18 | The resulting HTML tree is a vector, consisting of: 19 | 20 | 1. a keyword representing the tag name, 21 | 2. a map of tag attributes (mapping keywords to strings), 22 | 3. children nodes (strings or vectors of the same format). 23 | 24 | This is the same format as used by [hiccup], thus the output of `parse` is 25 | appropriate to pass to hiccup. 26 | 27 | There are also utility accessors (`tag`, `attributes`, `children`). 28 | 29 | clj-tagsoup will automatically use the correct encoding to parse the file if 30 | one is specified in either the HTTP headers (if the argument to `parse` is an 31 | URL object or a string representing one) or a `` tag. 32 | 33 | clj-tagsoup is meant to parse HTML tag soup, but, in practice, nothing 34 | prevents you to use it to parse arbitrary (potentially malformed) 35 | XML. The `:xml` keyword argument causes clj-tagsoup to take into 36 | consideration the XML header when detecting the encoding. 37 | 38 | There are two other options for parsing XML: 39 | 40 | * `parse-xml` just invokes `clojure.xml/parse` with TagSoup, so 41 | the output format is compatible with `clojure.xml` and is not 42 | the one described above. 43 | * `lazy-parse-xml` (introduced in clj-tagsoup 0.3.0) returns a 44 | lazy sequence of `Event` records defined by `clojure.data.xml`, 45 | similarly to the `source-seq` function from that library. 46 | 47 | Example 48 | ------- 49 | 50 | *project.clj*: 51 | ```clojure 52 | (defproject clj-tagsoup-example "0.0.1" 53 | :dependencies [[clj-tagsoup/clj-tagsoup "0.3.0"]]) 54 | ``` 55 | 56 | `lein repl`: 57 | 58 | ```clojure 59 | (use 'pl.danieljanus.tagsoup) 60 | => nil 61 | 62 | (parse "http://example.com") 63 | => [:html {} 64 | [:head {} 65 | [:title {} "Example Web Page"]] 66 | [:body {} 67 | [:p {} "You have reached this web page by typing \"example.com\",\n\"example.net\",\n or \"example.org\" into your web browser."] 68 | [:p {} "These domain names are reserved for use in documentation and are not available \n for registration. See " 69 | [:a {:shape "rect", :href "http://www.rfc-editor.org/rfc/rfc2606.txt"} "RFC \n 2606"] 70 | ", Section 3."]]] 71 | ``` 72 | 73 | FAQ 74 | --- 75 | 76 | * Why not just use [Enlive]? 77 | 78 | Truth be told, I wrote clj-tagsoup prior to discovering Enlive, which is an excellent library. That said, 79 | I believe clj-tagsoup has its niche. Here is an _à la carte_ list of differences between the two: 80 | 81 | - Enlive is a full-blown templating library; clj-tagsoup just parses HTML (and XML). 82 | - Unlike Enlive, clj-tagsoup's `parse` function goes out of its way to return parsed data in a proper 83 | encoding. It will detect the `` tag in your data and reinterpret the input 84 | stream to the indicated encoding as needed. 85 | - clj-tagsoup boasts a way to lazily parse XML with TagSoup. 86 | 87 | * What's with the dependency on stax-utils? 88 | 89 | It's for `lazy-parse-xml`. It's needed because that function uses [clojure.data.xml], which under the hood 90 | uses the StAX API. TagSoup is a SAX parser, so a bridge between the two parsing APIs is needed. 91 | 92 | If you don't use `lazy-parse-xml`, you can optionally exclude stax-utils from your project.clj, like this: 93 | 94 | :dependencies [[clj-tagsoup "0.3.0" :exclusions [net.java.dev.stax-utils/stax-utils]]] 95 | 96 | Author 97 | ------ 98 | 99 | clj-tagsoup was written by [Daniel Janus]. 100 | 101 | [cl-html-parse]: http://www.cliki.net/CL-HTML-Parse 102 | [clojure.data.xml]: https://github.com/clojure/data.xml 103 | [reader]: http://richhickey.github.com/clojure-contrib/branch-1.1.x/duck-streams-api.html#clojure.contrib.duck-streams/reader 104 | [Daniel Janus]: http://danieljanus.pl 105 | [Enlive]: http://github.com/cgrand/enlive 106 | [TagSoup]: http://home.ccil.org/~cowan/XML/tagsoup/ 107 | [Leiningen]: http://github.com/technomancy/leiningen 108 | [hiccup]: http://github.com/weavejester/hiccup 109 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject clj-tagsoup "0.3.0" 2 | :description "A HTML parser for Clojure." 3 | :dependencies [[org.clojure/clojure "1.2.0"] 4 | [org.clojure/data.xml "0.0.8"] 5 | [org.clojars.nathell/tagsoup "1.2.1"] 6 | [net.java.dev.stax-utils/stax-utils "20040917"]]) 7 | -------------------------------------------------------------------------------- /src/pl/danieljanus/tagsoup.clj: -------------------------------------------------------------------------------- 1 | (ns pl.danieljanus.tagsoup 2 | (:require [clojure.zip :as zip] 3 | [clojure.xml :as xml] 4 | [clojure.data.xml :as lazy-xml]) 5 | (:import (org.ccil.cowan.tagsoup Parser) 6 | (java.net URI URL MalformedURLException Socket) 7 | (java.io InputStream File FileInputStream ByteArrayInputStream BufferedInputStream InputStreamReader BufferedReader) 8 | (javax.xml.stream XMLEventReader XMLStreamConstants) 9 | (javax.xml.stream.events Attribute StartElement XMLEvent) 10 | (javanet.staxutils ContentHandlerToXMLEventWriter XMLEventPipe) 11 | (org.xml.sax Attributes InputSource))) 12 | 13 | (defn- attributes-map 14 | "Converts an Attributes object into a Clojure map," 15 | [^Attributes attrs] 16 | (into {} 17 | (map #(vector (keyword (.getQName attrs %)) (.getValue attrs %)) (range (.getLength attrs))))) 18 | 19 | (defn tag 20 | "Returns the tag name of a given HTML tree node as a keyword." 21 | [node] 22 | (first node)) 23 | 24 | (defn attributes 25 | "Returns the attributes of a given HTML tree node as a Clojure map." 26 | [node] 27 | (second node)) 28 | 29 | (defn children 30 | "Returns a seq of children nodes of a given HTML tree node." 31 | [node] 32 | (rest (rest node))) 33 | 34 | (defn- encoding-from-content-type 35 | "Strips the character-set name from a Content-Type: HTTP header value." 36 | [^String content-type] 37 | (when content-type 38 | (second (re-find #"charset=(.*)$" (.toLowerCase content-type))))) 39 | 40 | (defmulti #^{:doc "Like clojure.java.io/reader, but 41 | attempts to convert its argument to an InputStream. Returns a map 42 | mapping :stream to the stream and, potentially, :encoding to the 43 | encoding detected on that stream."} 44 | input-stream class) 45 | 46 | (defmethod input-stream InputStream [#^InputStream x] 47 | {:stream x}) 48 | 49 | (defmethod input-stream File [#^File x] 50 | {:stream (FileInputStream. x)}) 51 | 52 | (defmethod input-stream URL [#^URL x] 53 | (if (= "file" (.getProtocol x)) 54 | (input-stream (File. (.getPath x))) 55 | (let [connection (.openConnection x)] 56 | {:stream (.getInputStream connection), :encoding (-> connection (.getHeaderField "Content-Type") encoding-from-content-type)}))) 57 | 58 | (defmethod input-stream URI [#^URI x] 59 | (input-stream (.toURL x))) 60 | 61 | (defmethod input-stream String [#^String x] 62 | (try (let [url (URL. x)] 63 | (input-stream url)) 64 | (catch MalformedURLException e 65 | (input-stream (File. x))))) 66 | 67 | (defmethod input-stream Socket [#^Socket x] 68 | {:stream (.getInputStream x)}) 69 | 70 | (defmethod input-stream :default [x] 71 | (throw (Exception. (str "Cannot open " (pr-str x) " as an input stream.")))) 72 | 73 | (defn read-xml-encoding-declaration 74 | "Reads XML encoding declaration from a BufferedInputStream." 75 | [^BufferedInputStream stream] 76 | (let [arr-size 1024 77 | arr (make-array Byte/TYPE arr-size)] 78 | (.mark stream arr-size) 79 | (loop [offset 0] 80 | (let [nread (.read stream arr offset (- arr-size offset))] 81 | (if (or (= nread -1) (= (+ offset nread) arr-size)) 82 | arr 83 | (recur (+ offset nread))))) 84 | (.reset stream) 85 | (let [s (String. arr (java.nio.charset.Charset/forName "ISO-8859-1"))] 86 | (when (.startsWith s " ((input-stream source) :stream) BufferedInputStream.) 94 | source (InputSource. stream) 95 | xml-encoding (read-xml-encoding-declaration stream)] 96 | (when xml-encoding 97 | (.setEncoding source xml-encoding)) 98 | [parser source])) 99 | 100 | (defn- startparse-tagsoup 101 | "A startparse function compatible with clojure.xml." 102 | [input content-handler] 103 | (let [[^Parser parser ^InputSource source] (make-parser-and-source input)] 104 | (.setContentHandler parser content-handler) 105 | (.parse parser source) 106 | parser)) 107 | 108 | (defn parse 109 | "Parses a file or HTTP URL. file may be anything that can be fed 110 | to clojure.java.io/reader. If strip-whitespace is true 111 | removes empty (whitespace-only) PCDATA from in between the tags, which 112 | makes the resulting tree cleaner. If prefer-header-http-info is true 113 | and the encoding is specified in both tag and the 114 | HTTP headers (in this case, input must be a URL or a string 115 | representing one), the latter is preferred." 116 | [input & {:keys [xml strip-whitespace prefer-header-http-info], :or {strip-whitespace true}}] 117 | (with-local-vars [tree (zip/vector-zip []) pcdata "" reparse false] 118 | (let [{:keys [stream encoding]} (input-stream input) 119 | stream (BufferedInputStream. stream) 120 | source (InputSource. stream) 121 | reparse-exception (Exception. "reparse") 122 | xml-encoding (when xml (read-xml-encoding-declaration stream)) 123 | _ (.mark stream 65536) 124 | _ (.setEncoding source (or (and xml xml-encoding) encoding)) 125 | flush-pcdata #(let [data (var-get pcdata)] 126 | (when-not (empty? data) 127 | (when-not (and strip-whitespace (re-find #"^\s+$" data)) 128 | (var-set tree (-> tree var-get (zip/append-child data)))) 129 | (var-set pcdata ""))) 130 | parser (proxy [Parser] [] 131 | (pcdata [buf offset length] 132 | (var-set pcdata (str (var-get pcdata) (String. buf offset length)))) 133 | (startElement [uri localname qname attrs] 134 | (flush-pcdata) 135 | (let [attrs (attributes-map attrs) 136 | tag (keyword localname)] 137 | (when (and (= tag :meta) 138 | (let [^String http-equiv (attrs :http-equiv)] 139 | (and http-equiv (= (.toLowerCase http-equiv) "content-type")))) 140 | (let [charset (encoding-from-content-type (attrs :content))] 141 | (when (and charset 142 | (not (and encoding prefer-header-http-info)) 143 | (not (var-get reparse))) 144 | (.setEncoding source charset) 145 | (var-set reparse true) 146 | (.reset stream) 147 | (throw reparse-exception)))) 148 | (var-set tree (-> tree var-get 149 | (zip/append-child []) 150 | (zip/down) 151 | (zip/rightmost) 152 | (zip/append-child tag) 153 | (zip/append-child attrs))))) 154 | (endElement [uri localname qname] 155 | (flush-pcdata) 156 | (var-set tree (-> tree var-get zip/up))))] 157 | (try 158 | (.parse parser source) 159 | (catch Exception e 160 | (if (= e reparse-exception) 161 | (do 162 | (var-set pcdata "") 163 | (var-set tree (zip/vector-zip [])) 164 | (.parse parser source)) 165 | (throw e)))) 166 | (first (remove string? (zip/root (var-get tree))))))) 167 | 168 | (defn parse-string 169 | "Parses a given string as HTML, passing options to `parse'." 170 | [^String s & options] 171 | (apply parse (-> s .getBytes ByteArrayInputStream.) options)) 172 | 173 | (defn parse-xml 174 | "Parses a given XML using TagSoup and returns the parse result 175 | in the same format as clojure.xml/parse." 176 | [input] 177 | (xml/parse input startparse-tagsoup)) 178 | 179 | (defn- xml-name 180 | "Returns the local part of the name of a given XML entity as a keyword." 181 | [^StartElement x] 182 | (keyword (.getLocalPart (.getName x)))) 183 | 184 | (defn eventize 185 | "Convert a javax.xml.stream.events.XMLEvent to a clojure.data.xml.Event." 186 | [^XMLEvent ev] 187 | (condp = (.getEventType ev) 188 | XMLStreamConstants/START_ELEMENT 189 | (lazy-xml/event :start-element 190 | (xml-name ev) 191 | (into {} (map (fn [^Attribute attr] [(xml-name attr) (.getValue attr)]) (iterator-seq (.getAttributes ev)))) 192 | nil) 193 | XMLStreamConstants/END_ELEMENT 194 | (lazy-xml/event :end-element 195 | (xml-name ev) 196 | nil 197 | nil) 198 | XMLStreamConstants/CHARACTERS 199 | (lazy-xml/event :characters nil nil (.getData ev)) 200 | nil)) 201 | 202 | (defn event-seq 203 | "Like clojure.data.xml/pull-seq, but works on XMLEventReaders instead of stream readers." 204 | [^XMLEventReader ereader] 205 | (keep eventize (iterator-seq ereader))) 206 | 207 | (defn cancellable-lazy-parse-xml 208 | [input] 209 | (let [pipe (XMLEventPipe.) 210 | ereader (.getReadEnd pipe) 211 | ewriter (.getWriteEnd pipe) 212 | [^Parser parser ^InputSource source] (make-parser-and-source input)] 213 | (.setContentHandler parser (ContentHandlerToXMLEventWriter. ewriter)) 214 | {:future (future 215 | (try 216 | (.parse parser source) 217 | (finally (.close ewriter)))) 218 | :data (event-seq ereader)})) 219 | 220 | (defn lazy-parse-xml 221 | "Parses the XML using TagSoup and as result, returns a lazy 222 | sequence of elements in the same format as 223 | clojure.data.xml/source-seq. Parsing happens on a background thread, 224 | from which XML events are reported to the calling thread via a 225 | XMLEventPipe." 226 | [input] 227 | (:data (cancellable-lazy-parse-xml input))) 228 | -------------------------------------------------------------------------------- /test/pl/danieljanus/tagsoup_test.clj: -------------------------------------------------------------------------------- 1 | (ns pl.danieljanus.tagsoup-test 2 | (:use 3 | pl.danieljanus.tagsoup 4 | clojure.test)) 5 | 6 | (deftest parse-string-test 7 | (is (= [:html {} [:body {} [:p {} "foo"]]] 8 | (parse-string "

foo

")))) 9 | --------------------------------------------------------------------------------