├── .gitignore ├── .travis.yml ├── CHANGELOG.org ├── README.markdown ├── TODO.org ├── docs └── uberdoc.html ├── project.clj ├── src └── yokogiri │ └── core.clj └── test └── yokogiri └── test └── core.clj /.gitignore: -------------------------------------------------------------------------------- 1 | /pom.xml 2 | *jar 3 | /lib 4 | /classes 5 | /native 6 | /.lein-failures 7 | /checkouts 8 | /.lein-deps-sum 9 | /target 10 | pom.xml.asc 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: clojure 2 | lein: lein2 3 | script: lein2 with-profile dev midje 4 | jdk: 5 | - openjdk6 6 | - openjdk7 7 | - oraclejdk7 8 | -------------------------------------------------------------------------------- /CHANGELOG.org: -------------------------------------------------------------------------------- 1 | * <2014-06-13 Fri> -- Version 1.5.8 2 | ** Create page from a string with #'yokogiri.core/create-page 3 | * <2013-12-25 Wed> -- Version 1.5.5 4 | ** Fixed a bug with make-client options not being set 5 | ** Added more usage to docstrings 6 | ** Improved consistency and breadth of docstrings in general 7 | ** Renamed get-client-options to web-client-options and privatized it 8 | ** In turn, renamed get-client-options-map to get-client-options 9 | ** Documented the set of possible options that can be set on a client 10 | ** Added a few tests around make-client 11 | ** yokogiri.core/xpath now returns a vector instead of an ArrayList 12 | * <2013-11-30 Sat> -- Version 1.5.4 13 | ** Bumped version to 1.5.4 14 | ** Updated HTMLUnit dependency to 2.12 15 | ** Added the ability to get and set options on a client 16 | - See: http://htmlunit.sourceforge.net/apidocs/com/gargoylesoftware/htmlunit/WebClientOptions.html 17 | - And: https://github.com/devn/yokogiri/blob/master/src/yokogiri/core.clj#L9 18 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | # yokogiri 2 | 3 | A thin Clojure wrapper around HTMLUnit with support for xpath and css selectors. 4 | 5 | ## status 6 | 7 | [![Build Status](https://travis-ci.org/devn/yokogiri.png)](https://travis-ci.org/devn/yokogiri) 8 | 9 | [![Clojars Project](http://clojars.org/yokogiri/latest-version.svg)](http://clojars.org/yokogiri) 10 | 11 | ## getting started 12 | 13 | In your `project.clj`: `[yokogiri "1.5.8"]` 14 | ```clojure 15 | (ns myproject.core 16 | (:require [yokogiri.core :as $])) 17 | ``` 18 | 19 | ## usage 20 | ```clojure 21 | ;; Require yokogiri 22 | (ns myproject.core 23 | (:require [yokogiri.core :as y])) 24 | 25 | ;; Make a client 26 | (y/make-client) 27 | 28 | ;; with javascript disabled (look at the docstring for make-client 29 | ;; for all of the available options.): 30 | (let [a-client (y/make-client :javascript false)] 31 | (y/get-client-options a-client)) 32 | 33 | ;; Curious what options are set by default? 34 | (y/get-client-options (y/make-client)) 35 | ;=> {:redirects true, :javascript true, ...} 36 | 37 | ;; XPATH && CSS Scraping 38 | ;; First, we make a client, and get a page. 39 | (let [client (y/make-client) 40 | page (y/get-page client "http://example.com")] 41 | 42 | ;; XPATH 43 | (y/xpath page "//a") 44 | ;=> [#]>] 45 | 46 | (map y/attrs (y/xpath page "//a")) 47 | ;=> ({:text "More information...", :href "http://www.iana.org/domains/example"}) 48 | 49 | (map y/node-text (y/xpath page "//a")) 50 | ;=> ("More information...") 51 | 52 | ;; CSS 53 | (map y/node-text (y/css page "div.footer-beta-feedback")) 54 | 55 | ;; Get specific attributes 56 | (map #(select-keys (y/attrs %) [:href]) 57 | (y/css page "div.link a"))) 58 | 59 | ;; Other Usage Notes: 60 | ;; We don't *have to* create a client in order to get a page and do stuff with it: 61 | (y/get-page "http://example.com/") 62 | 63 | ;; Dynamically rebind *client* to get a new, temporary client within a scope: 64 | (y/with-client (y/make-client :javascript false) 65 | (y/get-page "http://www.example.com/")) 66 | 67 | ;; Treat a local HTML file as a page: 68 | (y/xpath (y/as-page "docs/uberdoc.html") "//a") 69 | 70 | ;; Treat an HTML string as a page: 71 | (let [html-string "bar"] 72 | (y/xpath (y/create-page html-string) "//a")) 73 | ``` 74 | 75 | ## documentation 76 | 77 | Check out the [nicely formatted documentation](https://rawgithub.com/devn/yokogiri/master/docs/uberdoc.html). 78 | 79 | ## license 80 | 81 | Copyright (C) 2013 Devin Walters 82 | 83 | Distributed under the Eclipse Public License, the same as Clojure. 84 | -------------------------------------------------------------------------------- /TODO.org: -------------------------------------------------------------------------------- 1 | * TODO Allow library consumer to set the BrowserVersion 2 | * TODO What does parity between Yokogiri and Nokogiri look like? 3 | I doubt the answer is 1:1. Nokogiri does more than I think Yokogiri 4 | ought to. Writing vs Reading, Traversal vs Modification, 5 | Abstraction vs Specificity. 6 | -------------------------------------------------------------------------------- /docs/uberdoc.html: -------------------------------------------------------------------------------- 1 | 2 | yokogiri -- Marginalia

yokogiri

1.5.8


Barebones Nokogiri for Clojure

3032 |

dependencies

org.clojure/clojure
1.5.1
net.sourceforge.htmlunit/htmlunit
2.13
se.fishtank/css-selectors
1.0.2



(this space intentionally left almost blank)
 
3033 |
(ns yokogiri.core
3034 |   (:require [clojure.java.io :as io])
3035 |   (:import [com.gargoylesoftware.htmlunit StringWebResponse WebClient BrowserVersion WebClientOptions]
3036 |            [com.gargoylesoftware.htmlunit.html HtmlPage DomNode DomAttr HTMLParser]
3037 |            [org.w3c.dom NamedNodeMap Node]
3038 |            [se.fishtank.css.selectors.dom DOMNodeSelector]))
3039 |
(set! *warn-on-reflection* true)

Returns the client options object for a WebClient.

3040 |
(defn- web-client-options
3041 |   [^WebClient client] (.getOptions client))
3042 |
(def set-client-options-map
3043 |   {:activex-native                  #(.setActiveXNative                     ^WebClientOptions %1 %2)
3044 |    :applet                          #(.setAppletEnabled                     ^WebClientOptions %1 %2)
3045 |    :block-popups                    #(.setPopupBlockerEnabled               ^WebClientOptions %1 %2)
3046 |    :css                             #(.setCssEnabled                        ^WebClientOptions %1 %2)
3047 |    :geolocation                     #(.setGeolocationEnabled                ^WebClientOptions %1 %2)
3048 |    :homepage                        #(.setHomePage                          ^WebClientOptions %1 %2)
3049 |    :insecure-ssl                    #(.setUseInsecureSSL                    ^WebClientOptions %1 %2)
3050 |    :print-content-on-failing-status #(.setPrintContentOnFailingStatusCode   ^WebClientOptions %1 %2)
3051 |    :redirects                       #(.setRedirectEnabled                   ^WebClientOptions %1 %2)
3052 |    :throw-on-failing-status         #(.setThrowExceptionOnFailingStatusCode ^WebClientOptions %1 %2)
3053 |    :throw-on-script-error           #(.setThrowExceptionOnScriptError       ^WebClientOptions %1 %2)
3054 |    :timeout                         #(.setTimeout                           ^WebClientOptions %1 %2)
3055 |    :tracking                        #(.setDoNotTrackEnabled                 ^WebClientOptions %1 %2)
3056 |    :javascript                      #(.setJavaScriptEnabled                 ^WebClientOptions %1 %2)})
3057 |
(declare ^:dynamic *client*)

Sets options on the client.

3058 | 3059 |

Usage:

3060 | 3061 |
(let [client (make-client)]
3062 |   (set-client-options! client {:redirects false}))
3063 | ;=> #<WebClient com.gargoylesoftware.htmlunit.WebClient@7622ccf2>
3064 | 
3065 | 3066 |

Available Options:

3067 | 3068 |
:activex-native                   bool
3069 | :applet                           bool
3070 | :css                              bool
3071 | :geolocation                      bool
3072 | :insecure-ssl                     bool
3073 | :print-content-on-failing-status  bool
3074 | :redirects                        bool
3075 | :throw-on-failing-status          bool
3076 | :throw-on-script-error            bool
3077 | :tracking                         bool
3078 | :javascript                       bool
3079 | :homepage                         string
3080 | :timeout                          integer
3081 | 
3082 |
(defn set-client-options!
3083 |   ([opts] (set-client-options! *client* opts))
3084 |   ([^WebClient client opts]
3085 |      (let [^WebClientOptions client-opts (web-client-options client)]
3086 |        (doseq [[k v] opts]
3087 |          (let [setter-fn (get set-client-options-map k)]
3088 |            (setter-fn client-opts v)))
3089 |        client)))

Returns a map of all options currently set on a client.

3090 | 3091 |

Usage:

3092 | 3093 |
user> (let [client (make-client :redirects false)]
3094 |         (get-client-options client))
3095 | ;=> {:javascript true, :redirects false, ...}
3096 | 
3097 |
(defn get-client-options
3098 |   [^WebClient client]
3099 |   (let [^WebClientOptions opts (web-client-options ^WebClient client)]
3100 |     {:activex-native                       (. opts isActiveXNative)
3101 |      :applet                               (. opts isAppletEnabled)
3102 |      :block-popups                         (. opts isPopupBlockerEnabled)
3103 |      :css                                  (. opts isCssEnabled)
3104 |      :geolocation                          (. opts isGeolocationEnabled)
3105 |      :homepage                             (. opts getHomePage)
3106 |      :insecure-ssl                         (. opts isUseInsecureSSL)
3107 |      :javascript                           (. opts isJavaScriptEnabled)
3108 |      :print-content-on-failing-status-code (. opts getPrintContentOnFailingStatusCode)
3109 |      :redirects                            (. opts isRedirectEnabled)
3110 |      :throw-on-failing-status              (. opts isThrowExceptionOnFailingStatusCode)
3111 |      :throw-on-script-error                (. opts isThrowExceptionOnScriptError)
3112 |      :timeout                              (. opts getTimeout)
3113 |      :tracking                             (. opts isDoNotTrackEnabled)}))

Constructs a new WebClient.

3114 | 3115 |

Usage:

3116 | 3117 |
user> (make-client)
3118 | ;=> #<WebClient com.gargoylesoftware.htmlunit.WebClient@124d43a8>
3119 | 
3120 | 3121 |

With Options:

3122 | 3123 |
user> (make-client :geolocation true
3124 |                    :block-popups false)
3125 | ;=> #<WebClient com.gargoylesoftware.htmlunit.WebClient@4473f04f>
3126 | 
3127 | 3128 |

Available Options:

3129 | 3130 |
:activex-native                   bool
3131 | :applet                           bool
3132 | :css                              bool
3133 | :geolocation                      bool
3134 | :insecure-ssl                     bool
3135 | :print-content-on-failing-status  bool
3136 | :redirects                        bool
3137 | :throw-on-failing-status          bool
3138 | :throw-on-script-error            bool
3139 | :tracking                         bool
3140 | :javascript                       bool
3141 | :homepage                         string
3142 | :timeout                          integer
3143 | 
3144 | 3145 |

See also: yokogiri.core/set-client-options!

3146 |
(defn make-client
3147 |   [& {:as opts}]
3148 |   (let [client (new WebClient)]
3149 |     (if-not (empty? opts)
3150 |       (set-client-options! (new WebClient) opts)
3151 |       client)))
3152 |
(defonce ^:dynamic *client* (make-client))

Takes a client which will be bound to client 3153 | within the scope of the form.

3154 | 3155 |

Usage:

3156 | 3157 |
user> (with-client (make-client :javascript false)
3158 |         (get-page "http://www.example.com/"))
3159 | ;=> #<HtmlPage HtmlPage(http://www.example.com/)@1536532984>
3160 | 
3161 |
(defmacro with-client
3162 |   [c & body]
3163 |   `(binding [*client* ~c]
3164 |      ~@body))

Takes a string, returns an HtmlPage.

3165 | 3166 |

Usage:

3167 | 3168 |
user> (create-page "<html><body><a href=\"http://example.com\">Link</a></body></html>")
3169 | ;=> #<HtmlPage HtmlPage(file://fake-response-url)@478170219>
3170 | 
3171 |
(defn create-page
3172 |   "Takes a string, returns an HtmlPage.
3173 |   **Usage:**
3174 |     user> (create-page \"<html><body><a href=\\\"http://example.com\\\">Link</a></body></html>\")
3175 |     ;=> #<HtmlPage HtmlPage(file://fake-response-url)@478170219>"
3176 |   [xml]
3177 |   (let [url (io/as-url "file://fake-response-url")
3178 |         response (StringWebResponse. xml url)]
3179 |     (HTMLParser/parseHtml response (.getCurrentWindow (WebClient.)))))

Takes a client and a url, returns an HtmlPage.

3180 | 3181 |

Usage:

3182 | 3183 |
user> (get-page (make-client) "http://www.example.com/")
3184 | ;=> #<HtmlPage HtmlPage(http://www.example.com/)@478170219>
3185 | 
3186 |
(defn get-page
3187 |   ([url] (get-page *client* url))
3188 |   ([^WebClient client, ^String url]
3189 |      (.getPage ^WebClient client url)))

Takes a path as a string and creates a Page you can access with 3190 | #'yokogiri.core/xpath, #'yokogiri.core/css, etc.

3191 | 3192 |

Usage:

3193 | 3194 |
user> (as-page "http://www.example.com/")
3195 | ;=> #<HtmlPage HtmlPage(file:/home/user/yokogiri/docs/uberdoc.html)@171016649>
3196 | 
3197 |
(defn as-page
3198 |   ([path] (as-page *client* path))
3199 |   ([client path] (->> path io/file io/as-url str (get-page client))))

Takes an HtmlPage and an xpath string. Returns a vector of nodes 3200 | which match the provided xpath string.

3201 | 3202 |

Usage:

3203 | 3204 |
user> (let [page (get-page your-client "http://www.example.com")]
3205 |         (xpath page "//a"))
3206 | ;=> [#<HtmlAnchor HtmlAnchor[<a href="http://www.iana.org/domains/example">]>]
3207 | 
3208 |
(defn xpath
3209 |   [^HtmlPage page, ^String xpath]
3210 |   (into [] (.getByXPath page xpath)))

Takes an HtmlPage and an xpath string. Returns the first matching 3211 | node which matches the provided xpath string.

3212 | 3213 |

Usage:

3214 | 3215 |
user> (first-by-xpath
3216 |         (get-page your-client "http://www.example.com/")
3217 |         "//a")
3218 | ;=> #<HtmlAnchor HtmlAnchor[<a href="http://www.iana.org/domains/example">]>
3219 | 
3220 |
(defn first-by-xpath
3221 |   [^HtmlPage page, ^String xpath]
3222 |   (.getFirstByXPath page xpath))

Returns matches for a given CSS selector

3223 | 3224 |

Usage:

3225 | 3226 |
user> (css your-client "a.gbzt")
3227 | ;=> (#<HtmlAnchor HtmlAnchor[<a onclick...>]>, ...)
3228 | 
3229 | 3230 |

http://www.goodercode.com/wp/use-css-selectors-with-htmlunit/ 3231 | TODO: Bumping the version of css-selectors to 1.0.4 breaks 3232 | querying by CSS.

3233 |
(defn css
3234 |   [^HtmlPage page, ^String selector]
3235 |   (let [queryable-page (DOMNodeSelector. (. page getDocumentElement))]
3236 |     (seq (. queryable-page querySelectorAll selector))))

Returns a node's XML representation.

3237 | 3238 |

Usage:

3239 | 3240 |
user> (node-xml
3241 |         (first-by-xpath
3242 |           (get-page (make-client) "http://www.example.com/")
3243 |          "//a"))
3244 | ;=> <a href="http://www.iana.org/domains/example">\
3245 | 
3246 | 3247 |

More information...\ 3248 | \

3249 |
(defn node-xml
3250 |   "Returns a node's XML representation.
3251 |   **Usage:**
3252 |     user> (node-xml
3253 |             (first-by-xpath
3254 |               (get-page (make-client) \"http://www.example.com/\")
3255 |              \"//a\"))
3256 |     ;=> <a href=\"http://www.iana.org/domains/example\">\\\n  More information...\\\n</a>\\\n"
3257 |   [^DomNode node]
3258 |   (.asXml node))

Returns a node's text value

3259 | 3260 |

Usage:

3261 | 3262 |
user> (node-text #<HtmlAnchor HtmlAnchor[<a class="foo" id="bar" href="http://example.com">]>)
3263 | ;=> "Search"
3264 | 
3265 |
(defn node-text
3266 |   [^DomNode node]
3267 |   (.asText node))

Returns a clojure map of attributes for a given node

3268 | 3269 |

Usage:

3270 | 3271 |
user> (attr-map #<HtmlAnchor HtmlAnchor[<a class="foo" id="bar" href="http://example.com">]>)
3272 | ;=> {:text "Search", :href "http://example.com", :id "bar", :class "foo"}
3273 | 
3274 | 3275 |

See also: yokogiri.core/attrs

3276 |
(defn attr-map
3277 |   [^DomNode node]
3278 |   (let [^NamedNodeMap attrs (.getAttributes node)]
3279 |     (loop [acc 0, res {}]
3280 |       (if (= acc (.getLength attrs))
3281 |         (assoc res :text (node-text node))
3282 |         (recur (inc acc)
3283 |                (let [^DomAttr attr (.item attrs acc)]
3284 |                  (assoc res (keyword (.getName attr)) (.getValue attr))))))))

See also: yokogiri.core/attr-map

3285 |
(def  attrs #'yokogiri.core/attr-map)

Returns the HtmlUnit DomAttr objects for a given node

3286 | 3287 |

See also: yokogiri.core/attr-map

3288 | 3289 |

TODO: http://htmlunit.sourceforge.net/apidocs/com/gargoylesoftware/htmlunit/html/DomAttr.html

3290 |
(defn- dom-attr
3291 |   [^DomNode node]
3292 |   (let [^NamedNodeMap attrs (.getAttributes node)
3293 |         len (.getLength attrs)]
3294 |     (map #(.item attrs %) (range 0 len))))
3295 |
(comment
3296 |   (def c (make-client))
3297 |   (def p (get-page c "http://www.example.com/"))
3298 |   (xpath p "//a")
3299 |   (map attrs (css p "p")))
 
-------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject yokogiri "1.5.8" 2 | :description "Barebones Nokogiri for Clojure" 3 | :license {:name "Eclipse Public License - Version 1.0" 4 | :url "http://www.eclipse.org/legal/epl-v10.html" 5 | :comments "Same as Clojure" 6 | :distribution :repo} 7 | :url "https://github.com/devn/yokogiri" 8 | :dependencies [[org.clojure/clojure "1.5.1"] 9 | [net.sourceforge.htmlunit/htmlunit "2.13"] 10 | [se.fishtank/css-selectors "1.0.2"]] 11 | :profiles {:dev {:dependencies [[midje "1.5.1" :exclusions [org.clojure/clojure]]] 12 | :plugins [[lein-midje "3.1.3"] 13 | [lein-marginalia "0.7.1"]]}} 14 | :min-lein-version "2.0.0") 15 | -------------------------------------------------------------------------------- /src/yokogiri/core.clj: -------------------------------------------------------------------------------- 1 | (ns yokogiri.core 2 | (:require [clojure.java.io :as io]) 3 | (:import [com.gargoylesoftware.htmlunit StringWebResponse WebClient BrowserVersion WebClientOptions] 4 | [com.gargoylesoftware.htmlunit.html HtmlPage DomNode DomAttr HTMLParser] 5 | [org.w3c.dom NamedNodeMap Node] 6 | [se.fishtank.css.selectors.dom DOMNodeSelector])) 7 | 8 | (set! *warn-on-reflection* true) 9 | 10 | (defn- web-client-options 11 | "Returns the client options object for a WebClient." 12 | [^WebClient client] (.getOptions client)) 13 | 14 | (def set-client-options-map 15 | {:activex-native #(.setActiveXNative ^WebClientOptions %1 %2) 16 | :applet #(.setAppletEnabled ^WebClientOptions %1 %2) 17 | :block-popups #(.setPopupBlockerEnabled ^WebClientOptions %1 %2) 18 | :css #(.setCssEnabled ^WebClientOptions %1 %2) 19 | :geolocation #(.setGeolocationEnabled ^WebClientOptions %1 %2) 20 | :homepage #(.setHomePage ^WebClientOptions %1 %2) 21 | :insecure-ssl #(.setUseInsecureSSL ^WebClientOptions %1 %2) 22 | :print-content-on-failing-status #(.setPrintContentOnFailingStatusCode ^WebClientOptions %1 %2) 23 | :redirects #(.setRedirectEnabled ^WebClientOptions %1 %2) 24 | :throw-on-failing-status #(.setThrowExceptionOnFailingStatusCode ^WebClientOptions %1 %2) 25 | :throw-on-script-error #(.setThrowExceptionOnScriptError ^WebClientOptions %1 %2) 26 | :timeout #(.setTimeout ^WebClientOptions %1 %2) 27 | :tracking #(.setDoNotTrackEnabled ^WebClientOptions %1 %2) 28 | :javascript #(.setJavaScriptEnabled ^WebClientOptions %1 %2)}) 29 | 30 | (declare ^:dynamic *client*) 31 | 32 | (defn set-client-options! 33 | "Sets options on the client. 34 | 35 | **Usage:** 36 | 37 | (let [client (make-client)] 38 | (set-client-options! client {:redirects false})) 39 | ;=> # 40 | 41 | **Available Options:** 42 | 43 | :activex-native bool 44 | :applet bool 45 | :css bool 46 | :geolocation bool 47 | :insecure-ssl bool 48 | :print-content-on-failing-status bool 49 | :redirects bool 50 | :throw-on-failing-status bool 51 | :throw-on-script-error bool 52 | :tracking bool 53 | :javascript bool 54 | :homepage string 55 | :timeout integer" 56 | ([opts] (set-client-options! *client* opts)) 57 | ([^WebClient client opts] 58 | (let [^WebClientOptions client-opts (web-client-options client)] 59 | (doseq [[k v] opts] 60 | (let [setter-fn (get set-client-options-map k)] 61 | (setter-fn client-opts v))) 62 | client))) 63 | 64 | (defn get-client-options 65 | "Returns a map of all options currently set on a client. 66 | 67 | **Usage:** 68 | 69 | user> (let [client (make-client :redirects false)] 70 | (get-client-options client)) 71 | ;=> {:javascript true, :redirects false, ...}" 72 | [^WebClient client] 73 | (let [^WebClientOptions opts (web-client-options ^WebClient client)] 74 | {:activex-native (. opts isActiveXNative) 75 | :applet (. opts isAppletEnabled) 76 | :block-popups (. opts isPopupBlockerEnabled) 77 | :css (. opts isCssEnabled) 78 | :geolocation (. opts isGeolocationEnabled) 79 | :homepage (. opts getHomePage) 80 | :insecure-ssl (. opts isUseInsecureSSL) 81 | :javascript (. opts isJavaScriptEnabled) 82 | :print-content-on-failing-status-code (. opts getPrintContentOnFailingStatusCode) 83 | :redirects (. opts isRedirectEnabled) 84 | :throw-on-failing-status (. opts isThrowExceptionOnFailingStatusCode) 85 | :throw-on-script-error (. opts isThrowExceptionOnScriptError) 86 | :timeout (. opts getTimeout) 87 | :tracking (. opts isDoNotTrackEnabled)})) 88 | 89 | (defn make-client 90 | "Constructs a new WebClient. 91 | 92 | **Usage:** 93 | 94 | user> (make-client) 95 | ;=> # 96 | 97 | **With Options:** 98 | 99 | user> (make-client :geolocation true 100 | :block-popups false) 101 | ;=> # 102 | 103 | **Available Options:** 104 | 105 | :activex-native bool 106 | :applet bool 107 | :css bool 108 | :geolocation bool 109 | :insecure-ssl bool 110 | :print-content-on-failing-status bool 111 | :redirects bool 112 | :throw-on-failing-status bool 113 | :throw-on-script-error bool 114 | :tracking bool 115 | :javascript bool 116 | :homepage string 117 | :timeout integer 118 | 119 | _See also: yokogiri.core/set-client-options!_" 120 | [& {:as opts}] 121 | (let [client (new WebClient)] 122 | (if-not (empty? opts) 123 | (set-client-options! (new WebClient) opts) 124 | client))) 125 | 126 | (defonce ^:dynamic *client* (make-client)) 127 | 128 | (defmacro with-client 129 | "Takes a client which will be bound to *client* 130 | within the scope of the form. 131 | 132 | **Usage:** 133 | 134 | user> (with-client (make-client :javascript false) 135 | (get-page \"http://www.example.com/\")) 136 | ;=> #" 137 | [c & body] 138 | `(binding [*client* ~c] 139 | ~@body)) 140 | 141 | (defn create-page 142 | "Takes a string, returns an HtmlPage. 143 | 144 | **Usage:** 145 | 146 | user> (create-page \"Link\") 147 | ;=> #" 148 | [xml] 149 | (let [url (io/as-url "file://fake-response-url") 150 | response (StringWebResponse. xml url)] 151 | (HTMLParser/parseHtml response (.getCurrentWindow (WebClient.))))) 152 | 153 | (defn get-page 154 | "Takes a client and a url, returns an HtmlPage. 155 | 156 | **Usage:** 157 | 158 | user> (get-page (make-client) \"http://www.example.com/\") 159 | ;=> #" 160 | ([url] (get-page *client* url)) 161 | ([^WebClient client, ^String url] 162 | (.getPage ^WebClient client url))) 163 | 164 | (defn as-page 165 | "Takes a path as a string and creates a Page you can access with 166 | #'yokogiri.core/xpath, #'yokogiri.core/css, etc. 167 | 168 | **Usage:** 169 | 170 | user> (as-page \"http://www.example.com/\") 171 | ;=> #" 172 | ([path] (as-page *client* path)) 173 | ([client path] (->> path io/file io/as-url str (get-page client)))) 174 | 175 | (defn xpath 176 | "Takes an HtmlPage and an xpath string. Returns a vector of nodes 177 | which match the provided xpath string. 178 | 179 | **Usage:** 180 | 181 | user> (let [page (get-page your-client \"http://www.example.com\")] 182 | (xpath page \"//a\")) 183 | ;=> [#]>]" 184 | [^HtmlPage page, ^String xpath] 185 | (into [] (.getByXPath page xpath))) 186 | 187 | (defn first-by-xpath 188 | "Takes an HtmlPage and an xpath string. Returns the first matching 189 | node which matches the provided xpath string. 190 | 191 | **Usage:** 192 | 193 | user> (first-by-xpath 194 | (get-page your-client \"http://www.example.com/\") 195 | \"//a\") 196 | ;=> #]>" 197 | [^HtmlPage page, ^String xpath] 198 | (.getFirstByXPath page xpath)) 199 | 200 | ;; _http://www.goodercode.com/wp/use-css-selectors-with-htmlunit/_ 201 | ;; _TODO: Bumping the version of css-selectors to 1.0.4 breaks_ 202 | ;; _querying by CSS._ 203 | (defn css 204 | "Returns matches for a given CSS selector 205 | 206 | **Usage:** 207 | 208 | user> (css your-client \"a.gbzt\") 209 | ;=> (#]>, ...)" 210 | [^HtmlPage page, ^String selector] 211 | (let [queryable-page (DOMNodeSelector. (. page getDocumentElement))] 212 | (seq (. queryable-page querySelectorAll selector)))) 213 | 214 | (defn node-xml 215 | "Returns a node's XML representation. 216 | 217 | **Usage:** 218 | 219 | user> (node-xml 220 | (first-by-xpath 221 | (get-page (make-client) \"http://www.example.com/\") 222 | \"//a\")) 223 | ;=> \n More information...\n\n" 224 | [^DomNode node] 225 | (.asXml node)) 226 | 227 | (defn node-text 228 | "Returns a node's text value 229 | 230 | **Usage:** 231 | 232 | user> (node-text #]>) 233 | ;=> \"Search\"" 234 | [^DomNode node] 235 | (.asText node)) 236 | 237 | (defn attr-map 238 | "Returns a clojure map of attributes for a given node 239 | 240 | **Usage:** 241 | 242 | user> (attr-map #]>) 243 | ;=> {:text \"Search\", :href \"http://example.com\", :id \"bar\", :class \"foo\"} 244 | 245 | _See also: yokogiri.core/attrs_" 246 | [^DomNode node] 247 | (let [^NamedNodeMap attrs (.getAttributes node)] 248 | (loop [acc 0, res {}] 249 | (if (= acc (.getLength attrs)) 250 | (assoc res :text (node-text node)) 251 | (recur (inc acc) 252 | (let [^DomAttr attr (.item attrs acc)] 253 | (assoc res (keyword (.getName attr)) (.getValue attr)))))))) 254 | 255 | (def ^{:doc "_See also: yokogiri.core/attr-map_"} attrs #'yokogiri.core/attr-map) 256 | 257 | ;; _TODO: http://htmlunit.sourceforge.net/apidocs/com/gargoylesoftware/htmlunit/html/DomAttr.html_ 258 | (defn- dom-attr 259 | "Returns the HtmlUnit DomAttr objects for a given node 260 | 261 | _See also: yokogiri.core/attr-map_" 262 | [^DomNode node] 263 | (let [^NamedNodeMap attrs (.getAttributes node) 264 | len (.getLength attrs)] 265 | (map #(.item attrs %) (range 0 len)))) 266 | 267 | (comment 268 | (def c (make-client)) 269 | (def p (get-page c "http://www.example.com/")) 270 | (xpath p "//a") 271 | (map attrs (css p "p")) 272 | ) 273 | -------------------------------------------------------------------------------- /test/yokogiri/test/core.clj: -------------------------------------------------------------------------------- 1 | (ns yokogiri.test.core 2 | (:use [yokogiri.core]) 3 | (:use [midje.sweet])) 4 | 5 | (fact "These tests need to be improved, but they're better than nothing." 6 | true => true) 7 | 8 | (facts "yokogiri.core/make-client" 9 | (fact "It can create a client" 10 | (let [simple-client (make-client)] 11 | simple-client 12 | => truthy 13 | (class simple-client) 14 | => com.gargoylesoftware.htmlunit.WebClient)) 15 | 16 | (fact "It can create a client with options" 17 | (let [insecure-client (make-client :insecure-ssl true)] 18 | (:insecure-ssl (get-client-options insecure-client)) 19 | => true)) 20 | (fact "It can create a client with many non-default options" 21 | (let [c (make-client :insecure-ssl false 22 | :javascript false 23 | :css false) 24 | opts (get-client-options c)] 25 | (->> (-> opts 26 | (select-keys [:insecure-ssl :javascript :css]) 27 | (vals)) 28 | (every? false?))) 29 | => true)) 30 | 31 | (facts "About CSS and XPath queries" 32 | (let [c (make-client) 33 | p (get-page c "http://www.example.com/")] 34 | (fact "CSS works" 35 | (css p "a") 36 | => truthy) 37 | (fact "XPath works") 38 | (xpath p "//a") 39 | => truthy)) 40 | 41 | (facts "About creating page from string" 42 | (let [p (create-page "Link")] 43 | (fact "CSS works" 44 | (css p "a") 45 | => truthy) 46 | (fact "XPath works" 47 | (xpath p "//a") 48 | => truthy))) 49 | 50 | (facts "About *client*" 51 | (fact "You can dynamically rebind *client* to your own client." 52 | (with-client (make-client :javascript false) 53 | (:javascript (get-client-options *client*))) => false) 54 | (fact "get-page works when using the default *client*." 55 | (get-page "http://www.example.com/") => truthy) 56 | (fact "as-page works when using the default *client*." 57 | (as-page "docs/uberdoc.html") => truthy) 58 | (fact "set-client-options! works when using the default *client*." 59 | (set-client-options! {:javascript true}) => truthy)) 60 | 61 | 62 | --------------------------------------------------------------------------------