├── .circleci
└── config.yml
├── .gitignore
├── NEWS.md
├── README.md
├── bin
└── kaocha
├── deps.edn
├── doc
├── 01-scraping-modes.md
├── 02-caching.md
├── 03-updates.md
├── 04-parsing.md
├── 05-http-interactions.md
├── 06-error-handling.md
├── 07-db.md
└── 08-development-mode.md
├── examples
├── clojuredays.clj
├── hackernews.clj
└── tree_species.clj
├── release.edn
├── src
└── skyscraper
│ ├── cache.clj
│ ├── context.clj
│ ├── core.clj
│ ├── data.clj
│ ├── db.clj
│ ├── dev.clj
│ ├── enlive_helpers.clj
│ └── traverse.clj
├── test
└── skyscraper
│ ├── basic_cookie_test.clj
│ ├── cache_test.clj
│ ├── character_encoding_test.clj
│ ├── db_self_pointing_test.clj
│ ├── db_test.clj
│ ├── error_handling_test.clj
│ ├── http_method_reset_test.clj
│ ├── mock_test.clj
│ ├── nondistinct_test.clj
│ ├── overridable_parse_fn_test.clj
│ ├── test_utils.clj
│ ├── traverse_test.clj
│ └── updates_test.clj
└── tests.edn
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | version: 2.1
2 | orbs:
3 | kaocha: lambdaisland/kaocha@0.0.3
4 | jobs:
5 | build:
6 | working_directory: ~/skyscraper
7 | docker:
8 | - image: cimg/clojure:1.11.1
9 | environment:
10 | LEIN_ROOT: nbd
11 | JVM_OPTS: -Xmx3200m
12 | steps:
13 | - checkout
14 | - restore_cache:
15 | key: skyscraper-{{ checksum "deps.edn" }}
16 | - kaocha/execute:
17 | clojure_version: 1.11.1
18 | - save_cache:
19 | paths:
20 | - ~/.m2
21 | key: skyscraper-{{ checksum "deps.edn" }}
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .\#*
2 | *.class
3 | *.jar
4 | *~
5 | /.clj-kondo
6 | /.cpcache
7 | /.lein-*
8 | /.lsp
9 | /.nrepl-port
10 | /.portal
11 | /checkouts
12 | /classes
13 | /target
14 | \#*
15 | pom.xml
16 | pom.xml.asc
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # History of Skyscraper releases
2 |
3 | ## 0.3.6 (2023-07-23)
4 |
5 | - Feature: New option `:use-http-headers-from-content` that can be set
6 | to `false` to disable charset detection based on the HTML response body.
7 | - Fix: Uncaught exceptions thrown by enhancers (like the DB one) should now be
8 | propagated to the toplevel and handled gracefully.
9 |
10 | ## 0.3.5 (2023-05-09)
11 |
12 | - Feature: New function `cached-document` for accessing a previous (cached)
13 | version of a page downloaded while in update mode.
14 | - Documentation: New example illustrating the use of a Redis cache backend.
15 | (Thanks to Patrick van de Glind, Carlo Sciolla, Alvin Francis Dumalus, and
16 | Oskar Gewalli for contributing!)
17 |
18 | ## 0.3.4 (2022-09-05)
19 |
20 | This release corrects the issue in 0.3.3 that caused its pom.xml to
21 | not include dependencies, but is otherwise the same.
22 |
23 | ## 0.3.3 (2022-09-04)
24 |
25 | - Feature: To facilitate debugging, processors can now set the
26 | `:skyscraper/description` key on contexts. These descriptions will be
27 | logged when downloading, instead of the URL, and won’t be propagated
28 | to child contexts.
29 | - Fix: Skyscraper now properly closes the cache when using `scrape!`
30 | and one of the processors throws an exception.
31 | - Fix: Skyscraper no longer complains when the server returns a quoted
32 | charset in the `Content-type` header.
33 | - Fix: `:skyscraper.traverse/priority` is no longer propagated to
34 | child contexts.
35 | - Infra: Skyscraper’s dependencies are now managed with cli-tools,
36 | with Kaocha being used for testing.
37 |
38 | ## 0.3.2 (2022-08-04)
39 |
40 | - Fix: Skyscraper no longer throws exceptions when using processed-cache
41 | and some of the processors don’t have `:cache-template`.
42 | - Fix: Skyscraper no longer throws exceptions when the server returns
43 | multiple Content-Type headers.
44 | - Fix: Processed cache no longer garbles non-ASCII strings on macOS.
45 |
46 | ## 0.3.1 (2022-07-31)
47 |
48 | - Backwards-incompatible API changes:
49 | - `parse-fn` is now expected to take three arguments, the third being
50 | the context. The aim of this change is to support cases where the
51 | HTML is known to be malformed and needs context-aware preprocessing
52 | before parsing. Built-in parse-fns have been updated to take the
53 | additional argument.
54 | - Cache backends are now expected to implement `java.io.Closeable`
55 | in addition to `CacheBackend`. Built-in backends have been
56 | updated to include no-op `close` methods.
57 | - Optimization: Skyscraper no longer generates indexes for columns
58 | marked with `:skyscraper.db/key-columns` when creating the DB from
59 | scratch. There is also a new option, `:ignore-db-keys`, to force
60 | this at all times.
61 | - Skyscraper now retries downloads upon encountering a timeout.
62 | - Bug fixes:
63 | - Fixed dev/scrape misbehaving when redefining processors while scraping is suspended.
64 | - Fixed scrape mishandling errors with `:download-mode` set to `:sync`.
65 | - Fixed an off-by-one bug in handling `:retries`.
66 | - Retry counts are now correctly reset on successful download.
67 |
68 | ## 0.3.0 (2020-02-17)
69 |
70 | - Skyscraper has been rewritten from scratch to be asynchronous and multithreaded,
71 | based on [core.async]. See [Scraping modes] for details.
72 | - Skyscraper now supports saving the scrape results to [a SQLite database][db].
73 | - In addition to the classic `scrape` function that returns a lazy sequence of nodes, there is an
74 | alternative, non-lazy, imperative interface (`scrape!`) that treats producing new results as
75 | side-effects.
76 | - [reaver] (using JSoup) is now available as an optional underlying HTML parsing engine, as an alternative to Enlive.
77 | - `:parse-fn` and `:http-options` can now be provided either per-page or globally. (Thanks to Alexander Solovyov for the suggestion.)
78 | - All options are now optional, including sane default for `process-fn`.
79 | - Backwards-incompatible API changes:
80 | - The `skyscraper` namespace has been renamed to `skyscraper.core`.
81 | - Processors are now named by keywords.
82 | - `defprocessor` now takes a keyword name, and registers a function in the
83 | global registry instead of defining it. This means that it’s no longer possible
84 | to call one processor from another: if you need that, define `process-fn` as a
85 | named function.
86 | - The context values corresponding to `:processor` keys are now expected to
87 | be keywords.
88 | - `scrape` no longer guarantees the order in which the site will be scraped.
89 | In particular, two different invocations of `scrape` are not guaranteed to return
90 | the scraped data in the same order. If you need that guarantee, set
91 | `parallelism` and `max-connections` to 1.
92 | - The cache interface has been overhauled. Caching now works by storing binary blobs
93 | (rather than strings), along with metadata (e.g., HTTP headers). Caches created
94 | by Skyscraper 0.1 or 0.2 cannot be reused for 0.3.
95 | - [Error handling] has been reworked.
96 | - `get-cache-keys` has been removed. If you want the same effect, include `:cache-key` in the desired contexts.
97 |
98 | [core.async]: https://github.com/clojure/core.async
99 | [Scraping modes]: doc/01-scraping-modes.md
100 | [db]: doc/07-db.md
101 | [Error handling]: doc/06-error-handling.md
102 | [reaver]: https://github.com/mischov/reaver
103 |
104 | ## 0.2.3 (2016-11-17)
105 |
106 | - New feature: Custom parse functions.
107 | - New feature: Customizable error handling strategies.
108 | - Bugfix: `:only` now doesn’t barf on keys not appearing in seed.
109 |
110 | ## 0.2.2 (2016-05-06)
111 |
112 | - Skyscraper now uses Timbre for logging.
113 | - New cache backend: `MemoryCache`.
114 | - `download` now supports arbitrarily many retries.
115 | - A situation where a context has a processor but no URL now triggers a warning instead of throwing an exception.
116 |
117 | ## 0.2.1 (2015-12-17)
118 |
119 | - New function: `get-cache-keys`.
120 | - `scrape` and friends can now accept a keyword as the first argument.
121 | - Cache keys are now accessible from within processors (under the
122 | `:cache-key` key in the context).
123 | - New `scrape` options: `:only` and `:postprocess`.
124 | - `scrape-csv` now accepts an `:all-keys` argument and has been rewritten using a helper function, `save-dataset-to-csv`.
125 |
126 | ## 0.2.0 (2015-10-03)
127 |
128 | - Skyscraper now supports pluggable cache backends.
129 | - The caching mechanism has been completely overhauled and Skyscraper no longer
130 | creates temporary files when the HTML cache is disabled.
131 | - Support for capturing scraping results to CSV via `scrape-csv`.
132 | - Support for updating existing scrapes: new processor flag `:updatable`,
133 | `scrape` now has an `:update` option.
134 | - New `scrape` option: `:retries`.
135 | - Fixed a bug whereby scraping huge datasets would result in an `OutOfMemoryError`.
136 | (`scrape` no longer holds onto the head of the lazy seq it produces).
137 |
138 | ## 0.1.2 (2015-09-17)
139 |
140 | - A processor can now return one context only. (Thanks to Bryan Maass.)
141 | - The `processed-cache` option to `scrape` now works as advertised.
142 | - New `scrape` option: `:html-cache`. (Thanks to ayato-p.)
143 | - Namespaced keywords are now resolved correctly to processors.
144 | (Thanks to ayato-p.)
145 | - New official `defprocessor` clauses: `:url-fn` and `:cache-key-fn`.
146 | - Note: these clauses existed in previous versions but were undocumented.
147 | - All contexts except the root ones are now guaranteed to contain the `:url` key.
148 |
149 | ## 0.1.1 (2015-08-24)
150 |
151 | - Processors (`process-fn` functions) can now access current context.
152 | - Skyscraper now uses [clj-http] to issue HTTP GET requests.
153 | - Skyscraper can now auto-detect page encoding thanks to clj-http’s `decode-body-headers` feature.
154 | - `scrape` now supports a `http-options` argument to override HTTP options (e.g., timeouts).
155 | - Skyscraper’s output is now fully lazy (i.e., guaranteed to be non-chunking).
156 | - Fixed a bug where relative URLs were incorrectly resolved in certain circumstances.
157 |
158 | [clj-http]: https://github.com/dakrone/clj-http
159 |
160 | ## 0.1.0 (2015-08-11)
161 |
162 | - First public release.
163 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Skyscraper
2 |
3 | A framework that helps you build structured dumps of whole websites.
4 |
5 | [](https://clojars.org/skyscraper)
6 | [](https://circleci.com/gh/nathell/skyscraper)
7 | [](https://cljdoc.org/d/skyscraper/skyscraper)
8 |
9 | ## Concepts
10 |
11 | ### Structural scraping and scrape trees
12 |
13 | Think of [Enlive]. It allows you to parse arbitrary HTML and extract various bits of information out of it: subtrees or parts of subtrees determined by selectors. You can then convert this information to some other format, easier for machine consumption, or process it in whatever other way you wish. This is called _scraping_.
14 |
15 | Now imagine that you have to parse a lot of HTML documents. They all come from the same site, so most of them are structured in the same way and can be scraped using the same sets of selectors. But not all of them. There’s an index page, which has a different layout and needs to be treated in its own peculiar way, with pagination and all. There are pages that group together individual pages in categories. And so on. Treating single pages is easy, but with whole collections of pages, you quickly find yourself writing a lot of boilerplate code.
16 |
17 | In particular, you realize that you can’t just `wget -r` the whole thing and then parse each page in turn. Rather, you want to simulate the workflow of a user who tries to “click through” the website to obtain the information she’s interested in. Sites have tree-like structure, and you want to keep track of this structure as you traverse the site, and reflect it in your output. I call it “structural scraping”, and the tree of traversed pages and information extracted from each one – the “scrape tree”.
18 |
19 | [Enlive]: https://github.com/cgrand/enlive
20 |
21 | ### Contexts
22 |
23 | A “context” is a map from keywords to arbitrary data. Think of it as “everything we have scraped so far”. A context has two special keys, `:url` and `:processor`, that contains the next URL to visit and the processor to handle it with (see below).
24 |
25 | Scraping works by transforming context to list of contexts. You can think of it as a list monad. The initial list of contexts is supplied by the user, and typically contains a single map with an URL and a root processor.
26 |
27 | A typical function producing an initial list of contexts (a _seed_) looks like this:
28 |
29 | ```clojure
30 | (defn seed [& _]
31 | [{:url "http://www.example.com",
32 | :processor :root-page}])
33 | ```
34 |
35 | ### Processors
36 |
37 | A “processor” is a unit of scraping: a function that processes sets of HTML pages in a uniform way.
38 |
39 | Processors are defined with the `defprocessor` macro (which registers the processing function in a global registry). A typical processor, for a site’s landing page that contains links to other pages within table cells, might look like this:
40 |
41 | ```clojure
42 | (defprocessor :landing-page
43 | :cache-template "mysite/index"
44 | :process-fn (fn [res context]
45 | (for [a (select res [:td :a])]
46 | {:page (text a),
47 | :url (href a),
48 | :processor :subpage})))
49 | ```
50 |
51 | The most important clause is `:process-fn`. This is the function called by the processor to extract new information from a page and include it in the context. It takes two parameters:
52 |
53 | 1. an Enlive resource corresponding to the parsed HTML tree of the page being processed,
54 | 2. the current context (i.e., combined outputs of all processors so far).
55 |
56 | The output should be a seq of maps that each have a new URL and a new processor (specified as a keyword) to invoke next.
57 |
58 | ## Where to go from here
59 |
60 | Explore the [documentation]. Have a look at examples in the `examples/` directory of the repo. Read the docstrings, especially those of `scrape` and `defprocessor`.
61 |
62 | If something is unclear, or you have suggestions or encounter a bug, please create an issue!
63 |
64 | [documentation]: https://cljdoc.org/d/skyscraper/skyscraper
65 |
66 | ## Caveats
67 |
68 | Skyscraper is work in progress. Some things are missing. The API is still in flux. Function and macro names, input and output formats are liable to change at any time. Suggestions of improvements are welcome (preferably as GitHub issues), as are pull requests.
69 |
70 | ## License
71 |
72 | Copyright (C) 2015–2022 Daniel Janus, http://danieljanus.pl
73 |
74 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
75 |
76 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
77 |
78 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
79 |
--------------------------------------------------------------------------------
/bin/kaocha:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | clojure -M:test:run-test "$@"
3 |
--------------------------------------------------------------------------------
/deps.edn:
--------------------------------------------------------------------------------
1 | {:paths ["src"]
2 | :deps {clj-http/clj-http {:mvn/version "3.12.3"}
3 | com.taoensso/timbre {:mvn/version "5.2.1"}
4 | crouton/crouton {:mvn/version "0.1.2"}
5 | enlive/enlive {:mvn/version "1.1.6"}
6 | org.clojure/core.async {:mvn/version "1.5.648"}
7 | org.clojure/core.incubator {:mvn/version "0.1.4"}
8 | org.clojure/data.csv {:mvn/version "1.0.1"}
9 | org.clojure/data.priority-map {:mvn/version "1.1.0"}
10 | org.clojure/java.jdbc {:mvn/version "0.7.12"}
11 | org.jsoup/jsoup {:mvn/version "1.15.3"}
12 | org.xerial/sqlite-jdbc {:mvn/version "3.36.0.3"}
13 | reaver/reaver {:mvn/version "0.1.3"}}
14 | :aliases {:test {:extra-paths ["test"]
15 | :extra-deps {com.github.nathell/cartestian {:git/sha "c6cb24aa8ae9e08a6f6cfccee0a606bfba965fa0"}
16 | hiccup/hiccup {:mvn/version "1.0.5"}
17 | lambdaisland/kaocha {:mvn/version "1.69.1069"}
18 | ring/ring {:mvn/version "1.9.5"}
19 | tortue/spy {:mvn/version "2.13.0"}}}
20 | :run-test {:main-opts ["-m" "kaocha.runner"]}
21 | :release {:extra-deps {applied-science/deps-library {:mvn/version "0.4.0"}}
22 | :main-opts ["-m" "applied-science.deps-library"]}}}
23 |
--------------------------------------------------------------------------------
/doc/01-scraping-modes.md:
--------------------------------------------------------------------------------
1 | # Scraping modes and asynchrony
2 |
3 | ## Entry points
4 |
5 | Skyscraper is implemented using [core.async][1] and by default, scraping happens asynchronously on multiple threads. There are several entry-point functions to invoke scraping; they all take the same arguments (see the docstring of the first one for details), but differ in how they return results.
6 |
7 | ### `skyscraper.core/scrape`: lazy sequence of contexts
8 |
9 | This is the only mode that was available in Skyscraper versions prior to 0.3. The `scrape` function returns a lazy sequence of contexts that are leaves in the scraping tree. After fully consuming this sequence, the underlying core.async channel will be closed and the worker threads terminated.
10 |
11 | ### `skyscraper.core/scrape!`: imperative, eager
12 |
13 | While `scrape` tries to provide a functional, sequence-based interface, `scrape!` takes the stance that scraping is a side-effectful process. It is eager (returns after scraping has completed) and returns `nil`. There are two ways to actually access the scraping results:
14 |
15 | 1. Provide the `:db` or `:db-file` options. This will cause `scrape!` to output a SQLite database. See [details][2].
16 | 2. Provide the `:leaf-chan` or `:item-chan` options: core.async channels to which vectors of leaf contexts or interim contexts, respectively, will be delivered.
17 |
18 | ### `skyscraper.dev/scrape`: for interactive use in the REPL
19 |
20 | This one is described in [Development Mode][3].
21 |
22 | ### `skyscraper.traverse/launch`: advanced
23 |
24 | Scraping in Skyscraper is actually implemented atop a more primitive abstraction: parallelized context tree traversal (see the docstring on `skyscraper.traverse`). All other scraping functions are implemented on top of `launch`. Use it in the rare cases when you want strict control over when to terminate scraping.
25 |
26 | ## Asynchrony
27 |
28 | Scraping spins up a number of _worker threads_ that actually perform scraping (parsing HTML, running processors, and sometimes downloading data – see below). The number of worker threads is configurable: by default, it’s 4, but you can override it by the `:parallelism` setting.
29 |
30 | ## Download modes
31 |
32 | By default, Skyscraper downloads pages using clj-http’s async request facility. This means that the downloads actually happen on threads other than the ones managed by Skyscraper. You can change this by specifying `:download-mode :sync`.
33 |
34 | When `:download-mode` is `:async` mode, there can be more simultaneous downloads than Skyscraper’s worker threads. Use `:max-connections` to limit that number.
35 |
36 | Use `:parallelism 1 :download-mode :sync` to simulate sequential scraping flow à la Skyscraper 0.2. This is useful when you want to precisely control the order in which your pages will be scraped.
37 |
38 | [1]: https://github.com/clojure/core.async
39 | [2]: db.md
40 | [3]: development-mode.md
41 |
--------------------------------------------------------------------------------
/doc/02-caching.md:
--------------------------------------------------------------------------------
1 | # Caching
2 |
3 | Skyscraper has two kinds of caches: one holding the raw downloaded HTML before parsing and processing (“HTML cache”), and one holding the results of parsing and processing individual pages (“processed cache”). Both caches are enabled by default, but can be disabled as needed.
4 |
5 | In normal usage (i.e., scraping a site using code that is known to work), it is recommended to keep the processed cache enabled. The HTML cache can be disabled in this case without many drawbacks, as Skyscraper will not attempt to redownload a page that had been processed already. The advantage of disabling HTML cache is saving disk space: Web pages are typically markup-heavy and the interesting pieces constitute a tiny part of them, so the HTML cache can grow much faster than the processed cache. This can be problematic when scraping huge sites.
6 |
7 | The converse is true when developing Skyscraper-based code and writing your own processors. The HTML cache comes in handy as you try out different ways of obtaining the desired information from the page at hand, as it only has to be downloaded once. On the other hand, disabling the processed cache guarantees that the information will be recomputed when the scraping code changes.
8 |
9 | Skyscraper supports pluggable cache backends. The core library contains a protocol that the backends should conform to (`CacheBackend`), as well as two implementations of that protocol: one that doesn’t actually cache data, just pretending to be doing so (a “null cache”), and one that stores data in the filesystem. See the file `skyscraper/cache.clj` for details.
10 |
11 | By default, both the HTML cache and the processed cache use the FS backend and are configured to live in `~/skyscraper-data`, respectively under `cache/html` and `cache/processed`.
12 |
13 | Since version 0.3.1, Skyscraper expects all `CacheBackend` implementations to also implement `java.io.Closeable`. This is so that caches can cleanly dispose of, say, database connections. Skyscraper calls `.close` on both caches as part of cleanup when ending the cache process.
14 |
15 | ## Cache keys
16 |
17 | Every page cached by Skyscraper is stored under a cache key (a string). It is up to the user to construct a key in a unique way for each page, based on information available in the context. Typically, the key is hierarchical, containing a logical “path” to the page separated by slashes (it may or may not correspond to the page’s URL).
18 |
19 | To facilitate construction of such keys, Skyscraper provides a micro-templating framework. The key templates can be specified in a `cache-template` parameter of the `defprocessor` macro. When a template contains parts prefixed by a colon and containing lower-case characters and dashes, these are replaced by corresponding context elements. As an example, the template `"mysite/:surname/:name"`, given the context `{:name "John", :surname "Doe"}`, will generate the key `"mysite/Doe/John"`.
20 |
--------------------------------------------------------------------------------
/doc/03-updates.md:
--------------------------------------------------------------------------------
1 | # Updating scraped sites
2 |
3 | Often, once you successfully scrape a site in full, you then want to periodically update the scraped data: redownload and rescrape only what is necessary. Skyscraper offers several options to assist you.
4 |
5 | ## The brute-force way: wiping cache
6 |
7 | If you’re using either of the HTML or processed caches (see [Caching][1]), then Skyscraper will reuse the already downloaded data for further processing. This means that rerunning a successful scrape with enabled caching will not trigger any HTTP request, even if the original site has changed.
8 |
9 | The most obvious (but also slowest) way to proceed is by clearing the cache (e.g., `rm -r ~/skyscraper-data/cache`), forcing Skyscraper to redownload everything.
10 |
11 | ## The on-demand way: `:update` and `:updatable`
12 |
13 | You can mark some processors as `:updatable`. These will typically correspond to non-leaf nodes of your scraping tree.
14 |
15 | ```
16 | (defprocessor :landing-page
17 | :cache-template "mysite/index"
18 | :updatable true
19 | :process-fn …)
20 | ```
21 |
22 | The value for `:updatable` can be either `true` (meaning “always update”), `false` (meaning “never update” – the default), or a function that should take a context and decide whether to update.
23 |
24 | Just setting `:updatable` has no effect on its own. However, when you invoke [one of the scraping entry-points][2] with `:update` set to `true`, Skyscraper will force re-downloading and re-processing of an updatable page.
25 |
26 | Sometimes, you will want to consult two versions of the document being processed: the one that was already present in cache and the freshly-downloaded one. The former is passed to the `:process-fn` as normal; to get the latter, you can call `cached-document` on the context. Note that your processor must be prepared for `cached-document` returning nil, indicating a first-time scrape. See `skyscraper.updates-test` for a contrived example.
27 |
28 | ## The optimization: `:uncached-only`
29 |
30 | Regardless of whether `:update` is enabled or not, Skyscraper normally processes the whole site (some of it potentially coming from the cache). Sometimes, you want to prune the
31 | scraping tree to uncached or updatable pages only, so that scraping only yields contexts corresponding to pages that are actually new.
32 |
33 | The `:uncached-only` option to `scrape` does exactly that.
34 |
35 | Be aware that in this mode scraping can do too little: pruning a page from the scraping tree also means pruning the entire subtree rooted at that page. Use it judiciously.
36 |
37 | [1]: caching.md
38 | [2]: scraping-modes.md
39 |
--------------------------------------------------------------------------------
/doc/04-parsing.md:
--------------------------------------------------------------------------------
1 | # Parsing
2 |
3 | By default, Skyscrapers parses the downloaded documents into Enlive resources. However, this can be overridden by supplying a custom _parse function_.
4 |
5 | Parse functions take a byte array (a binary blob), a map of HTTP headers (clj-http’s [header map][1]), and the current context, and should return a parsed representation of the document. Out of the box, Skyscraper provides three such functions:
6 |
7 | - `parse-enlive` – parses the blob as a HTML document with Enlive and returns an Enlive resource (a seq of maps);
8 | - `parse-reaver` – parses the blob as a HTML document with Reaver and returns an instance of `org.jsoup.nodes.Document`;
9 | - `parse-string` – parses the blob as a string.
10 |
11 | As you can see, the parsed representation can be anything, as long as your processors can work with it. The output of the parse function will be fed to the processor’s `:process-fn` as the first argument.
12 |
13 | You can specify a parse function in two ways:
14 |
15 | - for the whole scraping process – as a `:parse-fn` option to `scrape` or `scrape!`;
16 | - on a per-processor basis, in the `:parse-fn` clause in `defprocessor`.
17 |
18 | If both are specified, the per-processor definition prevails. See `examples/tree_species.clj` for an example of a scraper that uses Reaver, and
19 | `overridable-parse-fn-test` in `test/skyscraper/real_test.clj` for an example of a per-processor scraping function.
20 |
21 | You can implement custom parse functions yourself, e.g., for parsing PDF document, CSV sheets, etc. If you are parsing a text-based format, call `parse-string` first and
22 | process the resulting string – this will ensure that the blob is interpreted with a correct encoding as specified in the headers. Both `parse-enlive` and `parse-reaver` do this.
23 |
24 | ## Why do parse functions take context as an argument?
25 |
26 | You might be wondering why. After all, parsing should be generic and not depend on the particular place you’re in, right?
27 |
28 | That’ll be the case 99% of the time — and indeed the built-in parse functions just ignore the third argument — but sometimes you’ll need the context. For example, in one of my scrapers, I need to parse content that is essentially HTML, but retrieved from an archive that sometimes returns garbled data: random sequences of bytes that needs to be cut off before proceeding with normal HTML parsing. Sometimes this falls in the middle of a HTML tag. I solve this by having a standalone per “cut-off list” that a custom parse function
29 | uses, based on the context, to remove the garbage.
30 |
31 | [1]: https://github.com/dakrone/clj-http#headers
32 |
--------------------------------------------------------------------------------
/doc/05-http-interactions.md:
--------------------------------------------------------------------------------
1 | # HTTP interactions
2 |
3 | By default, Skyscraper makes HTTP requests with the GET method. However, it is possible to override it and use any method you want: just return a context with `:http/method` key from your processor. Set it to `:post`, `:put`, `:patch`, or whatever have you.
4 |
5 | Skyscraper actually doesn’t have any explicit support for the `:http/method` field. Instead, it just extracts any fields with the `http` namespace from contexts, and passes them as arguments to clj-http’s [`request`][1] function (unnamespacing them). This means that, for example, you can login to a password-protected site by combining `:http/method :post` and sending your credentials in `:http/form-params`.
6 |
7 | For example:
8 |
9 | ```clojure
10 | (defprocessor :login
11 | :process-fn (fn [document context]
12 | [{:url "/login",
13 | :http/method :post,
14 | :http/form-params {:username "johndoe", :password "t0p$3cr3+"}}]))
15 | ```
16 |
17 | Note that, unlike most keys in contexts, the `http`/-namespaced keys are one-off: they don’t get propagated to subsequent processors further down in the scraping tree. So if you tell Skyscraper to send a POST request to grab some page, and the processor for that page returns contexts pointing to other pages, those will again be requested with GET unless you
18 | set the method explicitly.
19 |
20 | The one exception for this rule is `:http/cookies`. Skyscraper will automatically merge cookies from HTTP responses into the map stored under that key, will preserve it across contexts, and clj-http’s API means they will be posted back to the server on every request. Thus, for the most part, you don’t need to worry about cookies: they Just Work™.
21 |
22 | [1]: https://cljdoc.org/d/clj-http/clj-http/3.10.0/api/clj-http.client
23 |
--------------------------------------------------------------------------------
/doc/06-error-handling.md:
--------------------------------------------------------------------------------
1 | # Error handling
2 |
3 | When scraping, many things can go wrong (and, per [Murphy’s law][1], they will). The possible failures fall into two broad categories:
4 |
5 | ## Problems with processing (a processor throws an exception)
6 |
7 | In this case, Skyscraper will gracefully propagate the exception to the scraping entry-point (i.e., it will be caught by the worker thread, all worker threads will terminate, and the exception will be re-thrown by the main thread after it has cleaned up).
8 |
9 | ## Problems with downloading (e.g., broken connections, HTTP 404 or 500 errors)
10 |
11 | The behaviour here is customizable. The default logic is:
12 |
13 | - if it’s an HTTP error other than a 50x, or a connection error, propagate the exception as above;
14 | - if it’s an HTTP 50x error, retry up to `:retries` times (default 5), and after that propagate the exception.
15 |
16 | You can override this by supplying a `:download-error-handler` option to the toplevel scraping function. If supplied, it should be a function taking three arguments:
17 |
18 | - the exception as thrown by clj-http;
19 | - the scraping option map as passed to the toplevel function;
20 | - the context containing the `:url` on which scraping failed.
21 |
22 | The error handler should return a vector of contexts that will be used as download results. Note that these contexts will _not_ normally be processed further – they will be returned as leaves of the scraping tree. To ignore the error, return an empty seq. To terminate scraping with an error, call `signal-error`. To simulate returning a successful clj-http response and continue as if the download had succeeded, call `respond-with`.
23 |
24 | See `skyscraper.core/default-download-error-handler` for an example of how to implement such a function.
25 |
26 | [1]: https://en.wikipedia.org/wiki/Murphy%27s_law
27 |
--------------------------------------------------------------------------------
/doc/07-db.md:
--------------------------------------------------------------------------------
1 | # Database integration
2 |
3 | ## Introduction
4 |
5 | Skyscraper can automatically emit the result of scraping in the form of a relational database. One table will be created for each database-enabled processor, with each row corresponding to an output context created by that processor. Each row will be automatically assigned an ID, and parent-child relationships will be modelled as foreign keys.
6 |
7 | The only database management system currently supported is SQLite (Skyscraper comes with a dependency on SQLite), but support for other DBMSs is planned for future versions of Skyscraper.
8 |
9 | To specify that a table should be generated for a given processor, add a `:skyscraper.db/columns` option to `defprocessor`. For example:
10 |
11 | ```clojure
12 | (defprocessor :users
13 | :process-fn (fn [res ctx]
14 | [{:name "John", :surname "Doe"}])
15 | :skyscraper.db/columns [:name :surname])
16 | ```
17 |
18 | (For the sake of clarity and focus, this document doesn’t rely on any particular site’s structure. Our example processors will return hardcoded data.)
19 |
20 | Now, when you invoke Skyscraper like this:
21 |
22 | ```clojure
23 | (scrape! [{:url "http://example.com", :processor :users}]
24 | :db-file "/tmp/demo.sqlite")
25 | ```
26 |
27 | Skyscraper will create a SQLite database in the given file, containing one table named `users` with four columns: two textual ones that you have specified, and two additional integer ones named `id` and `parent`. That is, it will conform to the following schema:
28 |
29 | ```sql
30 | CREATE TABLE users (id integer primary key, parent integer, name text, surname text);
31 | ```
32 |
33 | The `id` will be an internal, autogenerated primary key. It is not guaranteed to be stable – it is possible for two identical invocations of Skyscraper to generate different tables.
34 |
35 | The `parent` column will be described below.
36 |
37 | ## Tree structure
38 |
39 | Let us expand our worked example. Consider the following processor definitions:
40 |
41 | ```clojure
42 | (defprocessor :users
43 | :process-fn (fn [res ctx]
44 | [{:name "John", :surname "Doe", :url "/", :processor :accounts}])
45 | :skyscraper.db/columns [:name :surname])
46 |
47 | (defprocessor :accounts
48 | :process-fn (fn [res ctx]
49 | [{:bank-account "0123-4567"}
50 | {:bank-account "8888-9999"}])
51 | :skyscraper.db/columns [:bank-account])
52 | ```
53 |
54 | Running `scrape!` as above will now generate the following database:
55 |
56 | ```sql
57 | sqlite> select * from users;
58 | id = 1
59 | parent =
60 | name = John
61 | surname = Doe
62 |
63 | sqlite> select * from accounts;
64 | id = 1
65 | parent = 1
66 | bank_account = 0123-4567
67 |
68 | id = 2
69 | parent = 1
70 | bank_account = 8888-9999
71 | ```
72 |
73 | Because, in the scrape tree, nodes corresponding to the `:accounts` processor are children of those of `:users`, the `parent` column in the `account` user table references the `id` in `users`.
74 |
75 | Note that this database doesn’t contain redundant data, but you can still easily obtain user data for each account by simply `JOIN`ing the tables together.
76 |
77 | ## Key columns and updating
78 |
79 | There’s a gotcha: if you re-run Skyscraper with the above settings, it will duplicate the already existing records in the database. This is because, normally, there is no way tell whether the newly-scraped records correspond to data we already have or not. For example, the records may differ in some details (e.g., timestamps), but still refer to the same entity.
80 |
81 | Therefore, you have to be explicit about which fields uniquely identify a context for a given DB-enabled processor. For instance:
82 |
83 | ```clojure
84 | (defprocessor :users
85 | :process-fn (fn [res ctx]
86 | [{:name "John", :surname "Doe", :phone "123-4567"}])
87 | :skyscraper.db/columns [:name :surname :phone]
88 | :skyscraper.db/key-columns [:name :surname])
89 | ```
90 |
91 | In this case, rather than bluntly executing an `INSERT` for each encountered row, Skyscraper will only insert the row when it doesn’t already exist in the DB. If it does exist, Skyscraper will update it when necessary. For example, imagine John’s phone number changes:
92 |
93 | ```clojure
94 | (defprocessor :users
95 | :process-fn (fn [res ctx]
96 | [{:name "John", :surname "Doe", :phone "765-4321"}])
97 | :skyscraper.db/columns [:name :surname :phone]
98 | :skyscraper.db/key-columns [:name :surname])
99 | ```
100 |
101 | A repeated invocation of Skyscraper will now produce a database with the updated record.
102 |
103 | **Note:** If you use this feature, Skyscraper will create an unique index for every occurrence of `:skyscraper.db/key-columns`, slowing down inserting (as SQLite needs to update the index on every insert). You can pass the `:ignore-db-keys true` option to revert to plain `INSERT`s; if the database didn’t exist prior to scraping then Skyscraper will do that automatically. It is currently advisable to use `:skyscraper.db/key-columns` for small scrapes only, and to regenerate the DB from scratch each time for larger scrapes.
104 |
105 | ## Tips and caveats
106 |
107 | Often, you have a paginated structure where a page contains a number of records you’re interested in, plus a link to the next page. A natural approach is to have the processor return the "next-page" context along with the actual records. So the return value of scraping `/page/1` might look like this:
108 |
109 | ```clojure
110 | [{:name "John Doe", :url "/person/1", :processor :person}
111 | {:name "Jane Smith", :url "/person/2", :processor :person}
112 | ;; more records
113 | {:page 2, :url "/page/2", :processor :page}]
114 | ```
115 |
116 | If you DB-enable such a processor (presumably with the `:name` column), Skyscraper will duly emit a null-name row for the last context. There are several ways to cope with this:
117 |
118 | - You could change your scraping structure, having a top-level processor that just scans the pagination producing links to pages, and the `page` processor returning just the records of interest.
119 | - Alternatively, you could just bite the bullet and accept this situation, remembering to insert `NOT NULL` in your queries where appropriate. In general, it is a good idea to treat the Skyscraper-generated database as an interim step of your scraping flow, and have a cleaning step further downstream (the data structure you’re trying to recreate will sometimes not correspond faithfully to the scraping structure).
120 |
--------------------------------------------------------------------------------
/doc/08-development-mode.md:
--------------------------------------------------------------------------------
1 | # Development mode: Developing scrapers interactively
2 |
3 | Here’s a common workflow for developing a scraper with Skyscraper:
4 |
5 | 1. Start with a seed.
6 | 2. Download and parse the first page in that seed.
7 | 3. Experiment in the REPL with extracting desired data out of that page.
8 | 4. Once you’re satisfied with the outcome, define a processor.
9 | 5. Proceed with scraping until a page with yet-undefined processor is encountered.
10 | 6. Repeat steps 3–5 until all processors are defined.
11 |
12 | Skyscraper provides some functions (in the namespace `skyscraper.dev`) to assist in this process.
13 |
14 | ## A worked example
15 |
16 | As an example, we will develop a scraper to download and extract the data from UK Forest Research’s tree species database. At the time of writing (7 January 2020), you can access it here: https://www.forestresearch.gov.uk/tools-and-resources/tree-species-database/
17 |
18 | The complete code for this example can be found in the `examples/` directory.
19 |
20 | Launch a REPL and create a new namespace:
21 |
22 | ```clojure
23 | (ns skyscraper.examples.tree-species
24 | (:require [reaver]
25 | [skyscraper.core :as core :refer [defprocessor]]))
26 |
27 | (require '[skyscraper.dev :refer :all])
28 | ```
29 |
30 | Note that we have referred to the `skyscraper.dev` namespace for development; we’ll remove that `require` later, when we’re done. We have also opted to use Reaver (rather than the default Enlive) as the HTML parser.
31 |
32 | Define the seed:
33 |
34 | ```clojure
35 | (def seed [{:url "https://www.forestresearch.gov.uk/tools-and-resources/tree-species-database"
36 | :processor :tree-list
37 | :page 1}])
38 | ```
39 |
40 | We start at the first page of a paginated list, containing links to pages describing individual tree species as well as to the next page.
41 |
42 | At this point, we can run our first scrape, even though we haven’t defined the `:tree-list` processor yet:
43 |
44 | ```clojure
45 | (scrape seed :parse-fn core/parse-reaver)
46 | ```
47 |
48 | Skyscraper will run for a while and eventually say:
49 | ```
50 | 20-01-08 10:07:18 serpent INFO [skyscraper.dev:48] - Scraping suspended in processor :tree-list
51 | ```
52 | It will also helpfully open a browser for you, pointed at a local copy of the page it doesn’t know how to parse yet. (The styling will usually be broken, but it won’t affect your ability to extract the data).
53 |
54 | Use your browser’s DevTools to look at that page. You’ll notice that each tree species has its own div of class `listing-item`, which are all contained in the div of class `listing--trees`. Let us test this hypothesis in the REPL:
55 |
56 | ```clojure
57 | (count (reaver/select (document) ".listing--trees"))
58 | ;=> 1
59 | (count (reaver/select (document) ".listing--trees .listing-item"))
60 | ;=> 10
61 | ```
62 |
63 | Note that we call `document` here, and it returns the Reaver parse tree of the page that you were looking at in the browser.
64 |
65 | Let’s try to extract the data. For each species, we want the English name, Latin name, and a link to the details page:
66 |
67 | ```clojure
68 | (reaver/extract-from (document) ".listing--trees .listing-item"
69 | [:english-name :latin-name :url]
70 | ".listing__heading" reaver/text
71 | ".listing__metadata" reaver/text
72 | ".listing-item" (reaver/attr :href))
73 | ;=> ({:english-name "Common (or black) alder (CAR)",
74 | ; :latin-name "Alnus glutinosa",
75 | ; :url "/tools-and-resources/tree-species-database/common-or-black-alder-car/"}
76 | ; ...)
77 | ```
78 |
79 | Let’s also see if we can find a link to the next page:
80 |
81 | ```clojure
82 | (-> (reaver/select (document) ".pagination__item--next")
83 | first
84 | (reaver/attr :href))
85 | ;=> "/tools-and-resources/tree-species-database/page/2/"
86 | ```
87 |
88 | We now have everything we need to define a processor:
89 |
90 | ```clojure
91 | (defprocessor :tree-list
92 | :process-fn (fn [doc ctx]
93 | (concat
94 | (reaver/extract-from doc ".listing--trees .listing-item"
95 | [:english-name :latin-name :url :processor]
96 | ".listing__heading" reaver/text
97 | ".listing__metadata" reaver/text
98 | ".listing-item" (reaver/attr :href)
99 | ".listing-item" (constantly :tree))
100 | (when-let [next-page-url (-> (reaver/select doc ".pagination__item--next") first (reaver/attr :href))]
101 | [{:url next-page-url
102 | :processor :tree-list
103 | :page (inc (:page ctx))}]))))
104 | ```
105 |
106 | Note that we have added the next processor name to the `extract-from` invocation, and also replaced the `(document)` calls (meant to be used interactively) with the first argument of `process-fn`.
107 |
108 | You can now check whether this processor works correctly on the current resource and context:
109 |
110 | ```clojure
111 | (run-last-processor)
112 | ```
113 |
114 | It should return 11 maps: 10 species plus the next page of the list. All good so far! We are ready to proceed.
115 |
116 | Re-run the scrape as before:
117 |
118 | ```clojure
119 | (scrape seed :parse-fn core/parse-reaver)
120 | ```
121 |
122 | Skyscraper will clean up after the previous attempt and suspend scraping on the species detail page this time. For the sake of example, we will do the simplest thing and just grab the HTML description:
123 |
124 | ```clojure
125 | (defprocessor :tree
126 | :process-fn (fn [doc ctx]
127 | {:description-html (when-let [description (reaver/select doc ".is-typeset--article")]
128 | (.html description))}))
129 | ```
130 |
131 | And we’re done! If you launch the scrape again, it will now run to completion. We are now ready to remove the `(require 'skyscraper.dev)` line and run the usual `skyscraper.core/scrape` function to obtain structured data.
132 |
133 | The complete code in `examples/` also contains cache templates for both processors.
134 |
135 | ## Implementation note
136 |
137 | Development mode is currently implemented using `item-chan`. This means that you can’t pass a custom `item-chan` to `skyscraper.dev/scrape` because it will be overridden.
138 |
139 | It also means that when you redefine a processor after a suspended `scrape` and then call `scrape` again, it is unable to just continue as if the new definition had been in effect all along. Instead, it will run the previous scrape to completion and then start a new one. This should not be a big problem in practice, but it may cause Skyscraper to download pages more often than you might expect. Enable HTML cache to remedy this.
140 |
141 | In development mode, `parallelism` is set to 1.
142 |
--------------------------------------------------------------------------------
/examples/clojuredays.clj:
--------------------------------------------------------------------------------
1 | (ns clojuredays
2 | (:require [reaver]
3 | [skyscraper.core :as sky]
4 | [skyscraper.cache :as sky.cache]
5 | [taoensso.carmine :as car]))
6 |
7 | ; Redis setup as CacheBackend
8 | ; this example assumes you have a local Redis instance running
9 |
10 | ; tested with
11 | ; deps.edn:
12 | ; com.taoensso/carmine {:mvn/version "3.2.0-alpha1"}
13 | ; reaver/reaver {:mvn/version "0.1.3"}
14 |
15 | (def redis-conn {:pool {} :spec {:uri "redis://127.0.0.1:6379"}})
16 | (defmacro wcar* [& body] `(car/wcar redis-conn ~@body))
17 |
18 | (defn redis-cache
19 | []
20 | (reify
21 | sky.cache/CacheBackend
22 | (save-blob [_ key blob metadata]
23 | (wcar*
24 | (car/set key {:blob blob
25 | :meta metadata})))
26 | (load-blob [_ key]
27 | (wcar*
28 | (car/get key)))
29 | java.io.Closeable
30 | (close [_]
31 | nil)))
32 |
33 | (def seed
34 | [{:url "https://clojuredays.org",
35 | :processor :editions
36 | :page :index}])
37 |
38 | (sky/defprocessor :editions
39 | :cache-template "dcd/:page"
40 | :skyscraper.db/columns [:edition]
41 | :skyscraper.db/key-columns [:edition]
42 | :process-fn (fn [doc _ctx]
43 | (reaver/extract-from doc
44 | "aside > a.item" ; or "#sidebar > a.item"
45 | [:edition :url :processor]
46 | ".item" reaver/text
47 | ".item" (reaver/attr :href)
48 | ".item" (constantly :sponsors))))
49 |
50 | (sky/defprocessor :sponsors
51 | :cache-template "dcd/:edition"
52 | :skyscraper.db/columns [:sponsor_url :sponsor_name]
53 | :skyscraper.db/key-columns [:sponsor_url :sponsor_name]
54 | :process-fn (fn [doc _ctx]
55 | (reaver/extract-from doc
56 | ".sponsors > a" ; "#sponsors .sponsors > a"
57 | [:sponsor_url :sponsor_name]
58 | ".sponsor" (reaver/attr :href)
59 | ".sponsor > img" (reaver/attr :alt))))
60 |
61 | (defn run [cache]
62 | (sky/scrape! seed
63 | :html-cache cache
64 | :db-file "/tmp/dcd.db"
65 | :parse-fn sky/parse-reaver))
66 |
67 | (run (redis-cache))
--------------------------------------------------------------------------------
/examples/hackernews.clj:
--------------------------------------------------------------------------------
1 | ;; This example is incomplete and will be completed later on.
2 |
3 | (ns skyscraper.hackernews
4 | (:require
5 | [net.cgrand.enlive-html :refer [select attr? text emit* has pred first-child last-child nth-child]]
6 | [skyscraper.core :refer :all]
7 | [skyscraper.enlive-helpers :refer [href]]))
8 |
9 | (defn seed [& _]
10 | [{:url "https://news.ycombinator.com/",
11 | :processor :stories}])
12 |
13 | (defn extract-number [item]
14 | (when-let [s (re-find #"\d+" (text item))]
15 | (Long/parseLong s)))
16 |
17 | (defprocessor :stories
18 | :cache-template "hn/index"
19 | :process-fn (fn [res context]
20 | (let [rows (select res [:table :> [:tr (nth-child 3)] :table :tr])]
21 | (for [[title-row author-row _] (partition 3 (drop-last 2 rows))
22 | :let [a-title (first (select title-row [:td.title :a]))
23 | a-author (first (select author-row [[:a (nth-child 2)]]))]]
24 | {:story-url (href a-title),
25 | :title (text a-title),
26 | :score (extract-number (first (select author-row [:span.score])))
27 | :author (when (= (-> author-row :content second :content (nth 2)) " by ")
28 | (text a-author)),
29 | :time (text (first (select author-row [:span.age :a])))
30 | :num-comments (extract-number (text (first (select author-row [:td :> [:a last-child]])))),
31 | :url (href (select author-row [:span.age :a]))
32 | :id (extract-number (href (select author-row [:span.age :a])))
33 | :processor :comments}))))
34 |
35 | (defprocessor :comments
36 | :cache-template "hn/story/:id"
37 | :process-fn (fn [res context]
38 | {:commenters (mapv text (select res [:a.hnuser]))}))
39 |
--------------------------------------------------------------------------------
/examples/tree_species.clj:
--------------------------------------------------------------------------------
1 | (ns skyscraper.examples.tree-species
2 | (:require [reaver]
3 | [skyscraper.core :as core :refer [defprocessor]]))
4 |
5 | (def seed [{:url "https://www.forestresearch.gov.uk/tools-and-resources/tree-species-database"
6 | :processor :tree-list
7 | :page 1}])
8 |
9 | (defprocessor :tree-list
10 | :cache-template "tree-species/list/page/:page"
11 | :skyscraper.db/columns [:english-name :latin-name]
12 | :skyscraper.db/key-columns [:english-name]
13 | :process-fn (fn [doc ctx]
14 | (concat
15 | (reaver/extract-from doc "#tree-listing > div"
16 | [:english-name :latin-name :url :processor]
17 | "h3" reaver/text
18 | "i" reaver/text
19 | "a" (reaver/attr :href)
20 | "a" (constantly :tree))
21 | (when-let [next-page-url (-> (reaver/select doc ".forestry__pagination-last:not(.disabled) a") first (reaver/attr :href))]
22 | [{:url next-page-url
23 | :processor :tree-list
24 | :page (inc (:page ctx))}]))))
25 |
26 | (defprocessor :tree
27 | :cache-template "tree-species/tree/:english-name"
28 | :skyscraper.db/columns [:description-html]
29 | :process-fn (fn [doc ctx]
30 | {:description-html (when-let [description (reaver/select doc ".forestry-body > .container > .row > .col-8")]
31 | (.html description))}))
32 |
33 | (defn run []
34 | (core/scrape! seed
35 | :html-cache true
36 | :db-file "/tmp/trees.db"
37 | :parse-fn core/parse-reaver))
38 |
--------------------------------------------------------------------------------
/release.edn:
--------------------------------------------------------------------------------
1 | {:group-id "skyscraper"
2 | :artifact-id "skyscraper"
3 | :scm-url "https://github.com/nathell/skyscraper"}
4 |
--------------------------------------------------------------------------------
/src/skyscraper/cache.clj:
--------------------------------------------------------------------------------
1 | ;;;; Skyscraper - Cache backends
2 |
3 | (ns skyscraper.cache
4 | (:refer-clojure :exclude [load load-string])
5 | (:require [clojure.edn :as edn]
6 | [clojure.java.io :as io])
7 | (:import [java.io Closeable InputStream OutputStream]))
8 |
9 | ;;; Netstrings
10 |
11 | (let [bytes-type (type (byte-array []))]
12 | (defn- byte-array? [item]
13 | (= (type item) bytes-type)))
14 |
15 | (defn- write-netstring
16 | [^OutputStream stream item]
17 | (let [^bytes b (cond (byte-array? item) item
18 | (string? item) (.getBytes item)
19 | :otherwise (.getBytes (pr-str item)))]
20 | (.write stream (.getBytes (str (count b))))
21 | (.write stream (int \:))
22 | (.write stream b)
23 | (.write stream (int \,))))
24 |
25 | (defn- read-netstring
26 | [^InputStream stream]
27 | (loop [len 0]
28 | (let [ch (.read stream)]
29 | (cond (<= 48 ch 57) (recur (+ (* 10 len) ch -48))
30 | (= ch 58) (let [arr (byte-array len)]
31 | (.read stream arr)
32 | (assert (= (.read stream) 44))
33 | arr)
34 | :otherwise (throw (Exception. "colon needed after length"))))))
35 |
36 | ;;; Actual cache
37 |
38 | (defprotocol CacheBackend
39 | "Provides facilities for caching downloaded blobs (typically HTML),
40 | potentially enriched with some metadata (typically headers), in
41 | some kind of storage. Implementations of this protocol can be passed
42 | as `:html-cache` and `:processed-cache` options to
43 | [[skyscraper.core/scrape]]."
44 | (save-blob [cache key blob metadata])
45 | (load-blob [cache key]))
46 |
47 | ;; An implementation of CacheBackend that stores the blobs in a
48 | ;; filesystem under a specific directory (root-dir). The blobs are
49 | ;; stored as netstrings (http://cr.yp.to/proto/netstrings.txt),
50 | ;; prefixed with metadata EDN also stored as a netstring. The
51 | ;; filenames correspond to the stored keys. root-dir must end in a
52 | ;; path separator (/).
53 | (deftype FSCache
54 | [root-dir]
55 | CacheBackend
56 | (save-blob [cache key blob metadata]
57 | (let [meta-str (pr-str metadata)
58 | file (str root-dir key)]
59 | (io/make-parents file)
60 | (with-open [f (io/output-stream file)]
61 | (write-netstring f metadata)
62 | (write-netstring f blob))))
63 | (load-blob [cache key]
64 | (try
65 | (with-open [f (io/input-stream (str root-dir key))]
66 | (let [meta-blob (read-netstring f)
67 | blob (read-netstring f)]
68 | {:meta (edn/read-string (String. meta-blob))
69 | :blob blob}))
70 | (catch Exception _ nil)))
71 | Closeable
72 | (close [cache]
73 | nil))
74 |
75 | (defn fs
76 | "Creates a filesystem-based cache backend with a given root directory."
77 | [root-dir]
78 | (FSCache. (str root-dir "/")))
79 |
80 | ;; A dummy implementation of CacheBackend that doesn't actually cache data.
81 | (deftype NullCache
82 | []
83 | CacheBackend
84 | (save-blob [_ _ _ _] nil)
85 | (load-blob [_ _] nil)
86 | Closeable
87 | (close [_] nil))
88 |
89 | (defn null
90 | "Creates a null cache backend."
91 | []
92 | (NullCache.))
93 |
94 | (extend-protocol CacheBackend
95 | nil
96 | (save-blob [_ _ _ _] nil)
97 | (load-blob [_ _] nil))
98 |
99 | ;; An in-memory implementation of CacheBackend backed by two atoms.
100 | (deftype MemoryCache
101 | [storage]
102 | CacheBackend
103 | (save-blob [cache key blob metadata]
104 | (swap! storage assoc key {:blob blob, :meta metadata}))
105 | (load-blob [cache key]
106 | (@storage key))
107 | Closeable
108 | (close [cache]
109 | nil))
110 |
111 | (defn memory
112 | "Creates a memory cache backend."
113 | []
114 | (MemoryCache. (atom {})))
115 |
--------------------------------------------------------------------------------
/src/skyscraper/context.clj:
--------------------------------------------------------------------------------
1 | (ns skyscraper.context)
2 |
3 | (defn dissoc-internal
4 | "Dissocs the context keys that shouldn't be carried over to further processing."
5 | [context]
6 | (let [removed-keys #{:processor :url :skyscraper.core/new-items :skyscraper.core/retry :skyscraper.core/prev-response
7 | :skyscraper/description :skyscraper.traverse/priority}]
8 | (into {}
9 | (remove (fn [[k _]] (or (contains? removed-keys k)
10 | (and (keyword? k)
11 | (= (namespace k) "http")
12 | (not= k :http/cookies)))))
13 | context)))
14 |
15 |
16 | (defn dissoc-leaf-keys
17 | "Dissocs the context keys that shouldn't appear in the resulting channel
18 | or sequence of leaf nodes."
19 | [context]
20 | (dissoc context
21 | :skyscraper.core/cache-key
22 | :skyscraper.core/current-processor
23 | :skyscraper.core/next-stage
24 | :skyscraper.core/options
25 | :skyscraper.core/response
26 | :skyscraper.core/stage
27 | :skyscraper.traverse/handler
28 | :skyscraper.traverse/call-protocol
29 | :skyscraper.traverse/priority
30 | :skyscraper/description
31 | :http/cookies))
32 |
33 | (defn describe
34 | "Returns a user-friendly version of a context that doesn't include
35 | Skyscraper's internal keys, but does include the currently running
36 | processor name."
37 | [context]
38 | (let [processor (:skyscraper.core/current-processor context)]
39 | (cond-> context
40 | true dissoc-internal
41 | true dissoc-leaf-keys
42 | true (merge (select-keys context [:processor :url])) ; reattach
43 | processor (assoc :skyscraper.core/current-processor-name (:name processor))
44 | true pr-str)))
45 |
46 | (defn describe-url
47 | [context]
48 | (or (:skyscraper/description context)
49 | (:url context)))
50 |
--------------------------------------------------------------------------------
/src/skyscraper/core.clj:
--------------------------------------------------------------------------------
1 | (ns skyscraper.core
2 | (:require
3 | [clj-http.client :as http]
4 | [clj-http.conn-mgr :as http-conn]
5 | [clj-http.core :as http-core]
6 | [clj-http.headers :as http-headers]
7 | [clojure.edn :as edn]
8 | [clojure.set :refer [intersection]]
9 | [clojure.string :as string]
10 | [net.cgrand.enlive-html :as enlive]
11 | [reaver]
12 | [skyscraper.cache :as cache]
13 | [skyscraper.context :as context]
14 | [skyscraper.db :as sqlite]
15 | [skyscraper.traverse :as traverse]
16 | [taoensso.timbre :refer [debugf infof warnf errorf]])
17 | (:import [java.net URL]
18 | [java.nio.charset Charset]))
19 |
20 | ;;; Directories
21 |
22 | (def output-dir
23 | "All Skyscraper output, either temporary or final, goes under here."
24 | (str (System/getProperty "user.home") "/skyscraper-data/"))
25 |
26 | (def html-cache-dir
27 | "Local copies of downloaded HTML files go here."
28 | (str output-dir "cache/html/"))
29 |
30 | (def processed-cache-dir
31 | "Cache storing the interim results of processing HTML files."
32 | (str output-dir "cache/processed/"))
33 |
34 | ;;; Micro-templating framework
35 |
36 | (defn- format-template
37 | "Fills in a template string with moving parts from m. template should be
38 | a string containing 'variable names' starting with colons; these names
39 | are extracted, converted to keywords and looked up in m, which should be
40 | a map (or a function taking keywords and returning strings).
41 |
42 | Example:
43 | ```clojure
44 | (format-template \":group/:user/index\" {:user \"joe\", :group \"admins\"})
45 | ;=> \"admins/joe/index\"
46 | ```"
47 | [template m]
48 | (let [re #":[a-z-]+"
49 | keys (map #(keyword (subs % 1)) (re-seq re template))
50 | fmt (string/replace template re "%s")]
51 | (apply format fmt (map m keys))))
52 |
53 | ;;; Cache
54 |
55 | (defn- sanitize-cache
56 | "Converts a cache argument to the processor to a CacheBackend if it
57 | isn't one already."
58 | [value cache-dir]
59 | (cond
60 | (string? value) (cache/fs value)
61 | (= value true) (cache/fs cache-dir)
62 | (not value) (cache/null)
63 | :otherwise value))
64 |
65 | ;;; Defining processors
66 |
67 | (defonce
68 | ^{:doc "The global registry of processors: an atom containing a map from
69 | keywords naming processors to the processor definitions."}
70 | processors (atom {}))
71 |
72 | (defmacro with-processor-definitions
73 | "Runs body with processors defined as defs, restoring previous definitions
74 | afterwards.
75 | Note: don't use this unless you're skyscraper.dev."
76 | [defs & body]
77 | `(let [previous# @processors]
78 | (try
79 | (reset! processors ~defs)
80 | ~@body
81 | (finally (reset! processors previous#)))))
82 |
83 | (defn- default-process-fn
84 | "The default function that becomes a processor's :process-fn
85 | if you don't specify one."
86 | [resource context]
87 | [{::unimplemented true, ::resource resource, ::context context}])
88 |
89 | (defn defprocessor
90 | "Registers a processor named `name` with arguments `args`.
91 |
92 | `name` should be a keyword. `args`, optional keys and values, may include:
93 |
94 | - `:process-fn` – a function that takes a resource and a parent context, and returns a
95 | sequence of child contexts (corresponding to the scraped resource). Alternatively,
96 | it can return one context only, in which case it will be wrapped in a sequence.
97 | - `:cache-template` – a string specifying the template for cache keys. Ignored when
98 | `:cache-key-fn` is specified.
99 | - `:cache-key-fn` – a function taking the context and returning the cache key. Overrides
100 | `:cache-template`. Useful when mere templating does not suffice.
101 | - `:url-fn` – a one-argument function taking the context and returning the URL to visit.
102 | By default, Skyscraper just extracts the value under the `:url` key from the context.
103 | - `:updatable` – a boolean (false by default). When true, the pages accessed by this
104 | processor are considered to change often. When Skyscraper is run in update mode (see
105 | below), these pages will be re-downloaded and re-processed even if they had been present
106 | in the HTML or processed caches, respectively.
107 | - `:parse-fn` – a custom function that will be used to produce Enlive resources from
108 | downloaded documents. This can be useful, for instance, if you want to use reaver rather
109 | than Enlive; if you are scraping something other than HTMLs (e.g., PDFs via a custom
110 | parser); or when you’re scraping malformed HTML and need an interim fixup steps before
111 | parsing.
112 | - `:skyscraper.db/columns` – a vector of keys that are supposed to exist in the resulting
113 | contexts; the corresponding values will be emitted as a database row when `:db` or
114 | `:db-file` is supplied as a scrape argument.
115 | - `:skyscraper.db/key-columns` – a vector of keys that, when
116 | supplied, will be used to upsert records to database and treated as
117 | a unique key to match existing database records against."
118 | [name & {:as args}]
119 | (swap! processors assoc name (merge {:name name, :process-fn default-process-fn} args)))
120 |
121 | (defn- get-option
122 | "Some options can be specified either in the processor definition or
123 | during scraping; in this case, the per-processor one takes precedence."
124 | ([context options k] (get-option context options k nil))
125 | ([context options k default]
126 | (or (get-in context [::current-processor k])
127 | (get options k)
128 | default)))
129 |
130 | (defn- ensure-distinct-seq
131 | "If x is a sequence, removes duplicates from it, else returns a vector
132 | containing x only."
133 | [x]
134 | (if (map? x) [x] (doall (distinct x))))
135 |
136 | (defn run-processor
137 | "Runs a processor named by processor-name on document."
138 | ([processor-name document] (run-processor processor-name document {}))
139 | ([processor-name document context]
140 | (let [processor (or (@processors processor-name)
141 | {:name processor-name, :process-fn default-process-fn})]
142 | (ensure-distinct-seq ((:process-fn processor) document context)))))
143 |
144 | (defn allows?
145 | "True if all keys in m1 that are also in m2 have equal values in both maps."
146 | [m1 m2]
147 | (let [ks (intersection (set (keys m1)) (set (keys m2)))]
148 | (if (seq ks)
149 | (let [f (apply juxt ks)]
150 | (= (f m1) (f m2)))
151 | true)))
152 |
153 | (defn- filter-contexts
154 | "If `:only` was supplied in `options`, returns `contexts` filtered by it
155 | as specified in the docstring of `scrape`, else returns all contexts."
156 | [{:keys [only] :as options} contexts]
157 | (if only
158 | (let [filter-fn (if (fn? only)
159 | only
160 | (fn [x] (some #(allows? % x) (ensure-distinct-seq only))))]
161 | (filter filter-fn contexts))
162 | contexts))
163 |
164 | (defn merge-urls
165 | "Fills the missing parts of new-url (which can be either absolute,
166 | root-relative, or relative) with corresponding parts from url
167 | (an absolute URL) to produce a new absolute URL."
168 | [url new-url]
169 | (if (string/starts-with? new-url "?")
170 | (str (string/replace url #"\?.*" "") new-url)
171 | (str (URL. (URL. url) new-url))))
172 |
173 | (defn- merge-contexts
174 | "Given two contexts, `old` as passed to a processor as input, and
175 | `new` as returned by the processor, returns a merged context that
176 | will be fed to child processors."
177 | [old new]
178 | (let [preserve (context/dissoc-internal old)
179 | new-url (if-let [u (:url new)]
180 | (merge-urls (:url old) u))
181 | new (if new-url
182 | (assoc new :url new-url)
183 | new)]
184 | (merge preserve new)))
185 |
186 | (defn- string-resource
187 | "Returns an Enlive resource for a HTML snippet passed as a string."
188 | [s]
189 | (enlive/html-resource (java.io.StringReader. s)))
190 |
191 | (defn- strip-quotes
192 | [s]
193 | (if (and (string/starts-with? s "\"")
194 | (string/ends-with? s "\""))
195 | (subs s 1 (dec (count s)))
196 | s))
197 |
198 | (defn parse-string
199 | "Parses `body`, a byte-array, as a string encoded with
200 | content-type provided in `headers`. If `try-html?` is true,
201 | tries to look for encoding in the tag
202 | in `body`."
203 | ([headers ^bytes body _context] (parse-string headers body _context false))
204 | ([headers ^bytes body _context try-html?]
205 | (let [stream1 (when try-html?
206 | (java.io.ByteArrayInputStream. body))
207 | body-map (when try-html?
208 | (http/parse-html stream1))
209 | additional-headers (if try-html?
210 | (http/get-headers-from-body body-map)
211 | {})
212 | all-headers (merge headers additional-headers)
213 | content-type (get all-headers "content-type")
214 | content-type (cond-> content-type
215 | (vector? content-type) first)]
216 | (String. body (Charset/forName (strip-quotes (http/detect-charset content-type)))))))
217 |
218 | (defn parse-enlive
219 | "Parses a byte array as a Enlive resource."
220 | [headers body context]
221 | (string-resource (parse-string headers body context (:use-http-headers-from-content (::options context) true))))
222 |
223 | (defn parse-reaver
224 | "Parses a byte array as a JSoup/Reaver document."
225 | [headers body context]
226 | (reaver/parse (parse-string headers body context (:use-http-headers-from-content (::options context) true))))
227 |
228 | ;;; Scraping
229 |
230 | (defn- extract-namespaced-keys
231 | "Filters `m`, returning a map with only the keys whose namespace is `ns`."
232 | [ns m]
233 | (into {}
234 | (comp (filter #(= (namespace (key %)) ns))
235 | (map (fn [[k v]] [(keyword (name k)) v])))
236 | m))
237 |
238 | ;; The scraping engine is implemented on top of skyscraper.traverse,
239 | ;; but each step (download, parse, run processor, store in cache) is
240 | ;; decomposed into several stages collectively known as a "pipeline".
241 | ;; Steps in the pipeline normally run from left to right, mostly
242 | ;; sequentially (except for `download-handler` which is async), and
243 | ;; after the last step, we return to the first one. The current stage
244 | ;; is stored as `::stage` in the context. A handler can override
245 | ;; the next one by setting `::next-stage`.
246 |
247 | (defn- make-pipeline
248 | "Returns a list of symbols naming functions that implement the pipeline steps."
249 | [{:keys [download-mode] :as options}]
250 | `[init-handler
251 | check-cache-handler
252 | ~(case download-mode
253 | :async `download-handler
254 | :sync `sync-download-handler)
255 | store-cache-handler
256 | process-handler
257 | split-handler])
258 |
259 | (defn- advance-pipeline
260 | "Advances `context` to the next stage in `pipeline`."
261 | [pipeline context]
262 | (let [next-stage (or (::next-stage context)
263 | (->> pipeline
264 | (drop-while #(not= % (::stage context)))
265 | second)
266 | (when (and (:processor context) (:url context))
267 | (first pipeline)))]
268 | (when (and (:processor context) (not (:url context)))
269 | (warnf "Encountered context with processor but no URL: %s" (context/describe context)))
270 | (if next-stage
271 | (-> context
272 | (dissoc ::next-stage)
273 | (assoc ::stage next-stage
274 | ::traverse/handler (if (= next-stage `download-handler)
275 | `download-handler
276 | `sync-handler)
277 | ::traverse/call-protocol (if (= next-stage `download-handler)
278 | :callback
279 | :sync)))
280 | (context/dissoc-leaf-keys context))))
281 |
282 | (defn- init-handler
283 | "Sets up context with `::current-processor` and `::cache-key`."
284 | [context options]
285 | (let [{:keys [cache-template cache-key-fn]} (merge options (@processors (:processor context)))
286 | cache-key-fn (or cache-key-fn
287 | #(when cache-template
288 | (format-template cache-template %)))]
289 | [(assoc context
290 | ::current-processor (@processors (:processor context))
291 | ::cache-key (cache-key-fn context))]))
292 |
293 | (defn- updatable?
294 | "Should we redownload the given context even if we have it cached?"
295 | [context]
296 | (let [updatable (get-in context [::current-processor :updatable])]
297 | (if (ifn? updatable)
298 | (updatable context)
299 | updatable)))
300 |
301 | (defn- skip-download?
302 | [context options]
303 | (or (not (:update options))
304 | (not (updatable? context))))
305 |
306 | ;; TODO: deduplicate code around here
307 | (defn- maybe-retrieve-from-http-cache
308 | "When a context's cache-key exists in the cache, fetches the associated
309 | data."
310 | [context options]
311 | (if-let [key (::cache-key context)]
312 | (if-let [item (cache/load-blob (:html-cache options) key)]
313 | {:body (:blob item), :headers (:meta item)})))
314 |
315 | (defn- maybe-retrieve-from-processed-cache
316 | "Likewise, for processed cache."
317 | [context options]
318 | (when (skip-download? context options)
319 | (if-let [key (::cache-key context)]
320 | (if-let [item (cache/load-blob (:processed-cache options) key)]
321 | (edn/read-string (String. (:blob item) "UTF-8"))))))
322 |
323 | (defn- check-cache-handler
324 | "If context is cached, loads the cached data and skips [[download-handler]],
325 | otherwise returns it as-is."
326 | [context options]
327 | (let [processed-result (maybe-retrieve-from-processed-cache context options)
328 | cached-response (maybe-retrieve-from-http-cache context options)]
329 | (cond
330 | (and (:uncached-only options)
331 | (or processed-result
332 | (and cached-response (skip-download? context options))))
333 | #_=> []
334 | processed-result
335 | #_=> [(assoc context
336 | ::new-items (map (partial merge-contexts context) processed-result)
337 | ::next-stage `split-handler)]
338 | (and cached-response (skip-download? context options))
339 | #_=> [(assoc context
340 | ::response cached-response
341 | ::next-stage `process-handler)]
342 | cached-response
343 | #_=> [(assoc context
344 | ::prev-response cached-response)]
345 | :otherwise
346 | #_=> [context])))
347 |
348 | (defn- wait
349 | "If ms-or-fn is a number, Thread/sleep that many milliseconds, otherwise
350 | assume that it's a zero-argument function, call it and sleep for the resulting
351 | number."
352 | [ms-or-fn]
353 | (when ms-or-fn
354 | (let [ms (if (number? ms-or-fn)
355 | ms-or-fn
356 | (ms-or-fn))]
357 | (Thread/sleep ms))))
358 |
359 | (defn signal-error
360 | "Call this function from `download-error-handler` to cause scraping to signal an error."
361 | [error context]
362 | [{:skyscraper.traverse/error error,
363 | :skyscraper.traverse/context context}])
364 |
365 | (defn respond-with
366 | "Call this function from `download-error-handler` to continue scraping as if download had succeeded."
367 | [response {:keys [pipeline] :as options} context]
368 | [(cond-> (advance-pipeline pipeline context)
369 | true (assoc ::response response)
370 | (:cookies response) (update :http/cookies merge (:cookies response)))])
371 |
372 | (defn default-download-error-handler
373 | "By default, when clj-http returns an error (e.g., when the server returns 4xx or 5xx),
374 | Skyscraper will call this function to determine what to do next.
375 | This handler causes Skyscraper to retry up to `retries` times for 5xx status codes,
376 | and to throw an exception otherwise."
377 | [error options context]
378 | (let [{:keys [status]} (ex-data error)
379 | retry? (or (and status (>= status 500))
380 | (re-find #"timed out" (str (.getMessage error))))
381 | retry (inc (or (::retry context) 0))]
382 | (if (and retry? (<= retry (:retries options)))
383 | (do
384 | (warnf "[download] Unexpected error %s, retry %s, context %s" error retry (context/describe context))
385 | [(assoc context ::retry retry)])
386 | (do
387 | (warnf "[download] Unexpected error %s, giving up, context %s" error (context/describe context))
388 | (signal-error error context)))))
389 |
390 | (defn- download-handler
391 | "Asynchronously downloads the page specified by context."
392 | [context {:keys [connection-manager download-semaphore sleep] :as options} callback]
393 | (debugf "Running download-handler: %s" (:processor context))
394 | (let [req (merge {:method :get, :url (:url context)}
395 | (extract-namespaced-keys "http" context))
396 | success-fn (fn [response]
397 | (debugf "[download] Downloaded %s" (context/describe-url context))
398 | (.release download-semaphore)
399 | (callback (respond-with response options context)))
400 | error-fn (fn [error]
401 | (.release download-semaphore)
402 | (let [error-handler (:download-error-handler options)]
403 | (callback (error-handler error options context))))]
404 | (debugf "[download] Waiting")
405 | (.acquire download-semaphore)
406 | (infof "[download] Downloading %s" (context/describe-url context))
407 | (let [req (merge {:async? true,
408 | :connection-manager connection-manager}
409 | req (get-option context options :http-options))
410 | request-fn (or (:request-fn options)
411 | http/request)]
412 | (wait sleep)
413 | (request-fn req
414 | success-fn
415 | error-fn))))
416 |
417 | (defn- sync-download-handler
418 | "Synchronous version of download-handler."
419 | [context {:keys [connection-manager sleep pipeline] :as options}]
420 | (let [req (merge {:method :get, :url (:url context), :connection-manager connection-manager}
421 | (extract-namespaced-keys "http" context)
422 | (get-option context options :http-options))
423 | request-fn (or (:request-fn options)
424 | http/request)]
425 | (try
426 | (infof "[download] Downloading %s" (context/describe-url context))
427 | (wait sleep)
428 | (let [resp (request-fn req)]
429 | (debugf "[download] Downloaded %s" (context/describe-url context))
430 | [(cond-> (advance-pipeline pipeline context)
431 | true (assoc ::response resp)
432 | (:cookies resp) (update :http/cookies merge (:cookies resp)))])
433 | (catch Exception error
434 | (let [error-handler (:download-error-handler options)]
435 | (error-handler error options context))))))
436 |
437 | (defn- store-cache-handler
438 | "Assuming context has downloaded data, stores it in HTML cache if
439 | applicable and returns it unmodified."
440 | [context options]
441 | (when-let [key (::cache-key context)]
442 | (cache/save-blob (:html-cache options) key (get-in context [::response :body]) (get-in context [::response :headers])))
443 | [context])
444 |
445 | (defn- parsed-document
446 | [context options response-key]
447 | (let [parse (get-option context options :parse-fn)
448 | {:keys [headers body]} (response-key context)]
449 | (when body
450 | (parse (into (http-headers/header-map) headers) body context))))
451 |
452 | (defn cached-document
453 | "Returns a previously-cached, parsed version of the document currently being processed."
454 | [context]
455 | (parsed-document context (::options context) ::prev-response))
456 |
457 | (defn- process-handler
458 | "Runs the processor specified by context on itself. Returns a single context
459 | with the processor results as `::new-items`."
460 | [context options]
461 | (let [document (parsed-document context options ::response)
462 | processor-name (:processor context)
463 | result (run-processor processor-name document context)]
464 | (when-let [key (::cache-key context)]
465 | (cache/save-blob (:processed-cache options) key (.getBytes (pr-str result) "UTF-8") nil))
466 | [(assoc context ::new-items (map (partial merge-contexts context) result))]))
467 |
468 | (defn- split-handler
469 | "Extracts `::new-items` out of the supplied contexts and prunes the scraping
470 | tree if necessary."
471 | [context options]
472 | (->> (::new-items context)
473 | (map #(assoc % ::stage `split-handler))
474 | (filter-contexts options)))
475 |
476 | (defn- sync-handler
477 | "A handler that runs the squashed pipeline."
478 | [{:keys [::stage processor] :as context} options]
479 | (debugf "Running sync-handler: %s %s" stage processor)
480 | (let [f (ns-resolve *ns* stage)
481 | results (f context options)
482 | maybe-advance-pipeline (if (= stage `sync-download-handler)
483 | (fn [pipeline context] context)
484 | advance-pipeline)]
485 | (map (partial maybe-advance-pipeline (:pipeline options)) results)))
486 |
487 | (defn initialize-seed
488 | "Ensures the seed is a seq and sets up internal keys."
489 | [{:keys [download-mode pipeline] :as options} seed]
490 | (let [seed (ensure-distinct-seq seed)]
491 | (mapv #(->> %
492 | (merge {::options options})
493 | (advance-pipeline pipeline))
494 | seed)))
495 |
496 | (def default-options
497 | "Default scraping options."
498 | {:max-connections 10,
499 | :retries 5,
500 | :conn-mgr-options {},
501 | :parse-fn parse-enlive,
502 | :download-mode :async,
503 | :download-error-handler default-download-error-handler,
504 | :http-options {:redirect-strategy :lax,
505 | :as :byte-array,
506 | :socket-timeout 30000,
507 | :connection-timeout 30000}})
508 |
509 | (defn initialize-options
510 | "Initializes scraping options, ensuring that the caches are
511 | instances of [[CacheBackend]], and a db is present if `:db-file`
512 | was supplied."
513 | [options]
514 | (let [options (merge default-options options)
515 | html-cache (sanitize-cache (:html-cache options) html-cache-dir)
516 | processed-cache (sanitize-cache (:processed-cache options) processed-cache-dir)]
517 | (merge options
518 | (sqlite/initialize-db-options options)
519 | {:pipeline (make-pipeline options)
520 | :enhance? ::new-items
521 | :html-cache html-cache
522 | :processed-cache processed-cache
523 | :on-end #(try
524 | (.close html-cache)
525 | (finally
526 | (.close processed-cache)))
527 | :connection-manager (case (:download-mode options)
528 | :sync (http-conn/make-reusable-conn-manager (:conn-mgr-options options))
529 | :async (http-conn/make-reuseable-async-conn-manager (:conn-mgr-options options)))
530 | :download-semaphore (java.util.concurrent.Semaphore. (:max-connections options))})))
531 |
532 | (defn scrape
533 | "Runs scraping on seed (an initial context or sequence of contexts), returning
534 | a lazy sequence of leaf contexts.
535 |
536 | `options` may include the ones supported by [[skyscraper.traverse/launch]],
537 | as well as:
538 |
539 | - `:conn-mgr-options` – Skyscraper will create a clj-http connection manager
540 | with these options (a sync or async one, depending on `:download-mode`)
541 | and use it across all HTTP requests it makes.
542 | See [[clj-http.conn-mgr/make-reusable-conn-manager]] and
543 | [[clj-http.conn-mgr/make-reusable-async-conn-manager]] for details on the
544 | options you can specify here.
545 | - `:db` – a clojure.java.jdbc compatible db-spec that, when passed, will
546 | cause scraping to generate a SQL database of results. See
547 | `doc/db.md` for a walkthrough. Only supports SQLite.
548 | - `:db-file` – an alternative to `:db`, a filename or path that will
549 | be used to construct a SQLite db-spec.
550 | - `:ignore-db-keys` – if true, Skyscraper will insert (instead of upserting)
551 | rows into the DB specified by `:db` or `:db-file`, as if none of the
552 | processors specified `:skyscraper.db/key-columns`. Defaults to true
553 | if the DB didn't exist.
554 | - `:download-error-handler` – a function called when clj-http returns an
555 | error when downloading; see `doc/error-handling.md` for details.
556 | - `:download-mode` – can be `:async` (default) or `:sync`. When async,
557 | Skyscraper will use clj-http's asynchronous mode to make HTTP requests.
558 | - `:html-cache` – the HTTP cache to use. Can be an instance of `CacheBackend`,
559 | a string (meaning a directory to use for a filesystem cache), `nil` or `false`
560 | (meaning no cache), or `true` (meaning a filesystem cache in the default
561 | location, [[html-cache-dir]]). Defaults to `nil`.
562 | - `:http-options` – a map of additional options that will be passed to
563 | [[clj-http.core/request]].
564 | - `:max-connections` – maximum number of HTTP requests that can be active
565 | at any time.
566 | - `:only` – prunes the scrape tree to only include matching contexts; this can be
567 | a map (specifying to only include records whose values, if present, coincide with
568 | the map) or a predicate (meaning to filter contexts on it).
569 | - `:parse-fn` – a function that takes 3 arguments: a map of HTTP headers,
570 | a byte array containing the downloaded document, and the context,
571 | and returns a parsed representation of that document. Skyscraper provides
572 | [[parse-string]], [[parse-enlive]], and [[parse-reaver]] out of the box.
573 | Defaults to [[parse-enlive]].
574 | - `:processed-cache` – the processed cache to use. Same possible values as
575 | for `:http-cache`. Defaults to `nil`.
576 | - `:request-fn` – the HTTP request function to use. Defaults to [[clj-http.core/request]].
577 | Skyscraper relies on the API of clj-http, so only override this if you
578 | know what you're doing.
579 | - `:retries` – maximum number of times that Skyscraper will retry downloading
580 | a page until it gives up. Defaults to 5.
581 | - `:sleep` – sleep this many milliseconds before each request, or a niladic fn
582 | that returns a number of milliseconds. Useful for throttling. It's probably
583 | best to set `:parallelism` to 1 together with this.
584 | - `:uncached-only` – prune the scrape tree, yielding only the nodes that haven't been
585 | scraped yet. See `doc/updates.md`.
586 | - `:use-http-headers-from-content` – if true (the default for [[parse-enlive]] and
587 | [[parse-reaver]]), obeys the charset declaration from `` or
588 | `` tags. Slows down scraping because it invokes an extra HTML
589 | parse (with Crouton, as per clj-http) just to detect the charset.
590 | - `:update` – run in update mode (see `doc/updates.md`)."
591 | [seed & {:as options}]
592 | (let [options (initialize-options options)
593 | seed (initialize-seed options seed)]
594 | (traverse/leaf-seq seed options)))
595 |
596 | (defn scrape!
597 | "Like scrape, but eager: terminates after scraping has succeeded. Returns nil.
598 | Pass `:db`, `:db-file`, `:leaf-chan`, or `:item-chan` to access scraped data.
599 |
600 | `options` are the same as in `scrape!`."
601 | [seed & {:as options}]
602 | (let [options (initialize-options options)
603 | seed (initialize-seed options seed)]
604 | (traverse/traverse! seed options)))
605 |
--------------------------------------------------------------------------------
/src/skyscraper/data.clj:
--------------------------------------------------------------------------------
1 | (ns skyscraper.data
2 | "Internal namespace for helper functions that deal with data.")
3 |
4 | (defn separate
5 | "Splits s into elements that satisfy f and ones that don't."
6 | [f s]
7 | [(filter f s) (filter (complement f) s)])
8 |
--------------------------------------------------------------------------------
/src/skyscraper/db.clj:
--------------------------------------------------------------------------------
1 | (ns skyscraper.db
2 | (:require
3 | [clojure.core.async :as async]
4 | [clojure.core.strint :refer [<<]]
5 | [clojure.java.io :as io]
6 | [clojure.java.jdbc :as jdbc]
7 | [clojure.set :as set]
8 | [clojure.string :as string]
9 | [skyscraper.context :as context]
10 | [skyscraper.data :refer [separate]]
11 | [skyscraper.traverse :as traverse]
12 | [taoensso.timbre :refer [debugf warnf]]))
13 |
14 | (defn- keyword->db-name
15 | "Converts a keyword (naming a DB table or column) to a string
16 | suitable for use in SQL queries."
17 | [k]
18 | (string/replace (name k) "-" "_"))
19 |
20 | (defn- db-name->keyword
21 | "The inverse of keyword->db-name."
22 | [str]
23 | (keyword (string/replace str "_" "-")))
24 |
25 | (defn- create-index-ddl
26 | "Returns SQL to create an index on a given table and columns. Note that
27 | Skyscraper only creates one index per table, so it's sufficient to just
28 | name it after the table."
29 | [table-name key-column-names]
30 | (let [index-name (str "idx_" table-name)]
31 | (str "CREATE UNIQUE INDEX " index-name " ON " table-name " (" (string/join ", " key-column-names) ")")))
32 |
33 | (defn- create-index
34 | "Creates a unique index for table-name on key-column-names."
35 | [db table-name key-column-names]
36 | (when (seq key-column-names)
37 | (jdbc/execute! db
38 | (create-index-ddl table-name key-column-names))))
39 |
40 | (defn- create-table
41 | "Creates a table in db containing the given column-names. If key-column-names
42 | is non-empty, also creates a unique index on those columns."
43 | [db table-name column-names key-column-names]
44 | (jdbc/execute! db
45 | (jdbc/create-table-ddl
46 | table-name
47 | (into [["id" :integer "primary key"]
48 | ["parent" :integer]]
49 | (for [col column-names :when (not= col "parent")]
50 | [col :text]))))
51 | (create-index db table-name key-column-names))
52 |
53 | (defn- query-context-ids
54 | "Selects the rows corresponding to the upserted contexts, to retrieve
55 | their database-assigned IDs."
56 | [db table-name key-columns key-column-names ctxs]
57 | (let [key-part (string/join ", " key-column-names)
58 | values-1 (str "(" (string/join ", " (repeat (count key-column-names) "?")) ")")
59 | values (string/join ", " (repeat (count ctxs) values-1))
60 | null-clause (string/join " or " (map #(str % " is null") key-column-names)) ;; XXX: this might return too broad a result set
61 | query (<< "select * from ~{table-name} where (~{key-part}) in (values~{values}) or ~{null-clause}") ;; XXX: only select id + key columns, not *
62 | params (mapcat (apply juxt key-columns) ctxs)]
63 | (jdbc/query db (into [query] params)
64 | {:identifiers db-name->keyword})))
65 |
66 | (defn- upsert-multi-row-sql
67 | "Returns SQL for upsert-multi!"
68 | [table-name column-names key-column-names values]
69 | (let [nc (count column-names)
70 | vcs (map count values)
71 | non-key-column-names (vec (set/difference (set column-names) (set key-column-names)))
72 | comma-join (partial string/join ", ")
73 | qmarks (repeat (first vcs) "?")]
74 | (if (not (and (or (zero? nc) (= nc (first vcs))) (apply = vcs)))
75 | (throw (IllegalArgumentException. "insert! called with inconsistent number of columns / values"))
76 | (into [(str (<< "INSERT INTO ~{table-name} (~(comma-join column-names)) VALUES (~(comma-join qmarks))")
77 | (when (seq key-column-names)
78 | (let [set-clause (string/join ", " (map #(str % " = excluded." %) non-key-column-names))
79 | do-clause (if (empty? non-key-column-names)
80 | "NOTHING"
81 | (str "UPDATE SET " set-clause))]
82 | (<< " ON CONFLICT (~(comma-join key-column-names)) DO ~{do-clause}"))))]
83 | values))))
84 |
85 | (defn- upsert-multi!
86 | "Like clojure.java.jdbc/insert-multi!, but updates the existing rows
87 | where key-column-names match supplied ones. Requires rows to be a
88 | sequence of vectors. Not wrapped in a transaction.
89 | Equivalent to insert-multi! if key-column-names is empty.
90 | Note: This is currently implemented as an INSERT ... ON CONFLICT DO
91 | UPDATE, which requires a DBMS able to support this syntax (SQLite
92 | 3.24+ or PostgreSQL 9.5+)."
93 | [db table-name column-names key-column-names rows]
94 | (jdbc/db-do-prepared db false
95 | (upsert-multi-row-sql table-name column-names key-column-names rows)
96 | {:multi? true}))
97 |
98 | (defn- upsert-multi-ensure-table!
99 | "Tries an upsert-multi!, and if it fails due to a missing table,
100 | creates it and tries again."
101 | [db table-name column-names key-column-names rows]
102 | (try
103 | (upsert-multi! db table-name column-names key-column-names rows)
104 | (catch org.sqlite.SQLiteException e
105 | (condp #(string/includes? %2 %1) (.getMessage e)
106 | "no such table"
107 | #_=> (do
108 | (create-table db table-name column-names key-column-names)
109 | (upsert-multi! db table-name column-names key-column-names rows))
110 | "ON CONFLICT clause does not match any PRIMARY KEY or UNIQUE constraint"
111 | #_=> (do
112 | (create-index db table-name key-column-names)
113 | (upsert-multi! db table-name column-names key-column-names rows))
114 | (throw e)))))
115 |
116 | (defn- ensure-types-single
117 | "Returns context, emitting warnings if the fields named by columns
118 | don't exist or are not of expected type (int for :parent, nilable string
119 | otherwise)."
120 | [columns context]
121 | (doseq [[k v] context
122 | :when (contains? columns k)
123 | :let [check (if (= k :parent) int? #(or (nil? %) (string? %)))]
124 | :when (not (check v))]
125 | (warnf "Wrong type for key %s, value %s" k v))
126 | (doseq [column columns
127 | :when (and (not= column :parent)
128 | (not (contains? context column)))]
129 | (warnf "Context contains no value for key %s: %s" column (context/describe context)))
130 | (merge (zipmap columns (repeat nil))
131 | context))
132 |
133 | (defn- ensure-types
134 | "Ensures types of all contexts as per ensure-types-single."
135 | [columns ctxs]
136 | (mapv (partial ensure-types-single columns) ctxs))
137 |
138 | (defn- extract-ids
139 | "Given a sequence of ctxs that are assumed to exist in the given db table,
140 | queries the DB for them and assocs each one's id as :parent."
141 | ;; Remember that this runs after the processor's :process-fn, so
142 | ;; calling it :parent ensures that the child processors will encounter
143 | ;; this in the expected place.
144 | [db table-name key-columns key-column-names ctxs]
145 | (let [inserted-rows (query-context-ids db table-name key-columns key-column-names ctxs)
146 | inserted-row-ids (into {}
147 | (map (fn [r] [(select-keys r key-columns) (:id r)]))
148 | inserted-rows)]
149 | (map (fn [ctx]
150 | (assoc ctx :parent (get inserted-row-ids (select-keys ctx key-columns))))
151 | ctxs)))
152 |
153 | (defn- extract-ids-from-last-rowid
154 | "Given a sequence of ctxs that have just been successfully inserted,
155 | assocs each one's id in the DB as :parent based on last_insert_rowid()
156 | (SQLite-specific)."
157 | [db ctxs]
158 | (let [rowid (-> (jdbc/query db "select last_insert_rowid() rowid") first :rowid)]
159 | (map #(assoc %1 :parent %2) ctxs (range (inc (- rowid (count ctxs))) (inc rowid)))))
160 |
161 | (defn upsert-contexts
162 | "Inserts new contexts into a given db table, returning them augmented
163 | with the `:parent` fields corresponding to the DB-generated primary
164 | keys. If `key-columns` (a vector of column names) is provided,
165 | does an upsert rather than an insert, checking for conflicts on
166 | those columns and updating db accordingly."
167 | [db table key-columns columns ctxs]
168 | (debugf "Upserting %s rows" (count ctxs))
169 | (when (seq ctxs)
170 | (let [ctxs (ensure-types (set columns) ctxs)
171 | table-name (keyword->db-name table)
172 | column-names (mapv keyword->db-name columns)
173 | key-column-names (mapv keyword->db-name key-columns)
174 | rows (map (apply juxt columns) ctxs)]
175 | (upsert-multi-ensure-table! db table-name column-names key-column-names rows)
176 | (if (seq key-column-names)
177 | (extract-ids db table-name key-columns key-column-names ctxs)
178 | (extract-ids-from-last-rowid db ctxs)))))
179 |
180 | (defn maybe-store-in-db
181 | "Wraps upsert-context, skipping contexts that contain ::skip."
182 | [db {:keys [name ::columns ::key-columns] :as processor} ignore-db-keys contexts]
183 | (if (and db columns)
184 | (let [columns (distinct (conj columns :parent))
185 | [skipped inserted] (separate ::skip contexts)
186 | new-items (upsert-contexts db name (when-not ignore-db-keys key-columns) columns inserted)]
187 | (into (vec skipped) new-items))
188 | contexts))
189 |
190 | (defn enhancer
191 | "An enhancer that upserts supplied batches of contexts into
192 | the database."
193 | [{:keys [db ignore-db-keys]} channels]
194 | (jdbc/with-db-transaction [db db]
195 | (traverse/enhancer-loop
196 | channels
197 | (fn [item]
198 | (let [new-items (:skyscraper.core/new-items item)
199 | updated (maybe-store-in-db db (:skyscraper.core/current-processor item) ignore-db-keys new-items)]
200 | (assoc item :skyscraper.core/new-items updated))))))
201 |
202 | (defn initialize-db-options
203 | "Sets up DB-related options: handles :db-file and :enhancer, autodetects :ignore-db-keys."
204 | [{:keys [db db-file ignore-db-keys]}]
205 | (let [db (or db
206 | (when db-file
207 | {:classname "org.sqlite.JDBC"
208 | :subprotocol "sqlite"
209 | :subname db-file}))
210 | file (cond (nil? db) nil
211 | (map? db) (io/file (:subname db))
212 | (string? db) (io/file (:subname (#'jdbc/parse-properties-uri (java.net.URI. (#'jdbc/strip-jdbc db))))) ; yuck! accessing innards of clojure.java.jdbc
213 | :otherwise (throw (Exception. ":db needs to be a map or a string")))]
214 | {:db db
215 | :enhancer (when db enhancer)
216 | :ignore-db-keys (or ignore-db-keys
217 | (when file
218 | (or (not (.exists file))
219 | (zero? (.length file)))))}))
220 |
--------------------------------------------------------------------------------
/src/skyscraper/dev.clj:
--------------------------------------------------------------------------------
1 | (ns skyscraper.dev
2 | "Tools for interactive development of scrapers. See [doc/development-mode.md]
3 | for an overview and example."
4 | (:require
5 | [clojure.core.async :refer [chan alts!!]]
6 | [clojure.java.browse :refer [browse-url]]
7 | [clojure.java.io :as io]
8 | [skyscraper.core :as core]
9 | [skyscraper.traverse :as traverse]
10 | [taoensso.timbre :as log]))
11 |
12 | (defn- browse-context
13 | "Dumps the given context's response to a temporary file
14 | and opens a browser on it."
15 | [ctx]
16 | (let [f (java.io.File/createTempFile "skyscraper-" ".html")]
17 | (with-open [is (io/input-stream (get-in ctx [::core/response :body]))
18 | os (io/output-stream f)]
19 | (io/copy is os))
20 | (browse-url f)))
21 |
22 | (def ^:private scrape-data (atom nil))
23 |
24 | (defn cleanup
25 | "Runs a previously terminated [[scrape]] to completion."
26 | []
27 | (when-let [{{:keys [item-chan terminate-chan]} :channels, processors :processors} @scrape-data]
28 | (log/infof "Resuming suspended scrape to clean up")
29 | (core/with-processor-definitions processors
30 | (loop []
31 | (let [alts-res (alts!! [item-chan terminate-chan])
32 | [val port] alts-res]
33 | (if (= port terminate-chan)
34 | (reset! scrape-data nil)
35 | (recur)))))))
36 |
37 | (defn scrape
38 | "A variant of [[skyscraper.core/scrape!]] that will stop and open a
39 | browser on the first encountered processor that isn't defined or doesn't
40 | have a process-fn."
41 | [seed & {:as options}]
42 | (cleanup)
43 | (let [item-chan (chan)
44 | options (core/initialize-options (assoc options :item-chan item-chan :parallelism 1))
45 | seed (core/initialize-seed options seed)
46 | {:keys [terminate-chan] :as channels} (traverse/launch seed options)]
47 | (loop []
48 | (let [alts-res (alts!! [item-chan terminate-chan])
49 | [val port] alts-res]
50 | (if (= port terminate-chan)
51 | nil
52 | (if-let [{:keys [::core/resource ::core/context]} (first (filter #(::core/unimplemented %) val))]
53 | (do (reset! scrape-data {:resource resource, :context context, :channels channels, :processors @core/processors})
54 | (browse-context context)
55 | (log/infof "Scraping suspended in processor %s" (:processor context))
56 | nil)
57 | (recur)))))))
58 |
59 | (defn document
60 | "Returns the parsed document that the last invocation of [[scrape]]
61 | has stopped on."
62 | []
63 | (:resource @scrape-data))
64 |
65 | (defn run-last-processor
66 | "Calls the processor whose invocation caused [[scrape]] to stop."
67 | []
68 | (if-let [{:keys [resource context]} @scrape-data]
69 | (core/run-processor (:processor context) resource context)
70 | (throw (ex-info "No interactive scraping in progress" {}))))
71 |
--------------------------------------------------------------------------------
/src/skyscraper/enlive_helpers.clj:
--------------------------------------------------------------------------------
1 | (ns skyscraper.enlive-helpers
2 | "Utility functions for use in Enlive-based scrapers."
3 | (:require
4 | [net.cgrand.enlive-html :as enlive]))
5 |
6 | (defn href
7 | "Returns the href of an `` node, potentially wrapped in
8 | another node."
9 | [x]
10 | (cond
11 | (nil? x) nil
12 | (and (map? x) (= :a (:tag x))) (-> x :attrs :href)
13 | :otherwise (href (first (enlive/select x [:a])))))
14 |
--------------------------------------------------------------------------------
/src/skyscraper/traverse.clj:
--------------------------------------------------------------------------------
1 | (ns skyscraper.traverse
2 | "Parallelized context tree traversal.
3 |
4 | First, some definitions (sketchy – some details omitted):
5 |
6 | 1. A _handler_ is a function taking a map and returning a seq of
7 | maps (or a symbol naming such a function).
8 | 2. A _context_ is a map that may contain a special key,
9 | `::handler`, describing a handler that you may run on it.
10 |
11 | Now imagine that we have a root context. We can run its `::handler`
12 | on it, obtaining a series of child contexts. If these contexts in
13 | turn contain their own `::handler`s, we can invoke each on its
14 | associated context, obtaining another series of grandchild contexts.
15 | Repeatedly applying this process gives rise to a tree, called a
16 | _context tree_.
17 |
18 | We call that tree _implicit_ because it is never reified as a whole
19 | in the process; rather, its nodes are computed individually.
20 |
21 | This ns implements context tree traversal parallelized using core.async,
22 | with the following provisos:
23 |
24 | - A handler can be either synchronous (in which case it's a function
25 | taking context and returning seq of contexts) or asynchronous (in
26 | which case it takes a seq of contexts and a callback, should return
27 | immediately, and should arrange for that callback to be called with
28 | a list of return contexts when it's ready). Whether a handler is
29 | synchronous or asynchronous depends on a context's `::call-protocol`.
30 |
31 | - It supports context priorities, letting you control the order in which
32 | the context tree nodes will be visited. These are specified by the
33 | `::priority` context key: the less the number, the higher the priority."
34 | (:require
35 | [clojure.core.async :as async
36 | :refer [! >!! alts! alts!!
37 | chan close! go go-loop put! thread]]
38 | [clojure.data.priority-map :refer [priority-map]]
39 | [clojure.java.io :as io]
40 | [skyscraper.data :refer [separate]]
41 | [taoensso.timbre :as timbre :refer [debugf infof warnf errorf]]))
42 |
43 | (defn- priority [ctx]
44 | (::priority ctx 0))
45 |
46 | (defn- add-to-todo [todo new-items]
47 | (if (map? todo)
48 | (into todo (map #(vector % (priority %)) new-items))
49 | (into todo new-items)))
50 |
51 | (defn- initial-state [prioritize? items]
52 | {:todo (add-to-todo (if prioritize?
53 | (priority-map)
54 | (list))
55 | items)
56 | :doing #{}})
57 |
58 | (defn- gimme [{:keys [todo doing] :as s}]
59 | (if-let [popped (first todo)]
60 | (let [popped (if (map? todo) (key popped) popped)]
61 | [popped
62 | {:todo (pop todo)
63 | :doing (conj doing popped)}])
64 | [nil s]))
65 |
66 | (defn- pop-n [n todo]
67 | (let [pop' #(if (empty? %) % (pop %))]
68 | (if-not (map? todo)
69 | [(take n todo) (nth (iterate pop' todo) n)]
70 | [(map key (take n todo)) (nth (iterate pop' todo) n)])))
71 |
72 | (defn- done [{:keys [todo doing] :as s}
73 | {:keys [done new-items] :as t}
74 | want]
75 | (cond
76 | (not (contains? doing done)) {:unexpected done, :state s}
77 |
78 | (::error (first new-items)) {:want want, :error (first new-items)}
79 |
80 | :otherwise
81 | (let [all-todo (add-to-todo todo new-items)
82 | [giveaway new-todo] (pop-n want all-todo)
83 | doing (-> doing (disj done) (into giveaway))]
84 | {:want (- want (count giveaway))
85 | :giveaway giveaway
86 | :terminate (and (empty? doing) (empty? new-todo))
87 | :state {:todo new-todo, :doing doing}})))
88 |
89 | (defn- atomic-spit [path data]
90 | (let [temp (java.io.File/createTempFile "spit" nil)]
91 | (spit temp data)
92 | (.renameTo temp (io/file path))))
93 |
94 | (defn- read-resume-file [filename]
95 | (when (and filename (.exists (io/file filename)))
96 | (let [{:keys [todo doing]} (read-string (slurp filename))]
97 | {:todo (-> (list)
98 | (into doing)
99 | (into todo))
100 | :doing #{}})))
101 |
102 | (defn- governor [{:keys [prioritize? parallelism resume-file]} seed {:keys [control-chan data-chan terminate-chan]}]
103 | (go-loop [state (or (read-resume-file resume-file)
104 | (initial-state prioritize? seed))
105 | want 0
106 | terminating nil]
107 | (when resume-file
108 | (atomic-spit resume-file state))
109 | (debugf "[governor] Waiting for message")
110 | (let [message (! data-chan {::terminate true})
115 | (if (= terminating 1)
116 | (do
117 | (when resume-file
118 | (.delete (io/file resume-file)))
119 | (close! terminate-chan))
120 | (recur state want (dec terminating))))
121 | (= message :gimme) (let [[res state] (gimme state)]
122 | (debugf "[governor] Giving")
123 | (if res
124 | (do
125 | (>! data-chan res)
126 | (recur state want nil))
127 | (recur state (inc want) nil)))
128 | :otherwise (let [{:keys [unexpected want giveaway terminate state error]}
129 | (done state message want)]
130 | (cond
131 | unexpected (do
132 | (errorf "[governor] Unexpected message: %s" message)
133 | (recur state want nil))
134 | terminate (do
135 | (debugf "[governor] Entering termination mode")
136 | (dotimes [i want]
137 | (>! data-chan {::terminate true}))
138 | (recur state want (- parallelism want)))
139 | error (do
140 | (debugf "[governor] Error encountered, entering abnormal termination")
141 | (loop [cnt (- parallelism want)]
142 | (when (pos? cnt)
143 | (let [msg (! data-chan {::terminate true}))
149 | (>! terminate-chan error)
150 | (close! terminate-chan))
151 | :else (do
152 | (debugf "[governor] Giving away: %d" (count giveaway))
153 | (doseq [item giveaway]
154 | (>! data-chan item))
155 | (recur state want nil))))))))
156 |
157 | (defn- processed [context results]
158 | {:done context, :new-items results})
159 |
160 | (defn- propagate-new-contexts [{:keys [item-chan leaf-chan control-chan]} enhance i context new-contexts]
161 | (let [enhanced (mapv enhance new-contexts)
162 | [non-leaves leaves] (separate ::handler enhanced)
163 | err (first (filter ::error enhanced))]
164 | (debugf "[worker %d] %d leaves, %d inner nodes produced" i (count leaves) (count non-leaves))
165 | (when (and item-chan (seq new-contexts))
166 | (>!! item-chan new-contexts))
167 | (when (and leaf-chan (seq leaves))
168 | (>!! leaf-chan leaves))
169 | (when err
170 | (timbre/error err (format "[worker %d] Handler threw an error" i)))
171 | (>!! control-chan (processed context (if err
172 | [err]
173 | non-leaves)))))
174 |
175 | (defn- wrapped-error [context error]
176 | {::context context, ::error error})
177 |
178 | (defmacro capture-errors [context & body]
179 | `(let [context# ~context]
180 | (try
181 | ~@body
182 | (catch Exception e#
183 | [(wrapped-error context# e#)]))))
184 |
185 | (defn enhancer-loop [{:keys [enhancer-input-chan enhancer-output-chan]} f]
186 | (loop []
187 | (when-let [item (async/!! enhancer-output-chan new-item)
193 | (recur))))) ; XXX: do we want to recur even if an error had occurred?
194 |
195 | (defn- worker [{:keys [enhance?] :as options} i {:keys [control-chan data-chan enhancer-input-chan enhancer-output-chan] :as channels}]
196 | (let [options (assoc options ::worker i)
197 | enhance (fn [x]
198 | (if (and enhancer-input-chan enhance? (enhance? x))
199 | (do
200 | (>!! enhancer-input-chan x)
201 | (!! control-chan :gimme)
207 | (debugf "[worker %d] Waiting for reply" i)
208 | (let [{:keys [::terminate ::handler ::call-protocol] :as context} (seq` for an example of how to
244 | put it together."
245 | [seed options]
246 | (let [{:keys [parallelism leaf-chan item-chan enhancer on-end] :as options} (merge default-options options)
247 | channels (merge {:control-chan (chan)
248 | :data-chan (chan)
249 | :terminate-chan (chan)
250 | :leaf-chan leaf-chan
251 | :item-chan item-chan
252 | :on-end on-end}
253 | (when enhancer
254 | {:enhancer-input-chan (chan)
255 | :enhancer-output-chan (chan)}))]
256 | (governor options seed channels)
257 | (dotimes [i parallelism]
258 | (worker options i channels))
259 | (cond-> channels
260 | enhancer (assoc :enhancer-terminate-chan
261 | (thread
262 | (enhancer options channels)
263 | nil)))))
264 |
265 | (defn- throw-handler-error!
266 | "Throws an ExceptionInfo about a handler throwing an error."
267 | [error]
268 | (throw (ex-info "Handler threw an error"
269 | (::context error)
270 | (::error error))))
271 |
272 | (defn close-all!
273 | "Closes channels used by the traversal process."
274 | [channels]
275 | (doseq [[k ch] channels :when (and ch (not (#{:enhancer-terminate-chan :on-end} k)))]
276 | (close! ch))
277 | (when-let [ch (:enhancer-terminate-chan channels)]
278 | (seq [ch {:keys [terminate-chan] :as channels}]
299 | (lazy-seq
300 | (let [[items out-ch] (alts!! [ch terminate-chan])]
301 | (cond
302 | (and items (= out-ch terminate-chan))
303 | #_=> (do
304 | (close-all! channels)
305 | (throw-handler-error! items))
306 | items
307 | #_=> (concat items (chan->seq ch channels))
308 | :otherwise
309 | #_=> (close-all! channels)))))
310 |
311 | (defn leaf-seq
312 | "Returns a lazy seq of leaf nodes from a tree traversal. Any channels
313 | created will be automatically closed when the seq is fully consumed."
314 | [seed options]
315 | (let [leaf-chan (chan)
316 | options (assoc options :leaf-chan leaf-chan)
317 | channels (launch seed options)]
318 | (chan->seq leaf-chan channels)))
319 |
--------------------------------------------------------------------------------
/test/skyscraper/basic_cookie_test.clj:
--------------------------------------------------------------------------------
1 | (ns skyscraper.basic-cookie-test
2 | (:require
3 | [clojure.test :as test :refer [deftest is]]
4 | [hiccup.page :refer [html5]]
5 | [net.cgrand.enlive-html :refer [select text]]
6 | [ring.middleware.cookies :refer [wrap-cookies]]
7 | [skyscraper.cache :as cache]
8 | [skyscraper.core :refer [defprocessor scrape scrape!]]
9 | [skyscraper.enlive-helpers :refer [href]]
10 | [skyscraper.test-utils :refer [make-seed resp-page with-server]]
11 | [taoensso.timbre :as timbre]))
12 |
13 | (defn handler [{:keys [cookies uri]}]
14 | (condp = uri
15 | "/" {:status 200,
16 | :headers {"Set-Cookie" "secret=donttellanyone"},
17 | :body (html5 [:a {:href "/secret"} "Got a cookie?"])}
18 | "/secret" (resp-page
19 | [:p (if (= (get-in cookies ["secret" :value]) "donttellanyone")
20 | "You got it!"
21 | "You ain't got it")])
22 | {:status 404}))
23 |
24 | (defprocessor ::root
25 | :process-fn (fn [res ctx]
26 | (for [link (select res [:a])]
27 | {:link-text (text link), :url (href link), :processor ::secret})))
28 |
29 | (defprocessor ::secret
30 | :process-fn (fn [res ctx]
31 | (for [item (select res [:p])]
32 | {:target (text item)})))
33 |
34 | (deftest basic-cookie-test
35 | (timbre/set-level! :warn)
36 | (let [seed (make-seed ::root)]
37 | (with-server (wrap-cookies handler)
38 | (is (= (scrape seed)
39 | [{:link-text "Got a cookie?", :target "You got it!"}])))))
40 |
--------------------------------------------------------------------------------
/test/skyscraper/cache_test.clj:
--------------------------------------------------------------------------------
1 | (ns skyscraper.cache-test
2 | (:require [cartestian.core]
3 | [cartestian.test :refer [with-combinations]]
4 | [clojure.java.io :as io]
5 | [clojure.test :refer :all]
6 | [net.cgrand.enlive-html :refer [select text]]
7 | [skyscraper.cache :as cache]
8 | [skyscraper.core :refer :all]
9 | [skyscraper.enlive-helpers :refer [href]]
10 | [skyscraper.test-utils :refer :all]
11 | [taoensso.timbre :as timbre]))
12 |
13 | (defn close-aware-cache
14 | [closed?]
15 | (let [ensure-not-closed! #(when @closed?
16 | (throw (Exception. "cache already closed")))]
17 | (reify
18 | cache/CacheBackend
19 | (save-blob [cache key blob metadata]
20 | (ensure-not-closed!)
21 | nil)
22 | (load-blob [cache key]
23 | (ensure-not-closed!)
24 | nil)
25 | java.io.Closeable
26 | (close [cache]
27 | (ensure-not-closed!)
28 | (reset! closed? true)))))
29 |
30 | (defn handler [{:keys [uri]}]
31 | (condp = uri
32 | "/" (resp-page [:h1 "Hello world"])
33 | "/parent" (resp-page [:a {:href "child"} "Next"])
34 | "/child" (resp-page [:h1 "Text"])))
35 |
36 | (defprocessor ::start
37 | :cache-template "index"
38 | :process-fn (fn [res ctx]
39 | (for [x (select res [:h1])]
40 | {:text (text x)})))
41 |
42 | (defprocessor ::parent
43 | ;; no cache-template
44 | :process-fn (fn [res ctx]
45 | (for [x (select res [:a])]
46 | {:url (href x), :processor ::child})))
47 |
48 | (defprocessor ::child
49 | :cache-template "child"
50 | :process-fn (fn [res ctx]
51 | (for [x (select res [:h1])]
52 | {:text (text x)})))
53 |
54 | (defprocessor ::kaboom
55 | :cache-template "kaboom"
56 | :process-fn (fn [res ctx]
57 | (throw (Exception.))))
58 |
59 | (deftest test-closing-cache
60 | (with-combinations [v {:interface [:lazy :imperative]
61 | :succeeding [true false]}]
62 | (let [closed? (atom false)
63 | cache (close-aware-cache closed?)
64 | seed (make-seed (if (:succeeding v) ::start ::kaboom))
65 | run! (case (:interface v)
66 | :lazy #(dorun (scrape seed :html-cache cache))
67 | :imperative #(scrape! seed :html-cache cache))]
68 | (with-server handler
69 | (testing "scraping should close the cache"
70 | (if (:succeeding v)
71 | (is (nil? (run!)))
72 | (is (thrown? Exception (run!))))
73 | (is @closed?))
74 | (testing "subsequent scraping should throw an exception"
75 | (is (thrown? Exception (run!))))))))
76 |
77 | (defn test-fs-cache []
78 | (cache/fs (str (java.nio.file.Files/createTempDirectory "html-cache" (into-array java.nio.file.attribute.FileAttribute [])))))
79 |
80 | (deftest test-processed-cache-with-missing-keys
81 | (with-server handler
82 | (testing "scrape works correctly"
83 | (is (= (scrape (make-seed ::parent "/parent")
84 | :processed-cache (test-fs-cache))
85 | [{:text "Text"}])))))
86 |
--------------------------------------------------------------------------------
/test/skyscraper/character_encoding_test.clj:
--------------------------------------------------------------------------------
1 | (ns skyscraper.character-encoding-test
2 | (:require
3 | [clojure.test :as test :refer [deftest is testing]]
4 | [net.cgrand.enlive-html :refer [select text]]
5 | [skyscraper.cache :as cache]
6 | [skyscraper.core :refer [defprocessor scrape scrape!]]
7 | [skyscraper.enlive-helpers :refer [href]]
8 | [skyscraper.test-utils :refer [make-seed resp-page with-server]]
9 | [taoensso.timbre :as timbre]))
10 |
11 | (def polish-text "Filmuj rzeź żądań, pość, gnęb chłystków")
12 | (def polish-html-latin2 (.getBytes (str " " polish-text " " polish-text "Number %s
%s"
17 | i
18 | (if (>= i 100)
19 | ""
20 | (apply str (for [n (range 10) :let [m (+ n (* 10 i))] :when (pos? m)] (format "Page %s" m m))))))
21 |
22 | (defn url-number
23 | [url]
24 | (try
25 | (Integer/parseInt (last (string/split url #"/")))
26 | (catch Exception _ 0)))
27 |
28 | (def hits (atom 0))
29 |
30 | (defn mock-request
31 | [{:keys [url async?]} & [success-fn error-fn]]
32 | (swap! hits inc)
33 | (let [response {:headers {"content-type" "text/html; charset=utf-8"}
34 | :body (.getBytes (dummy-site-content (url-number url)))}]
35 | (if async?
36 | (success-fn response)
37 | response)))
38 |
39 | (defn process-root [res {:keys [i] :as ctx}]
40 | (let [numtext (text (first (select res [:h1])))
41 | number (Long/parseLong (subs numtext 7))
42 | subpages (select res [:a])]
43 | (if (seq subpages)
44 | (for [a subpages]
45 | (assoc
46 | {:processor :root, :url (href a), :i (inc i)}
47 | (keyword (str "level" i)) number))
48 | {:number number})))
49 |
50 | (defprocessor :root
51 | :cache-key-fn (fn [ctx] (str "numbers/" (url-number (:url ctx))))
52 | :process-fn process-root)
53 |
54 | (defprocessor :root-uncached
55 | :process-fn process-root)
56 |
57 | (defn seed [& _]
58 | [{:url "http://localhost/0", :i 0, :processor :root}])
59 |
60 | (defn seed-uncached [& _]
61 | [{:url "http://localhost/0", :i 0, :processor :root-uncached}])
62 |
63 | (timbre/set-level! :warn)
64 |
65 | (deftest basic-scraping
66 | (is (= (count (scrape (seed)
67 | :html-cache nil
68 | :processed-cache nil
69 | :request-fn mock-request))
70 | 900))
71 | (is (= (count (scrape (seed)
72 | :html-cache nil
73 | :processed-cache nil
74 | :request-fn mock-request
75 | :download-mode :sync))
76 | 900)))
77 |
78 | (deftest test-only
79 | (reset! hits 0)
80 | (is (= (->> (scrape (seed)
81 | :only {:level1 1}
82 | :html-cache nil
83 | :processed-cache nil
84 | :request-fn mock-request)
85 | (map :number)
86 | (sort))
87 | (range 100 200))))
88 |
89 | (deftest caches
90 | (reset! hits 0)
91 | (let [hcache (cache/memory)
92 | pcache (cache/memory)]
93 | (is (= (count (scrape (seed) :request-fn mock-request :html-cache hcache :processed-cache pcache)) 900))
94 | (let [hits-before @hits
95 | _ (dorun (scrape (seed) :request-fn mock-request :html-cache hcache :processed-cache pcache))
96 | hits-after @hits
97 | _ (dorun (scrape (seed) :request-fn mock-request :html-cache hcache :processed-cache pcache :parallelism 1 :download-mode :sync))
98 | hits-after-sync @hits]
99 | (is (= hits-before hits-after hits-after-sync)))
100 | (let [res1 (doall (scrape (seed) :request-fn mock-request :html-cache hcache :processed-cache pcache))
101 | res2 (doall (scrape (seed) :request-fn mock-request :html-cache hcache :processed-cache nil))
102 | res3 (doall (scrape (seed) :request-fn mock-request :html-cache nil :processed-cache pcache))
103 | res4 (doall (scrape (seed) :request-fn mock-request :html-cache nil :processed-cache nil))
104 | res5 (doall (scrape (seed-uncached) :request-fn mock-request :html-cache nil :processed-cache nil))]
105 | (is (apply = (mapv #(sort-by :number %) [res1 res2 res3 res4 res5]))))))
106 |
107 | (deftest test-merge-urls
108 | (are [y z] (= (merge-urls "https://foo.pl/bar/baz" y) z)
109 | "http://bar.uk/baz/foo" "http://bar.uk/baz/foo"
110 | "//bar.uk/baz/foo" "https://bar.uk/baz/foo"
111 | "/baz/foo" "https://foo.pl/baz/foo"
112 | "foo" "https://foo.pl/bar/foo"))
113 |
114 | (deftest test-allows
115 | (is (allows? {:k1 1, :k2 2} {:k1 1, :k3 3}))
116 | (is (not (allows? {:k1 1, :k2 2} {:k1 1, :k2 3})))
117 | (is (allows? {:k1 1} {:k1 1, :k2 2}))
118 | (is (allows? {} {:k1 1, :k2 2}))
119 | (is (allows? {:k1 1} {:k2 2})))
120 |
121 | (defprocessor :nil-url-test-processor-root
122 | :cache-template "nil-url"
123 | :process-fn (fn [res ctx]
124 | (for [a (select res [:a])]
125 | {:title (text a), :url (href a), :processor :nil-url-test-processor-child})))
126 |
127 | (defprocessor :nil-url-test-processor-child
128 | :cache-template "nil-url/:title"
129 | :process-fn (fn [res ctx]
130 | [{:info (text (first (select res [:h1])))}]))
131 |
132 | (deftest test-nil-url
133 | (let [html-main "linknon-link"
134 | html-child "