├── .editorconfig
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── extra_docs
└── pause_resume_grab_sites.sh
├── grab-site
├── gs-dump-urls
├── gs-server
├── images
├── dashboard.png
└── scriptorium.jpg
├── libgrabsite
├── 404.html
├── __init__.py
├── dashboard.html
├── dashboard_client.py
├── default_cookies.txt
├── dump_urls.py
├── dupes.py
├── dupespotter.py
├── favicon.ico
├── ignore_sets
│ ├── blogs
│ ├── coppermine
│ ├── facebook
│ ├── forums
│ ├── global
│ ├── imdb
│ ├── mediawiki
│ ├── meetupeverywhere
│ ├── nogravatar
│ ├── noonion
│ ├── nosortedindex
│ ├── pinterest
│ ├── reddit
│ ├── singletumblr
│ ├── twitter
│ └── youtube
├── main.py
├── server.py
├── wpull_hooks.py
└── wpull_tweaks.py
├── setup.py
└── tests
├── offline-tests
└── online-tests
/.editorconfig:
--------------------------------------------------------------------------------
1 | root = true
2 |
3 | [*]
4 | indent_style = tab
5 | indent_size = 4
6 | end_of_line = lf
7 | charset = utf-8
8 | trim_trailing_whitespace = true
9 | insert_final_newline = true
10 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | sudo: required
3 | dist: xenial
4 | python:
5 | - 3.7
6 |
7 | before_install:
8 | - sudo apt-get update
9 | - sudo apt-get install -y --no-install-recommends libxml2-dev libxslt1-dev libre2-dev pkg-config
10 |
11 | install:
12 | - travis_retry pip install --upgrade pip setuptools
13 | - travis_retry pip install --no-binary lxml --upgrade .
14 |
15 | script:
16 | - ./tests/offline-tests
17 | - ./tests/online-tests
18 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | grab-site license:
2 |
3 | Copyright (c) 2015 Ivan Kozik
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
23 |
24 |
25 | grab-site includes code from ArchiveBot, which is licensed as:
26 |
27 | Copyright (c) 2013 David Yip
28 |
29 | Permission is hereby granted, free of charge, to any person obtaining a copy
30 | of this software and associated documentation files (the "Software"), to deal
31 | in the Software without restriction, including without limitation the rights
32 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
33 | copies of the Software, and to permit persons to whom the Software is
34 | furnished to do so, subject to the following conditions:
35 |
36 | The above copyright notice and this permission notice shall be included in
37 | all copies or substantial portions of the Software.
38 |
39 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
40 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
41 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
42 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
43 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
44 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
45 | THE SOFTWARE.
46 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | grab-site
2 | =========
3 |
4 | [![Build status][travis-image]][travis-url]
5 |
6 | grab-site is an easy preconfigured web crawler designed for backing up websites.
7 | Give grab-site a URL and it will recursively crawl the site and write
8 | [WARC files](https://www.archiveteam.org/index.php?title=The_WARC_Ecosystem).
9 | Internally, grab-site uses [a fork](https://github.com/ArchiveTeam/ludios_wpull) of
10 | [wpull](https://github.com/chfoo/wpull) for crawling.
11 |
12 | grab-site gives you
13 |
14 | * a dashboard with all of your crawls, showing which URLs are being
15 | grabbed, how many URLs are left in the queue, and more.
16 |
17 | * the ability to add ignore patterns when the crawl is already running.
18 | This allows you to skip the crawling of junk URLs that would
19 | otherwise prevent your crawl from ever finishing. See below.
20 |
21 | * an extensively tested default ignore set ([global](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/global))
22 | as well as additional (optional) ignore sets for forums, reddit, etc.
23 |
24 | * duplicate page detection: links are not followed on pages whose
25 | content duplicates an already-seen page.
26 |
27 | The URL queue is kept on disk instead of in memory. If you're really lucky,
28 | grab-site will manage to crawl a site with ~10M pages.
29 |
30 | 
31 |
32 | Note: if you have any problems whatsoever installing or getting grab-site to run,
33 | please [file an issue](https://github.com/ArchiveTeam/grab-site/issues) - thank you!
34 |
35 | The installation methods below are the only ones supported in our GitHub issues.
36 | Please do not modify the installation steps unless you really know what you're
37 | doing, with both Python packaging and your operating system. grab-site runs
38 | on a specific version of Python (3.7 or 3.8) and with specific dependency versions.
39 |
40 | **Contents**
41 |
42 | - [Install on Ubuntu 18.04, 20.04, 22.04, Debian 10 (buster), Debian 11 (bullseye)](#install-on-ubuntu-1804-2004-2204-debian-10-buster-debian-11-bullseye)
43 | - [Install on NixOS](#install-on-nixos)
44 | - [Install on another distribution lacking Python 3.7.x or 3.8.x](#install-on-another-distribution-lacking-python-37x-or-38x)
45 | - [Install on macOS](#install-on-macos)
46 | - [Install on Windows 10 (experimental)](#install-on-windows-10-experimental)
47 | - [Upgrade an existing install](#upgrade-an-existing-install)
48 | - [Usage](#usage)
49 | - [`grab-site` options, ordered by importance](#grab-site-options-ordered-by-importance)
50 | - [Warnings](#warnings)
51 | - [Tips for specific websites](#tips-for-specific-websites)
52 | - [Changing ignores during the crawl](#changing-ignores-during-the-crawl)
53 | - [Inspecting the URL queue](#inspecting-the-url-queue)
54 | - [Preventing a crawl from queuing any more URLs](#preventing-a-crawl-from-queuing-any-more-urls)
55 | - [Stopping a crawl](#stopping-a-crawl)
56 | - [Advanced `gs-server` options](#advanced-gs-server-options)
57 | - [Viewing the content in your WARC archives](#viewing-the-content-in-your-warc-archives)
58 | - [Inspecting WARC files in the terminal](#inspecting-warc-files-in-the-terminal)
59 | - [Automatically pausing grab-site processes when free disk is low](#automatically-pausing-grab-site-processes-when-free-disk-is-low)
60 | - [Thanks](#thanks)
61 | - [Help](#help)
62 |
63 |
64 |
65 | Install on Ubuntu 18.04, 20.04, 22.04, Debian 10 (buster), Debian 11 (bullseye)
66 | ---
67 |
68 | 1. On Debian, use `su` to become root if `sudo` is not configured to give you access.
69 |
70 | ```
71 | sudo apt-get update
72 | sudo apt-get install --no-install-recommends \
73 | wget ca-certificates git build-essential libssl-dev zlib1g-dev \
74 | libbz2-dev libreadline-dev libsqlite3-dev libffi-dev libxml2-dev \
75 | libxslt1-dev libre2-dev pkg-config
76 | ```
77 |
78 | If you see `Unable to locate package`, run the two commands again.
79 |
80 | 2. As a **non-root** user:
81 |
82 | ```
83 | wget https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer
84 | chmod +x pyenv-installer
85 | ./pyenv-installer
86 | ~/.pyenv/bin/pyenv install 3.8.15
87 | ~/.pyenv/versions/3.8.15/bin/python -m venv ~/gs-venv
88 | ~/gs-venv/bin/pip install --no-binary lxml --upgrade git+https://github.com/ArchiveTeam/grab-site
89 | ```
90 |
91 | `--no-binary lxml` is necessary for the html5-parser build.
92 |
93 | 3. Add this to your `~/.bashrc` or `~/.zshrc`:
94 |
95 | ```
96 | PATH="$PATH:$HOME/gs-venv/bin"
97 | ```
98 |
99 | and then restart your shell (e.g. by opening a new terminal tab/window).
100 |
101 |
102 | Install on NixOS
103 | ---
104 |
105 | grab-site was removed from nixpkgs master; 23.05 is the last release to contain grab-site.
106 |
107 | ```
108 | nix-env -f https://github.com/NixOS/nixpkgs/archive/release-23.05.tar.gz -iA grab-site
109 | ```
110 |
111 | or, if you are using profiles (ie when you have flakes enabled):
112 |
113 | ```
114 | nix profile install nixpkgs/release-22.11#grab-site
115 | ```
116 |
117 |
118 | Install on another distribution lacking Python 3.7.x or 3.8.x
119 | ---
120 |
121 | After installing [uv](https://docs.astral.sh/uv/), you can run
122 | ```
123 | uv tool install --python=3.8 --no-binary-package lxml git+https://github.com/ArchiveTeam/grab-site/
124 | ```
125 |
126 |
127 | Install on macOS
128 | ---
129 |
130 | On OS X 10.10 - macOS 11:
131 |
132 | 1. Run `locale` in your terminal. If the output includes "UTF-8", you
133 | are all set. If it does not, your terminal is misconfigured and grab-site
134 | will fail to start. This can be corrected with:
135 |
136 | - Terminal.app: Preferences... -> Profiles -> Advanced -> **check** Set locale environment variables on startup
137 |
138 | - iTerm2: Preferences... -> Profiles -> Terminal -> Environment -> **check** Set locale variables automatically
139 |
140 | ### Using Homebrew (**Intel Mac**)
141 |
142 | For M1 Macs, use the next section instead of this one.
143 |
144 | 2. Install Homebrew using the install step on https://brew.sh/
145 |
146 | 3. Run:
147 |
148 | ```
149 | brew update
150 | brew install python@3.8 libxslt re2 pkg-config
151 | /usr/local/opt/python@3.8/bin/python3 -m venv ~/gs-venv
152 | PKG_CONFIG_PATH="/usr/local/opt/libxml2/lib/pkgconfig" ~/gs-venv/bin/pip install --no-binary lxml --upgrade git+https://github.com/ArchiveTeam/grab-site
153 | ```
154 |
155 | 4. To put the `grab-site` binaries in your PATH, add this to your `~/.zshrc` (macOS 10.15, 11+) or `~/.bash_profile` (earlier):
156 |
157 | ```
158 | PATH="$PATH:$HOME/gs-venv/bin"
159 | ```
160 |
161 | and then restart your shell (e.g. by opening a new terminal tab/window).
162 |
163 | ### Using Homebrew (**M1 Mac**)
164 |
165 | 2. Install Homebrew using the install step on https://brew.sh/
166 |
167 | If you already have a Homebrew install at `/usr/local`, you may need to first remove that old Intel-based Homebrew install.
168 |
169 | 3. Run:
170 |
171 | ```
172 | brew update
173 | brew install python@3.8 libxslt re2 pkg-config
174 | /opt/homebrew/opt/python@3.8/bin/python3 -m venv ~/gs-venv
175 | PKG_CONFIG_PATH="/opt/homebrew/opt/libxml2/lib/pkgconfig" ~/gs-venv/bin/pip install --no-binary lxml --upgrade git+https://github.com/ArchiveTeam/grab-site
176 | ```
177 |
178 | 4. To put the `grab-site` binaries in your PATH, add this to your `~/.zshrc` (macOS 10.15, 11+) or `~/.bash_profile` (earlier):
179 |
180 | ```
181 | PATH="$PATH:$HOME/gs-venv/bin"
182 | ```
183 |
184 | and then restart your shell (e.g. by opening a new terminal tab/window).
185 |
186 |
187 |
188 | Install on Windows 10 (experimental)
189 | ---
190 |
191 | On Windows 10 Fall Creators Update (1703) or newer:
192 |
193 | 1. Start menu -> search "feature" -> Turn Windows features on or off
194 |
195 | 2. Scroll down, check "Windows Subsystem for Linux" and click OK.
196 |
197 | 3. Wait for install and click "Restart now"
198 |
199 | 4. Start menu -> Store
200 |
201 | 5. Search for "Ubuntu" in the store and install Ubuntu (publisher: Canonical Group Limited).
202 |
203 | 6. Start menu -> Ubuntu
204 |
205 | 7. Wait for install and create a user when prompted.
206 |
207 | 8. Follow the [Ubuntu 18.04, 20.04, 22.04, Debian 10 (buster), Debian 11 (bullseye)](#install-on-ubuntu-1804-2004-2204-debian-10-buster-debian-11-bullseye) steps.
208 |
209 |
210 |
211 | Upgrade an existing install
212 | ---
213 |
214 | To update grab-site, simply run the `~/gs-venv/bin/pip install ...` or
215 | `nix-env ...` command used to install it originally (see above).
216 |
217 | After upgrading, stop `gs-server` with `kill` or ctrl-c, then start it again.
218 | Existing `grab-site` crawls will automatically reconnect to the new server.
219 |
220 |
221 |
222 | Usage
223 | ---
224 |
225 | First, start the dashboard with:
226 |
227 | ```
228 | gs-server
229 | ```
230 |
231 | and point your browser to http://127.0.0.1:29000/
232 |
233 | Note: gs-server listens on all interfaces by default, so you can reach the
234 | dashboard by a non-localhost IP as well, e.g. a LAN or WAN IP. (Sub-note:
235 | no code execution capabilities are exposed on any interface.)
236 |
237 | Then, start as many crawls as you want with:
238 |
239 | ```
240 | grab-site 'URL'
241 | ```
242 |
243 | Do this inside tmux unless they're very short crawls.
244 |
245 | grab-site outputs WARCs, logs, and control files to a new subdirectory in the
246 | directory from which you launched `grab-site`, referred to here as "DIR".
247 | (Use `ls -lrt` to find it.)
248 |
249 | You can pass multiple `URL` arguments to include them in the same crawl,
250 | whether they are on the same domain or different domains entirely.
251 |
252 | warcprox users: [warcprox](https://github.com/internetarchive/warcprox) breaks the
253 | dashboard's WebSocket; please make your browser skip the proxy for whichever
254 | host/IP you're using to reach the dashboard.
255 |
256 | ### `grab-site` options, ordered by importance
257 |
258 | Options can come before or after the URL.
259 |
260 | * `--1`: grab just `URL` and its page requisites, without recursing.
261 |
262 | * `--igsets=IGSET1,IGSET2`: use ignore sets `IGSET1` and `IGSET2`.
263 |
264 | Ignore sets are used to avoid requesting junk URLs using a pre-made set of
265 | regular expressions. See [the full list of available ignore sets](https://github.com/ArchiveTeam/grab-site/tree/master/libgrabsite/ignore_sets).
266 |
267 | The [global](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/global)
268 | ignore set is implied and enabled unless `--no-global-igset` is used.
269 |
270 | The ignore sets can be changed during the crawl by editing the `DIR/igsets` file.
271 |
272 | * `--no-global-igset`: don't add the [global](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/global) ignore set.
273 |
274 | * `--no-offsite-links`: avoid following links to a depth of 1 on other domains.
275 |
276 | grab-site always grabs page requisites (e.g. inline images and stylesheets), even if
277 | they are on other domains. By default, grab-site also grabs linked pages to a depth
278 | of 1 on other domains. To turn off this behavior, use `--no-offsite-links`.
279 |
280 | Using `--no-offsite-links` may prevent all kinds of useful images, video, audio, downloads,
281 | etc from being grabbed, because these are often hosted on a CDN or subdomain, and
282 | thus would otherwise not be included in the recursive crawl.
283 |
284 | * `-i` / `--input-file`: Load list of URLs-to-grab from a local file or from a
285 | URL; like `wget -i`. File must be a newline-delimited list of URLs.
286 | Combine with `--1` to avoid a recursive crawl on each URL.
287 |
288 | * `--igon`: Print all URLs being ignored to the terminal and dashboard. Can be
289 | changed during the crawl by `touch`ing or `rm`ing the `DIR/igoff` file.
290 | This is slower because it needs to find the specific regexp to blame.
291 |
292 | * `--no-video`: Skip the download of videos by both mime type and file extension.
293 | Skipped videos are logged to `DIR/skipped_videos`. Can be
294 | changed during the crawl by `touch`ing or `rm`ing the `DIR/video` file.
295 |
296 | * `--no-sitemaps`: don't queue URLs from `sitemap.xml` at the root of the site.
297 |
298 | * `--max-content-length=N`: Skip the download of any response that claims a
299 | Content-Length larger than `N`. (default: -1, don't skip anything).
300 | Skipped URLs are logged to `DIR/skipped_max_content_length`. Can be changed
301 | during the crawl by editing the `DIR/max_content_length` file.
302 |
303 | * `--no-dupespotter`: Disable dupespotter, a plugin that skips the extraction
304 | of links from pages that look like duplicates of earlier pages. Disable this
305 | for sites that are directory listings, because they frequently trigger false
306 | positives.
307 |
308 | * `--concurrency=N`: Use `N` connections to fetch in parallel (default: 2).
309 | Can be changed during the crawl by editing the `DIR/concurrency` file.
310 |
311 | * `--delay=N`: Wait `N` milliseconds (default: 0) between requests on each concurrent fetcher.
312 | Can be a range like X-Y to use a random delay between X and Y. Can be changed during
313 | the crawl by editing the `DIR/delay` file.
314 |
315 | * `--import-ignores`: Copy this file to to `DIR/ignores` before the crawl begins.
316 |
317 | * `--warc-max-size=BYTES`: Try to limit each WARC file to around `BYTES` bytes
318 | before rolling over to a new WARC file (default: 5368709120, which is 5GiB).
319 | Note that the resulting WARC files may be drastically larger if there are very
320 | large responses.
321 |
322 | * `--level=N`: recurse `N` levels instead of `inf` levels.
323 |
324 | * `--page-requisites-level=N`: recurse page requisites `N` levels instead of `5` levels.
325 |
326 | * `--ua=STRING`: Send User-Agent: `STRING` instead of pretending to be Firefox on Windows.
327 |
328 | * `--id=ID`: Use id `ID` for the crawl instead of a random 128-bit id. This must be unique for every crawl.
329 |
330 | * `--dir=DIR`: Put control files, temporary files, and unfinished WARCs in `DIR`
331 | (default: a directory name based on the URL, date, and first 8 characters of the id).
332 |
333 | * `--finished-warc-dir=FINISHED_WARC_DIR`: absolute path to a directory into
334 | which finished `.warc.gz` and `.cdx` files will be moved.
335 |
336 | * `--permanent-error-status-codes=STATUS_CODES`: A comma-separated list of
337 | HTTP status codes to treat as a permanent error and therefore **not** retry
338 | (default: `401,403,404,405,410`). Other error responses tried another 2
339 | times for a total of 3 tries (customizable with `--wpull-args=--tries=N`).
340 | Note that, unlike wget, wpull puts retries at the end of the queue.
341 |
342 | * `--wpull-args=ARGS`: String containing additional arguments to pass to wpull;
343 | see `wpull --help`. `ARGS` is split with `shlex.split` and individual
344 | arguments can contain spaces if quoted, e.g.
345 | `--wpull-args="--youtube-dl \"--youtube-dl-exe=/My Documents/youtube-dl\""`
346 |
347 | Examples:
348 |
349 | * `--wpull-args=--no-skip-getaddrinfo` to respect `/etc/hosts` entries.
350 | * `--wpull-args=--no-warc-compression` to write uncompressed WARC files.
351 |
352 | * `--which-wpull-args-partial`: Print a partial list of wpull arguments that
353 | would be used and exit. Excludes grab-site-specific features, and removes
354 | `DIR/` from paths. Useful for reporting bugs on wpull without grab-site involvement.
355 |
356 | * `--which-wpull-command`: Populate `DIR/` but don't start wpull; instead print
357 | the command that would have been used to start wpull with all of the
358 | grab-site functionality.
359 |
360 | * `--debug`: print a lot of debug information.
361 |
362 | * `--help`: print help text.
363 |
364 | ### Warnings
365 |
366 | If you pay no attention to your crawls, a crawl may head down some infinite bot
367 | trap and stay there forever. The site owner may eventually notice high CPU use
368 | or log activity, then IP-ban you.
369 |
370 | grab-site does not respect `robots.txt` files, because they frequently
371 | [whitelist only approved robots](https://github.com/robots.txt),
372 | [hide pages embarrassing to the site owner](https://web.archive.org/web/20140401024610/http://www.thecrimson.com/robots.txt),
373 | or block image or stylesheet resources needed for proper archival.
374 | [See also](https://www.archiveteam.org/index.php?title=Robots.txt).
375 | Because of this, very rarely you might run into a robot honeypot and receive
376 | an abuse@ complaint. Your host may require a prompt response to such a complaint
377 | for your server to stay online. Therefore, we recommend against crawling the
378 | web from a server that hosts your critical infrastructure.
379 |
380 | Don't run grab-site on GCE (Google Compute Engine); as happened to me, your
381 | entire API project may get nuked after a few days of crawling the web, with
382 | no recourse. Good alternatives include OVH ([OVH](https://www.ovh.com/us/dedicated-servers/),
383 | [So You Start](https://www.soyoustart.com/us/essential-servers/),
384 | [Kimsufi](https://www.kimsufi.com/us/en/index.xml)), and online.net's
385 | [dedicated](https://www.online.net/en/dedicated-server) and
386 | [Scaleway](https://www.scaleway.com/) offerings.
387 |
388 | ### Tips for specific websites
389 |
390 | #### Website requiring login / cookies
391 |
392 | Log in to the website in Chrome or Firefox. Use the cookies.txt extension
393 | [for Chrome](https://github.com/daftano/cookies.txt) or
394 | [for Firefox](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/)
395 | extension to copy Netscape-format cookies. Paste the cookies data into a new
396 | file. Start grab-site with `--wpull-args=--load-cookies=ABSOLUTE_PATH_TO_COOKIES_FILE`.
397 |
398 | #### Static websites; WordPress blogs; Discourse forums
399 |
400 | The defaults usually work fine.
401 |
402 | #### Blogger / blogspot.com blogs
403 |
404 | The defaults work fine except for blogs with a JavaScript-only Dynamic Views theme.
405 |
406 | Some blogspot.com blogs use "[Dynamic Views](https://support.google.com/blogger/answer/1229061?hl=en)"
407 | themes that require JavaScript and serve absolutely no HTML content. In rare
408 | cases, you can get JavaScript-free pages by appending `?m=1`
409 | ([example](https://happinessbeyondthought.blogspot.com/?m=1)). Otherwise, you
410 | can archive parts of these blogs through Google Cache instead
411 | ([example](https://webcache.googleusercontent.com/search?q=cache:http://blog.datomic.com/))
412 | or by using https://archive.is/ instead of grab-site.
413 |
414 | #### Tumblr blogs
415 |
416 | Either don't crawl from Europe (because tumblr redirects to a GDPR `/privacy/consent` page), or add `Googlebot` to the user agent:
417 |
418 | ```
419 | --ua "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/70.0 but not really nor Googlebot/2.1"
420 | ```
421 |
422 | Use [`--igsets=singletumblr`](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/singletumblr)
423 | to avoid crawling the homepages of other tumblr blogs.
424 |
425 | If you don't care about who liked or reblogged a post, add `\?from_c=` to the
426 | crawl's `ignores`.
427 |
428 | Some tumblr blogs appear to require JavaScript, but they are actually just
429 | hiding the page content with CSS. You are still likely to get a complete crawl.
430 | (See the links in the page source for https://X.tumblr.com/archive).
431 |
432 | #### Subreddits
433 |
434 | Use [`--igsets=reddit`](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/reddit)
435 | and add a `/` at the end of the URL to avoid crawling all subreddits.
436 |
437 | When crawling a subreddit, you **must** get the casing of the subreddit right
438 | for the recursive crawl to work. For example,
439 |
440 | ```
441 | grab-site https://www.reddit.com/r/Oculus/ --igsets=reddit
442 | ```
443 |
444 | will crawl only a few pages instead of the entire subreddit. The correct casing is:
445 |
446 | ```
447 | grab-site https://www.reddit.com/r/oculus/ --igsets=reddit
448 | ```
449 |
450 | You can hover over the "Hot"/"New"/... links at the top of the page to see the correct casing.
451 |
452 | #### Directory listings ("Index of ...")
453 |
454 | Use `--no-dupespotter` to avoid triggering false positives on the duplicate
455 | page detector. Without it, the crawl may miss large parts of the directory tree.
456 |
457 | #### Very large websites
458 |
459 | Use `--no-offsite-links` to stay on the main website and avoid crawling linked pages on other domains.
460 |
461 | #### Websites that are likely to ban you for crawling fast
462 |
463 | Use `--concurrency=1 --delay=500-1500`.
464 |
465 | #### MediaWiki sites with English language
466 |
467 | Use [`--igsets=mediawiki`](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/mediawiki).
468 | Note that this ignore set ignores old page revisions.
469 |
470 | #### MediaWiki sites with non-English language
471 |
472 | You will probably have to add ignores with translated `Special:*` URLs based on
473 | [ignore_sets/mediawiki](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/mediawiki).
474 |
475 | #### Forums that aren't Discourse
476 |
477 | Forums require more manual intervention with ignore patterns.
478 | [`--igsets=forums`](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/forums)
479 | is often useful for most forums, but you will have to add other ignore
480 | patterns, including one to ignore individual-forum-post pages if there are
481 | too many posts to crawl. (Generally, crawling the thread pages is enough.)
482 |
483 | #### GitHub issues / pull requests
484 |
485 | Find the highest issue number from an issues page ([example](https://github.com/rust-lang/rust/issues)) and use:
486 |
487 | ```
488 | grab-site --1 https://github.com/rust-lang/rust/issues/{1..30000}
489 | ```
490 |
491 | This relies on your shell to expand the argument to thousands of arguments.
492 | If there are too many arguments, you may have to write the URLs to a file
493 | and use `grab-site -i` instead:
494 |
495 | ```
496 | for i in {1..30000}; do echo https://github.com/rust-lang/rust/issues/$i >> .urls; done
497 | grab-site --1 -i .urls
498 | ```
499 |
500 | #### Websites whose domains have just expired but are still up at the webhost
501 |
502 | Use a [DNS history](https://www.google.com/search?q=historical+OR+history+dns)
503 | service to find the old IP address (the DNS "A" record) for the domain. Add a
504 | line to your `/etc/hosts` to point the domain to the old IP. Start a crawl
505 | with `--wpull-args=--no-skip-getaddrinfo` to make wpull use `/etc/hosts`.
506 |
507 | #### twitter.com/user
508 |
509 | Use [snscrape](https://github.com/JustAnotherArchivist/snscrape) to get a list
510 | of tweets for a user. Redirect `snscrape`'s output to a list of URLs with
511 | `> urls` and pass this file to `grab-site --1 -i urls`.
512 |
513 | Alternatively, use [webrecorder.io](https://webrecorder.io/) instead of
514 | grab-site. It has an autoscroll feature and you can download the WARCs.
515 |
516 | Keep in mind that scrolling `twitter.com/user` returns a maximum of 3200 tweets,
517 | while a [from:user](https://twitter.com/search?q=from%3Ainternetarchive&src=typd&f=realtime&qf=off&lang=en)
518 | query can return more.
519 |
520 |
521 |
522 | Changing ignores during the crawl
523 | ---
524 | While the crawl is running, you can edit `DIR/ignores` and `DIR/igsets`; the
525 | changes will be applied within a few seconds.
526 |
527 | `DIR/igsets` is a comma-separated list of ignore sets to use.
528 |
529 | `DIR/ignores` is a newline-separated list of [Python 3 regular expressions](https://pythex.org/)
530 | to use in addition to the ignore sets.
531 |
532 | You can `rm DIR/igoff` to display all URLs that are being filtered out
533 | by the ignores, and `touch DIR/igoff` to turn it back off.
534 |
535 | Note that ignores will not apply to any of the crawl's start URLs.
536 |
537 |
538 |
539 | Inspecting the URL queue
540 | ---
541 | Inspecting the URL queue is usually not necessary, but may be helpful
542 | for adding ignores before grab-site crawls a large number of junk URLs.
543 |
544 | To dump the queue, run:
545 |
546 | ```
547 | gs-dump-urls DIR/wpull.db todo
548 | ```
549 |
550 | Four other statuses can be used besides `todo`:
551 | `done`, `error`, `in_progress`, and `skipped`.
552 |
553 | You may want to pipe the output to `sort` and `less`:
554 |
555 | ```
556 | gs-dump-urls DIR/wpull.db todo | sort | less -S
557 | ```
558 |
559 |
560 |
561 | Preventing a crawl from queuing any more URLs
562 | ---
563 | `rm DIR/scrape`. Responses will no longer be scraped for URLs. Scraping cannot
564 | be re-enabled for a crawl.
565 |
566 |
567 |
568 | Stopping a crawl
569 | ---
570 | You can `touch DIR/stop` or press ctrl-c, which will do the same. You will
571 | have to wait for the current downloads to finish.
572 |
573 |
574 |
575 | Advanced `gs-server` options
576 | ---
577 | These environmental variables control what `gs-server` listens on:
578 |
579 | * `GRAB_SITE_INTERFACE` (default `0.0.0.0`)
580 | * `GRAB_SITE_PORT` (default `29000`)
581 |
582 | These environmental variables control which server each `grab-site` process connects to:
583 |
584 | * `GRAB_SITE_HOST` (default `127.0.0.1`)
585 | * `GRAB_SITE_PORT` (default `29000`)
586 |
587 |
588 |
589 | Viewing the content in your WARC archives
590 | ---
591 |
592 | Try [ReplayWeb.page](https://replayweb.page/) or [webrecorder-player](https://github.com/webrecorder/webrecorder-player).
593 |
594 |
595 |
596 | Inspecting WARC files in the terminal
597 | ---
598 | `zless` is a wrapper over `less` that can be used to view raw WARC content:
599 |
600 | ```
601 | zless DIR/FILE.warc.gz
602 | ```
603 |
604 | `zless -S` will turn off line wrapping.
605 |
606 | Note that grab-site requests uncompressed HTTP responses to avoid
607 | double-compression in .warc.gz files and to make zless output more useful.
608 | However, some servers will send compressed responses anyway.
609 |
610 |
611 |
612 | Automatically pausing grab-site processes when free disk is low
613 | ---
614 |
615 | If you automatically upload and remove finished .warc.gz files, you can still
616 | run into a situation where grab-site processes fill up your disk faster than
617 | your uploader process can handle. To prevent this situation, you can customize
618 | and run [this script](https://github.com/ArchiveTeam/grab-site/blob/master/extra_docs/pause_resume_grab_sites.sh),
619 | which will pause and resume grab-site processes as your free disk space
620 | crosses a threshold value.
621 |
622 |
623 |
624 | Thanks
625 | ---
626 |
627 | grab-site is made possible only because of [wpull](https://github.com/chfoo/wpull),
628 | written by [Christopher Foo](https://github.com/chfoo) who spent a year
629 | making something much better than wget. ArchiveTeam's most pressing
630 | issue with wget at the time was that it kept the entire URL queue in memory
631 | instead of on disk. wpull has many other advantages over wget, including
632 | better link extraction and Python hooks.
633 |
634 | Thanks to [David Yip](https://github.com/yipdw), who created
635 | [ArchiveBot](https://github.com/ArchiveTeam/ArchiveBot). The wpull
636 | hooks in ArchiveBot served as the basis for grab-site. The original ArchiveBot
637 | dashboard inspired the newer dashboard now used in both projects.
638 |
639 | Thanks to [Falcon Darkstar Momot](https://github.com/falconkirtaran) for
640 | the many wpull 2.x fixes that were rolled into
641 | [ArchiveTeam/wpull](https://github.com/ArchiveTeam/wpull).
642 |
643 | Thanks to [JustAnotherArchivist](https://github.com/JustAnotherArchivist)
644 | for investigating my wpull issues.
645 |
646 | Thanks to [BrowserStack](https://www.browserstack.com/) for providing free
647 | browser testing for grab-site, which we use to make sure the dashboard works
648 | in various browsers.
649 |
650 | [
](https://www.browserstack.com/)
651 |
652 |
653 |
654 | Help
655 | ---
656 | grab-site bugs and questions are welcome in
657 | [grab-site/issues](https://github.com/ArchiveTeam/grab-site/issues).
658 |
659 | Terminal output in your bug report should be surrounded by triple backquotes, like this:
660 |
661 |
.*', b"", body)
161 |
162 | # nsslabs.com has this
163 | body = re.sub(br'
.{1,4000}?
', b"", body)
164 |
165 | # sbs.com.au has generated /css_ filenames
166 | body = re.sub(br'/css_[-_A-Za-z0-9]{10,100}\.css', b"", body)
167 |
168 | # stopbadware.org has some differing autogenerated ', b"", body)
170 |
171 | # Drupal generates items based on the URL
172 | # Generated class="" also spotted on non-Drupal www.minutouno.com
173 | # Duplicate class="" on stopbadware.org
174 | body = re.sub(br'<(body|div)( id="[^"]+")? class="[^"]+"( class="[^"]+")?( data-src="[^"]{1,2000}")?', b"", body)
175 |
176 | return body
177 |
178 |
179 | def compare_bodies(body1, body2, url1, url2):
180 | # TODO: handle non-utf-8 bodies
181 | for line in difflib.unified_diff(
182 | body1.decode("utf-8", "replace").splitlines(keepends=True),
183 | body2.decode("utf-8", "replace").splitlines(keepends=True),
184 | fromfile=url1,
185 | tofile=url2):
186 | if not "\n" in line:
187 | line += "\n"
188 | sys.stdout.buffer.write(line.encode("utf-8"))
189 |
190 |
191 | def compare_unprocessed_bodies(up_body1, up_body2, url1, url2):
192 | body1 = process_body(up_body1, url1)
193 | body2 = process_body(up_body2, url2)
194 | print("{} == md5({!r})".format(md5_url(url1), url1))
195 | print("{} == md5({!r})".format(md5_url(url2), url2))
196 | print("After processing,")
197 | print("len(body({!r})) == {}".format(url1, len(body1)))
198 | print("len(body({!r})) == {}".format(url2, len(body2)))
199 | compare_bodies(body1, body2, url1, url2)
200 |
201 |
202 | def main():
203 | try:
204 | os.makedirs(cache_dir)
205 | except OSError:
206 | pass
207 |
208 | assert os.path.exists(cache_dir)
209 |
210 | if len(sys.argv) == 2:
211 | # Just save and print the body
212 | print(get_body(sys.argv[1]))
213 | elif len(sys.argv) == 3:
214 | url1, url2 = sys.argv[1], sys.argv[2]
215 | compare_unprocessed_bodies(get_body(url1), get_body(url2), url1, url2)
216 | else:
217 | assert 0, sys.argv
218 |
219 |
220 | if __name__ == '__main__':
221 | main()
222 |
--------------------------------------------------------------------------------
/libgrabsite/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArchiveTeam/grab-site/0eaf88628f4d8ef5af4df4c13f594606a41de3cd/libgrabsite/favicon.ico
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/blogs:
--------------------------------------------------------------------------------
1 | # All 'blogs' ignores have been migrated to 'global';
2 | # there is no need to specify --igsets=blogs
3 |
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/coppermine:
--------------------------------------------------------------------------------
1 | (?:displayimage|thumbnails)\.php[?&]album=(?:topn|toprated|lastcom|lastup|lastupby|random|lastcomby)
2 | ratepic\.php
3 | addfav\.php\?.*ref=displayimage\.php
4 | displayimage\.php\?.*slideshow=\d+
5 |
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/facebook:
--------------------------------------------------------------------------------
1 | ^https?://error\.facebook\.com/common/scribe_endpoint\.php\?c=
2 | ^https?://www\.facebook\.com/[^/]+/(posts/|app_)[^/]+\?(ref=page_internal&)?_fb_noscript=
3 | ^https?://www\.facebook\.com/[^/]+/photos/(pb|a)\.[^/]+/[^/]+/.{4}/
4 | ^https?://www\.facebook\.com/[^/]+/photos/(pb|a)\.[^/]+/[^/]+/\?type=
5 |
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/forums:
--------------------------------------------------------------------------------
1 | /cron\.php\?
2 | /external\.php\?type=rss
3 | /login\.php\?
4 | /newreply\.php\?
5 | /private\.php\?
6 | /privmsg\.php\?
7 | /register\.php\?
8 | /sendmessage\.php\?
9 | /subscription\.php\?
10 | /posting\.php\?
11 | /viewtopic\.php\?.+&view=(next|previous)
12 | /viewtopic\.php\?.+&hilit=
13 | /feed\.php\?
14 | /index\.php\?option=com_mailto
15 | &view=login&return=
16 | &format=opensearch
17 | /misc\.php\?do=whoposted
18 | /newthread\.php\?
19 | /post_thanks\.php\?
20 | /blog_post\.php\?do=newblog
21 | /forumdisplay\.php.*[\?&]do=markread
22 | /userpoll/vote\.php\?
23 | /showthread\.php.*[\?&]goto=(next(old|new)est|newpost)
24 | /editpost\.php\?
25 | /\?view=getlastpost$
26 | /index\.php\?sharelink=
27 | /ucp\.php\?mode=delete_cookies
28 | /index.php\?action=(verificationcode|reporttm|emailuser|quickmod2)
29 |
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/global:
--------------------------------------------------------------------------------
1 | # URLs that are very likely to be endless loops
2 | %25252525
3 | /App_Themes/.+/App_Themes/
4 | /bxSlider/.+/bxSlider/
5 | /bxSlider/bxSlider/
6 | /slides/slides/.+/slides/
7 | /slides/.+/slides/slides/
8 | /slides/slides/slides/
9 | /js/js/.+/js/
10 | /js/.+/js/js/
11 | /js/js/js/
12 | /css/css/.+/css/
13 | /css/.+/css/css/
14 | /css/css/css/
15 | /styles/styles/.+/styles/
16 | /styles/.+/styles/styles/
17 | /styles/styles/styles/
18 | /scripts/scripts/.+/scripts/
19 | /scripts/.+/scripts/scripts/
20 | /scripts/scripts/scripts/
21 | /images/images/.+/images/
22 | /images/.+/images/images/
23 | /images/images/images/
24 | /img/img/.+/img/
25 | /img/.+/img/img/
26 | /img/img/img/
27 | /clientscript/clientscript/.+/clientscript/
28 | /clientscript/.+/clientscript/clientscript/
29 | /clientscript/clientscript/clientscript/
30 | /lib/exe/.*lib[-_]exe[-_]lib[-_]exe[-_]
31 | ^https?://{any_start_netloc}/.*&amp;
32 | ^https?://{any_start_netloc}/.*amp%3Bamp%3Bamp%3B
33 | ^https?://{any_start_netloc}/.+/plugins/ultimate-social-media-plus/.+/like/like/
34 |
35 | # URLs that are very likely incorrectly extracted by wpull
36 | /(%5C)+(%22|%27)
37 | /%5C/%5C/
38 | /%27\+[^/]+\+%27
39 | /%22\+[^/]+\+%22
40 | /%27%20\+[^/]+\+%20%27
41 | /%22%20\+[^/]+\+%20%22
42 | /\\+(%22|%27)
43 | /\\+["']
44 | /\\/\\/
45 | /'\+[^/]+\+'
46 | ^https?://{any_start_netloc}/.+/%3Ca%20href=
47 | ^https?://www\.youtube\.com/.*\[\[.+\]\]
48 | ^https?://www\.youtube\.com/.*\{\{.+\}\}
49 |
50 | ^https?://www\.google\.com/recaptcha/(api|mailhide/d\?)
51 | ^https?://www\.google\.com/accounts/AccountChooser
52 | ^https?://accounts\.google\.com/(SignUp|ServiceLogin|AccountChooser|a/UniversalLogin)
53 |
54 | # CAPTCHAs on ASP.NET sites
55 | ^https?://[^/]+/.+/CaptchaImage\.axd
56 |
57 | # We don't want to change language
58 | ^https?://www\.flickr\.com/change_language\.gne
59 |
60 | # Tracking scripts, tracking pixels, analytics
61 | ^https?://geo\.yahoo\.com/b\?
62 | ^https?://b\.scorecardresearch\.com/
63 | ^https?://pixel\.blog\.hu/
64 | ^https?://pixel\.redditmedia\.com/pixel/
65 | ^https?://alb\.reddit\.com/
66 | ^https?://pixel\.(quantserve|wp)\.com/
67 | ^https?://(www|ssl)\.google-analytics\.com/(r/)?(__utm\.gif|collect\?)
68 | ^https?://p\.opt\.fimserve\.com/
69 | ^https?://.+/js-agent\.newrelic\.com/nr-\d{3}(\.min)?\.js$
70 | ^https?://.+/stats\.g\.doubleclick\.net/dc\.js$
71 | ^https?://.+/js/chartbeat\.js$
72 | ^https?://[^/]+\.xiti\.com/hit\.xiti\?
73 | ^https?://[^/]+\.services\.livejournal\.com/ljcounter
74 | ^https?://beacon\.wikia-services\.com/
75 | ^https?://s\d+\.sitemeter\.com/(js/counter\.js|meter\.asp)
76 | ^https?://www\.amazon\.com/.+/logging/log-action\.html
77 |
78 | # The tracking on warnerbros.com inexplicably links to bogus warnerbros.com/\d+ pages
79 | ^https?://www\.warnerbros\.com/\d+$
80 |
81 | # Inaccessible and dead sites that are frequently-linked
82 | ^https?://i\.dev\.cdn\.turner\.com/
83 | ^https?://[^/]+\.corp\.ne1\.yahoo\.com/
84 | ^https?://prod-preview\.wired\.com/
85 | ^https?://(www\.)?(megaupload|filesonic|wupload)\.com/
86 |
87 | # Links to TED and TED embeds are common enough that we need to ignore their
88 | # videos to prevent WARC bloat
89 | ^https?://video-subtitle\.tedcdn\.com/
90 | ^https?://download\.ted\.com/
91 |
92 | # Avoid bloating WARCs with TMZ videos
93 | ^https?://tmz\.vo\.llnwd\.net/
94 |
95 | # Avoid hitting radio and TV streams, which can hang crawls for a long time.
96 | # Note that we also detect and abort Icecast/SHOUTcast responses in
97 | # wpull_hooks.py, so some of these ignores are no longer necessary.
98 | ^https?://([^\./]+\.)?stream\.publicradio\.org/
99 | ^https?://av\.rasset\.ie/av/live/
100 | ^https?://gcnplayer\.gcnlive\.com/.+
101 | ^https?://mp3\.ffh\.de/
102 | ^https?://(audio\d?|nfw)\.video\.ria\.ru/
103 | ^https?://[^\./]+\.radioscoop\.(com|net):\d+/
104 | ^https?://[^\./]+\.streamchan\.org:\d+/
105 | ^https?://[^/]*musicproxy\.s12\.de/
106 | ^https?://relay\.broadcastify\.com/
107 | ^https?://audio\d?\.radioreference\.com/
108 | ^https?://[^/]+\.akadostream\.ru(:\d+)?/
109 | ^https?://play(\d+)?\.radio13\.ru:8000/
110 | ^https?://stream(\d+)?\.media\.rambler\.ru/
111 | ^https?://pub(\d+)?\.di\.fm/
112 | ^https?://[^/]+\.streamtheworld\.com/
113 | ^https?://[^/]+\.gaduradio\.pl/
114 | ^https?://r-a-d\.io/.+\.mp3$
115 | ^https?://mp3tslg\.tdf-cdn\.com/
116 | ^https?://[^/]+/anony/mjpg\.cgi$
117 | ^https?://[^/]+/mjpg/video\.mjpg
118 | ^https?://air\.radiorecord\.ru(:\d+)?/
119 | ^https?://[^/]+\.rastream\.com(:\d+)?/
120 | ^https?://audiots\.scdn\.arkena\.com/
121 | ^https?://[a-z0-9]+\.cdn\.dvmr\.fr(:\d+)?/.+\.mp3
122 |
123 | # Avoid following any kind of 'share' or 'bookmark' link
124 | ^https?://(www|draft)\.blogger\.com/(navbar\.g|post-edit\.g|delete-comment\.g|comment-iframe\.g|share-post\.g|email-post\.g|blog-this\.g|delete-backlink\.g|rearrange|blog_this\.pyra)\?
125 | ^https?://(www|px\.srvcs)\.tumblr\.com/(impixu\?|share(/link/?)?\?|reblog/)
126 | ^https?://plus\.google\.com/share\?
127 | ^https?://(apis|plusone)\.google\.com/_/\+1/
128 | ^https?://(ssl\.|www\.)?reddit\.com/(login\?dest=|submit\?|static/button/button)
129 | ^https?://(www\.)?digg\.com/submit\?
130 | ^https?://(www\.)?facebook\.com/(plugins/(share_button|like(box)?)\.php|sharer/sharer\.php|sharer?\.php|dialog/(feed|share))\?
131 | ^https?://(www\.)?facebook\.com/v[\d\.]+/plugins/like\.php
132 | ^https?://social-plugins\.line\.me/lineit/share
133 | ^https?://(www\.)?twitter\.com/(share\?|intent/((re)?tweet|favorite)|home/?\?status=|\?status=)
134 | ^https?://platform\d?\.twitter\.com/widgets/tweet_button.html\?
135 | ^https?://www\.newsvine\.com/_wine/save\?
136 | ^https?://www\.netvibes\.com/subscribe\.php\?
137 | ^https?://add\.my\.yahoo\.com/(rss|content)\?
138 | ^https?://www\.addtoany\.com/(add_to/|share_save\?)
139 | ^https?://www\.addthis\.com/bookmark\.php\?
140 | ^https?://([^\.]+\.)?pinterest\.com/pin/create/
141 | ^https?://www\.linkedin\.com/(cws/share|shareArticle)\?
142 | ^https?://(www\.)?stumbleupon\.com/(submit\?|badge/embed/)
143 | ^https?://csp\.cyworld\.com/bi/bi_recommend_pop\.php\?
144 | ^https?://share\.flipboard\.com/bookmarklet/popout\?
145 | ^https?://flattr.com/submit/auto\?
146 | ^https?://(www\.)?myspace\.com/Modules/PostTo/
147 | ^https?://www\.google\.com/bookmarks/mark\?
148 | ^https?://myweb2\.search\.yahoo\.com/myresults/bookmarklet\?
149 | ^https?://vuible\.com/pins-settings/
150 | ^https?://news\.ycombinator\.com/submitlink\?
151 | ^https?://reporter\.es\.msn\.com/\?fn=contribute
152 | ^https?://www\.blinklist\.com/index\.php\?Action=Blink/addblink\.php
153 | ^https?://sphinn\.com/index\.php\?c=post&m=submit&
154 | ^https?://posterous\.com/share\?
155 | ^https?://del\.icio\.us/post\?
156 | ^https?://delicious\.com/(save|post)\?
157 | ^https?://(www\.)?friendfeed\.com/share\?
158 | ^https?://(www\.)?xing\.com/(app/user\?op=share|social_plugins/share\?)
159 | ^https?://iwiw\.hu/pages/share/share\.jsp\?
160 | ^https?://memori(\.qip)?\.ru/link/\?
161 | ^https?://wow\.ya\.ru/posts_(add|share)_link\.xml\?
162 | ^https?://connect\.mail\.ru/share\?
163 | ^https?://zakladki\.yandex\.ru/newlink\.xml\?
164 | ^https?://vkontakte\.ru/share\.php\?
165 | ^https?://www\.odnoklassniki\.ru/dk\?st\.cmd=addShare
166 | ^https?://www\.google\.com/(reader/link\?|buzz/post\?)
167 | ^https?://service\.weibo\.com/share/share\.php\?
168 | ^https?://(www\.)?technorati\.com/faves/?\?add=
169 | ^https?://bufferapp\.com/add\?
170 | ^https?://b\.hatena\.ne\.jp/add\?
171 | ^https?://api\.addthis\.com/
172 | ^https?://bookmark\.naver\.com/post\?
173 | ^https?://(www\.)?instapaper\.com/hello2\?
174 | ^https?://getpocket\.com/(save|edit)/?\?
175 | ^https?://medium\.com/_/(vote|bookmark|subscribe)/
176 | ^https?://telegram\.me/share/url\?
177 |
178 | # mail.google.com requires login but shows up on the web surprisingly often
179 | ^https?://mail\.google\.com/mail/
180 |
181 | # This is the default gravatar that you don't want a million copies of
182 | ^https?://(\d|www|secure)\.gravatar\.com/avatar/ad516503a11cd5ca435acc9bb6523536
183 |
184 | # imageshack's 404 page that you would be hitting quite often otherwise
185 | ^https?://imageshack\.com/lost$
186 |
187 | # A loop on khaleejtimes.com
188 | ^https?://www\.khaleejtimes\.com/.+/kt_.+/kt_
189 | ^https?://www\.khaleejtimes\.com/.+/images/.+/images/
190 | ^https?://www\.khaleejtimes\.com/.+/imgactv/.+/imgactv/
191 |
192 | # More loops
193 | ^https?://photobucket\.com/.+/albums/.+/albums/
194 | ^https?://([^/]+\.)?gdcvault\.com(/.*/|/)(fonts(/.*/|/)fonts/|css(/.*/|/)css/|img(/.*/|/)img/)
195 | ^https?://static\.licdn\.com/sc/p/com\.linkedin\.nux(:|%3A)nux-static-content(\+|%2B)[\d\.]+/f/
196 | ^https?://static\.licdn\.com/sc/p/.+/f//
197 | ^https?://tm\.uol\.com\.br/h/.+/h/
198 | ^https?://((s-)?static\.ak\.fbcdn\.net|(connect\.|www\.)?facebook\.com)/connect\.php/js/.*rsrc\.php
199 | ^https?://web\.archive\.org/web/[^/]+/https?\:/[^/]+\.addthis\.com/.+/static/.+/static/
200 | ^https?://[^/]+\.libsyn\.com/.+/%2[02]https?:/
201 | ^https?://www\.infomous\.com/cloud_widget/lib/lib/
202 |
203 | # This specifically catches only *invalid* flickr.com links extracted by wpull
204 | ^https?://www\.flickr\.com/(explore/|photos/[^/]+/(sets/\d+/(page\d+/)?)?)\d+_[a-f0-9]+(_[a-z])?\.jpg$
205 |
206 | # Avoid grabbing thousands of these; they page-requisite each other
207 | ^https?://media\.opb\.org/clips/embed/.+\.js$
208 |
209 | # Per-post and per-comment Atom feeds
210 | ^https?://www\.blogger\.com/feeds/\d+/posts/default/\d+
211 | ^https?://www\.blogger\.com/feeds/\d+/\d+/comments/default/\d+
212 |
213 | # Bogus /disqus.com path
214 | ^https?://.+/.+/disqus\.com/forums/$
215 |
216 | # Bogus literal "/page/%d/" URLs (not filled with a number)
217 | ^https?://{any_start_netloc}(/.*|/)page/%d/$
218 |
219 | # Bogus URLs on tumblr blogs
220 | ^https?://{any_start_netloc}/.*(\?|%5Cx26)route=(/page/:page|/archive/:year/:month|/tagged/:tag|/post/:id|/image/:post_id)
221 | ^https?://{any_start_netloc}/.*%5Cx26route=/archive
222 |
223 | # There are too many avatars on tumblr.com
224 | ^https?://\d+\.media\.tumblr\.com/avatar_.+_16\.pn[gj]$
225 |
226 | ^https?://www\.livejournal\.com/(tools/memadd|update|(identity/)?login)\.bml\?
227 | ^https?://[^\.]+\.livejournal\.com/.+/\*sup_ru/ru/UTF-8/
228 | ^https?://[^\.]+\.livejournal\.com/.+http://[^\.]+\.livejournal\.com/
229 |
230 | ^https?://www\.dreamwidth\.org/tools/(memadd|tellafriend)\?
231 |
232 | ^https?://r-login\.wordpress\.com/remote-login\.php
233 | ^https?://{any_start_netloc}/(wp-admin/|wp-login\.php\?)
234 | ^https?://[^/]+\.facebook\.com/login\.php
235 |
236 | # Ignore /search.*updated-(min|max)= blogspot pagination because all posts are
237 | # crawled anyway via the _archive.html pages. Need to ignore on all domains
238 | # because blogspot also runs on non-blogspot.com domains.
239 | ^https?://{any_start_netloc}/search(/label/[^\?]+|\?q=[^&]+|)[\?&]updated-(min|max)=\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d.*&max-results=\d+
240 |
241 | # Ignore bogus /CSI/ links on blogspot.com
242 | ^https?://.+\.blogspot\.(com|in|com\.au|co\.uk|jp|co\.nz|ca|de|it|fr|se|sg|es|pt|com\.br|ar|mx|kr)/(\d{4}/\d{2}/|search/label/)(CSI/$|.*/CSI/CSI/CSI/)
243 |
244 | # Links to ?share=(twitter|facebook|reddit|email|google-plus-1) etc.
245 | # These typically redirect.
246 | ^https?://{any_start_netloc}/.+[\?&]share=[a-z]{4,}
247 |
248 | # Per-comment links
249 | ^https?://{any_start_netloc}/.+[\?&]mode=reply
250 | ^https?://{any_start_netloc}/.+[\?&](replyto(com)?|like_comment)=\d+
251 | ^https?://{any_start_netloc}/.+\?showComment(=|%5C)\d+
252 | ^https?://{any_start_netloc}/.+/quote-comment-\d+/$
253 | ^https?://{any_start_netloc}/.+/jetpack-comment/\?blogid=\d+&postid=\d+
254 |
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/imdb:
--------------------------------------------------------------------------------
1 | # Intended for archiving imdb forums
2 |
3 | ^http://b\.scorecardresearch\.com/
4 | ^http://ad\.doubleclick\.net/
5 | ^http://www\.imdb\.com/rd/
6 | ^http://www\.imdb\.com/.+\?ref_=
7 | ^http://www\.imdb\.com/.+/board/flat/
8 | ^http://www\.imdb\.com/.+/board/inline/
9 | ^http://www\.imdb\.com/.+/board/thread/
10 | ^http://www\.imdb\.com/help/boards_posting\.html
11 | ^http://www\.imdb\.com/register/
12 | ^http://www\.imdb\.com/.+/board/.+/\d+\?d=
13 | ^http://www\.imdb\.com/.+/videogallery/.+/.+/
14 |
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/mediawiki:
--------------------------------------------------------------------------------
1 | # This ignore set avoids grabbing the full history of each page, because there
2 | # are generally far too many ?oldid= pages to crawl completely.
3 | ^https?://{any_start_netloc}/.+[\?&]oldid=\d+
4 | ^https?://{any_start_netloc}/.+[\?&]curid=\d+
5 | ^https?://{any_start_netloc}/.+[\?&]limit=(20|100|250|500)
6 | ^https?://{any_start_netloc}/.+[\?&]hide(minor|bots|anons|liu|myself|redirs|links|trans|patrolled)=
7 | ^https?://{any_start_netloc}/.+([\?&]title=|/)Special:(UserLogin|UserLogout|Translate|MobileFeedback|MobileOptions|RecentChangesLinked|Diff|MobileDiff)
8 | ^https?://{any_start_netloc}/.+([\?&]title=|/)Special:RecentChanges&from=\d+
9 | ^https?://{any_start_netloc}/.+([\?&]title=|/)Special:ListFiles&dir=prev&offset=\d+
10 | ^https?://{any_start_netloc}/.+([\?&]title=|/)Special:(ListFiles|PrefixIndex).*&
11 | ^https?://{any_start_netloc}/.+([\?&]title=|/)Special:ListFiles.*&user=
12 | ^https?://{any_start_netloc}/.+([\?&]title=|/)Special:Log/
13 | ^https?://{any_start_netloc}/.+[\?&]action=edit§ion=(\d+|new)
14 | ^https?://{any_start_netloc}/.+[\?&]feed(format)?=atom
15 | ^https?://{any_start_netloc}/.+[\?&]printable=yes
16 | ^https?://{any_start_netloc}/.+[\?&]mobileaction=
17 | ^https?://{any_start_netloc}/.+[\?&]undo(after)?=\d+
18 | ^https?://{any_start_netloc}/.+[\?&]lqt_method=
19 |
20 | # Links to pages that don't exist
21 | ^https?://{any_start_netloc}/.+[\?&]redlink=1
22 |
23 | # Loops
24 | ^https?://{any_start_netloc}/.*User_talk:.+/User_talk:
25 | ^https?://{any_start_netloc}/.*User_blog:.+/User_blog:
26 | ^https?://{any_start_netloc}/.*User:.+/User:
27 |
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/meetupeverywhere:
--------------------------------------------------------------------------------
1 | ^https?://.*\meetup\.com/login/
2 |
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/nogravatar:
--------------------------------------------------------------------------------
1 | ^https?://(\d|secure)\.gravatar\.com/avatar/
2 |
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/noonion:
--------------------------------------------------------------------------------
1 | ^https?://[^/]+\.onion/
2 |
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/nosortedindex:
--------------------------------------------------------------------------------
1 | # These are the "sort by" links on "index of" directory listings
2 | \?C=[NMSD];O=[AD]$
3 |
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/pinterest:
--------------------------------------------------------------------------------
1 | ^https?://www\.pinterest\.com/[^/]+/\^/[^/]+/
2 | ^https?://www\.pinterest\.com/[^/]+/[^/]+/\^/[^/]+/
3 | ^https?://www\.pinterest\.com/[^/]+/[^/]+\.[^/]+
4 | ^https?://www\.pinterest\.com/[^/]+/[^/]+/[^/]+\.[^/]+
5 | ^https?://www\.pinterest\.com/[^/]+/webapp/js/app/(desktop|common)/bundle-(jcrop|mapbox)\.js
6 | ^https?://www\.pinterest\.com/[^/]+/[^/]+/webapp/js/app/(desktop|common)/bundle-(jcrop|mapbox)\.js
7 |
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/reddit:
--------------------------------------------------------------------------------
1 | # These ignores are designed for archiving subreddits. Note that not
2 | # all comments will be downloaded because many comments are collapsed
3 | # by reddit.
4 |
5 | ^https?://(www|old)\.reddit\.com/gold\?goldtype=
6 | # URLs with utm_ can (hopefully) be safely ignored because reddit also sends
7 | # href=""s without the utm_ trackers.
8 | ^https?://(www|old)\.reddit\.com/r/[^/]+/.*[\?&]utm_
9 | ^https?://(www|old)\.reddit\.com/r/[^/]+/comments/[a-z0-9]+/[^/]+/[a-z0-9]+
10 | ^https?://(www|old)\.reddit\.com/r/[^/]+/comments/[a-z0-9]+.*\?sort=
11 | ^https?://(www|old)\.reddit\.com/r/[^/]+/comments/[a-z0-9]+/[^/]+/\.compact
12 | ^https?://(www|old)\.reddit\.com/r/[^/]+/(top|new|rising|controversial|gilded|ads)/.+[\?&]after=
13 | ^https?://(www|old)\.reddit\.com/r/[^/]+/related/
14 | ^https?://(www|old)\.reddit\.com/r/[^/]+/(gilded)?\.mobile\?
15 | ^https?://(www|old)\.reddit\.com/r/[^/]+/search/?\?
16 | ^https?://(www|old)\.reddit\.com/r/[^/]+/wiki/(revisions|discussions)/user/.+
17 | ^https?://(www|old)\.reddit\.com/user/[^/]+/(comments/)?.+[\?&]sort=
18 | ^https?://(www|old)\.reddit\.com/.+/\.rss$
19 | \.reddit\.com/message/compose/?\?
20 | ^https?://(m|out|simple|amp)\.reddit\.com/
21 |
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/singletumblr:
--------------------------------------------------------------------------------
1 | # You generally want this ignore set if you are archiving a tumblr blog,
2 | # because tumblr blogs can have tens of thousands of links to other tumblr
3 | # blogs, and grab-site's default --offsite-links behavior will otherwise grab
4 | # all of their homepages.
5 | #
6 | # This homepage ignore won't apply to any of the start URLs given to grab-site.
7 |
8 | ^https?://[^/]+\.tumblr\.com/$
9 |
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/twitter:
--------------------------------------------------------------------------------
1 | ^https?://((?:www|mobile)\.)?twitter\.com/.+[\?&](?:id|lang|locale|screen_name|nav)=
2 | ^https?://mobile\.twitter\.com/i/anonymize\?data=
3 |
--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/youtube:
--------------------------------------------------------------------------------
1 | \.?youtube\.com/user/[^/]+/(playlists|channels|videos)\?(flow|view|sort|live_view)=
2 |
--------------------------------------------------------------------------------
/libgrabsite/main.py:
--------------------------------------------------------------------------------
1 | import faulthandler
2 | faulthandler.enable()
3 |
4 | import re
5 | import os
6 | import sys
7 | import urllib.request
8 | import shutil
9 | import binascii
10 | import datetime
11 | import shlex
12 | import click
13 | import libgrabsite
14 |
15 | def print_version(ctx, param, value):
16 | if not value or ctx.resilient_parsing:
17 | return
18 | click.echo(libgrabsite.__version__)
19 | ctx.exit()
20 |
21 | def replace_2arg(args, arg, replacement):
22 | idx = args.index(arg)
23 | if idx == -1:
24 | return
25 | args.pop(idx)
26 | args.pop(idx)
27 | for r in reversed(replacement):
28 | args.insert(idx, r)
29 |
30 | def patch_dns_inet_is_multicast():
31 | """
32 | Patch dnspython's dns.inet.is_multicast to not raise ValueError:
33 | https://github.com/ArchiveTeam/grab-site/issues/111
34 | """
35 | import dns.inet
36 | is_multicast_dnspython = dns.inet.is_multicast
37 | def is_multicast(text):
38 | try:
39 | return is_multicast_dnspython(text)
40 | except Exception:
41 | return False
42 | dns.inet.is_multicast = is_multicast
43 |
44 | @click.command()
45 |
46 | @click.option('--concurrency', default=2, metavar='NUM',
47 | help='Use this many connections to fetch in parallel (default: 2).')
48 |
49 | @click.option('--concurrent', default=-1, metavar='NUM',
50 | help='Alias for --concurrency.')
51 |
52 | @click.option('--delay', default="0", metavar='DELAY',
53 | help=
54 | 'Time to wait between requests, in milliseconds (default: 0). '
55 | 'Can be "NUM", or "MIN-MAX" to use a random delay between MIN and MAX '
56 | 'for each request. Delay applies to each concurrent fetcher, not globally.')
57 |
58 | @click.option('--recursive/--1', default=True,
59 | help=
60 | '--recursive (default: true) to crawl under last /path/ component '
61 | 'recursively, or --1 to get just START_URL.')
62 |
63 | @click.option('--offsite-links/--no-offsite-links', default=True,
64 | help=
65 | '--offsite-links (default: true) to grab all links to a depth of 1 '
66 | 'on other domains, or --no-offsite-links to disable.')
67 |
68 | @click.option('--igsets', default="", metavar='LIST',
69 | help='Comma-separated list of ignore sets to use in addition to "global".')
70 |
71 | @click.option('--ignore-sets', default="", metavar='LIST',
72 | help='Alias for --igsets.')
73 |
74 | @click.option('--no-global-igset', is_flag=True,
75 | help='Do not add the "global" ignore set.')
76 |
77 | @click.option('--import-ignores', default=None, metavar='FILE',
78 | help='Copy this file to DIR/ignores before the crawl begins.')
79 |
80 | @click.option('--igon/--igoff', default=False,
81 | help=
82 | '--igon (default: false) to print all URLs being ignored to the terminal '
83 | 'and dashboard.')
84 |
85 | @click.option('--debug', is_flag=True, help='Print a lot of debugging information.')
86 |
87 | @click.option('--video/--no-video', default=True,
88 | help=
89 | '--no-video (default: false) to skip the download of videos by both '
90 | 'mime type and file extension. Skipped videos are logged to '
91 | 'DIR/skipped_videos')
92 |
93 | @click.option('-i', '--input-file', default=None, type=str,
94 | help=
95 | 'Load list of URLs-to-grab from a local file or from a URL; like wget -i. '
96 | 'File must be a newline-delimited list of URLs. '
97 | 'Combine with --1 to avoid a recursive crawl on each URL.')
98 |
99 | @click.option('--max-content-length', default=-1, metavar='N',
100 | help=
101 | "Skip the download of any response that claims a Content-Length "
102 | "larger than N (default: -1, don't skip anything).")
103 |
104 | @click.option('--level', default="inf", metavar='NUM',
105 | help='Recurse this many levels (default: inf).')
106 |
107 | @click.option('--page-requisites-level', default="5", metavar='NUM',
108 | help='Recursive this many levels for page requisites (default: 5).')
109 |
110 | @click.option('--warc-max-size', default=5368709120, metavar='BYTES',
111 | help=
112 | 'Try to limit each WARC file to around BYTES bytes before rolling over '
113 | 'to a new WARC file (default: 5368709120, which is 5GiB).')
114 |
115 | @click.option('--ua', default="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0",
116 | metavar='STRING', help='Send User-Agent: STRING instead of pretending to be Firefox on Windows.')
117 |
118 | @click.option('--wpull-args', default="",
119 | metavar='ARGS', help=
120 | r'String containing additional arguments to pass to wpull; '
121 | r'see ~/.local/bin/wpull --help. ARGS is split with shlex.split '
122 | r'and individual arguments can contain spaces if quoted, e.g. '
123 | r'--wpull-args="--youtube-dl \"--youtube-dl-exe=/My Documents/youtube-dl\""')
124 |
125 | @click.option('--sitemaps/--no-sitemaps', default=True,
126 | help=
127 | '--sitemaps (default: true) to queue URLs from sitemap.xml '
128 | 'at the root of the site, or --no-sitemaps to disable.')
129 |
130 | @click.option('--dupespotter/--no-dupespotter', default=True,
131 | help=
132 | '--dupespotter (default: true) to skip the extraction of links '
133 | 'from pages that look like duplicates of earlier pages, or '
134 | '--no-dupespotter to disable. Disable this for sites that are '
135 | 'directory listings.')
136 |
137 | @click.option('--id', default=None, type=str, metavar='ID',
138 | help=
139 | 'Use id ID for the crawl instead of a random 128-bit id. '
140 | 'This must be unique for every crawl.')
141 |
142 | @click.option('--dir', default=None, type=str, metavar='DIR', help=
143 | 'Put control files, temporary files, and unfinished WARCs in DIR '
144 | '(default: a directory name based on the URL, date, and first 8 '
145 | 'characters of the id).')
146 |
147 | @click.option('--finished-warc-dir', default=None, type=str, metavar='FINISHED_WARC_DIR',
148 | help=
149 | 'Absolute path to a directory into which finished .warc.gz and .cdx '
150 | 'files will be moved.')
151 |
152 | @click.option('--permanent-error-status-codes', default='401,403,404,405,410', type=str,
153 | metavar='STATUS_CODES',
154 | help=
155 | 'A comma-separated list of HTTP status codes to treat as a permanent '
156 | 'error and therefore *not* retry (default: 401,403,404,405,410)')
157 |
158 | @click.option('--which-wpull-args-partial', is_flag=True,
159 | help=
160 | 'Print a partial list of wpull arguments that would be used and exit. '
161 | 'Excludes grab-site-specific features, and removes DIR/ from paths. '
162 | 'Useful for reporting bugs on wpull without grab-site involvement.')
163 |
164 | @click.option('--which-wpull-command', is_flag=True,
165 | help=
166 | "Populate DIR/ but don't start wpull; instead print the command that would "
167 | "have been used to start wpull with all of the grab-site functionality.")
168 |
169 | @click.option('--version', is_flag=True, callback=print_version,
170 | expose_value=False, is_eager=True, help='Print version and exit.')
171 |
172 | @click.argument('start_url', nargs=-1, required=False)
173 |
174 | def main(concurrency, concurrent, delay, recursive, offsite_links, igsets,
175 | ignore_sets, no_global_igset, import_ignores, igon, debug, video, level,
176 | page_requisites_level, max_content_length, sitemaps, dupespotter, warc_max_size,
177 | ua, input_file, wpull_args, start_url, id, dir, finished_warc_dir,
178 | permanent_error_status_codes, which_wpull_args_partial, which_wpull_command):
179 | """
180 | Runs a crawl on one or more URLs. For additional help, see
181 |
182 | https://github.com/ArchiveTeam/grab-site/blob/master/README.md#usage
183 | """
184 | if not (input_file or start_url):
185 | print("Neither a START_URL or --input-file= was specified; see --help", file=sys.stderr)
186 | sys.exit(1)
187 | elif input_file and start_url:
188 | print("Can't specify both START_URL and --input-file=; see --help", file=sys.stderr)
189 | sys.exit(1)
190 |
191 | span_hosts_allow = "page-requisites,linked-pages"
192 | if not offsite_links:
193 | span_hosts_allow = "page-requisites"
194 |
195 | if concurrent != -1:
196 | concurrency = concurrent
197 |
198 | if ignore_sets != "":
199 | igsets = ignore_sets
200 |
201 | if start_url:
202 | claim_start_url = start_url[0]
203 | else:
204 | input_file_is_remote = bool(re.match("^(ftp|https?)://", input_file))
205 | if input_file_is_remote:
206 | claim_start_url = input_file
207 | else:
208 | claim_start_url = 'file://' + os.path.abspath(input_file)
209 |
210 | if not id:
211 | id = binascii.hexlify(os.urandom(16)).decode('utf-8')
212 | ymd = datetime.datetime.utcnow().isoformat()[:10]
213 | no_proto_no_trailing = claim_start_url.split('://', 1)[1].rstrip('/')[:100]
214 | unwanted_chars_re = r'[^-_a-zA-Z0-9%\.,;@+=]'
215 | warc_name = "{}-{}-{}".format(re.sub(unwanted_chars_re, '-', no_proto_no_trailing).lstrip('-'), ymd, id[:8])
216 |
217 | # make absolute because wpull will start in temp/
218 | if not dir:
219 | working_dir = os.path.abspath(warc_name)
220 | else:
221 | working_dir = os.path.abspath(dir)
222 |
223 | LIBGRABSITE = os.path.dirname(libgrabsite.__file__)
224 | args = [
225 | "--debug" if debug else "--quiet",
226 | "-U", ua,
227 | "--header", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
228 | "--header", "Accept-Language: en-US,en;q=0.5",
229 | "--no-check-certificate",
230 | "--no-robots",
231 | "--inet4-only",
232 | "--dns-timeout", "20",
233 | "--connect-timeout", "20",
234 | "--read-timeout", "900",
235 | "--session-timeout", str(86400 * 2),
236 | "--tries", "3",
237 | "--waitretry", "5",
238 | "--max-redirect", "8",
239 | "--output-file", "{}/wpull.log".format(working_dir),
240 | "--database", "{}/wpull.db".format(working_dir),
241 | "--plugin-script", "{}/wpull_hooks.py".format(LIBGRABSITE),
242 | "--save-cookies", "{}/cookies.txt".format(working_dir),
243 | "--delete-after",
244 | "--page-requisites",
245 | "--no-parent",
246 | "--concurrent", str(concurrency),
247 | "--warc-file", "{}/{}".format(working_dir, warc_name),
248 | "--warc-max-size", str(warc_max_size),
249 | "--warc-cdx",
250 | "--strip-session-id",
251 | "--escaped-fragment",
252 | "--level", level,
253 | "--page-requisites-level", page_requisites_level,
254 | "--span-hosts-allow", span_hosts_allow,
255 | "--load-cookies", "{}/default_cookies.txt".format(LIBGRABSITE),
256 | ]
257 |
258 | if os.name != "nt" and sys.platform != "cygwin":
259 | args += [
260 | "--debug-manhole"
261 | ]
262 |
263 | if finished_warc_dir is not None:
264 | args += ["--warc-move", finished_warc_dir]
265 |
266 | if sitemaps:
267 | args += ["--sitemaps"]
268 |
269 | if recursive:
270 | args += ["--recursive"]
271 |
272 | if wpull_args:
273 | args += shlex.split(wpull_args)
274 |
275 | DIR_input_file = os.path.join(working_dir, "input_file")
276 | if start_url:
277 | args.extend(start_url)
278 | else:
279 | args += ["--input-file", DIR_input_file]
280 |
281 | if which_wpull_args_partial:
282 | replace_2arg(args, "--output-file", ["--output-file", "wpull.log"])
283 | replace_2arg(args, "--database", ["--database", "wpull.db"])
284 | replace_2arg(args, "--plugin-script", [])
285 | replace_2arg(args, "--save-cookies", ["--save-cookies", "cookies.txt"])
286 | replace_2arg(args, "--load-cookies", [])
287 | replace_2arg(args, "--warc-file", ["--warc-file", warc_name])
288 | try:
289 | args.remove("--quiet")
290 | except ValueError:
291 | pass
292 | print(" ".join(shlex.quote(a) for a in args))
293 | return
294 |
295 | # Create DIR and DIR files only after which_wpull_args_* checks
296 | os.makedirs(working_dir)
297 | temp_dir = os.path.join(working_dir, "temp")
298 | os.makedirs(temp_dir)
299 |
300 | if input_file is not None:
301 | # wpull -i doesn't support URLs, so download the input file ourselves if necessary
302 | if input_file_is_remote:
303 | # TODO: use wpull with correct user agent instead of urllib.request
304 | # wpull -O fails: https://github.com/chfoo/wpull/issues/275
305 | u = urllib.request.urlopen(input_file)
306 | with open(DIR_input_file, "wb") as f:
307 | while True:
308 | s = u.read(1024 * 1024)
309 | if not s:
310 | break
311 | f.write(s)
312 | else:
313 | shutil.copyfile(input_file, DIR_input_file)
314 |
315 | with open("{}/id".format(working_dir), "w") as f:
316 | f.write(id)
317 |
318 | with open("{}/start_url".format(working_dir), "w") as f:
319 | f.write(claim_start_url)
320 |
321 | with open("{}/all_start_urls".format(working_dir), "w") as f:
322 | for u in start_url:
323 | f.write(u + "\n")
324 |
325 | with open("{}/concurrency".format(working_dir), "w") as f:
326 | f.write(str(concurrency))
327 |
328 | with open("{}/max_content_length".format(working_dir), "w") as f:
329 | f.write(str(max_content_length))
330 |
331 | with open("{}/igsets".format(working_dir), "w") as f:
332 | f.write("{}{}".format("" if no_global_igset else "global,", igsets))
333 |
334 | if video:
335 | with open("{}/video".format(working_dir), "w") as f:
336 | pass
337 |
338 | if not igon:
339 | with open("{}/igoff".format(working_dir), "w") as f:
340 | pass
341 |
342 | with open("{}/ignores".format(working_dir), "w") as f:
343 | if import_ignores is not None:
344 | f.write(open(import_ignores, "r").read())
345 |
346 | with open("{}/delay".format(working_dir), "w") as f:
347 | f.write(delay)
348 |
349 | with open("{}/scrape".format(working_dir), "w") as f:
350 | pass
351 |
352 | # We don't actually need to write control files for this mode to work, but the
353 | # only reason to use this is if you're starting wpull manually with modified
354 | # arguments, and wpull_hooks.py requires the control files.
355 | if which_wpull_command:
356 | bin = sys.argv[0].replace("/grab-site", "/wpull") # TODO
357 | print("GRAB_SITE_WORKING_DIR={} DUPESPOTTER_ENABLED={} {} {}".format(
358 | working_dir, int(dupespotter), bin, " ".join(shlex.quote(a) for a in args)))
359 | return
360 |
361 | patch_dns_inet_is_multicast()
362 |
363 | # Mutate argv, environ, cwd before we turn into wpull
364 | sys.argv[1:] = args
365 | os.environ["GRAB_SITE_WORKING_DIR"] = working_dir
366 | os.environ["DUPESPOTTER_ENABLED"] = "1" if dupespotter else "0"
367 | # We can use --warc-tempdir= to put WARC-related temporary files in a temp
368 | # directory, but wpull also creates non-WARC-related "resp_cb" temporary
369 | # files in the cwd, so we must start wpull in temp/ anyway.
370 | os.chdir(temp_dir)
371 |
372 | # Modify NO_DOCUMENT_STATUS_CODES
373 | # https://github.com/chfoo/wpull/issues/143
374 | from wpull.processor.web import WebProcessor
375 | WebProcessor.NO_DOCUMENT_STATUS_CODES = \
376 | tuple(int(code) for code in permanent_error_status_codes.split(","))
377 |
378 | import wpull.application.main
379 | # Don't let wpull install a handler for SIGINT or SIGTERM,
380 | # because we install our own in wpull_hooks.py.
381 | wpull.application.main.main(use_signals=False)
382 |
383 |
384 | if __name__ == '__main__':
385 | main()
386 |
--------------------------------------------------------------------------------
/libgrabsite/server.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import txaio
4 | txaio.use_asyncio()
5 | import os
6 | import json
7 | import pprint
8 | import asyncio
9 | from autobahn.asyncio.websocket import WebSocketServerFactory, WebSocketServerProtocol
10 |
11 | class GrabberServerProtocol(WebSocketServerProtocol):
12 | def __init__(self):
13 | super().__init__()
14 | self.mode = None
15 |
16 | def onConnect(self, request):
17 | self.peer = request.peer
18 | print(f"{self.peer} connected")
19 | self.factory.clients.add(self)
20 |
21 | def onClose(self, wasClean, code, reason):
22 | print(f"{self.peer} disconnected")
23 | self.factory.clients.discard(self)
24 |
25 | def onMessage(self, payload, isBinary):
26 | obj = json.loads(payload.decode("utf-8"))
27 | type = obj["type"]
28 | if self.mode is None and type == "hello" and obj.get("mode"):
29 | mode = obj["mode"]
30 | if mode in ("dashboard", "grabber"):
31 | self.mode = mode
32 | if mode == "grabber":
33 | print(f'{self.peer} is grabbing {obj["url"]}')
34 | elif mode == "dashboard":
35 | user_agent = obj.get("user_agent", "(no User-Agent)")
36 | print(f"{self.peer} is dashboarding with {user_agent}")
37 | elif self.mode == "grabber":
38 | if type == "download":
39 | self.broadcast_to_dashboards({
40 | "type": type,
41 | "job_data": obj["job_data"],
42 | "url": obj["url"],
43 | "response_code": obj["response_code"],
44 | "wget_code": obj["response_message"]
45 | })
46 | elif type in ("stdout", "stderr"):
47 | self.broadcast_to_dashboards({
48 | "type": type,
49 | "job_data": obj["job_data"],
50 | "message": obj["message"]
51 | })
52 | elif type == "ignore":
53 | self.broadcast_to_dashboards({
54 | "type": type,
55 | "job_data": obj["job_data"],
56 | "url": obj["url"],
57 | "pattern": obj["pattern"],
58 | })
59 |
60 | def broadcast_to_dashboards(self, obj):
61 | for client in self.factory.clients:
62 | if client.mode == "dashboard":
63 | client.sendMessage(json.dumps(obj).encode("utf-8"))
64 |
65 | # Called when we get an HTTP request instead of a WebSocket request
66 | def sendServerStatus(self, redirectUrl=None, redirectAfter=0):
67 | requestPath = self.http_request_uri.split("?")[0]
68 | if requestPath == "/":
69 | self.send_page("dashboard.html", 200, "OK", "text/html; charset=UTF-8")
70 | elif requestPath == "/favicon.ico":
71 | self.send_page("favicon.ico", 200, "OK", "image/x-icon")
72 | else:
73 | self.send_page("404.html", 404, "Not Found", "text/html; charset=UTF-8")
74 |
75 | # Based on AutoBahn's WebSocketServerProtocol.sendHtml
76 | def send_page(self, fname, code, status, content_type):
77 | with open(os.path.join(os.path.dirname(__file__), fname), "rb") as f:
78 | response_body = f.read()
79 | response = f"HTTP/1.1 {code} {status}\r\n"
80 | response += f"Content-Type: {content_type}\r\n"
81 | response += f"Content-Length: {len(response_body)}\r\n"
82 | response += "X-Frame-Options: DENY\r\n"
83 | response += "\r\n"
84 | self.sendData(response.encode("utf-8"))
85 | self.sendData(response_body)
86 |
87 |
88 | class GrabberServerFactory(WebSocketServerFactory):
89 | protocol = GrabberServerProtocol
90 |
91 | def __init__(self):
92 | super().__init__()
93 | self.clients = set()
94 |
95 |
96 | def main():
97 | loop = asyncio.get_event_loop()
98 | ports = list(int(p) for p in os.environ.get("GRAB_SITE_PORT", "29000").split(","))
99 | factory = GrabberServerFactory()
100 | interface = os.environ.get("GRAB_SITE_INTERFACE", "0.0.0.0")
101 | for port in ports:
102 | coro = loop.create_server(factory, interface, port)
103 | loop.run_until_complete(coro)
104 | print(f"grab-site server listening on {interface}:{port}")
105 |
106 | loop.run_forever()
107 |
108 |
109 | if __name__ == "__main__":
110 | main()
111 |
--------------------------------------------------------------------------------
/libgrabsite/wpull_hooks.py:
--------------------------------------------------------------------------------
1 | import re
2 | import re2
3 | import os
4 | import sys
5 | import time
6 | import signal
7 | import random
8 | import functools
9 | import traceback
10 | import asyncio
11 | import urllib.parse
12 |
13 | from wpull.application.hook import Actions
14 | from wpull.application.plugin import WpullPlugin, PluginFunctions, hook, event
15 | from wpull.pipeline.app import AppSession
16 | from wpull.pipeline.item import URLRecord
17 | from wpull.pipeline.session import ItemSession
18 | from wpull.url import URLInfo
19 |
20 | from libgrabsite import wpull_tweaks, dashboard_client
21 | import libgrabsite
22 |
23 |
24 | working_dir = os.environ["GRAB_SITE_WORKING_DIR"]
25 | def cf(fname):
26 | return os.path.join(working_dir, fname)
27 |
28 | def re_compile(regexp):
29 | # Validate with re first, because re2 may be more prone to segfaulting on
30 | # bad regexps, and because re returns useful errors.
31 | re.compile(regexp)
32 | try:
33 | return re2.compile(regexp)
34 | except re.error:
35 | # Regular expressions with lookaround expressions cannot be compiled with
36 | # re2, so on error try compiling with re.
37 | return re.compile(regexp)
38 |
39 | def compile_combined_regexp(patterns):
40 | # If there are no patterns, we want to ignore nothing, not everything.
41 | if not patterns:
42 | return re_compile("$^")
43 | regexp = "|".join(map(lambda pattern: f"({pattern})", patterns))
44 | return re_compile(regexp)
45 |
46 | def include_ignore_line(line):
47 | return line and not line.startswith("#")
48 |
49 | ignore_sets_path = os.path.join(os.path.dirname(libgrabsite.__file__), "ignore_sets")
50 | def get_patterns_for_ignore_set(name: str):
51 | assert name != "", name
52 | with open(os.path.join(ignore_sets_path, name), "r", encoding="utf-8") as f:
53 | return f.read().strip("\n").split("\n")
54 |
55 | def swallow_exception(f):
56 | @functools.wraps(f)
57 | def wrapper(*args, **kwargs):
58 | try:
59 | return f(*args, **kwargs)
60 | except Exception:
61 | traceback.print_exc()
62 | return wrapper
63 |
64 | CONTROL_FILE_CACHE_SEC = 1.5
65 |
66 | def caching_decorator(f):
67 | cache = {}
68 | @functools.wraps(f)
69 | def wrapper(path):
70 | timestamp, val = cache.get(path, (-CONTROL_FILE_CACHE_SEC, None))
71 | if timestamp > (time.monotonic() - CONTROL_FILE_CACHE_SEC):
72 | #print(f"returning cached value {path} {val}")
73 | return val
74 | val = f(path)
75 | cache[path] = (time.monotonic(), val)
76 | #print(f"returning new value {path} {val}")
77 | return val
78 | return wrapper
79 |
80 | @caching_decorator
81 | def path_exists_with_cache(path):
82 | return os.path.exists(path)
83 |
84 | @caching_decorator
85 | def mtime_with_cache(path):
86 | return os.stat(path).st_mtime
87 |
88 | class FileChangedWatcher(object):
89 | def __init__(self, fname):
90 | self.fname = fname
91 | # Use a bogus mtime so that has_changed() returns True
92 | # at least once
93 | self.last_mtime = -1
94 |
95 | def has_changed(self):
96 | now_mtime = mtime_with_cache(self.fname)
97 | changed = now_mtime != self.last_mtime
98 | self.last_mtime = now_mtime
99 | if changed:
100 | print(f"Imported {self.fname}")
101 | return changed
102 |
103 |
104 | ICY_FIELD_PATTERN = re2.compile("(?i)^icy-|ice-|x-audiocast-")
105 | ICY_VALUE_PATTERN = re2.compile("(?i)^icecast")
106 |
107 | def get_content_length(response) -> int:
108 | try:
109 | return int(list(p for p in response.fields.get_all() if p[0] == "Content-Length")[0][1])
110 | except (IndexError, ValueError):
111 | return -1
112 |
113 | def has_content_type_video(response) -> bool:
114 | try:
115 | t = list(p for p in response.fields.get_all() if p[0] == "Content-Type")[0][1]
116 | return t.lower().startswith("video/")
117 | except (IndexError, ValueError):
118 | return False
119 |
120 | def response_status_code(response) -> int:
121 | statcode = 0
122 |
123 | try:
124 | # duck typing: assume the response is
125 | # wpull.protocol.http.request.Response
126 | statcode = response.status_code
127 | except (AttributeError, KeyError):
128 | pass
129 |
130 | try:
131 | # duck typing: assume the response is
132 | # wpull.protocol.ftp.request.Response
133 | statcode = response.reply.code
134 | except (AttributeError, KeyError):
135 | pass
136 |
137 | return statcode
138 |
139 | # Excluded vob, mpeg, mpg, avi because they are not found on the general web
140 | video_exts = set("webm mp4 m4v mkv ts 3gp 3g2 flv mov wmv ogv ogm".split(" "))
141 |
142 | def has_video_ext(url: str) -> bool:
143 | ext = url.rsplit(".")[-1]
144 | return ext.lower() in video_exts
145 |
146 | class GrabSitePlugin(WpullPlugin):
147 | def activate(self):
148 | wpull_tweaks.activate(self.app_session)
149 | self.loop = asyncio.get_event_loop()
150 | self.enable_stdio_capture()
151 | self.add_signal_handlers()
152 | self.init_job_data()
153 | self.init_ws()
154 | self.setup_watchers()
155 | self.all_start_urls = open(cf("all_start_urls")).read().rstrip("\n").split("\n")
156 | self.all_start_netlocs = set(urllib.parse.urlparse(url).netloc for url in self.all_start_urls)
157 | self.skipped_videos = open(cf("skipped_videos"), "w", encoding="utf-8")
158 | self.skipped_max_content_length = open(cf("skipped_max_content_length"), "w", encoding="utf-8")
159 | self.update_ignores()
160 | super().activate()
161 |
162 | def enable_stdio_capture(self):
163 | self.real_stdout_write = sys.stdout.buffer.write
164 | self.real_stderr_write = sys.stderr.buffer.write
165 | sys.stdout.buffer.write = self.stdout_write_both
166 | sys.stderr.buffer.write = self.stderr_write_both
167 |
168 | def print_to_terminal(self, s):
169 | self.real_stdout_write((s + "\n").encode("utf-8"))
170 | sys.stdout.buffer.flush()
171 |
172 | def graceful_stop_callback(self):
173 | self.print_to_terminal("\n^C detected, creating 'stop' file, please wait for exit...")
174 | with open(cf("stop"), "wb") as _f:
175 | pass
176 |
177 | def forceful_stop_callback(self):
178 | self.loop.stop()
179 |
180 | def add_signal_handlers(self):
181 | try:
182 | self.loop.add_signal_handler(signal.SIGINT, self.graceful_stop_callback)
183 | self.loop.add_signal_handler(signal.SIGTERM, self.forceful_stop_callback)
184 | except NotImplementedError:
185 | # Not supported on Windows
186 | pass
187 |
188 | def setup_watchers(self):
189 | self.watchers = {}
190 | for f in ["igsets", "ignores", "delay", "concurrency", "max_content_length"]:
191 | self.watchers[f] = FileChangedWatcher(cf(f))
192 |
193 | def put_ws_queue(self, obj):
194 | try:
195 | self.ws_queue.put_nowait(obj)
196 | except asyncio.QueueFull:
197 | pass
198 |
199 | def stdout_write_both(self, message):
200 | assert isinstance(message, bytes), message
201 | try:
202 | self.real_stdout_write(message)
203 | self.put_ws_queue({
204 | "type": "stdout",
205 | "job_data": self.job_data,
206 | "message": message.decode("utf-8")
207 | })
208 | except Exception as e:
209 | self.real_stderr_write((str(e) + "\n").encode("utf-8"))
210 |
211 | def stderr_write_both(self, message):
212 | assert isinstance(message, bytes), message
213 | try:
214 | self.real_stderr_write(message)
215 | self.put_ws_queue({
216 | "type": "stderr",
217 | "job_data": self.job_data,
218 | "message": message.decode("utf-8")
219 | })
220 | except Exception as e:
221 | self.real_stderr_write((str(e) + "\n").encode("utf-8"))
222 |
223 | def init_job_data(self):
224 | self.job_data = {
225 | "ident": open(cf("id")).read().strip(),
226 | "url": open(cf("start_url")).read().strip(),
227 | "started_at": os.stat(cf("start_url")).st_mtime,
228 | "max_content_length": -1,
229 | "suppress_ignore_reports": True,
230 | "video": True,
231 | "scrape": True,
232 | "concurrency": 2,
233 | "bytes_downloaded": 0,
234 | "items_queued": 0,
235 | "items_downloaded": 0,
236 | "delay_min": 0,
237 | "delay_max": 0,
238 | "r1xx": 0,
239 | "r2xx": 0,
240 | "r3xx": 0,
241 | "r4xx": 0,
242 | "r5xx": 0,
243 | "runk": 0,
244 | }
245 |
246 | def init_ws(self):
247 | self.ws_queue = asyncio.Queue(maxsize=250)
248 |
249 | ws_host = os.environ.get("GRAB_SITE_HOST", "127.0.0.1")
250 | ws_port = int(os.environ.get("GRAB_SITE_PORT", 29000))
251 | ws_url = f"ws://{ws_host}:{ws_port}"
252 |
253 | self.loop.create_task(dashboard_client.sender(self, ws_url))
254 |
255 | @swallow_exception
256 | def update_max_content_length(self):
257 | if not self.watchers["max_content_length"].has_changed():
258 | return
259 | with open(self.watchers["max_content_length"].fname, "r") as f:
260 | self.job_data["max_content_length"] = int(f.read().strip())
261 |
262 | @swallow_exception
263 | def update_delay(self):
264 | if not self.watchers["delay"].has_changed():
265 | return
266 | with open(self.watchers["delay"].fname, "r") as f:
267 | content = f.read().strip()
268 | if "-" in content:
269 | self.job_data["delay_min"], self.job_data["delay_max"] = list(int(s) for s in content.split("-", 1))
270 | else:
271 | self.job_data["delay_min"] = self.job_data["delay_max"] = int(content)
272 |
273 | @swallow_exception
274 | def update_concurrency(self):
275 | if not self.watchers["concurrency"].has_changed():
276 | return
277 | with open(self.watchers["concurrency"].fname, "r") as f:
278 | concurrency = int(f.read().strip())
279 | if concurrency < 1:
280 | print(f"Warning: using 1 for concurrency instead of {concurrency} because it cannot be < 1")
281 | concurrency = 1
282 | self.job_data["concurrency"] = concurrency
283 | self.app_session.factory["PipelineSeries"].concurrency = concurrency
284 |
285 | stop_path = cf("stop")
286 | def should_stop(self):
287 | return path_exists_with_cache(self.stop_path)
288 |
289 | def should_ignore_url(self, url, record_info):
290 | return self.combined_ignore_regexp.search(url)
291 |
292 | igoff_path = cf("igoff")
293 | def update_igoff(self):
294 | self.job_data["suppress_ignore_reports"] = path_exists_with_cache(self.igoff_path)
295 |
296 | video_path = cf("video")
297 | def update_video(self):
298 | self.job_data["video"] = path_exists_with_cache(self.video_path)
299 |
300 | scrape_path = cf("scrape")
301 | @swallow_exception
302 | def update_scrape(self):
303 | scrape = path_exists_with_cache(self.scrape_path)
304 | self.job_data["scrape"] = scrape
305 | if not scrape:
306 | # Empty the list of scrapers, which will stop scraping for new URLs
307 | # but still keep going through what is already in the queue.
308 | self.app_session.factory["DemuxDocumentScraper"]._document_scrapers = []
309 |
310 | @swallow_exception
311 | def update_ignores(self):
312 | if not (self.watchers["igsets"].has_changed() or self.watchers["ignores"].has_changed()):
313 | return
314 |
315 | ignores = set()
316 |
317 | with open(cf("igsets"), "r") as f:
318 | igsets = f.read().strip("\r\n\t ,").split(',')
319 | if igsets == [""]:
320 | igsets = []
321 |
322 | for igset in igsets:
323 | for pattern in get_patterns_for_ignore_set(igset):
324 | if include_ignore_line(pattern):
325 | ignores.update(self.ignore_pattern_to_regexp_strings(pattern))
326 |
327 | with open(cf("ignores"), "r") as f:
328 | lines = f.read().strip("\n").split("\n")
329 | for pattern in lines:
330 | if include_ignore_line(pattern):
331 | ignores.update(self.ignore_pattern_to_regexp_strings(pattern))
332 |
333 | self.print_to_terminal(f"Using these {len(ignores)} ignores:")
334 | for ig in sorted(ignores):
335 | self.print_to_terminal(f"\t{ig}")
336 |
337 | self.compiled_ignores = [(ig, re_compile(ig)) for ig in ignores]
338 | self.combined_ignore_regexp = compile_combined_regexp(ignores)
339 |
340 | def ignore_pattern_to_regexp_strings(self, pattern):
341 | if "{any_start_netloc}" not in pattern:
342 | return [pattern]
343 |
344 | return [pattern.replace("{any_start_netloc}", re.escape(netloc)) for netloc in self.all_start_netlocs]
345 |
346 | def get_specific_ignore_pattern(self, url):
347 | for pattern, regexp in self.compiled_ignores:
348 | if regexp.search(url):
349 | # We can't use regexp.pattern because that quickly causes segfaults
350 | return pattern
351 |
352 | @hook(PluginFunctions.accept_url)
353 | def accept_url(self, item_session: ItemSession, verdict: bool, reasons: dict):
354 | record_info = item_session.url_record
355 | url_info = item_session.request.url_info
356 | url = url_info.raw
357 |
358 | self.update_ignores()
359 |
360 | if url.startswith("data:"):
361 | # data: URLs aren't something you can grab, so drop them to avoid ignore
362 | # checking and ignore logging.
363 | return False
364 |
365 | # Don't apply ignores to any of the start URLs
366 | if url in self.all_start_urls:
367 | # Return original verdict instead of True to avoid infinite retries
368 | return verdict
369 |
370 | should_ignore = self.should_ignore_url(url, record_info)
371 | if should_ignore:
372 | if not self.job_data["suppress_ignore_reports"]:
373 | pattern = self.get_specific_ignore_pattern(url)
374 | self.maybe_log_ignore(url, pattern)
375 | return False
376 |
377 | # If we get here, none of our ignores apply. Return the original verdict.
378 | return verdict
379 |
380 | def handle_result(self, url_info, record_info, error_info, response):
381 | self.update_igoff()
382 |
383 | self.job_data["bytes_downloaded"] += wpull_tweaks.response_body_size(response)
384 |
385 | response_code = 0
386 | response_message = ""
387 | if error_info:
388 | response_message = str(error_info)
389 | elif response:
390 | response_code = response_status_code(response)
391 | response_message = response.reason
392 | response_code_str = str(response_code)
393 |
394 | if len(response_code_str) == 3 and response_code_str[0] in "12345":
395 | self.job_data[f"r{response_code_str[0]}xx"] += 1
396 | else:
397 | self.job_data["runk"] += 1
398 |
399 | self.put_ws_queue({
400 | "type": "download",
401 | "job_data": self.job_data,
402 | "url": url_info.raw,
403 | "response_code": response_code,
404 | "response_message": response_message,
405 | })
406 |
407 | if self.should_stop():
408 | return Actions.STOP
409 |
410 | return Actions.NORMAL
411 |
412 | def maybe_log_ignore(self, url, pattern):
413 | if not self.job_data["suppress_ignore_reports"]:
414 | self.print_to_terminal(f"IGNOR {url}\n by {pattern}")
415 | self.put_ws_queue({
416 | "type": "ignore",
417 | "job_data": self.job_data,
418 | "url": url,
419 | "pattern": pattern
420 | })
421 |
422 | @event(PluginFunctions.queued_url)
423 | def queued_url(self, _url_info: URLInfo):
424 | self.job_data["items_queued"] += 1
425 |
426 | @event(PluginFunctions.dequeued_url)
427 | def dequeued_url(self, _url_info: URLInfo, _record_info: URLRecord):
428 | self.job_data["items_downloaded"] += 1
429 |
430 | @hook(PluginFunctions.handle_response)
431 | def handle_response(self, item_session: ItemSession):
432 | url_info = item_session.request.url_info
433 | record_info = item_session.url_record
434 | response = item_session.response
435 | error_info = None
436 | return self.handle_result(url_info, record_info, error_info, response)
437 |
438 | @hook(PluginFunctions.handle_error)
439 | def handle_error(self, item_session: ItemSession, error_info: BaseException):
440 | url_info = item_session.request.url_info
441 | record_info = item_session.url_record
442 | response = item_session.response
443 | return self.handle_result(url_info, record_info, error_info, response)
444 |
445 | @hook(PluginFunctions.handle_pre_response)
446 | def handle_pre_response(self, item_session: ItemSession):
447 | url_info = item_session.request.url_info
448 | response = item_session.response
449 | self.update_scrape()
450 |
451 | url = url_info.raw
452 |
453 | self.update_max_content_length()
454 | limit = self.job_data["max_content_length"]
455 | if limit != -1:
456 | length = get_content_length(response)
457 | if length > limit:
458 | self.skipped_max_content_length.write(url + "\n")
459 | self.skipped_max_content_length.flush()
460 | self.maybe_log_ignore(url, f"[content-length {length} over limit {limit}]")
461 | return Actions.FINISH
462 |
463 | self.update_video()
464 | if not self.job_data["video"]:
465 | if has_content_type_video(response) or has_video_ext(url):
466 | self.skipped_videos.write(url + "\n")
467 | self.skipped_videos.flush()
468 | self.maybe_log_ignore(url, "[video]")
469 | return Actions.FINISH
470 |
471 | # Check if server version starts with ICY
472 | if response.version == "ICY":
473 | self.maybe_log_ignore(url, "[icy version]")
474 | return Actions.FINISH
475 |
476 | # Loop through all the server headers for matches
477 | for field, value in response.fields.get_all():
478 | if ICY_FIELD_PATTERN.match(field):
479 | self.maybe_log_ignore(url, "[icy field]")
480 | return Actions.FINISH
481 |
482 | if field == "Server" and ICY_VALUE_PATTERN.match(value):
483 | self.maybe_log_ignore(url, "[icy server]")
484 | return Actions.FINISH
485 |
486 | # Nothing matched, allow download
487 | self.print_to_terminal(url + " ...")
488 | return Actions.NORMAL
489 |
490 | @hook(PluginFunctions.exit_status)
491 | def exit_status(self, _app_session: AppSession, code: int) -> int:
492 | print()
493 | print(f'Finished grab {self.job_data["ident"]} {self.job_data["url"]} with exit code {code}')
494 | print(f"Output is in directory:\n{working_dir}")
495 | return code
496 |
497 | @hook(PluginFunctions.wait_time)
498 | def wait_time(self, _seconds: float, _item_session: ItemSession, _error):
499 | self.update_delay()
500 | self.update_concurrency()
501 | return random.uniform(self.job_data["delay_min"], self.job_data["delay_max"]) / 1000
502 |
503 | @event(PluginFunctions.get_urls)
504 | def get_urls(self, item_session: ItemSession):
505 | url_info = item_session.request.url_info
506 | url = url_info.raw
507 | extra_urls = None
508 | # If we see this URL, also queue the URL for the :orig quality image
509 | if url.startswith("https://pbs.twimg.com/media/"):
510 | new_url = re.sub(":[a-z]{1,10}$", "", url) + ":orig"
511 | # see wpull/item.py:LinkType
512 | extra_urls = [dict(url=new_url, link_type="media", inline=True)]
513 | # Quora shows login-required screen unless you add ?share=1
514 | elif url.startswith("https://www.quora.com/") and not "?" in url:
515 | new_url = url + "?share=1"
516 | extra_urls = [dict(url=new_url, link_type="html")]
517 | return extra_urls
518 |
--------------------------------------------------------------------------------
/libgrabsite/wpull_tweaks.py:
--------------------------------------------------------------------------------
1 | import os
2 | import hashlib
3 | import functools
4 |
5 | from wpull.database.sqltable import SQLiteURLTable
6 | from wpull.document.html import HTMLReader
7 | from wpull.processor.rule import ProcessingRule
8 |
9 | from libgrabsite import dupespotter, __version__
10 | from libgrabsite.dupes import DupesOnDisk
11 |
12 |
13 | def response_body_size(response) -> int:
14 | try:
15 | return response.body.size()
16 | except Exception:
17 | return 0
18 |
19 | class NoFsyncSQLTable(SQLiteURLTable):
20 | @classmethod
21 | def _apply_pragmas_callback(cls, connection, record):
22 | super()._apply_pragmas_callback(connection, record)
23 | connection.execute('PRAGMA synchronous=OFF')
24 |
25 |
26 | class DupeSpottingProcessingRule(ProcessingRule):
27 | def __init__(self, *args, **kwargs):
28 | self.dupes_db = kwargs.pop('dupes_db', None)
29 | super().__init__(*args, **kwargs)
30 |
31 | def scrape_document(self, item_session):
32 | response = item_session.response
33 | url_info = item_session.request.url_info
34 | url = url_info.raw
35 |
36 | if response_body_size(response) < 30 * 1024 * 1024:
37 | dupes_db = self.dupes_db
38 | body = response.body.content()
39 | if HTMLReader.is_response(response):
40 | body = dupespotter.process_body(body, url)
41 | digest = hashlib.md5(body).digest()
42 | if dupes_db is not None:
43 | dupe_of = dupes_db.get_old_url(digest)
44 | else:
45 | dupe_of = None
46 | if dupe_of is not None:
47 | # Don't extract links from pages we've already seen
48 | # to avoid loops that descend a directory endlessly
49 | print("DUPE {}\n OF {}".format(url, dupe_of))
50 | return
51 | else:
52 | if dupes_db is not None:
53 | dupes_db.set_old_url(digest, url)
54 |
55 | super().scrape_document(item_session)
56 |
57 |
58 | def activate(app_session):
59 | app_session.factory.class_map['URLTableImplementation'] = NoFsyncSQLTable
60 |
61 | warc_recorder_cls = app_session.factory.class_map['WARCRecorder']
62 | warc_recorder_cls.DEFAULT_SOFTWARE_STRING = f'grab-site/{__version__} ' + warc_recorder_cls.DEFAULT_SOFTWARE_STRING
63 |
64 | if int(os.environ["DUPESPOTTER_ENABLED"]):
65 | dupes_db_location = os.path.join(os.environ["GRAB_SITE_WORKING_DIR"], "dupes_db")
66 | dupes_db = DupesOnDisk(dupes_db_location)
67 | app_session.factory.class_map['ProcessingRule'] = \
68 | functools.partial(DupeSpottingProcessingRule, dupes_db=dupes_db)
69 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | try:
4 | from setuptools import setup
5 | except ImportError:
6 | from distutils.core import setup
7 |
8 | import os
9 | import sys
10 | import libgrabsite
11 |
12 | install_requires = [
13 | "click>=6.3",
14 | "wpull @ https://github.com/ArchiveTeam/ludios_wpull/archive/refs/tags/3.0.9.zip",
15 | "manhole>=1.0.0",
16 | "lmdb>=0.89",
17 | "autobahn>=0.12.1",
18 | "google-re2>=1.0.6",
19 | "websockets>=6.0",
20 | ]
21 |
22 | if 'GRAB_SITE_NO_CCHARDET' not in os.environ:
23 | install_requires.append("cchardet>=1.0.0")
24 |
25 | setup(
26 | name="grab-site",
27 | version=libgrabsite.__version__,
28 | description="The archivist's web crawler: WARC output, dashboard for all crawls, dynamic ignore patterns",
29 | url="https://ludios.org/grab-site/",
30 | author="Ivan Kozik",
31 | author_email="ivan@ludios.org",
32 | classifiers=[
33 | "Programming Language :: Python :: 3",
34 | "Development Status :: 5 - Production/Stable",
35 | "Intended Audience :: End Users/Desktop",
36 | "License :: OSI Approved :: MIT License",
37 | "Topic :: Internet :: WWW/HTTP",
38 | ],
39 | scripts=["grab-site", "gs-server", "gs-dump-urls"],
40 | packages=["libgrabsite"],
41 | package_data={"libgrabsite": ["*.html", "*.ico", "*.txt", "ignore_sets/*"]},
42 | install_requires=install_requires,
43 | )
44 |
--------------------------------------------------------------------------------
/tests/offline-tests:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -eu -o pipefail -o verbose
4 |
5 | grab-site --help
6 | grab-site --version
7 | gs-dump-urls --help
8 | python -c 'import libgrabsite.server'
9 |
--------------------------------------------------------------------------------
/tests/online-tests:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -eu -o pipefail -o verbose
4 |
5 | server_log=$(mktemp)
6 | python -u -m http.server 0 > "$server_log" &
7 | port=
8 | # Try until server starts up
9 | while [[ "$port" = "" ]]; do
10 | port=$(grep -P -o 'port \d+' "$server_log" | cut -d ' ' -f 2 || true)
11 | sleep 0.1
12 | done
13 | server="http://127.0.0.1:$port/"
14 |
15 | trash=$(mktemp -d)
16 | cd "$trash"
17 |
18 | nodnspython=--wpull-args=--no-skip-getaddrinfo
19 |
20 | grab-site --1 "$server"
21 | grab-site $nodnspython --1 "$server"
22 | ls -l 127.0.0.1*
23 | for i in 127.0.0.1*/wpull.db; do
24 | gs-dump-urls "$i" done
25 | done
26 | grab-site $nodnspython --1 --permanent-error-status-codes=404 "$server"
27 | echo '.*' > ignores
28 | grab-site $nodnspython --import-ignores ignores "$server"
29 | grab-site $nodnspython --1 --id my-id --no-dupespotter --no-video --concurrent 3 "$server" "$server/another"
30 | # TODO: test -i with remote URL list
31 | echo "$server" > local-url-list
32 | grab-site $nodnspython --1 -i local-url-list
33 | grab-site $nodnspython --1 -i local-url-list --which-wpull-args-partial
34 | grab-site $nodnspython --1 -i local-url-list --which-wpull-command
35 |
36 | # kill http.server
37 | kill $!
38 |
--------------------------------------------------------------------------------