├── .editorconfig
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── extra_docs
    └── pause_resume_grab_sites.sh
├── grab-site
├── gs-dump-urls
├── gs-server
├── images
    ├── dashboard.png
    └── scriptorium.jpg
├── libgrabsite
    ├── 404.html
    ├── __init__.py
    ├── dashboard.html
    ├── dashboard_client.py
    ├── default_cookies.txt
    ├── dump_urls.py
    ├── dupes.py
    ├── dupespotter.py
    ├── favicon.ico
    ├── ignore_sets
    │   ├── blogs
    │   ├── coppermine
    │   ├── facebook
    │   ├── forums
    │   ├── global
    │   ├── imdb
    │   ├── mediawiki
    │   ├── meetupeverywhere
    │   ├── nogravatar
    │   ├── noonion
    │   ├── nosortedindex
    │   ├── pinterest
    │   ├── reddit
    │   ├── singletumblr
    │   ├── twitter
    │   └── youtube
    ├── main.py
    ├── server.py
    ├── wpull_hooks.py
    └── wpull_tweaks.py
├── setup.py
└── tests
    ├── offline-tests
    └── online-tests


/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | indent_style = tab
 5 | indent_size = 4
 6 | end_of_line = lf
 7 | charset = utf-8
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | sudo: required
 3 | dist: xenial
 4 | python:
 5 |   - 3.7
 6 | 
 7 | before_install:
 8 |   - sudo apt-get update
 9 |   - sudo apt-get install -y --no-install-recommends libxml2-dev libxslt1-dev libre2-dev pkg-config
10 | 
11 | install:
12 |   - travis_retry pip install --upgrade pip setuptools
13 |   - travis_retry pip install --no-binary lxml --upgrade .
14 | 
15 | script:
16 |   - ./tests/offline-tests
17 |   - ./tests/online-tests
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | grab-site license:
 2 | 
 3 | Copyright (c) 2015 Ivan Kozik
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 
23 | 
24 | 
25 | grab-site includes code from ArchiveBot, which is licensed as:
26 | 
27 | Copyright (c) 2013 David Yip
28 | 
29 | Permission is hereby granted, free of charge, to any person obtaining a copy
30 | of this software and associated documentation files (the "Software"), to deal
31 | in the Software without restriction, including without limitation the rights
32 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
33 | copies of the Software, and to permit persons to whom the Software is
34 | furnished to do so, subject to the following conditions:
35 | 
36 | The above copyright notice and this permission notice shall be included in
37 | all copies or substantial portions of the Software.
38 | 
39 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
40 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
41 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
42 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
43 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
44 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
45 | THE SOFTWARE.
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | grab-site
  2 | =========
  3 | 
  4 | [![Build status][travis-image]][travis-url]
  5 | 
  6 | grab-site is an easy preconfigured web crawler designed for backing up websites.
  7 | Give grab-site a URL and it will recursively crawl the site and write
  8 | [WARC files](https://www.archiveteam.org/index.php?title=The_WARC_Ecosystem).
  9 | Internally, grab-site uses [a fork](https://github.com/ArchiveTeam/ludios_wpull) of
 10 | [wpull](https://github.com/chfoo/wpull) for crawling.
 11 | 
 12 | grab-site gives you
 13 | 
 14 | *	a dashboard with all of your crawls, showing which URLs are being
 15 | 	grabbed, how many URLs are left in the queue, and more.
 16 | 
 17 | *	the ability to add ignore patterns when the crawl is already running.
 18 | 	This allows you to skip the crawling of junk URLs that would
 19 | 	otherwise prevent your crawl from ever finishing.  See below.
 20 | 
 21 | *	an extensively tested default ignore set ([global](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/global))
 22 | 	as well as additional (optional) ignore sets for forums, reddit, etc.
 23 | 
 24 | *	duplicate page detection: links are not followed on pages whose
 25 | 	content duplicates an already-seen page.
 26 | 
 27 | The URL queue is kept on disk instead of in memory.  If you're really lucky,
 28 | grab-site will manage to crawl a site with ~10M pages.
 29 | 
 30 | ![dashboard screenshot](https://raw.githubusercontent.com/ArchiveTeam/grab-site/master/images/dashboard.png)
 31 | 
 32 | Note: if you have any problems whatsoever installing or getting grab-site to run,
 33 | please [file an issue](https://github.com/ArchiveTeam/grab-site/issues) - thank you!
 34 | 
 35 | The installation methods below are the only ones supported in our GitHub issues.
 36 | Please do not modify the installation steps unless you really know what you're
 37 | doing, with both Python packaging and your operating system. grab-site runs
 38 | on a specific version of Python (3.7 or 3.8) and with specific dependency versions.
 39 | 
 40 | **Contents**
 41 | 
 42 | - [Install on Ubuntu 18.04, 20.04, 22.04, Debian 10 (buster), Debian 11 (bullseye)](#install-on-ubuntu-1804-2004-2204-debian-10-buster-debian-11-bullseye)
 43 | - [Install on NixOS](#install-on-nixos)
 44 | - [Install on another distribution lacking Python 3.7.x or 3.8.x](#install-on-another-distribution-lacking-python-37x-or-38x)
 45 | - [Install on macOS](#install-on-macos)
 46 | - [Install on Windows 10 (experimental)](#install-on-windows-10-experimental)
 47 | - [Upgrade an existing install](#upgrade-an-existing-install)
 48 | - [Usage](#usage)
 49 |   - [`grab-site` options, ordered by importance](#grab-site-options-ordered-by-importance)
 50 |   - [Warnings](#warnings)
 51 |   - [Tips for specific websites](#tips-for-specific-websites)
 52 | - [Changing ignores during the crawl](#changing-ignores-during-the-crawl)
 53 | - [Inspecting the URL queue](#inspecting-the-url-queue)
 54 | - [Preventing a crawl from queuing any more URLs](#preventing-a-crawl-from-queuing-any-more-urls)
 55 | - [Stopping a crawl](#stopping-a-crawl)
 56 | - [Advanced `gs-server` options](#advanced-gs-server-options)
 57 | - [Viewing the content in your WARC archives](#viewing-the-content-in-your-warc-archives)
 58 | - [Inspecting WARC files in the terminal](#inspecting-warc-files-in-the-terminal)
 59 | - [Automatically pausing grab-site processes when free disk is low](#automatically-pausing-grab-site-processes-when-free-disk-is-low)
 60 | - [Thanks](#thanks)
 61 | - [Help](#help)
 62 | 
 63 | 
 64 | 
 65 | Install on Ubuntu 18.04, 20.04, 22.04, Debian 10 (buster), Debian 11 (bullseye)
 66 | ---
 67 | 
 68 | 1.	On Debian, use `su` to become root if `sudo` is not configured to give you access.
 69 | 
 70 | 	```
 71 | 	sudo apt-get update
 72 | 	sudo apt-get install --no-install-recommends \
 73 | 	    wget ca-certificates git build-essential libssl-dev zlib1g-dev \
 74 | 	    libbz2-dev libreadline-dev libsqlite3-dev libffi-dev libxml2-dev \
 75 | 	    libxslt1-dev libre2-dev pkg-config
 76 | 	```
 77 | 
 78 | 	If you see `Unable to locate package`, run the two commands again.
 79 | 
 80 | 2.	As a **non-root** user:
 81 | 
 82 | 	```
 83 | 	wget https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer
 84 | 	chmod +x pyenv-installer
 85 | 	./pyenv-installer
 86 | 	~/.pyenv/bin/pyenv install 3.8.15
 87 | 	~/.pyenv/versions/3.8.15/bin/python -m venv ~/gs-venv
 88 | 	~/gs-venv/bin/pip install --no-binary lxml --upgrade git+https://github.com/ArchiveTeam/grab-site
 89 | 	```
 90 | 
 91 | 	`--no-binary lxml` is necessary for the html5-parser build.
 92 | 
 93 | 3.	Add this to your `~/.bashrc` or `~/.zshrc`:
 94 | 
 95 | 	```
 96 | 	PATH="$PATH:$HOME/gs-venv/bin"
 97 | 	```
 98 | 
 99 | 	and then restart your shell (e.g. by opening a new terminal tab/window).
100 | 
101 | 
102 | Install on NixOS
103 | ---
104 | 
105 | grab-site was removed from nixpkgs master; 23.05 is the last release to contain grab-site.
106 | 
107 | ```
108 | nix-env -f https://github.com/NixOS/nixpkgs/archive/release-23.05.tar.gz -iA grab-site
109 | ```
110 | 
111 | or, if you are using profiles (ie when you have flakes enabled):
112 | 
113 | ```
114 | nix profile install nixpkgs/release-22.11#grab-site
115 | ```
116 | 
117 | 
118 | Install on another distribution lacking Python 3.7.x or 3.8.x
119 | ---
120 | 
121 | After installing [uv](https://docs.astral.sh/uv/), you can run
122 | ```
123 | uv tool install --python=3.8 --no-binary-package lxml git+https://github.com/ArchiveTeam/grab-site/
124 | ```
125 | 
126 | 
127 | Install on macOS
128 | ---
129 | 
130 | On OS X 10.10 - macOS 11:
131 | 
132 | 1.	Run `locale` in your terminal.  If the output includes "UTF-8", you
133 | 	are all set.  If it does not, your terminal is misconfigured and grab-site
134 | 	will fail to start.  This can be corrected with:
135 | 
136 | 	-	Terminal.app: Preferences... -> Profiles -> Advanced -> **check** Set locale environment variables on startup
137 | 
138 | 	-	iTerm2: Preferences... -> Profiles -> Terminal -> Environment -> **check** Set locale variables automatically
139 | 
140 | ### Using Homebrew (**Intel Mac**)
141 | 
142 | For M1 Macs, use the next section instead of this one.
143 | 
144 | 2.	Install Homebrew using the install step on https://brew.sh/
145 | 
146 | 3.	Run:
147 | 
148 | 	```
149 | 	brew update
150 | 	brew install python@3.8 libxslt re2 pkg-config
151 | 	/usr/local/opt/python@3.8/bin/python3 -m venv ~/gs-venv
152 | 	PKG_CONFIG_PATH="/usr/local/opt/libxml2/lib/pkgconfig" ~/gs-venv/bin/pip install --no-binary lxml --upgrade git+https://github.com/ArchiveTeam/grab-site
153 | 	```
154 | 
155 | 4.	To put the `grab-site` binaries in your PATH, add this to your `~/.zshrc` (macOS 10.15, 11+) or `~/.bash_profile` (earlier):
156 | 
157 | 	```
158 | 	PATH="$PATH:$HOME/gs-venv/bin"
159 | 	```
160 | 
161 | 	and then restart your shell (e.g. by opening a new terminal tab/window).
162 | 
163 | ### Using Homebrew (**M1 Mac**)
164 | 
165 | 2.	Install Homebrew using the install step on https://brew.sh/
166 | 
167 | 	If you already have a Homebrew install at `/usr/local`, you may need to first remove that old Intel-based Homebrew install.
168 | 
169 | 3.	Run:
170 | 
171 | 	```
172 | 	brew update
173 | 	brew install python@3.8 libxslt re2 pkg-config
174 | 	/opt/homebrew/opt/python@3.8/bin/python3 -m venv ~/gs-venv
175 | 	PKG_CONFIG_PATH="/opt/homebrew/opt/libxml2/lib/pkgconfig" ~/gs-venv/bin/pip install --no-binary lxml --upgrade git+https://github.com/ArchiveTeam/grab-site
176 | 	```
177 | 
178 | 4.	To put the `grab-site` binaries in your PATH, add this to your `~/.zshrc` (macOS 10.15, 11+) or `~/.bash_profile` (earlier):
179 | 
180 | 	```
181 | 	PATH="$PATH:$HOME/gs-venv/bin"
182 | 	```
183 | 
184 | 	and then restart your shell (e.g. by opening a new terminal tab/window).
185 | 
186 | 
187 | 
188 | Install on Windows 10 (experimental)
189 | ---
190 | 
191 | On Windows 10 Fall Creators Update (1703) or newer:
192 | 
193 | 1. Start menu -> search "feature" -> Turn Windows features on or off
194 | 
195 | 2. Scroll down, check "Windows Subsystem for Linux" and click OK.
196 | 
197 | 3. Wait for install and click "Restart now"
198 | 
199 | 4. Start menu -> Store
200 | 
201 | 5. Search for "Ubuntu" in the store and install Ubuntu (publisher: Canonical Group Limited).
202 | 
203 | 6. Start menu -> Ubuntu
204 | 
205 | 7. Wait for install and create a user when prompted.
206 | 
207 | 8. Follow the [Ubuntu 18.04, 20.04, 22.04, Debian 10 (buster), Debian 11 (bullseye)](#install-on-ubuntu-1804-2004-2204-debian-10-buster-debian-11-bullseye) steps.
208 | 
209 | 
210 | 
211 | Upgrade an existing install
212 | ---
213 | 
214 | To update grab-site, simply run the `~/gs-venv/bin/pip install ...` or
215 | `nix-env ...` command used to install it originally (see above).
216 | 
217 | After upgrading, stop `gs-server` with `kill` or ctrl-c, then start it again.
218 | Existing `grab-site` crawls will automatically reconnect to the new server.
219 | 
220 | 
221 | 
222 | Usage
223 | ---
224 | 
225 | First, start the dashboard with:
226 | 
227 | ```
228 | gs-server
229 | ```
230 | 
231 | and point your browser to http://127.0.0.1:29000/
232 | 
233 | Note: gs-server listens on all interfaces by default, so you can reach the
234 | dashboard by a non-localhost IP as well, e.g. a LAN or WAN IP.  (Sub-note:
235 | no code execution capabilities are exposed on any interface.)
236 | 
237 | Then, start as many crawls as you want with:
238 | 
239 | ```
240 | grab-site 'URL'
241 | ```
242 | 
243 | Do this inside tmux unless they're very short crawls.
244 | 
245 | grab-site outputs WARCs, logs, and control files to a new subdirectory in the
246 | directory from which you launched `grab-site`, referred to here as "DIR".
247 | (Use `ls -lrt` to find it.)
248 | 
249 | You can pass multiple `URL` arguments to include them in the same crawl,
250 | whether they are on the same domain or different domains entirely.
251 | 
252 | warcprox users: [warcprox](https://github.com/internetarchive/warcprox) breaks the
253 | dashboard's WebSocket; please make your browser skip the proxy for whichever
254 | host/IP you're using to reach the dashboard.
255 | 
256 | ### `grab-site` options, ordered by importance
257 | 
258 | Options can come before or after the URL.
259 | 
260 | *	`--1`: grab just `URL` and its page requisites, without recursing.
261 | 
262 | *	`--igsets=IGSET1,IGSET2`: use ignore sets `IGSET1` and `IGSET2`.
263 | 
264 | 	Ignore sets are used to avoid requesting junk URLs using a pre-made set of
265 | 	regular expressions.  See [the full list of available ignore sets](https://github.com/ArchiveTeam/grab-site/tree/master/libgrabsite/ignore_sets).
266 | 
267 | 	The [global](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/global)
268 | 	ignore set is implied and enabled unless `--no-global-igset` is used.
269 | 
270 | 	The ignore sets can be changed during the crawl by editing the `DIR/igsets` file.
271 | 
272 | *	`--no-global-igset`: don't add the [global](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/global) ignore set.
273 | 
274 | *	`--no-offsite-links`: avoid following links to a depth of 1 on other domains.
275 | 
276 | 	grab-site always grabs page requisites (e.g. inline images and stylesheets), even if
277 | 	they are on other domains.  By default, grab-site also grabs linked pages to a depth
278 | 	of 1 on other domains.  To turn off this behavior, use `--no-offsite-links`.
279 | 
280 | 	Using `--no-offsite-links` may prevent all kinds of useful images, video, audio, downloads,
281 | 	etc from being grabbed, because these are often hosted on a CDN or subdomain, and
282 | 	thus would otherwise not be included in the recursive crawl.
283 | 
284 | *	`-i` / `--input-file`: Load list of URLs-to-grab from a local file or from a
285 | 	URL; like `wget -i`.  File must be a newline-delimited list of URLs.
286 | 	Combine with `--1` to avoid a recursive crawl on each URL.
287 | 
288 | *	`--igon`: Print all URLs being ignored to the terminal and dashboard.  Can be
289 | 	changed during the crawl by `touch`ing or `rm`ing the `DIR/igoff` file.
290 | 	This is slower because it needs to find the specific regexp to blame.
291 | 
292 | *	`--no-video`: Skip the download of videos by both mime type and file extension.
293 | 	Skipped videos are logged to `DIR/skipped_videos`.  Can be
294 | 	changed during the crawl by `touch`ing or `rm`ing the `DIR/video` file.
295 | 
296 | *	`--no-sitemaps`: don't queue URLs from `sitemap.xml` at the root of the site.
297 | 
298 | *	`--max-content-length=N`: Skip the download of any response that claims a
299 | 	Content-Length larger than `N`.  (default: -1, don't skip anything).
300 | 	Skipped URLs are logged to `DIR/skipped_max_content_length`.  Can be changed
301 | 	during the crawl by editing the `DIR/max_content_length` file.
302 | 
303 | *	`--no-dupespotter`: Disable dupespotter, a plugin that skips the extraction
304 | 	of links from pages that look like duplicates of earlier pages.  Disable this
305 | 	for sites that are directory listings, because they frequently trigger false
306 | 	positives.
307 | 
308 | *	`--concurrency=N`: Use `N` connections to fetch in parallel (default: 2).
309 | 	Can be changed during the crawl by editing the `DIR/concurrency` file.
310 | 
311 | *	`--delay=N`: Wait `N` milliseconds (default: 0) between requests on each concurrent fetcher.
312 | 	Can be a range like X-Y to use a random delay between X and Y.  Can be changed during
313 | 	the crawl by editing the `DIR/delay` file.
314 | 
315 | *	`--import-ignores`: Copy this file to to `DIR/ignores` before the crawl begins.
316 | 
317 | *	`--warc-max-size=BYTES`: Try to limit each WARC file to around `BYTES` bytes
318 | 	before rolling over to a new WARC file (default: 5368709120, which is 5GiB).
319 | 	Note that the resulting WARC files may be drastically larger if there are very
320 | 	large responses.
321 | 
322 | *	`--level=N`: recurse `N` levels instead of `inf` levels.
323 | 
324 | *	`--page-requisites-level=N`: recurse page requisites `N` levels instead of `5` levels.
325 | 
326 | *	`--ua=STRING`: Send User-Agent: `STRING` instead of pretending to be Firefox on Windows.
327 | 
328 | *	`--id=ID`: Use id `ID` for the crawl instead of a random 128-bit id. This must be unique for every crawl.
329 | 
330 | *	`--dir=DIR`: Put control files, temporary files, and unfinished WARCs in `DIR`
331 | 	(default: a directory name based on the URL, date, and first 8 characters of the id).
332 | 
333 | *	`--finished-warc-dir=FINISHED_WARC_DIR`: absolute path to a directory into
334 | 	which finished `.warc.gz` and `.cdx` files will be moved.
335 | 
336 | *	`--permanent-error-status-codes=STATUS_CODES`: A comma-separated list of
337 | 	HTTP status codes to treat as a permanent error and therefore **not** retry
338 | 	(default: `401,403,404,405,410`).  Other error responses tried another 2
339 | 	times for a total of 3 tries (customizable with `--wpull-args=--tries=N`).
340 | 	Note that, unlike wget, wpull puts retries at the end of the queue.
341 | 
342 | *	`--wpull-args=ARGS`: String containing additional arguments to pass to wpull;
343 | 	see `wpull --help`.  `ARGS` is split with `shlex.split` and individual
344 | 	arguments can contain spaces if quoted, e.g.
345 | 	`--wpull-args="--youtube-dl \"--youtube-dl-exe=/My Documents/youtube-dl\""`
346 | 
347 | 	Examples:
348 | 
349 | 	*	`--wpull-args=--no-skip-getaddrinfo` to respect `/etc/hosts` entries.
350 | 	*	`--wpull-args=--no-warc-compression` to write uncompressed WARC files.
351 | 
352 | *	`--which-wpull-args-partial`: Print a partial list of wpull arguments that
353 | 	would be used and exit.  Excludes grab-site-specific features, and removes
354 | 	`DIR/` from paths.  Useful for reporting bugs on wpull without grab-site involvement.
355 | 
356 | *	`--which-wpull-command`: Populate `DIR/` but don't start wpull; instead print
357 | 	the command that would have been used to start wpull with all of the
358 | 	grab-site functionality.
359 | 
360 | *	`--debug`: print a lot of debug information.
361 | 
362 | *	`--help`: print help text.
363 | 
364 | ### Warnings
365 | 
366 | If you pay no attention to your crawls, a crawl may head down some infinite bot
367 | trap and stay there forever.  The site owner may eventually notice high CPU use
368 | or log activity, then IP-ban you.
369 | 
370 | grab-site does not respect `robots.txt` files, because they frequently
371 | [whitelist only approved robots](https://github.com/robots.txt),
372 | [hide pages embarrassing to the site owner](https://web.archive.org/web/20140401024610/http://www.thecrimson.com/robots.txt),
373 | or block image or stylesheet resources needed for proper archival.
374 | [See also](https://www.archiveteam.org/index.php?title=Robots.txt).
375 | Because of this, very rarely you might run into a robot honeypot and receive
376 | an abuse@ complaint.  Your host may require a prompt response to such a complaint
377 | for your server to stay online.  Therefore, we recommend against crawling the
378 | web from a server that hosts your critical infrastructure.
379 | 
380 | Don't run grab-site on GCE (Google Compute Engine); as happened to me, your
381 | entire API project may get nuked after a few days of crawling the web, with
382 | no recourse.  Good alternatives include OVH ([OVH](https://www.ovh.com/us/dedicated-servers/),
383 | [So You Start](https://www.soyoustart.com/us/essential-servers/),
384 | [Kimsufi](https://www.kimsufi.com/us/en/index.xml)), and online.net's
385 | [dedicated](https://www.online.net/en/dedicated-server) and
386 | [Scaleway](https://www.scaleway.com/) offerings.
387 | 
388 | ### Tips for specific websites
389 | 
390 | #### Website requiring login / cookies
391 | 
392 | Log in to the website in Chrome or Firefox.  Use the cookies.txt extension
393 | [for Chrome](https://github.com/daftano/cookies.txt) or
394 | [for Firefox](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/)
395 | extension to copy Netscape-format cookies.  Paste the cookies data into a new
396 | file.  Start grab-site with `--wpull-args=--load-cookies=ABSOLUTE_PATH_TO_COOKIES_FILE`.
397 | 
398 | #### Static websites; WordPress blogs; Discourse forums
399 | 
400 | The defaults usually work fine.
401 | 
402 | #### Blogger / blogspot.com blogs
403 | 
404 | The defaults work fine except for blogs with a JavaScript-only Dynamic Views theme.
405 | 
406 | Some blogspot.com blogs use "[Dynamic Views](https://support.google.com/blogger/answer/1229061?hl=en)"
407 | themes that require JavaScript and serve absolutely no HTML content.  In rare
408 | cases, you can get JavaScript-free pages by appending `?m=1`
409 | ([example](https://happinessbeyondthought.blogspot.com/?m=1)).  Otherwise, you
410 | can archive parts of these blogs through Google Cache instead
411 | ([example](https://webcache.googleusercontent.com/search?q=cache:http://blog.datomic.com/))
412 | or by using https://archive.is/ instead of grab-site.
413 | 
414 | #### Tumblr blogs
415 | 
416 | Either don't crawl from Europe (because tumblr redirects to a GDPR `/privacy/consent` page), or add `Googlebot` to the user agent:
417 | 
418 | ```
419 | --ua "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/70.0 but not really nor Googlebot/2.1"
420 | ```
421 | 
422 | Use [`--igsets=singletumblr`](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/singletumblr)
423 | to avoid crawling the homepages of other tumblr blogs.
424 | 
425 | If you don't care about who liked or reblogged a post, add `\?from_c=` to the
426 | crawl's `ignores`.
427 | 
428 | Some tumblr blogs appear to require JavaScript, but they are actually just
429 | hiding the page content with CSS.  You are still likely to get a complete crawl.
430 | (See the links in the page source for https://X.tumblr.com/archive).
431 | 
432 | #### Subreddits
433 | 
434 | Use [`--igsets=reddit`](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/reddit)
435 | and add a `/` at the end of the URL to avoid crawling all subreddits.
436 | 
437 | When crawling a subreddit, you **must** get the casing of the subreddit right
438 | for the recursive crawl to work.  For example,
439 | 
440 | ```
441 | grab-site https://www.reddit.com/r/Oculus/ --igsets=reddit
442 | ```
443 | 
444 | will crawl only a few pages instead of the entire subreddit.  The correct casing is:
445 | 
446 | ```
447 | grab-site https://www.reddit.com/r/oculus/ --igsets=reddit
448 | ```
449 | 
450 | You can hover over the "Hot"/"New"/... links at the top of the page to see the correct casing.
451 | 
452 | #### Directory listings ("Index of ...")
453 | 
454 | Use `--no-dupespotter` to avoid triggering false positives on the duplicate
455 | page detector.  Without it, the crawl may miss large parts of the directory tree.
456 | 
457 | #### Very large websites
458 | 
459 | Use `--no-offsite-links` to stay on the main website and avoid crawling linked pages on other domains.
460 | 
461 | #### Websites that are likely to ban you for crawling fast
462 | 
463 | Use `--concurrency=1 --delay=500-1500`.
464 | 
465 | #### MediaWiki sites with English language
466 | 
467 | Use [`--igsets=mediawiki`](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/mediawiki).
468 | Note that this ignore set ignores old page revisions.
469 | 
470 | #### MediaWiki sites with non-English language
471 | 
472 | You will probably have to add ignores with translated `Special:*` URLs based on
473 | [ignore_sets/mediawiki](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/mediawiki).
474 | 
475 | #### Forums that aren't Discourse
476 | 
477 | Forums require more manual intervention with ignore patterns.
478 | [`--igsets=forums`](https://github.com/ArchiveTeam/grab-site/blob/master/libgrabsite/ignore_sets/forums)
479 | is often useful for most forums, but you will have to add other ignore
480 | patterns, including one to ignore individual-forum-post pages if there are
481 | too many posts to crawl.  (Generally, crawling the thread pages is enough.)
482 | 
483 | #### GitHub issues / pull requests
484 | 
485 | Find the highest issue number from an issues page ([example](https://github.com/rust-lang/rust/issues)) and use:
486 | 
487 | ```
488 | grab-site --1 https://github.com/rust-lang/rust/issues/{1..30000}
489 | ```
490 | 
491 | This relies on your shell to expand the argument to thousands of arguments.
492 | If there are too many arguments, you may have to write the URLs to a file
493 | and use `grab-site -i` instead:
494 | 
495 | ```
496 | for i in {1..30000}; do echo https://github.com/rust-lang/rust/issues/$i >> .urls; done
497 | grab-site --1 -i .urls
498 | ```
499 | 
500 | #### Websites whose domains have just expired but are still up at the webhost
501 | 
502 | Use a [DNS history](https://www.google.com/search?q=historical+OR+history+dns)
503 | service to find the old IP address (the DNS "A" record) for the domain.  Add a
504 | line to your `/etc/hosts` to point the domain to the old IP.  Start a crawl
505 | with `--wpull-args=--no-skip-getaddrinfo` to make wpull use `/etc/hosts`.
506 | 
507 | #### twitter.com/user
508 | 
509 | Use [snscrape](https://github.com/JustAnotherArchivist/snscrape) to get a list
510 | of tweets for a user.  Redirect `snscrape`'s output to a list of URLs with
511 | `> urls` and pass this file to `grab-site --1 -i urls`.
512 | 
513 | Alternatively, use [webrecorder.io](https://webrecorder.io/) instead of
514 | grab-site.  It has an autoscroll feature and you can download the WARCs.
515 | 
516 | Keep in mind that scrolling `twitter.com/user` returns a maximum of 3200 tweets,
517 | while a [from:user](https://twitter.com/search?q=from%3Ainternetarchive&src=typd&f=realtime&qf=off&lang=en)
518 | query can return more.
519 | 
520 | 
521 | 
522 | Changing ignores during the crawl
523 | ---
524 | While the crawl is running, you can edit `DIR/ignores` and `DIR/igsets`; the
525 | changes will be applied within a few seconds.
526 | 
527 | `DIR/igsets` is a comma-separated list of ignore sets to use.
528 | 
529 | `DIR/ignores` is a newline-separated list of [Python 3 regular expressions](https://pythex.org/)
530 | to use in addition to the ignore sets.
531 | 
532 | You can `rm DIR/igoff` to display all URLs that are being filtered out
533 | by the ignores, and `touch DIR/igoff` to turn it back off.
534 | 
535 | Note that ignores will not apply to any of the crawl's start URLs.
536 | 
537 | 
538 | 
539 | Inspecting the URL queue
540 | ---
541 | Inspecting the URL queue is usually not necessary, but may be helpful
542 | for adding ignores before grab-site crawls a large number of junk URLs.
543 | 
544 | To dump the queue, run:
545 | 
546 | ```
547 | gs-dump-urls DIR/wpull.db todo
548 | ```
549 | 
550 | Four other statuses can be used besides `todo`:
551 | `done`, `error`, `in_progress`, and `skipped`.
552 | 
553 | You may want to pipe the output to `sort` and `less`:
554 | 
555 | ```
556 | gs-dump-urls DIR/wpull.db todo | sort | less -S
557 | ```
558 | 
559 | 
560 | 
561 | Preventing a crawl from queuing any more URLs
562 | ---
563 | `rm DIR/scrape`.  Responses will no longer be scraped for URLs.  Scraping cannot
564 | be re-enabled for a crawl.
565 | 
566 | 
567 | 
568 | Stopping a crawl
569 | ---
570 | You can `touch DIR/stop` or press ctrl-c, which will do the same.  You will
571 | have to wait for the current downloads to finish.
572 | 
573 | 
574 | 
575 | Advanced `gs-server` options
576 | ---
577 | These environmental variables control what `gs-server` listens on:
578 | 
579 | *	`GRAB_SITE_INTERFACE` (default `0.0.0.0`)
580 | *	`GRAB_SITE_PORT` (default `29000`)
581 | 
582 | These environmental variables control which server each `grab-site` process connects to:
583 | 
584 | *	`GRAB_SITE_HOST` (default `127.0.0.1`)
585 | *	`GRAB_SITE_PORT` (default `29000`)
586 | 
587 | 
588 | 
589 | Viewing the content in your WARC archives
590 | ---
591 | 
592 | Try [ReplayWeb.page](https://replayweb.page/) or [webrecorder-player](https://github.com/webrecorder/webrecorder-player).
593 | 
594 | 
595 | 
596 | Inspecting WARC files in the terminal
597 | ---
598 | `zless` is a wrapper over `less` that can be used to view raw WARC content:
599 | 
600 | ```
601 | zless DIR/FILE.warc.gz
602 | ```
603 | 
604 | `zless -S` will turn off line wrapping.
605 | 
606 | Note that grab-site requests uncompressed HTTP responses to avoid
607 | double-compression in .warc.gz files and to make zless output more useful.
608 | However, some servers will send compressed responses anyway.
609 | 
610 | 
611 | 
612 | Automatically pausing grab-site processes when free disk is low
613 | ---
614 | 
615 | If you automatically upload and remove finished .warc.gz files, you can still
616 | run into a situation where grab-site processes fill up your disk faster than
617 | your uploader process can handle.  To prevent this situation, you can customize
618 | and run [this script](https://github.com/ArchiveTeam/grab-site/blob/master/extra_docs/pause_resume_grab_sites.sh),
619 | which will pause and resume grab-site processes as your free disk space
620 | crosses a threshold value.
621 | 
622 | 
623 | 
624 | Thanks
625 | ---
626 | 
627 | grab-site is made possible only because of [wpull](https://github.com/chfoo/wpull),
628 | written by [Christopher Foo](https://github.com/chfoo) who spent a year
629 | making something much better than wget.  ArchiveTeam's most pressing
630 | issue with wget at the time was that it kept the entire URL queue in memory
631 | instead of on disk.  wpull has many other advantages over wget, including
632 | better link extraction and Python hooks.
633 | 
634 | Thanks to [David Yip](https://github.com/yipdw), who created
635 | [ArchiveBot](https://github.com/ArchiveTeam/ArchiveBot).  The wpull
636 | hooks in ArchiveBot served as the basis for grab-site.  The original ArchiveBot
637 | dashboard inspired the newer dashboard now used in both projects.
638 | 
639 | Thanks to [Falcon Darkstar Momot](https://github.com/falconkirtaran) for
640 | the many wpull 2.x fixes that were rolled into
641 | [ArchiveTeam/wpull](https://github.com/ArchiveTeam/wpull).
642 | 
643 | Thanks to [JustAnotherArchivist](https://github.com/JustAnotherArchivist)
644 | for investigating my wpull issues.
645 | 
646 | Thanks to [BrowserStack](https://www.browserstack.com/) for providing free
647 | browser testing for grab-site, which we use to make sure the dashboard works
648 | in various browsers.
649 | 
650 | [<img src="https://user-images.githubusercontent.com/211271/29110431-887941d2-7cde-11e7-8c2f-199d85c5a3b5.png" height="30" alt="BrowserStack Logo">](https://www.browserstack.com/)
651 | 
652 | 
653 | 
654 | Help
655 | ---
656 | grab-site bugs and questions are welcome in
657 | [grab-site/issues](https://github.com/ArchiveTeam/grab-site/issues).
658 | 
659 | Terminal output in your bug report should be surrounded by triple backquotes, like this:
660 | 
661 | <pre>
662 | ```
663 | very
664 | long
665 | output
666 | ```
667 | </pre>
668 | 
669 | Please report security bugs as regular bugs.
670 | 
671 | 
672 | [travis-image]: https://img.shields.io/travis/ArchiveTeam/grab-site.svg
673 | [travis-url]: https://travis-ci.org/ArchiveTeam/grab-site
674 | 


--------------------------------------------------------------------------------
/extra_docs/pause_resume_grab_sites.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Sample script to pause or resume all grab-site processes as your free disk
 4 | # space crosses a threshold value.  Modify the values below.
 5 | 
 6 | # Default: 80GB
 7 | LOW_DISK_KB=$((80 * 1024 * 1024))
 8 | PARTITION=/
 9 | CHECK_INTERVAL_SEC=60
10 | 
11 | # Track whether *we* paused the grab-sites to avoid (typically) resuming
12 | # grab-sites that were paused by the user with e.g. ctrl-z
13 | paused=0
14 | 
15 | while true; do
16 | 	left=$(df "$PARTITION" | grep / | sed -r 's/ +/ /g' | cut -f 4 -d ' ')
17 | 	if [[ $paused = 1 ]] && (( left >= $LOW_DISK_KB )); then
18 | 		echo "Disk OK, resuming all grab-sites"
19 | 		paused=0
20 | 		killall -CONT grab-site
21 | 	fi
22 | 	if (( left < $LOW_DISK_KB )); then
23 | 		echo "Disk low, pausing all grab-sites"
24 | 		paused=1
25 | 		killall -STOP grab-site
26 | 	fi
27 | 	echo -n ". "
28 | 	sleep "$CHECK_INTERVAL_SEC"
29 | done
30 | 


--------------------------------------------------------------------------------
/grab-site:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | from libgrabsite import main
4 | main.main()
5 | 


--------------------------------------------------------------------------------
/gs-dump-urls:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | from libgrabsite import dump_urls
4 | dump_urls.main()
5 | 


--------------------------------------------------------------------------------
/gs-server:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | from libgrabsite import server
4 | server.main()
5 | 


--------------------------------------------------------------------------------
/images/dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArchiveTeam/grab-site/0eaf88628f4d8ef5af4df4c13f594606a41de3cd/images/dashboard.png


--------------------------------------------------------------------------------
/images/scriptorium.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArchiveTeam/grab-site/0eaf88628f4d8ef5af4df4c13f594606a41de3cd/images/scriptorium.jpg


--------------------------------------------------------------------------------
/libgrabsite/404.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 | <title>404 Not Found</title>
 5 | </head>
 6 | <body>
 7 | <h1>404 Not Found</h1>
 8 | <p>The page you requested was not found</p>
 9 | </body>
10 | </html>
11 | 


--------------------------------------------------------------------------------
/libgrabsite/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '2.2.7'
2 | 


--------------------------------------------------------------------------------
/libgrabsite/dashboard.html:
--------------------------------------------------------------------------------
   1 | <!doctype html>
   2 | <html lang="en">
   3 | <head>
   4 | <meta charset="UTF-8">
   5 | <meta name="referrer" content="no-referrer">
   6 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
   7 | <meta http-equiv="x-dns-prefetch-control" content="off">
   8 | <base target="_blank">
   9 | <title>grab-site dashboard</title>
  10 | </head>
  11 | <body>
  12 | <style>
  13 | html {
  14 | 	/* Always show scrollbar to prevent jumpiness when filtering */
  15 | 	overflow-y: scroll;
  16 | }
  17 | 
  18 | html, body {
  19 | 	font-family: system-ui, -apple-system, Roboto, 'Segoe UI', Tahoma, Arial, sans-serif;
  20 | 	font-size: 13px;
  21 | 	/**
  22 | 	 * Opt out of Chrome's Scroll Anchoring, which causes this page to
  23 | 	 * incorrectly scroll up as new lines appear in log windows.
  24 | 	 * https://bugs.chromium.org/p/chromium/issues/detail?id=650017
  25 | 	 * https://github.com/WICG/interventions/blob/0063fe5d3d0e086d4f963c8bd612d12b57db0784/scroll-anchoring/explainer.md#exclusion--opt-out-api
  26 | 	 */
  27 | 	overflow-anchor: none;
  28 | }
  29 | 
  30 | html, body, .job-ident {
  31 | 	background-color: #d4c7b8;
  32 | }
  33 | 
  34 | #filter-box {
  35 | 	background-color: #eee;
  36 | 	border: 1px solid #999;
  37 | 	padding: 1px 3px 1px 3px;
  38 | 	font-size: 18px;
  39 | 	border-radius: 3px;
  40 | }
  41 | 
  42 | .button {
  43 | 	font-size: 18px;
  44 | }
  45 | 
  46 | .padded-page {
  47 | 	padding: 20px 27px 20px 27px;
  48 | }
  49 | 
  50 | @media all and (min-width: 1440px) {
  51 | 	.padded-page {
  52 | 		padding: 20px 54px 47px 54px;
  53 | 	}
  54 | }
  55 | 
  56 | .header {
  57 | 	font-family: Roboto, Arial, sans-serif;
  58 | 	font-weight: bold;
  59 | 	font-size: 18px;
  60 | 	margin: 0 0 20px 0;
  61 | 	display: flex;
  62 | 	align-items: flex-end;
  63 | 	justify-content: space-between;
  64 | 	flex-flow: row nowrap;
  65 | }
  66 | 
  67 | .job-header {
  68 | 	display: flex;
  69 | 	align-items: flex-end;
  70 | 	justify-content: space-between;
  71 | 	flex-flow: row nowrap;
  72 | 	cursor: default;
  73 | }
  74 | 
  75 | .stats-elements:hover {
  76 | 	background-color: #ffffff40;
  77 | }
  78 | 
  79 | .job-info {
  80 | 	white-space: nowrap;
  81 | 	overflow: hidden;
  82 | }
  83 | 
  84 | .job-info-done {
  85 | 	color: #767676;
  86 | }
  87 | 
  88 | .job-info-aborted {
  89 | 	color: #9B00D7 !important;
  90 | }
  91 | 
  92 | .job-info-fatal {
  93 | 	color: #DD0000 !important;
  94 | }
  95 | 
  96 | .inline-stat {
  97 | 	/* Needed for 'Align!' feature */
  98 | 	display: inline-block;
  99 | 	/* Needed to avoid extra vertical padding */
 100 | 	vertical-align: bottom;
 101 | 	/* Needed to avoid collapsing of leading space */
 102 | 	white-space: pre;
 103 | 	/* Keep digits in fonts like San Francisco aligned */
 104 | 	font-variant-numeric: tabular-nums;
 105 | }
 106 | 
 107 | .job-url {
 108 | 	font-family: Roboto, Arial, sans-serif;
 109 | 	font-size: 14px;
 110 | 	font-weight: bold;
 111 | 	text-decoration: none;
 112 | 	color: inherit;
 113 | }
 114 | 
 115 | .job-url-aligned {
 116 | 	width: 260px;
 117 | 	overflow: hidden;
 118 | 	text-overflow: ellipsis;
 119 | }
 120 | 
 121 | .job-note-aligned {
 122 | 	display: none;
 123 | }
 124 | 
 125 | .job-nick-aligned {
 126 | 	width: 60px;
 127 | 	overflow: hidden;
 128 | 	text-overflow: hidden;
 129 | }
 130 | 
 131 | .job-mb-aligned {
 132 | 	width: 78px;
 133 | 	overflow: hidden;
 134 | 	text-overflow: hidden;
 135 | 	text-align: right;
 136 | }
 137 | 
 138 | .job-responses-aligned {
 139 | 	width: 92px;
 140 | 	overflow: hidden;
 141 | 	text-overflow: hidden;
 142 | 	text-align: right;
 143 | }
 144 | 
 145 | .job-responses-per-second-aligned {
 146 | 	width: 27px;
 147 | 	overflow: hidden;
 148 | 	text-overflow: hidden;
 149 | 	text-align: right;
 150 | }
 151 | 
 152 | .job-in-queue-aligned {
 153 | 	width: 92px;
 154 | 	overflow: hidden;
 155 | 	text-overflow: hidden;
 156 | 	text-align: right;
 157 | }
 158 | 
 159 | .job-connections-aligned {
 160 | 	width: 14px;
 161 | 	overflow: hidden;
 162 | 	text-overflow: hidden;
 163 | 	text-align: right;
 164 | }
 165 | 
 166 | .job-delay-aligned {
 167 | 	width: 130px;
 168 | 	overflow: hidden;
 169 | 	text-overflow: hidden;
 170 | 	text-align: right;
 171 | }
 172 | 
 173 | .job-ignores {
 174 | 	font-weight: bold;
 175 | }
 176 | 
 177 | .job-igoff {
 178 | 	font-weight: normal !important;
 179 | }
 180 | 
 181 | .job-ident {
 182 | 	font-family: Roboto, Tahoma, Arial, sans-serif;
 183 | 	margin: 0 1px 0 0;
 184 | 	padding: 0;
 185 | 	border: 0;
 186 | 	color: #645444;
 187 | 	text-align: right;
 188 | }
 189 | 
 190 | .log-window {
 191 | 	transition: height 0.18s ease;
 192 | 	background-color: #FFF7E1;
 193 | 	overflow-y: scroll;
 194 | 	height: 192px;
 195 | 	border: 1px solid #999;
 196 | 	margin: 0 0 1em 0;
 197 | 	border-radius: 3px;
 198 | 	box-shadow: 4px 4px 17px 0px #0000001c;
 199 | 	overscroll-behavior: contain;
 200 | }
 201 | 
 202 | .log-window-hidden {
 203 | 	height: 0;
 204 | 	border-width: 0 1px 0 1px;
 205 | 	margin: 0;
 206 | }
 207 | 
 208 | .log-window-stopped {
 209 | 	box-shadow: 4px 4px 17px 0px #00000052;
 210 | }
 211 | 
 212 | .log-window-expanded {
 213 | 	height: 384px;
 214 | }
 215 | 
 216 | .line-normal {
 217 | 	display: block;
 218 | 	white-space: pre;
 219 | 	width: 100%;
 220 | 	padding: 0 0 0 5px;
 221 | 	box-sizing: border-box;
 222 | }
 223 | 
 224 | .line-error {
 225 | 	display: block;
 226 | 	white-space: pre;
 227 | 	width: 100%;
 228 | 	background-color: #FFB9B9;
 229 | 	padding: 0 0 0 5px;
 230 | 	box-sizing: border-box;
 231 | }
 232 | 
 233 | .line-warning {
 234 | 	display: block;
 235 | 	white-space: pre;
 236 | 	width: 100%;
 237 | 	background-color: #F7DB7D;
 238 | 	padding: 0 0 0 5px;
 239 | 	box-sizing: border-box;
 240 | }
 241 | 
 242 | .line-redirect {
 243 | 	display: block;
 244 | 	white-space: pre;
 245 | 	width: 100%;
 246 | 	background-color: #E7CEEA;
 247 | 	padding: 0 0 0 5px;
 248 | 	box-sizing: border-box;
 249 | }
 250 | 
 251 | .line-ignore {
 252 | 	white-space: pre;
 253 | 	width: 100%;
 254 | 	color: #999;
 255 | 	padding: 0 0 0 5px;
 256 | 	box-sizing: border-box;
 257 | }
 258 | 
 259 | .line-stdout {
 260 | 	white-space: pre;
 261 | 	width: 100%;
 262 | 	background-color: #DCD8CB;
 263 | 	padding: 0 0 0 5px;
 264 | 	box-sizing: border-box;
 265 | }
 266 | 
 267 | a {
 268 | 	color: #000;
 269 | 	text-decoration: none;
 270 | }
 271 | 
 272 | a.ignore {
 273 | 	color: #999 !important;
 274 | }
 275 | 
 276 | .underlined-a {
 277 | 	text-decoration: underline;
 278 | }
 279 | 
 280 | .bold {
 281 | 	font-weight: bold;
 282 | }
 283 | 
 284 | #help {
 285 | 	background-color: #FFF7E1;
 286 | 	font-family: Roboto, Arial, sans-serif;
 287 | 	font-size: 14px;
 288 | 	border-radius: 5px;
 289 | 	padding: 0.01em 1em 0.01em 1em;
 290 | 	margin-bottom: 1em;
 291 | }
 292 | 
 293 | #help p {
 294 | 	padding: 0.20em 0 0.20em 0;
 295 | }
 296 | 
 297 | #help p a {
 298 | 	text-decoration: underline;
 299 | }
 300 | 
 301 | .undisplayed {
 302 | 	display: none;
 303 | }
 304 | 
 305 | #context-menu {
 306 | 	padding: 2px 0 2px 0;
 307 | 	background-color: white;
 308 | 	border: 1px solid #BABABA;
 309 | 	box-shadow: 2px 2px 3px #8E8E8E;
 310 | 	position: fixed;
 311 | 	left: 0;
 312 | 	top: 0;
 313 | 	display: none;
 314 | 	cursor: default;
 315 | }
 316 | 
 317 | .context-menu-entry {
 318 | 	display: block;
 319 | 	white-space: nowrap;
 320 | 	overflow: hidden;
 321 | 	max-width: 960px;
 322 | 	height: 26px;
 323 | 	line-height: 26px;
 324 | 	padding-left: 26px;
 325 | 	padding-right: 26px;
 326 | 	font-family: system-ui, -apple-system, Roboto, 'Segoe UI', 'Helvetica Neue', sans-serif;
 327 | 	font-size: 12px;
 328 | 	cursor: default;
 329 | 	-webkit-touch-callout: none;
 330 | 	-webkit-user-select: none;
 331 | 	-khtml-user-select: none;
 332 | 	-moz-user-select: none;
 333 | 	-ms-user-select: none;
 334 | 	user-select: none;
 335 | }
 336 | 
 337 | .context-menu-entry:hover {
 338 | 	background-color: #4281F4;
 339 | 	color: #fff;
 340 | }
 341 | 
 342 | #clipboard-scratchpad {
 343 | 	height: 1px;
 344 | 	width: 1px;
 345 | 	padding: 0;
 346 | 	border: 0;
 347 | 	position: absolute;
 348 | 	top: 0;
 349 | }
 350 | </style>
 351 | <div id="context-menu"></div>
 352 | <div class="padded-page">
 353 | 	<div class="header">
 354 | 		<div>
 355 | 			grab-site server tracking <span id="num-crawls">0 crawls</span>.
 356 | 			Show: <input id="filter-box" type="text" size="21">
 357 | 			<input onclick="ds.setFilter('');" type="button" value="  All  " class="button">
 358 | 			<input onclick="ds.setFilter('^$');" type="button" value="None" class="button">
 359 | 		</div>
 360 | 		<div>
 361 | 			<input type="button" onclick="ds.toggleAlign()" class="button" value="Align!">
 362 | 			<input type="button" onclick="ds.toggleHelp()" class="button" value="Help!">
 363 | 		</div>
 364 | 	</div>
 365 | 
 366 | 	<div id="critical-info">
 367 | 		<noscript>
 368 | 			Need JavaScript (ES5+) and WebSocket
 369 | 		</noscript>
 370 | 		<div id="help" class="undisplayed">
 371 | 			<p>
 372 | 				This page shows all of the crawls that are being reported to this server by your grab-site processes.
 373 | 			</p>
 374 | 			<p>
 375 | 				To pause scrolling, move your mouse inside a log window.
 376 | 			</p>
 377 | 			<p>
 378 | 				To show just one crawl, click anywhere on its stats line.
 379 | 			</p>
 380 | 			<p>
 381 | 				To clear all finished crawls, reload the page.
 382 | 			</p>
 383 | 			<p>
 384 | 				Keyboard shortcuts:
 385 | 			</p>
 386 | 			<ul>
 387 | 				<li><kbd>j</kbd> - show next crawl window
 388 | 				<li><kbd>k</kbd> - show previous crawl window
 389 | 				<li><kbd>a</kbd> - show all crawl windows
 390 | 				<li><kbd>n</kbd> - hide all crawl windows
 391 | 				<li><kbd>f</kbd> - move focus to filter box
 392 | 				<li><kbd>v</kbd> - open the start URL of the first-shown crawl window
 393 | 				<li><kbd>?</kbd> - show/hide help text
 394 | 			</ul>
 395 | 			<p>
 396 | 				The color coding for the crawl stats line is:
 397 | 					in progress,
 398 | 					<span class="job-info-done">finished normally</span>,
 399 | 					<span class="job-info-aborted">finished with abort</span>,
 400 | 					<span class="job-info-fatal">finished with fatal exception</span>.
 401 | 			</p>
 402 | 			<p>
 403 | 				Mouse over the crawl start date or the response count for additional information.
 404 | 			</p>
 405 | 			<p>
 406 | 				If your adblocker is enabled for this domain, you will see slower performance, and some URLs will not be displayed.
 407 | 			</p>
 408 | 			<p>
 409 | 				A custom context menu is shown when right-clicking URLs in the log windows below.  It can be disabled by adding <kbd><span class="url-q-or-amp">?</span>contextMenu=0</kbd> to the dashboard URL.  Firefox users: if you see both the normal and custom context menu, make sure <code>dom.event.contextmenu.enabled</code> is set to <code>true</code> in <code>about:config</code>.
 410 | 			</p>
 411 | 			<p>
 412 | 				For easier text selection in the log windows, add <kbd><span class="url-q-or-amp">?</span>moreDom=1</kbd> to the dashboard URL.  (This uses ~25% more memory.)
 413 | 			</p>
 414 | 			<p>
 415 | 				<a href="https://ludios.org/grab-site/">grab-site source code</a>.
 416 | 			</p>
 417 | 		</div>
 418 | 	</div>
 419 | 	<div id="traffic"></div>
 420 | 	<div id="logs"></div>
 421 | </div>
 422 | <script>
 423 | "use strict";
 424 | 
 425 | var assert = function(condition, message) {
 426 | 	if(!condition) {
 427 | 		throw message || "Assertion failed";
 428 | 	}
 429 | };
 430 | 
 431 | var byId = function(id) {
 432 | 	return document.getElementById(id);
 433 | };
 434 | 
 435 | var text = function(s) {
 436 | 	return document.createTextNode(s);
 437 | };
 438 | 
 439 | /**
 440 |  * Adaptation of ActiveSupport's #blank?.
 441 |  *
 442 |  * Returns true if the object is undefined, null, or is a string whose
 443 |  * post-trim length is zero.  Otherwise, returns false.
 444 |  */
 445 | var isBlank = function(o) {
 446 | 	return !o || o.trim().length === 0;
 447 | }
 448 | 
 449 | /**
 450 |  * appendChild but accepts strings and arrays of children|strings
 451 |  */
 452 | var appendAny = function(e, thing) {
 453 | 	if(Array.isArray(thing)) {
 454 | 		for(var i=0; i < thing.length; i++) {
 455 | 			appendAny(e, thing[i]);
 456 | 		}
 457 | 	} else if(typeof thing == "string") {
 458 | 		e.appendChild(text(thing));
 459 | 	} else {
 460 | 		if(thing == null) {
 461 | 			throw Error("thing is " + JSON.stringify(thing));
 462 | 		}
 463 | 		e.appendChild(thing);
 464 | 	}
 465 | };
 466 | 
 467 | /**
 468 |  * Create DOM element with attributes and children from Array<node|string>|node|string
 469 |  */
 470 | var h = function(elem, attrs, thing) {
 471 | 	var e = document.createElement(elem);
 472 | 	if(attrs != null) {
 473 | 		for(var attr in attrs) {
 474 | 			if(attr == "spellcheck" || attr == "readonly") {
 475 | 				e.setAttribute(attr, attrs[attr]);
 476 | 			} else if(attr == "class") {
 477 | 				throw new Error("Did you mean className?");
 478 | 			} else {
 479 | 				e[attr] = attrs[attr];
 480 | 			}
 481 | 		}
 482 | 	}
 483 | 	if(thing != null) {
 484 | 		appendAny(e, thing);
 485 | 	}
 486 | 	return e;
 487 | };
 488 | 
 489 | var href = function(href, text) {
 490 | 	var a = h("a");
 491 | 	a.href = href;
 492 | 	a.textContent = text;
 493 | 	return a;
 494 | };
 495 | 
 496 | var removeChildren = function(elem) {
 497 | 	while(elem.firstChild) {
 498 | 		elem.removeChild(elem.firstChild);
 499 | 	}
 500 | };
 501 | 
 502 | var prettyJson = function(obj) {
 503 | 	return JSON.stringify(obj, undefined, 2);
 504 | };
 505 | 
 506 | // Copied from Coreweb/js_coreweb/cw/string.js
 507 | /**
 508 |  * Like Python's s.split(delim, num) and s.split(delim)
 509 |  * This does *NOT* implement Python's no-argument s.split()
 510 |  *
 511 |  * @param {string} s The string to split.
 512 |  * @param {string} sep The separator to split by.
 513 |  * @param {number} maxsplit Maximum number of times to split.
 514 |  *
 515 |  * @return {!Array.<string>} The splitted string, as an array.
 516 |  */
 517 | var split = function(s, sep, maxsplit) {
 518 | 	assert(typeof sep == "string",
 519 | 		"arguments[1] of split must be a separator string");
 520 | 	if(maxsplit === undefined || maxsplit < 0) {
 521 | 		return s.split(sep);
 522 | 	}
 523 | 	var pieces = s.split(sep);
 524 | 	var head = pieces.splice(0, maxsplit);
 525 | 	// after the splice, pieces is shorter and no longer has the `head` elements.
 526 | 	if(pieces.length > 0) {
 527 | 		var tail = pieces.join(sep);
 528 | 		head.push(tail); // no longer just the head.
 529 | 	}
 530 | 	return head;
 531 | };
 532 | 
 533 | // Copied from closure-library's goog.string.startsWith
 534 | var startsWith = function(str, prefix) {
 535 | 	return str.lastIndexOf(prefix, 0) == 0;
 536 | }
 537 | 
 538 | // Copied from closure-library's goog.string.endsWith
 539 | var endsWith = function(str, suffix) {
 540 | 	var l = str.length - suffix.length;
 541 | 	return l >= 0 && str.indexOf(suffix, l) == l;
 542 | };
 543 | 
 544 | // Based on closure-library's goog.string.regExpEscape
 545 | var regExpEscape = function(s) {
 546 | 	var escaped = String(s).replace(/([-()\[\]{}+?*.$\^|,:#<!\\])/g, '\\$1').
 547 | 		replace(/\x08/g, '\\x08');
 548 | 	if(s.indexOf('[') == -1 && s.indexOf(']') == -1) {
 549 | 		// If there were no character classes, there can't have been any need
 550 | 		// to escape -, to unescape them.
 551 | 		escaped = escaped.replace(/\\-/g, "-");
 552 | 	}
 553 | 	return escaped;
 554 | };
 555 | 
 556 | /**
 557 |  * [[1, 2], [3, 4]] -> {1: 2, 3: 4}
 558 |  */
 559 | var intoObject = function(arr) {
 560 | 	var obj = {};
 561 | 	arr.forEach(function(e) {
 562 | 		obj[e[0]] = e[1];
 563 | 	});
 564 | 	return obj;
 565 | };
 566 | 
 567 | var getQueryArgs = function() {
 568 | 	var pairs = location.search.replace("?", "").split("&");
 569 | 	if(pairs == "") {
 570 | 		return {};
 571 | 	}
 572 | 	return intoObject(pairs.map(function(e) { return split(e, "=", 1); }));
 573 | };
 574 | 
 575 | var getChromeMajorVersion = function() {
 576 | 	return Number(navigator.userAgent.match(/Chrome\/(\d+)/)[1]);
 577 | };
 578 | 
 579 | var getFirefoxMajorVersion = function() {
 580 | 	return Number(navigator.userAgent.match(/Firefox\/(\d+)/)[1]);
 581 | };
 582 | 
 583 | var getTridentMajorVersion = function() {
 584 | 	return Number(navigator.userAgent.match(/Trident\/(\d+)/)[1]);
 585 | };
 586 | 
 587 | var isChrome = navigator.userAgent.indexOf("Chrome/") != -1;
 588 | var isSafari = !isChrome && navigator.userAgent.indexOf("Safari") != -1;
 589 | var isFirefox = navigator.userAgent.indexOf("Firefox") != -1;
 590 | var isTrident = navigator.userAgent.indexOf("Trident/") != -1;
 591 | 
 592 | var addAnyChangeListener = function(elem, func) {
 593 | 	// DOM0 handler for convenient use by Clear button
 594 | 	elem.onchange = func;
 595 | 	elem.addEventListener('keydown', func, false);
 596 | 	elem.addEventListener('paste', func, false);
 597 | 	elem.addEventListener('input', func, false);
 598 | };
 599 | 
 600 | var arrayFrom = function(arrayLike) {
 601 | 	return Array.prototype.slice.call(arrayLike);
 602 | };
 603 | 
 604 | /**
 605 |  * Returns a function that gets the given property on any object passed in
 606 |  */
 607 | var prop = function(name) {
 608 | 	return function(obj) {
 609 | 		return obj[name];
 610 | 	};
 611 | };
 612 | 
 613 | /**
 614 |  * Returns a function that adds the given class to any element passed in
 615 |  */
 616 | var classAdder = function(name) {
 617 | 	return function(elem) {
 618 | 		elem.classList.add(name);
 619 | 	};
 620 | };
 621 | 
 622 | /**
 623 |  * Returns a function that removes the given class to any element passed in
 624 |  */
 625 | var classRemover = function(name) {
 626 | 	return function(elem) {
 627 | 		elem.classList.remove(name);
 628 | 	};
 629 | };
 630 | 
 631 | var removeFromArray = function(arr, item) {
 632 | 	var idx = arr.indexOf(item);
 633 | 	if(idx != -1) {
 634 | 		arr.splice(idx, 1);
 635 | 	}
 636 | };
 637 | 
 638 | // Based on http://stackoverflow.com/a/18520276
 639 | var findInArray = function(arr, test, ctx) {
 640 | 	var result = null;
 641 | 	arr.some(function(el, i) {
 642 | 		return test.call(ctx, el, i, arr) ? ((result = i), true) : false;
 643 | 	});
 644 | 	return result;
 645 | };
 646 | 
 647 | /*** End of utility code ***/
 648 | 
 649 | 
 650 | 
 651 | var JobsTracker = function() {
 652 | 	this.known = {};
 653 | 	this.sorted = [];
 654 | 	this.finishedArray = [];
 655 | 	this.finishedSet = {};
 656 | 	this.fatalExceptionSet = {};
 657 | };
 658 | 
 659 | JobsTracker.prototype.countActive = function() {
 660 | 	return this.sorted.length - this.finishedArray.length;
 661 | };
 662 | 
 663 | JobsTracker.prototype.resort = function() {
 664 | 	this.sorted.sort(function(a, b) { return a["started_at"] > b["started_at"] ? -1 : 1 });
 665 | };
 666 | 
 667 | /**
 668 |  * Returns true if a new job was added
 669 |  */
 670 | JobsTracker.prototype.handleJobData = function(jobData) {
 671 | 	var ident = jobData["ident"];
 672 | 	var alreadyKnown = ident in this.known;
 673 | 	if(!alreadyKnown) {
 674 | 		this.known[ident] = true;
 675 | 		this.sorted.push(jobData);
 676 | 		this.resort();
 677 | 	}
 678 | 	return !alreadyKnown;
 679 | };
 680 | 
 681 | JobsTracker.prototype.markFinished = function(ident) {
 682 | 	if(!(ident in this.finishedSet)) {
 683 | 		this.finishedSet[ident] = true;
 684 | 		this.finishedArray.push(ident);
 685 | 	}
 686 | };
 687 | 
 688 | JobsTracker.prototype.markUnfinished = function(ident) {
 689 | 	if(ident in this.finishedSet) {
 690 | 		delete this.finishedSet[ident];
 691 | 		removeFromArray(this.finishedArray, ident);
 692 | 	}
 693 | 	// Job was restarted, so unmark fatal exception
 694 | 	if(ident in this.fatalExceptionSet) {
 695 | 		delete this.fatalExceptionSet[ident];
 696 | 	}
 697 | };
 698 | 
 699 | JobsTracker.prototype.markFatalException = function(ident) {
 700 | 	this.fatalExceptionSet[ident] = true;
 701 | };
 702 | 
 703 | JobsTracker.prototype.hasFatalException = function(ident) {
 704 | 	return ident in this.fatalExceptionSet;
 705 | };
 706 | 
 707 | 
 708 | 
 709 | var JobRenderInfo = function(logWindow, logSegment, statsElements, jobNote, lineCountWindow, lineCountSegments) {
 710 | 	this.logWindow = logWindow;
 711 | 	this.logSegment = logSegment;
 712 | 	this.statsElements = statsElements;
 713 | 	this.jobNote = jobNote;
 714 | 	this.lineCountWindow = lineCountWindow;
 715 | 	this.lineCountSegments = lineCountSegments;
 716 | };
 717 | 
 718 | 
 719 | 
 720 | var Reusable = {
 721 | 	obj_className_line_normal: {"className": "line-normal"},
 722 | 	obj_className_line_error: {"className": "line-error"},
 723 | 	obj_className_line_warning: {"className": "line-warning"},
 724 | 	obj_className_line_redirect: {"className": "line-redirect"},
 725 | 	//
 726 | 	obj_className_line_ignore: {"className": "line-ignore"},
 727 | 	obj_className_line_stdout: {"className": "line-stdout"},
 728 | 	obj_className_bold: {"className": "bold"}
 729 | };
 730 | 
 731 | 
 732 | 
 733 | // http://stackoverflow.com/questions/2901102/how-to-print-a-number-with-commas-as-thousands-separators-in-javascript
 734 | var numberWithCommas = function(s_or_n) {
 735 | 	return ("" + s_or_n).replace(/\B(?=(\d{3})+(?!\d))/g, ",");
 736 | };
 737 | 
 738 | var toStringTenths = function(n) {
 739 | 	var s = "" + (Math.round(10 * n) / 10);
 740 | 	if(s.indexOf(".") == -1) {
 741 | 		s += ".0";
 742 | 	}
 743 | 	return s;
 744 | };
 745 | 
 746 | var getTotalResponses = function(jobData) {
 747 | 	return (
 748 | 		parseInt(jobData["r1xx"]) +
 749 | 		parseInt(jobData["r2xx"]) +
 750 | 		parseInt(jobData["r3xx"]) +
 751 | 		parseInt(jobData["r4xx"]) +
 752 | 		parseInt(jobData["r5xx"]) +
 753 | 		parseInt(jobData["runk"]));
 754 | };
 755 | 
 756 | var getSummaryResponses = function(jobData) {
 757 | 	return (
 758 | 		"1xx: " + numberWithCommas(jobData["r1xx"]) + "\n" +
 759 | 		"2xx: " + numberWithCommas(jobData["r2xx"]) + "\n" +
 760 | 		"3xx: " + numberWithCommas(jobData["r3xx"]) + "\n" +
 761 | 		"4xx: " + numberWithCommas(jobData["r4xx"]) + "\n" +
 762 | 		"5xx: " + numberWithCommas(jobData["r5xx"]) + "\n" +
 763 | 		"Unknown: " + numberWithCommas(jobData["runk"]));
 764 | };
 765 | 
 766 | 
 767 | 
 768 | var JobsRenderer = function(container, filterBox, historyLines, showNicks, contextMenuRenderer) {
 769 | 	this.container = container;
 770 | 	this.filterBox = filterBox;
 771 | 	addAnyChangeListener(this.filterBox, this.applyFilter.bind(this));
 772 | 	this.filterBox.onkeypress = function(ev) {
 773 | 		// So that j or k in input box does not result in job window switching
 774 | 		ev.stopPropagation();
 775 | 	}
 776 | 	this.historyLines = historyLines;
 777 | 	this.showNicks = showNicks;
 778 | 	this.contextMenuRenderer = contextMenuRenderer;
 779 | 	this.linesPerSegment = Math.max(1, Math.round(this.historyLines / 10));
 780 | 	this.jobs = new JobsTracker();
 781 | 	// ident -> JobRenderInfo
 782 | 	this.renderInfo = {};
 783 | 	this.mouseInside = null;
 784 | 	this.numCrawls = byId('num-crawls');
 785 | 	this.aligned = false;
 786 | };
 787 | 
 788 | JobsRenderer.prototype._getNextJobInSorted = function(ident) {
 789 | 	for(var i=0; i < this.jobs.sorted.length; i++) {
 790 | 		var e = this.jobs.sorted[i];
 791 | 		if(e["ident"] == ident) {
 792 | 			return this.jobs.sorted[i+1];
 793 | 		}
 794 | 	}
 795 | 	return null;
 796 | };
 797 | 
 798 | JobsRenderer.prototype._createLogSegment = function() {
 799 | 	return h('div');
 800 | };
 801 | 
 802 | JobsRenderer.prototype._createLogContainer = function(jobData) {
 803 | 	var ident = jobData["ident"];
 804 | 	var beforeJob = this._getNextJobInSorted(ident);
 805 | 	var beforeElement = beforeJob == null ? null : byId("log-container-" + beforeJob["ident"]);
 806 | 
 807 | 	var logSegment = this._createLogSegment();
 808 | 
 809 | 	var logWindowAttrs = {
 810 | 		"className": "log-window",
 811 | 		"id": "log-window-" + ident,
 812 | 		"onmouseenter": function(ev) {
 813 | 			this.mouseInside = ident;
 814 | 			ev.target.classList.add('log-window-stopped');
 815 | 		}.bind(this),
 816 | 		"onmouseleave": function(ev) {
 817 | 			var leave = function() {
 818 | 				this.mouseInside = null;
 819 | 				ev.target.classList.remove('log-window-stopped');
 820 | 			}.bind(this);
 821 | 			// When our custom context menu pops up, it causes onmouseleave on the
 822 | 			// log window, so make our leave callback fire only after the context
 823 | 			// menu is closed.
 824 | 			if(this.contextMenuRenderer.visible) {
 825 | 				this.contextMenuRenderer.callAfterBlur(leave);
 826 | 			} else {
 827 | 				leave();
 828 | 			}
 829 | 		}.bind(this)
 830 | 	}
 831 | 
 832 | 	// If you reach the end of a log window, the browser annoyingly
 833 | 	// starts to scroll the page instead.  We prevent this behavior here.
 834 | 	// If the user wants to scroll the page, they need to move their
 835 | 	// mouse outside a log window first.
 836 | 	if(isChrome && getChromeMajorVersion() >= 63) {
 837 | 		// No need to attach an event; .log-window { overscroll-behavior: contain } will take care of it.
 838 | 	} else if(!isSafari) {
 839 | 		logWindowAttrs["onwheel"] = function(ev) {
 840 | 			// Note: offsetHeight is "wrong" by 2px but it doesn't matter
 841 | 			//console.log(ev, logWindow.scrollTop, (logWindow.scrollHeight - logWindow.offsetHeight));
 842 | 			if(ev.deltaY < 0 && logWindow.scrollTop == 0) {
 843 | 				ev.preventDefault();
 844 | 			} else if(ev.deltaY > 0 && logWindow.scrollTop >= (logWindow.scrollHeight - logWindow.offsetHeight)) {
 845 | 				ev.preventDefault();
 846 | 			}
 847 | 		}
 848 | 	} else {
 849 | 		// Safari 7.0.5 can't preventDefault or stopPropagation an onwheel event,
 850 | 		// so use onmousewheel instead.
 851 | 		logWindowAttrs["onmousewheel"] = function(ev) {
 852 | 			//console.log(ev, logWindow.scrollTop, (logWindow.scrollHeight - logWindow.offsetHeight));
 853 | 			if(ev.wheelDeltaY > 0 && logWindow.scrollTop == 0) {
 854 | 				ev.preventDefault();
 855 | 			} else if(ev.wheelDeltaY < 0 && logWindow.scrollTop >= (logWindow.scrollHeight - logWindow.offsetHeight)) {
 856 | 				ev.preventDefault();
 857 | 			}
 858 | 		}
 859 | 	}
 860 | 
 861 | 	var statsElements = {
 862 | 		mb: h("span", {"className": "inline-stat job-mb"}, "?"),
 863 | 		responses: h("span", {"className": "inline-stat job-responses"}, "?"),
 864 | 		responsesPerSecond: h("span", {"className": "inline-stat job-responses-per-second"}, "?"),
 865 | 		queueLength: h("span", {"className": "inline-stat job-in-queue"}, "? in q."),
 866 | 		connections: h("span", {"className": "inline-stat job-connections"}, "?"),
 867 | 		delay: h("span", {"className": "inline-stat job-delay"}, "? ms delay"),
 868 | 		ignores: h("span", {"className": "job-ignores"}, "?"),
 869 | 		jobInfo: null /* set later */
 870 | 	};
 871 | 
 872 | 	var startedISOString = new Date(parseFloat(jobData["started_at"]) * 1000).toISOString();
 873 | 	var jobNote = h("span", {"className": "job-note"}, null);
 874 | 
 875 | 	statsElements.jobInfo = h(
 876 | 		"span", {"className": "job-info"}, [
 877 | 			h("a", {"className": "inline-stat job-url", "href": jobData["url"]}, jobData["url"]),
 878 | 			// Clicking anywhere in this area will set the filter to a regexp that
 879 | 			// matches only this job URL, thus hiding everything but this job.
 880 | 			h("span", {
 881 | 				"className": "stats-elements",
 882 | 				"onclick": function() {
 883 | 					var filter = ds.getFilter();
 884 | 					if(RegExp(filter).test(jobData["url"]) && startsWith(filter, "^") && endsWith(filter, "$")) {
 885 | 						// If we're already showing just this log window, go back
 886 | 						// to showing nothing.
 887 | 						ds.setFilter("^$");
 888 | 					} else {
 889 | 						ds.setFilter("^" + regExpEscape(jobData["url"]) + "$");
 890 | 					}
 891 | 				}
 892 | 			}, [
 893 | 				" on ",
 894 | 				h("span", {"className": "inline-stat", "title": startedISOString}, startedISOString.split("T")[0].substr(5)),
 895 | 				h("span", {"className": "inline-stat job-nick"}, (this.showNicks ? " by " + jobData["started_by"] : "")),
 896 | 				jobNote,
 897 | 				"; ",
 898 | 				statsElements.mb,
 899 | 				" MB in ",
 900 | 				statsElements.responses,
 901 | 				" at ",
 902 | 				statsElements.responsesPerSecond,
 903 | 				"/s, ",
 904 | 				statsElements.queueLength,
 905 | 				"; ",
 906 | 				statsElements.connections,
 907 | 				" con. w/ ",
 908 | 				statsElements.delay,
 909 | 				"; ",
 910 | 				statsElements.ignores
 911 | 			])
 912 | 		]
 913 | 	);
 914 | 
 915 | 	var logWindow = h('div', logWindowAttrs, logSegment);
 916 | 	var div = h(
 917 | 		'div',
 918 | 		{"id": "log-container-" + ident}, [
 919 | 			h("div", {"className": "job-header"}, [
 920 | 				statsElements.jobInfo,
 921 | 				h("input", {
 922 | 					"className": "job-ident",
 923 | 					"type": "text",
 924 | 					"value": ident,
 925 | 					"size": "28",
 926 | 					"spellcheck": "false",
 927 | 					"readonly": "",
 928 | 					"onclick": function() { this.select(); }
 929 | 				})
 930 | 			]),
 931 | 			logWindow
 932 | 		]
 933 | 	);
 934 | 	this.renderInfo[ident] = new JobRenderInfo(logWindow, logSegment, statsElements, jobNote, 0, [0]);
 935 | 	this.container.insertBefore(div, beforeElement);
 936 | 	// Set appropriate CSS classes - we might be in aligned mode already
 937 | 	this.updateAlign();
 938 | 	// Filter hasn't changed, but we might need to filter out the new job, or
 939 | 	// add/remove log-window-expanded class
 940 | 	this.applyFilter();
 941 | }
 942 | 
 943 | JobsRenderer.prototype._renderDownloadLine = function(data, logSegment) {
 944 | 	var code = data["response_code"];
 945 | 	if(code >= 400 && code < 500) {
 946 | 		var attrs = {"className": "line-warning", "href": data["url"]};
 947 | 	} else if(code === 0 || code >= 500) {
 948 | 		var attrs = {"className": "line-error", "href": data["url"]};
 949 | 	} else if(code && code >= 300 && code < 400) {
 950 | 		var attrs = {"className": "line-redirect", "href": data["url"]};
 951 | 	} else {
 952 | 		var attrs = {"className": "line-normal", "href": data["url"]};
 953 | 	}
 954 | 	logSegment.appendChild(
 955 | 		h("a", attrs, code + " " + data["wget_code"] + " " + data["url"])
 956 | 	);
 957 | 	return 1;
 958 | };
 959 | 
 960 | /**
 961 |  * Like _renderDownloadLine, but makes it easier to start a text selection from the
 962 |  * left or right of the URL.
 963 |  */
 964 | JobsRenderer.prototype._moreDomRenderDownloadLine = function(data, logSegment) {
 965 | 	var code = data["response_code"];
 966 | 	if(code >= 400 && code < 500) {
 967 | 		var attrs = Reusable.obj_className_line_warning;
 968 | 	} else if(code === 0 || code >= 500) {
 969 | 		var attrs = Reusable.obj_className_line_error;
 970 | 	} else if(code && code >= 300 && code < 400) {
 971 | 		var attrs = Reusable.obj_className_line_redirect;
 972 | 	} else {
 973 | 		var attrs = Reusable.obj_className_line_normal;
 974 | 	}
 975 | 	logSegment.appendChild(h("div", attrs, [
 976 | 		code + " " + data["wget_code"] + " ",
 977 | 		h("a", {"href": data["url"], "className": "log-url"}, data["url"])
 978 | 	]));
 979 | 	return 1;
 980 | };
 981 | 
 982 | JobsRenderer.prototype._renderIgnoreLine = function(data, logSegment) {
 983 | 	var attrs = Reusable.obj_className_line_ignore;
 984 | 	logSegment.appendChild(h("div", attrs, [
 985 | 		h('span', null, " IGNOR "),
 986 | 		h('a', {"href": data["url"], "className": "ignore"}, data["url"]),
 987 | 		h('span', Reusable.obj_className_bold, " by "),
 988 | 		data["pattern"]
 989 | 	]));
 990 | 	return 1;
 991 | };
 992 | 
 993 | JobsRenderer.prototype._renderStdoutLine = function(data, logSegment, info, ident) {
 994 | 	var cleanedMessage = data["message"].replace(/[\r\n]+$/, "");
 995 | 	// Format DUPE/OF messages a little more nicely
 996 | 	cleanedMessage = cleanedMessage.replace(/^DUPE /, "  DUPE ").replace(/\n  OF /, "\n      OF ");
 997 | 	var renderedLines = 0;
 998 | 	if(!cleanedMessage) {
 999 | 		return renderedLines;
1000 | 	}
1001 | 	var lines = cleanedMessage.split("\n");
1002 | 	for(var i=0; i < lines.length; i++) {
1003 | 		var line = lines[i];
1004 | 		if(!line) {
1005 | 			continue;
1006 | 		}
1007 | 		logSegment.appendChild(h("div", Reusable.obj_className_line_stdout, line));
1008 | 		renderedLines += 1;
1009 | 
1010 | 		if(/^Finished grab \S+ \S+ with exit code ([0-13-8])$/.test(line)) {
1011 | 			info.statsElements.jobInfo.classList.add('job-info-done');
1012 | 			this.jobs.markFinished(ident);
1013 | 		} else if(/^Finished grab \S+ \S+ with exit code |^CRITICAL (Sorry|Please report)|^ERROR Fatal exception|No space left on device|^Fatal Python error:|^(Thread|Current thread) 0x/.test(line)) {
1014 | 			info.statsElements.jobInfo.classList.add('job-info-fatal');
1015 | 			this.jobs.markFatalException(ident);
1016 | 		} else if(/Script requested immediate stop/.test(line)) {
1017 | 			// Note: above message can be in:
1018 | 			// ERROR Script requested immediate stop
1019 | 			// or after an ERROR Fatal exception:
1020 | 			// wpull.hook.HookStop: Script requested immediate stop.
1021 | 			info.statsElements.jobInfo.classList.remove('job-info-fatal');
1022 | 			info.statsElements.jobInfo.classList.add('job-info-aborted');
1023 | 		}
1024 | 	}
1025 | 	return renderedLines;
1026 | };
1027 | 
1028 | JobsRenderer.prototype.handleData = function(data) {
1029 | 	var jobData = data["job_data"];
1030 | 	var added = this.jobs.handleJobData(jobData);
1031 | 	var jobsActive = this.jobs.countActive();
1032 | 	this.numCrawls.textContent =
1033 | 		jobsActive === 1 ?
1034 | 			"1 crawl" :
1035 | 			jobsActive + " crawls";
1036 | 	if(added) {
1037 | 		this._createLogContainer(jobData);
1038 | 	}
1039 | 	var type = data["type"];
1040 | 	var ident = jobData["ident"];
1041 | 
1042 | 	var info = this.renderInfo[ident];
1043 | 	if(!info) {
1044 | 		console.warn("No render info for " + ident);
1045 | 		return;
1046 | 	}
1047 | 
1048 | 	var totalResponses = parseInt(getTotalResponses(jobData));
1049 | 	if(type == "download") {
1050 | 		var linesRendered = this._renderDownloadLine(data, info.logSegment);
1051 | 	} else if(type == "stdout" || type == "stderr") {
1052 | 		var linesRendered = this._renderStdoutLine(data, info.logSegment, info, ident);
1053 | 	} else if(type == "ignore") {
1054 | 		var linesRendered = this._renderIgnoreLine(data, info.logSegment);
1055 | 	} else {
1056 | 		assert(false, "Unexpected message type " + type);
1057 | 	}
1058 | 
1059 | 	// Update stats
1060 | 	info.statsElements.mb.textContent =
1061 | 		numberWithCommas(
1062 | 			toStringTenths(
1063 | 				(parseInt(jobData["bytes_downloaded"]) / (1024 * 1024)).toString()));
1064 | 	info.statsElements.responses.textContent =
1065 | 		numberWithCommas(totalResponses) + " resp.";
1066 | 	info.statsElements.responses.title = getSummaryResponses(jobData);
1067 | 	var duration = Date.now()/1000 - parseFloat(jobData["started_at"]);
1068 | 	info.statsElements.responsesPerSecond.textContent =
1069 | 		toStringTenths(totalResponses/duration);
1070 | 
1071 | 	if (jobData["items_queued"] && jobData["items_downloaded"]) {
1072 | 		var totalQueued = parseInt(jobData["items_queued"], 10);
1073 | 		var totalDownloaded = parseInt(jobData["items_downloaded"], 10);
1074 | 		info.statsElements.queueLength.textContent =
1075 | 			numberWithCommas((totalQueued - totalDownloaded) + " in q.");
1076 | 		info.statsElements.queueLength.title =
1077 | 			numberWithCommas(totalQueued) + " queued\n" +
1078 | 			numberWithCommas(totalDownloaded) + " downloaded";
1079 | 	}
1080 | 
1081 | 	info.statsElements.connections.textContent = jobData["concurrency"];
1082 | 
1083 | 	var delayMin = parseInt(jobData["delay_min"]);
1084 | 	var delayMax = parseInt(jobData["delay_max"]);
1085 | 	info.statsElements.delay.textContent =
1086 | 		(delayMin == delayMax ?
1087 | 			delayMin :
1088 | 			delayMin + "-" + delayMax) + " ms delay";
1089 | 
1090 | 	if(jobData["suppress_ignore_reports"]) {
1091 | 		info.statsElements.ignores.textContent = 'igoff';
1092 | 		if(!info.statsElements.ignores.classList.contains('job-igoff')) {
1093 | 			info.statsElements.ignores.classList.add('job-igoff');
1094 | 		}
1095 | 	} else {
1096 | 		info.statsElements.ignores.textContent = 'igon';
1097 | 		if(info.statsElements.ignores.classList.contains('job-igoff')) {
1098 | 			info.statsElements.ignores.classList.remove('job-igoff');
1099 | 		}
1100 | 	}
1101 | 
1102 | 	// Update note
1103 | 	info.jobNote.textContent =
1104 | 		isBlank(jobData["note"]) ?
1105 | 			"" :
1106 | 			" (" + jobData["note"] + ")";
1107 | 
1108 | 	info.lineCountWindow += linesRendered;
1109 | 	info.lineCountSegments[info.lineCountSegments.length - 1] += linesRendered;
1110 | 
1111 | 	if(info.lineCountSegments[info.lineCountSegments.length - 1] >= this.linesPerSegment) {
1112 | 		//console.log("Created new segment", info);
1113 | 		var newSegment = this._createLogSegment();
1114 | 		info.logWindow.appendChild(newSegment);
1115 | 		info.logSegment = newSegment;
1116 | 		info.lineCountSegments.push(0);
1117 | 	}
1118 | 
1119 | 	if(this.mouseInside != ident) {
1120 | 		// Don't remove any scrollback information when the job has a fatal exception,
1121 | 		// so that the user can find the traceback and report a bug.
1122 | 		if(!this.jobs.hasFatalException(ident)) {
1123 | 			// We may have to remove more than one segment, if the user
1124 | 			// has paused the log window for a while.
1125 | 			while(info.lineCountWindow >= this.historyLines + this.linesPerSegment) {
1126 | 				var firstLogSegment = info.logWindow.firstChild;
1127 | 				assert(firstLogSegment != null, "info.logWindow.firstChild is null; " +
1128 | 					JSON.stringify({
1129 | 						"lineCountWindow": info.lineCountWindow,
1130 | 						"lineCountSegments": info.lineCountSegments}));
1131 | 				info.logWindow.removeChild(firstLogSegment);
1132 | 				info.lineCountWindow -= info.lineCountSegments[0];
1133 | 				info.lineCountSegments.shift();
1134 | 			}
1135 | 		}
1136 | 
1137 | 		// Scroll to the bottom
1138 | 		// To avoid serious performance problems in Firefox, we use a big number
1139 | 		// instead of info.logWindow.scrollHeight.
1140 | 		info.logWindow.scrollTop = 999999;
1141 | 	}
1142 | };
1143 | 
1144 | JobsRenderer.prototype.applyFilter = function() {
1145 | 	var query = this.filterBox.value;
1146 | 	var matches = 0;
1147 | 	var matchedWindows = [];
1148 | 	var unmatchedWindows = [];
1149 | 	this.firstFilterMatch = null;
1150 | 	for(var i=0; i < this.jobs.sorted.length; i++) {
1151 | 		var job = this.jobs.sorted[i];
1152 | 		var w = this.renderInfo[job["ident"]].logWindow;
1153 | 		if(!RegExp(query).test(job["url"])) {
1154 | 			w.classList.add("log-window-hidden");
1155 | 			// Firefox exhibits serious performance problems when adding
1156 | 			// lines to our 0px-high log windows, so add display: none
1157 | 			// (effectively killing the animation)
1158 | 			if(isFirefox) {
1159 | 				w.style.display = "none";
1160 | 			}
1161 | 
1162 | 			// Remove this class, else an ugly border may be visible
1163 | 			w.classList.remove('log-window-stopped');
1164 | 			unmatchedWindows.push(w);
1165 | 		} else {
1166 | 			w.classList.remove("log-window-hidden");
1167 | 			if(isFirefox) {
1168 | 				w.style.display = "block";
1169 | 			}
1170 | 
1171 | 			matches += 1;
1172 | 			matchedWindows.push(w);
1173 | 			if(this.firstFilterMatch == null) {
1174 | 				this.firstFilterMatch = job;
1175 | 			}
1176 | 		}
1177 | 	}
1178 | 
1179 | 	// If there's only one visible log window, expand it so that more lines are visible.
1180 | 	unmatchedWindows.map(classRemover('log-window-expanded'));
1181 | 	matchedWindows.map(classRemover('log-window-expanded'));
1182 | 	if(matches == 1) {
1183 | 		matchedWindows.map(classAdder('log-window-expanded'));
1184 | 	}
1185 | 
1186 | 	if(matches < this.jobs.sorted.length) {
1187 | 		// If you're not seeing all of the log windows, you're probably seeing very
1188 | 		// few of them, so you probably want alignment enabled.
1189 | 		this.aligned = true;
1190 | 		this.updateAlign();
1191 | 	} else {
1192 | 		// You're seeing all of the log windows, so alignment doesn't help as much
1193 | 		// as seeing the full info.
1194 | 		this.aligned = false;
1195 | 		this.updateAlign();
1196 | 	}
1197 | };
1198 | 
1199 | JobsRenderer.prototype.showNextPrev = function(offset) {
1200 | 	var idx;
1201 | 	if(this.firstFilterMatch == null) {
1202 | 		idx = null;
1203 | 	} else {
1204 | 		idx = findInArray(this.jobs.sorted, function(el, i) {
1205 | 			return el["ident"] == this.firstFilterMatch["ident"];
1206 | 		}.bind(this));
1207 | 	}
1208 | 	if(idx == null) {
1209 | 		// If no job windows are shown, set up index to make j show the first job window,
1210 | 		// k the last job window.
1211 | 		idx = this.jobs.sorted.length;
1212 | 	}
1213 | 	idx = idx + offset;
1214 | 	// When reaching either end, hide all job windows.  When going past
1215 | 	// the end, wrap around.
1216 | 	if(idx == -1) {
1217 | 		idx = this.jobs.sorted.length;
1218 | 	} else if(idx == this.jobs.sorted.length + 1) {
1219 | 		idx = 0;
1220 | 	}
1221 | 	if(idx == this.jobs.sorted.length) {
1222 | 		ds.setFilter("^$");
1223 | 	} else {
1224 | 		var newShownJob = this.jobs.sorted[idx];
1225 | 		ds.setFilter("^" + regExpEscape(newShownJob["url"]) + "$");
1226 | 	}
1227 | };
1228 | 
1229 | JobsRenderer.prototype.updateAlign = function() {
1230 | 	var adderOrRemover = this.aligned ? classAdder : classRemover;
1231 | 	arrayFrom(document.querySelectorAll('.job-url')).map(adderOrRemover('job-url-aligned'));
1232 | 	arrayFrom(document.querySelectorAll('.job-note')).map(adderOrRemover('job-note-aligned'));
1233 | 	arrayFrom(document.querySelectorAll('.job-nick')).map(adderOrRemover('job-nick-aligned'));
1234 | 	arrayFrom(document.querySelectorAll('.job-mb')).map(adderOrRemover('job-mb-aligned'));
1235 | 	arrayFrom(document.querySelectorAll('.job-responses')).map(adderOrRemover('job-responses-aligned'));
1236 | 	arrayFrom(document.querySelectorAll('.job-responses-per-second')).map(adderOrRemover('job-responses-per-second-aligned'));
1237 | 	arrayFrom(document.querySelectorAll('.job-in-queue')).map(adderOrRemover('job-in-queue-aligned'));
1238 | 	arrayFrom(document.querySelectorAll('.job-connections')).map(adderOrRemover('job-connections-aligned'));
1239 | 	arrayFrom(document.querySelectorAll('.job-delay')).map(adderOrRemover('job-delay-aligned'));
1240 | };
1241 | 
1242 | JobsRenderer.prototype.toggleAlign = function() {
1243 | 	this.aligned = !this.aligned;
1244 | 	this.updateAlign();
1245 | };
1246 | 
1247 | 
1248 | 
1249 | /**
1250 |  * This context menu pops up when you right-click on a URL in
1251 |  * a log window, helping you copy a regexp based on the URL
1252 |  * you right-clicked.
1253 |  */
1254 | var ContextMenuRenderer = function() {
1255 | 	this.visible = false;
1256 | 	this.callAfterBlurFns = [];
1257 | 	this.element = byId('context-menu');
1258 | };
1259 | 
1260 | /**
1261 |  * Returns true if the event target is a URL in a log window
1262 |  */
1263 | ContextMenuRenderer.prototype.clickedOnLogWindowURL = function(ev) {
1264 | 	var cn = ev.target.className;
1265 | 	return cn == "line-normal" || cn == "line-error" || cn == "line-warning" || cn == "line-redirect" || cn == "log-url";
1266 | };
1267 | 
1268 | ContextMenuRenderer.prototype.makeCopyTextFn = function(text) {
1269 | 	return function() {
1270 | 		var clipboardScratchpad = byId('clipboard-scratchpad');
1271 | 		clipboardScratchpad.value = text;
1272 | 		clipboardScratchpad.focus();
1273 | 		clipboardScratchpad.select();
1274 | 		document.execCommand('copy');
1275 | 	}.bind(this);
1276 | };
1277 | 
1278 | ContextMenuRenderer.prototype.getPathVariants = function(path) {
1279 | 	var paths = [path];
1280 | 
1281 | 	// Avoid generating a duplicate suggestion
1282 | 	path = path.replace(/\/$/, "");
1283 | 
1284 | 	while(path && path.lastIndexOf('/') != -1) {
1285 | 		path = path.replace(/\/[^\/]*$/, "");
1286 | 		paths.push(path + '/');
1287 | 	}
1288 | 
1289 | 	return paths;
1290 | };
1291 | 
1292 | ContextMenuRenderer.prototype.getSuggestedCommands = function(ident, url) {
1293 | 	var schema = url.split(':')[0];
1294 | 	var domain = url.split('/')[2];
1295 | 	var withoutQuery = url.split('?')[0];
1296 | 	var path = '/' + split(withoutQuery, '/', 3)[3];
1297 | 	var reSchema = startsWith(schema, 'http') ? 'https?' : 'ftp';
1298 | 	return this.getPathVariants(path).map(function(p) {
1299 | 		return "^" + reSchema + "://" + regExpEscape(domain + p);
1300 | 	});
1301 | };
1302 | 
1303 | ContextMenuRenderer.prototype.makeEntries = function(ident, url) {
1304 | 	var commands = this.getSuggestedCommands(ident, url).map(function(c) {
1305 | 		return h(
1306 | 			'span',
1307 | 			{'onclick': this.makeCopyTextFn(c)},
1308 | 			"Copy " + c
1309 | 		);
1310 | 	}.bind(this));
1311 | 	return [
1312 | 		// Unfortunately, this does not open it in a background tab
1313 | 		// like the real context menu does.
1314 | 		 h('a', {'href': url}, "Open link in new tab")
1315 | 		,h('span', {'onclick': this.makeCopyTextFn(url)}, "Copy link address")
1316 | 	].concat(commands);
1317 | };
1318 | 
1319 | ContextMenuRenderer.prototype.onContextMenu = function(ev) {
1320 | 	//console.log(ev);
1321 | 	if(!this.clickedOnLogWindowURL(ev)) {
1322 | 		this.blur();
1323 | 		return;
1324 | 	}
1325 | 	ev.preventDefault();
1326 | 	this.visible = true;
1327 | 	this.element.style.display = "block";
1328 | 	this.element.style.left = ev.clientX + "px";
1329 | 	this.element.style.top = ev.clientY + "px";
1330 | 
1331 | 	removeChildren(this.element);
1332 | 	// We put the clipboard-scratchpad in the fixed-positioned
1333 | 	// context menu instead of elsewhere on the page, because
1334 | 	// we must focus the input box to automatically copy its text,
1335 | 	// and the focus operation scrolls to the element on the page,
1336 | 	// and we want to avoid such scrolling.
1337 | 	appendAny(this.element, h('input', {'type': 'text', 'id': 'clipboard-scratchpad'}));
1338 | 
1339 | 	var url = ev.target.href;
1340 | 	try {
1341 | 		var ident = ev.target.parentNode.parentNode.id.match(/^log-window-(.*)/)[1];
1342 | 	} catch(e) {
1343 | 		// moreDom=1
1344 | 		var ident = ev.target.parentNode.parentNode.parentNode.id.match(/^log-window-(.*)/)[1];
1345 | 	}
1346 | 	var entries = this.makeEntries(ident, url);
1347 | 	for(var i=0; i < entries.length; i++) {
1348 | 		var entry = entries[i];
1349 | 		entry.classList.add('context-menu-entry');
1350 | 		appendAny(this.element, entry);
1351 | 	}
1352 | 
1353 | 	// If the bottom of the context menu is outside the viewport, move the context
1354 | 	// menu up, so that it appears to have opened from its bottom-left corner.
1355 | 	// + 1 pixel so that the pointer lands inside the element and turns on cursor: default
1356 | 	if(ev.clientY + this.element.offsetHeight > document.documentElement.clientHeight) {
1357 | 		this.element.style.top = (ev.clientY - this.element.offsetHeight + 1) + "px";
1358 | 	}
1359 | };
1360 | 
1361 | ContextMenuRenderer.prototype.blur = function() {
1362 | 	this.visible = false;
1363 | 	this.element.style.display = "none";
1364 | 	this.callAfterBlurFns.map(function(fn) { fn(); });
1365 | 	this.callAfterBlurFns = [];
1366 | };
1367 | 
1368 | // TODO: decouple - fire an onblur event instead
1369 | ContextMenuRenderer.prototype.callAfterBlur = function(fn) {
1370 | 	this.callAfterBlurFns.push(fn);
1371 | };
1372 | 
1373 | 
1374 | 
1375 | var BatchingQueue = function(callable, minInterval) {
1376 | 	this.callable = callable;
1377 | 	this._minInterval = minInterval;
1378 | 	this.queue = [];
1379 | 	this._timeout = null;
1380 | 	this._boundRunCallable = this._runCallable.bind(this);
1381 | };
1382 | 
1383 | BatchingQueue.prototype.setMinInterval = function(minInterval) {
1384 | 	this._minInterval = minInterval;
1385 | };
1386 | 
1387 | BatchingQueue.prototype._runCallable = function() {
1388 | 	this._timeout = null;
1389 | 	var queue = this.queue;
1390 | 	this.queue = [];
1391 | 	this.callable(queue);
1392 | };
1393 | 
1394 | BatchingQueue.prototype.callNow = function() {
1395 | 	if(this._timeout !== null) {
1396 | 		clearTimeout(this._timeout);
1397 | 		this._timeout = null;
1398 | 	}
1399 | 	this._runCallable();
1400 | };
1401 | 
1402 | BatchingQueue.prototype.push = function(v) {
1403 | 	this.queue.push(v);
1404 | 	if(this._timeout === null) {
1405 | 		this._timeout = setTimeout(this._boundRunCallable, this._minInterval);
1406 | 	}
1407 | };
1408 | 
1409 | 
1410 | 
1411 | var Decayer = function(initial, multiplier, max) {
1412 | 	this.initial = initial;
1413 | 	this.multiplier = multiplier;
1414 | 	this.max = max;
1415 | 	this.reset();
1416 | };
1417 | 
1418 | Decayer.prototype.reset = function() {
1419 | 	// First call to .decay() will multiply, but we want to get the `intitial`
1420 | 	// value on the first call to .decay(), so divide.
1421 | 	this.current = this.initial / this.multiplier;
1422 | 	return this.current;
1423 | };
1424 | 
1425 | Decayer.prototype.decay = function() {
1426 | 	this.current = Math.min(this.current * this.multiplier, this.max);
1427 | 	return this.current;
1428 | };
1429 | 
1430 | 
1431 | 
1432 | var Dashboard = function() {
1433 | 	this.messageCount = 0;
1434 | 
1435 | 	var args = getQueryArgs();
1436 | 
1437 | 	var historyLines         = args["historyLines"]         ? Number(args["historyLines"])         : navigator.userAgent.match(/Mobi/) ? 250 : 1000;
1438 | 	var batchTimeWhenVisible = args["batchTimeWhenVisible"] ? Number(args["batchTimeWhenVisible"]) : 125;
1439 | 	var showNicks            = args["showNicks"]            ? Boolean(Number(args["showNicks"]))   : false;
1440 | 	var contextMenu          = args["contextMenu"]          ? Boolean(Number(args["contextMenu"])) : true;
1441 | 	var moreDom              = args["moreDom"]              ? Boolean(Number(args["moreDom"]))     : false;
1442 | 	// Append to page title to make it possible to identify the tab in Chrome's task manager
1443 | 	if(args["title"]) {
1444 | 		document.title += " - " + args["title"];
1445 | 	}
1446 | 
1447 | 	if(moreDom) {
1448 | 		JobsRenderer.prototype._renderDownloadLine = JobsRenderer.prototype._moreDomRenderDownloadLine;
1449 | 	}
1450 | 
1451 | 	if(args["host"]) {
1452 | 		this.host = args["host"];
1453 | 	} else {
1454 | 		// If no ?host=, connect to this grab-site server instead of some other server.
1455 | 		this.host = location.host;
1456 | 	}
1457 | 	this.dumpTraffic = args["dumpMax"] && Number(args["dumpMax"]) > 0;
1458 | 	if(this.dumpTraffic) {
1459 | 		this.dumpMax = Number(args["dumpMax"]);
1460 | 	}
1461 | 
1462 | 	this.contextMenuRenderer = new ContextMenuRenderer(document);
1463 | 	if(contextMenu) {
1464 | 		document.oncontextmenu = this.contextMenuRenderer.onContextMenu.bind(this.contextMenuRenderer);
1465 | 		document.onclick = this.contextMenuRenderer.blur.bind(this.contextMenuRenderer);
1466 | 		// onkeydown picks up ESC, onkeypress doesn't (tested Chrome 44)
1467 | 		document.onkeydown = function(ev) {
1468 | 			if(ev.keyCode == 27) { // ESC
1469 | 				this.contextMenuRenderer.blur();
1470 | 			}
1471 | 		}.bind(this);
1472 | 		// In Chrome, the native context menu disappears when you wheel around, so
1473 | 		// match that behavior for our own context menu.
1474 | 		if(isChrome) {
1475 | 			document.onwheel = function(ev) {
1476 | 				this.contextMenuRenderer.blur();
1477 | 			}.bind(this);
1478 | 		}
1479 | 	}
1480 | 
1481 | 	this.jobsRenderer = new JobsRenderer(
1482 | 		byId('logs'), byId('filter-box'), historyLines, showNicks, this.contextMenuRenderer);
1483 | 
1484 | 	var batchTimeWhenHidden = 5000;
1485 | 
1486 | 	document.onkeypress = this.keyPress.bind(this);
1487 | 
1488 | 	// Adjust help text based on URL
1489 | 	Array.prototype.slice.call(document.querySelectorAll('.url-q-or-amp')).map(function(elem) {
1490 | 		if(window.location.search.indexOf("?") != -1) {
1491 | 			elem.textContent = "&";
1492 | 		}
1493 | 	});
1494 | 
1495 | 	if(!showNicks) {
1496 | 		document.write('<style>.job-nick-aligned { width: 0; }</style>');
1497 | 	}
1498 | 
1499 | 	this.queue = new BatchingQueue(function(queue) {
1500 | 		//console.log("Queue has ", queue.length, "items");
1501 | 		for(var i=0; i < queue.length; i++) {
1502 | 			this.handleData(JSON.parse(queue[i]));
1503 | 		}
1504 | 	}.bind(this), batchTimeWhenVisible);
1505 | 
1506 | 	this.decayer = new Decayer(1000, 1.5, 60000);
1507 | 	this.connectWebSocket();
1508 | 
1509 | 	document.addEventListener("visibilitychange", function() {
1510 | 		if(document.hidden) {
1511 | 			//console.log("Page has become hidden");
1512 | 			this.queue.setMinInterval(batchTimeWhenHidden);
1513 | 		} else {
1514 | 			//console.log("Page has become visible");
1515 | 			this.queue.setMinInterval(batchTimeWhenVisible);
1516 | 			this.queue.callNow();
1517 | 		}
1518 | 	}.bind(this), false);
1519 | };
1520 | 
1521 | Dashboard.prototype.keyPress = function(ev) {
1522 | 	//console.log(ev);
1523 | 
1524 | 	// If you press ctrl-f or alt-f in Firefox (tested: 41), it dispatches
1525 | 	// the keypress event for 'f'.  We want only the modifier-free
1526 | 	// keypresses.
1527 | 	if(ev.ctrlKey || ev.altKey || ev.metaKey) {
1528 | 		return;
1529 | 	}
1530 | 	// Check shiftKey only after handling '?', because you need shift for '?'
1531 | 	if(ev.which == 63) { // ?
1532 | 		ds.toggleHelp();
1533 | 		return;
1534 | 	}
1535 | 	if(ev.shiftKey) {
1536 | 		return;
1537 | 	}
1538 | 	if(ev.which == 106) { // j
1539 | 		this.jobsRenderer.showNextPrev(1);
1540 | 	} else if(ev.which == 107) { // k
1541 | 		this.jobsRenderer.showNextPrev(-1);
1542 | 	} else if(ev.which == 97) { // a
1543 | 		ds.setFilter('');
1544 | 	} else if(ev.which == 110) { // n
1545 | 		ds.setFilter('^$');
1546 | 	} else if(ev.which == 102) { // f
1547 | 		ev.preventDefault();
1548 | 		byId('filter-box').focus();
1549 | 		byId('filter-box').select();
1550 | 	} else if(ev.which == 118) { // v
1551 | 		window.open(this.jobsRenderer.firstFilterMatch["url"]);
1552 | 	}
1553 | };
1554 | 
1555 | Dashboard.prototype.handleData = function(data) {
1556 | 	this.messageCount += 1;
1557 | 	if(this.dumpTraffic && this.messageCount <= this.dumpMax) {
1558 | 		byId('traffic').appendChild(h("pre", null, prettyJson(data)));
1559 | 	}
1560 | 	this.jobsRenderer.handleData(data);
1561 | };
1562 | 
1563 | Dashboard.prototype.connectWebSocket = function() {
1564 | 	// Use wss:// if we're behind a reverse proxy serving with https://
1565 | 	var protocol = window.location.protocol == "https:" ? "wss:" : "ws:";
1566 | 	this.ws = new WebSocket(protocol + "//" + this.host + "/stream");
1567 | 
1568 | 	this.ws.onmessage = function(ev) {
1569 | 		this.queue.push(ev["data"]);
1570 | 	}.bind(this);
1571 | 
1572 | 	this.ws.onopen = function(ev) {
1573 | 		console.log("WebSocket opened:", ev);
1574 | 		this.ws.send(JSON.stringify({
1575 | 			"type": "hello",
1576 | 			"mode": "dashboard",
1577 | 			"user_agent": navigator.userAgent
1578 | 		}));
1579 | 		this.decayer.reset();
1580 | 	}.bind(this);
1581 | 
1582 | 	this.ws.onclose = function(ev) {
1583 | 		console.log("WebSocket closed:", ev);
1584 | 		var delay = this.decayer.decay();
1585 | 		console.log("Reconnecting in", delay, "ms");
1586 | 		setTimeout(this.connectWebSocket.bind(this), delay);
1587 | 	}.bind(this);
1588 | };
1589 | 
1590 | Dashboard.prototype.toggleAlign = function() {
1591 | 	this.jobsRenderer.toggleAlign();
1592 | };
1593 | 
1594 | Dashboard.prototype.toggleHelp = function() {
1595 | 	var help = byId('help');
1596 | 	if(help.classList.contains('undisplayed')) {
1597 | 		help.classList.remove('undisplayed');
1598 | 	} else {
1599 | 		help.classList.add('undisplayed');
1600 | 	}
1601 | };
1602 | 
1603 | Dashboard.prototype.getFilter = function(value) {
1604 | 	return byId('filter-box').value;
1605 | };
1606 | 
1607 | Dashboard.prototype.setFilter = function(value) {
1608 | 	byId('filter-box').value = value;
1609 | 	byId('filter-box').onchange();
1610 | };
1611 | 
1612 | var ds = new Dashboard();
1613 | 
1614 | </script>
1615 | </body>
1616 | </html>
1617 | 


--------------------------------------------------------------------------------
/libgrabsite/dashboard_client.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import asyncio
 4 | import websockets
 5 | 
 6 | 
 7 | class Decayer:
 8 | 	def __init__(self, initial, multiplier, maximum):
 9 | 		"""
10 | 		initial    - initial number to return
11 | 		multiplier - multiply number by this value after each call to decay()
12 | 		maximum    - cap number at this value
13 | 		"""
14 | 		self.initial = initial
15 | 		self.multiplier = multiplier
16 | 		self.maximum = maximum
17 | 		self.reset()
18 | 
19 | 	def reset(self):
20 | 		# First call to .decay() will multiply, but we want to get the `intitial`
21 | 		# value on the first call to .decay(), so divide.
22 | 		self.current = self.initial / self.multiplier
23 | 		return self.current
24 | 
25 | 	def decay(self):
26 | 		self.current = min(self.current * self.multiplier, self.maximum)
27 | 		return self.current
28 | 
29 | 
30 | async def send_object(ws, obj):
31 | 	await ws.send(json.dumps(obj).encode("utf-8"))
32 | 
33 | async def sender(plugin, uri):
34 | 	decayer = Decayer(0.25, 1.5, 8)
35 | 	while True:
36 | 		try:
37 | 			async with websockets.connect(uri) as ws:
38 | 				print(f"Connected to {uri}")
39 | 				decayer.reset()
40 | 				await send_object(ws, {
41 | 					"type": "hello",
42 | 					"mode": "grabber",
43 | 					"url":  plugin.job_data["url"]
44 | 				})
45 | 				while True:
46 | 					obj = await plugin.ws_queue.get()
47 | 					try:
48 | 						await send_object(ws, obj)
49 | 					finally:
50 | 						plugin.ws_queue.task_done()
51 | 		except Exception as e:
52 | 			delay = decayer.decay()
53 | 			print(f"Disconnected from ws:// server: {repr(e)}")
54 | 			await asyncio.sleep(delay)
55 | 


--------------------------------------------------------------------------------
/libgrabsite/default_cookies.txt:
--------------------------------------------------------------------------------
 1 | # HTTP cookie file.
 2 | 
 3 | # Skip the age gate on many subreddits
 4 | .reddit.com	TRUE	/	FALSE	2147483647	over18	1
 5 | # Skip the quarantine gate on some subreddits
 6 | .reddit.com	TRUE	/	FALSE	2147483647	_options	%7B%22pref_quarantine_optin%22%3A%20true%7D
 7 | 
 8 | # Skip the age gate on saidit
 9 | .saidit.net	TRUE	/	FALSE	2147483647	over18	1
10 | 
11 | # Skip the age gate on many games (birthtime 0 = 1970)
12 | store.steampowered.com	FALSE	/	FALSE	2147483647	mature_content	1
13 | store.steampowered.com	FALSE	/	FALSE	2147483647	birthtime	0
14 | store.steampowered.com	FALSE	/	FALSE	2147483647	lastagecheckage	1-January-1970
15 | 
16 | # Avoid getting redirected to a non-.com TLD when crawling outside the US
17 | .blogspot.com	TRUE	/	FALSE	2147483647	NCR	1
18 | 


--------------------------------------------------------------------------------
/libgrabsite/dump_urls.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import sqlite3
 3 | import libgrabsite
 4 | 
 5 | def print_version(ctx, param, value):
 6 | 	if not value or ctx.resilient_parsing:
 7 | 		return
 8 | 	click.echo(libgrabsite.__version__)
 9 | 	ctx.exit()
10 | 
11 | 
12 | @click.command()
13 | 
14 | @click.option("--version", is_flag=True, callback=print_version,
15 | 	expose_value=False, is_eager=True, help="Print version and exit.")
16 | 
17 | @click.argument("wpull_db_file", type=str)
18 | 
19 | @click.argument("status", type=click.Choice(["done", "error", "in_progress", "skipped", "todo"]))
20 | 
21 | def main(wpull_db_file, status):
22 | 	"""
23 | 	Dumps URLs of a particular crawl status from a wpull.db file.
24 | 
25 | 	WPULL_DB_FILE is the path to the wpull.db file.
26 | 
27 | 	STATUS is one of "done", "error", "in_progress", "skipped", or "todo".
28 | 	"""
29 | 	conn = sqlite3.connect(wpull_db_file)
30 | 	c = conn.cursor()
31 | 
32 | 	try:
33 | 		# query for wpull 2.0+ wpull.db
34 | 		rows = c.execute(
35 | 			"SELECT url_strings.url FROM queued_urls "
36 | 			"JOIN url_strings ON queued_urls.url_string_id=url_strings.id "
37 | 			"WHERE status=?;", (status,))
38 | 	except sqlite3.OperationalError:
39 | 		# query for wpull 1.x wpull.db
40 | 		rows = c.execute(
41 | 			"SELECT url_strings.url FROM urls "
42 | 			"JOIN url_strings ON urls.url_str_id=url_strings.id "
43 | 			"WHERE status=?;", (status,))
44 | 
45 | 	for row in rows:
46 | 		print(row[0])
47 | 
48 | 
49 | if __name__ == "__main__":
50 | 	main()
51 | 


--------------------------------------------------------------------------------
/libgrabsite/dupes.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Duplicate page content database.
 3 | """
 4 | 
 5 | class DupesOnDisk:
 6 | 	def __init__(self, filename):
 7 | 		import lmdb
 8 | 		last_error = None
 9 | 		# Try 1TB, 2GB, 1GB, 512MB, 256MB, 128MB, then give up.
10 | 		# "Must be <2GB on 32-bit" https://lmdb.readthedocs.org/en/release/
11 | 		# but sometimes needs to be even lower.
12 | 		for map_size in (1024**4, 2**31 - 1, 2**30 - 1, 2**29 - 1, 2**28 - 1, 2**27 - 1):
13 | 			try:
14 | 				self._env = lmdb.open(
15 | 					filename,
16 | 					# Can't use writemap=True on OS X because it does not fully support sparse files
17 | 					# https://acid.readthedocs.org/en/latest/engines.html
18 | 					#
19 | 					# Don't use writemap=True elsewhere to avoid creating enormous sparse files
20 | 					# that will inevitably get copied without hole-skipping and fill up a disk.
21 | 					writemap=False,
22 | 					sync=False,
23 | 					metasync=False,
24 | 					# http://lmdb.readthedocs.org/en/release/#lmdb.Environment
25 | 					map_size=map_size)
26 | 			except (OverflowError, lmdb.MemoryError, lmdb.Error) as e:
27 | 				last_error = e
28 | 			else:
29 | 				print(f"Created lmdb db with map_size={map_size}")
30 | 				last_error = None
31 | 				break
32 | 		if last_error is not None:
33 | 			raise last_error
34 | 
35 | 	def get_old_url(self, digest):
36 | 		with self._env.begin() as txn:
37 | 			maybe_url = txn.get(digest)
38 | 			if maybe_url is None:
39 | 				return maybe_url
40 | 			return maybe_url.decode("utf-8")
41 | 
42 | 	def set_old_url(self, digest, url):
43 | 		with self._env.begin(write=True) as txn:
44 | 			return txn.put(digest, url.encode("utf-8"))
45 | 
46 | 
47 | class DupesInMemory:
48 | 	def __init__(self):
49 | 		self._digests = {}
50 | 
51 | 	def get_old_url(self, digest):
52 | 		return self._digests.get(digest)
53 | 
54 | 	def set_old_url(self, digest, url):
55 | 		self._digests[digest] = url
56 | 


--------------------------------------------------------------------------------
/libgrabsite/dupespotter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import sys
  4 | import os
  5 | import re
  6 | import json
  7 | import difflib
  8 | import subprocess
  9 | 
 10 | from hashlib import md5
 11 | from urllib.parse import urlsplit, quote, quote_plus, unquote
 12 | 
 13 | cache_dir = "cache"
 14 | 
 15 | 
 16 | def md5_url(url):
 17 | 	return md5(url.encode("utf-8")).hexdigest()
 18 | 
 19 | 
 20 | def get_cache_filename(url):
 21 | 	return os.path.join(cache_dir, md5_url(url))
 22 | 
 23 | 
 24 | UA = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36"
 25 | 
 26 | def get_body(url):
 27 | 	fname = get_cache_filename(url)
 28 | 	if os.path.exists(fname):
 29 | 		with open(fname, "rb") as f:
 30 | 			return f.read()
 31 | 	else:
 32 | 		subprocess.call(["wget", "--content-on-error", "-U", UA, url, "-O", fname])
 33 | 		with open(fname + ".info.json", "w") as f:
 34 | 			f.write(json.dumps({"url": url}))
 35 | 		with open(fname, "rb") as f:
 36 | 			return f.read()
 37 | 
 38 | 
 39 | def lower_escapes(url):
 40 | 	assert isinstance(url, bytes), type(url)
 41 | 	if b'%' not in url:
 42 | 		return url
 43 | 	return re.sub(b'(%[a-fA-F0-9]{2})', lambda m: m.group(1).lower(), url)
 44 | 
 45 | 
 46 | def kill_path(path, body):
 47 | 	body = body.replace(path.encode("utf-8"), b"")
 48 | 	body = body.replace(path.encode("utf-8").replace(b"/", br"\/"), b"")
 49 | 	body = body.replace(quote_plus(path).encode("utf-8"), b"")
 50 | 	body = body.replace(lower_escapes(quote_plus(path).encode("utf-8")), b"")
 51 | 	path_without_slashes = path.replace("/", "")
 52 | 	if len(path_without_slashes) >= 5:
 53 | 		body = body.replace(path_without_slashes.encode("utf-8"), b"")
 54 | 	# For Dokuwiki
 55 | 	path_underscored = path.replace("/", "_")
 56 | 	body = body.replace(path_underscored.encode("utf-8"), b"")
 57 | 	# For Drupal "jQuery.extend(Drupal.settings" line
 58 | 	path_jsoned = '"' + path.replace("/", "\\u002F") + '"'
 59 | 	body = body.replace(path_jsoned.encode("utf-8"), b"")
 60 | 	if '%' in path:
 61 | 		unquoted_path = unquote(path)
 62 | 		if len(unquoted_path) >= 4:
 63 | 			body = body.replace(quote_plus(unquoted_path).encode("utf-8"), b"")
 64 | 			body = body.replace(lower_escapes(quote_plus(unquoted_path).encode("utf-8")), b"")
 65 | 	return body
 66 | 
 67 | 
 68 | def process_body(body, url):
 69 | 	"""
 70 | 	Return a post-processed page body that excludes irrelevant content
 71 | 	that would prevent duplicate pages from being detected as duplicates.
 72 | 	"""
 73 | 	assert isinstance(body, bytes), type(body)
 74 | 
 75 | 	drupal = b"Drupal" in body
 76 | 
 77 | 	u = urlsplit(url)
 78 | 	# Needed for www.tragnarion.com
 79 | 	path = u.path.rstrip('/')
 80 | 	if path.startswith('/'):
 81 | 		path = path[1:]
 82 | 	if len(path) >= 5:
 83 | 		body = kill_path(path, body)
 84 | 
 85 | 	# Drupal websites sometimes embed the current URL excluding the
 86 | 	# first and/or second path component
 87 | 	shorter_path = '/'.join(path.split('/')[2:])
 88 | 	if len(shorter_path) >= 50:
 89 | 		body = kill_path(shorter_path, body)
 90 | 
 91 | 	if len(u.query) >= 3:
 92 | 		encoded_query = u.query.encode("utf-8")
 93 | 		body = body.replace(('?' + u.query).encode("utf-8"), b"")
 94 | 		body = body.replace(quote('?' + u.query).encode("utf-8"), b"")
 95 | 
 96 | 	# Strip HTML comments, which sometimes include timestamps or
 97 | 	# page generation stats
 98 | 	body = re.sub(br'<\!--.{1,4000}?-->', b"", body, count=1000, flags=re.DOTALL)
 99 | 
100 | 	# Drupal generates a "theme_token":"..." inside a JSON blob
101 | 	# CloudFlare has a petok:"-1413059798-86400"
102 | 	body = re.sub(br'(petok|_token|applicationTime)"?:("[-_A-Za-z0-9\.]+"|[0-9\.]+)', b"", body)
103 | 
104 | 	# Handle any 10-256 characters of hex or decimal
105 | 	# Minimum of 10 to handle UNIX timestamps
106 | 	body = re.sub(br'[A-Fa-f0-9\.]{10,256}', b"", body)
107 | 
108 | 	# Spotted on http://mtnldelhi.in/:
109 | 	# id="tabber_container_0_991">
110 | 	# id="tab_1-1_340">
111 | 	# <a name="tab_1-1_340">
112 | 	body = re.sub(br'\b(id|name|class)="[^"]{0,100}[-_]\d+"', b"", body)
113 | 
114 | 	# Randomized anti-spam mailto: lines
115 | 	body = re.sub(br'<a href="mailto:[^"@]{1,100}@[^"]{2,100}">(&#[0-9a-fA-Fx]{2,4};){3,100}</a>', b"", body)
116 | 
117 | 	# Kill twitter and facebook share buttons, no matter what kind of
118 | 	# URL they stuffed in there.
119 | 	body = re.sub(br'<div class="fb-like" data-href=".*?</div>', b"", body)
120 | 	body = re.sub(br'<a href="https?://twitter.com/share[^"]{0,4096}" class="twitter-share-button.*?</a>', b"", body)
121 | 
122 | 	# Drupal puts the current URL here, and the casing doesn't always match
123 | 	body = re.sub(br'<(link rel="(canonical|shortlink|alternate)".{1,1000}?href=|meta property="og:url" content=)"[^"]+" />', b"", body)
124 | 
125 | 	# Spotted on eff.org drupal
126 | 	body = re.sub(br'<link href="[^"]+" rel="alternate" hreflang="[^"]+" />', b"", body)
127 | 
128 | 	# Spotted on http://www.museodelvideojuego.com/ - handles
129 | 	# <input type="hidden" name="form_build_id" value="form-ddmhsyCMnpZsHKCQN-l6R1j9EwMT3lHKDI4xXcyFcBA" />
130 | 	# Spotted on http://2045.com/
131 | 	# <input type="hidden" name="file_uploadToken" value="\d+"
132 | 	body = re.sub(br'<input type="hidden"[^>]{1,16384}?>', b"", body)
133 | 
134 | 	# Spotted on http://www.communauteanimalcrossing.fr/
135 | 	body = re.sub(br'<param name="flashvars" value="servannee=\d{4}&amp;servmois=\d{1,2}&amp;servjour=\d{1,2}&amp;servheure=\d{1,2}&amp;servminute=\d{1,2}&amp;servseconde=\d{1,2}" />', b"", body)
136 | 
137 | 	# vbulletin
138 | 	body = re.sub(br'\(\d+ Viewing\)', b"", body)
139 | 	body = re.sub(br'Currently Active Users</a>: \d+ \(\d+ members and \d+ guests\)', b"", body)
140 | 
141 | 	# v= on http://vstreamers.com/v/images/css/p/videos
142 | 	# cb= on megahits.sapo.pt
143 | 	# pos= on www.smartcast.com.mx
144 | 	body = re.sub(br'[&\?]((v|cb)=\d+|pos=[A-Za-z0-9=]+)', b"", body)
145 | 
146 | 	# spotted on espn.go.com and others
147 | 	body = re.sub(br'(splinks-|var hash = .|":"?)-?\d+', b"", body)
148 | 
149 | 	# Kill newrelic inline script
150 | 	body = re.sub(br'window\.NREUM\|\|\(NREUM=\{\}\);NREUM\.info=\{.{1,3000}?\}', b"", body)
151 | 
152 | 	if drupal:
153 | 		# Kill entire Drupal settings line
154 | 		body = re.sub(br'jQuery\.extend\(Drupal.settings, ?\{.{1,40000}?\}\);', b"", body)
155 | 
156 | 		# Drupal generates this class id
157 | 		body = re.sub(br"\bview-dom-id-[0-9a-f]+\b", b"", body)
158 | 
159 | 		# Drupal sites have randomized sidebar content with these IDs
160 | 		body = re.sub(br'<div class="views-field views-field-[-a-z]+">.*', b"", body)
161 | 
162 | 		# nsslabs.com has this
163 | 		body = re.sub(br'<div class="breadcrumb">.{1,4000}?    </div>', b"", body)
164 | 
165 | 		# sbs.com.au has generated /css_ filenames
166 | 		body = re.sub(br'/css_[-_A-Za-z0-9]{10,100}\.css', b"", body)
167 | 
168 | 		# stopbadware.org has some differing autogenerated <style>
169 | 		body = re.sub(br'<style type="text/css" media="all">@import url\(.{1,4096}?\);</style>', b"", body)
170 | 
171 | 	# Drupal generates <body class="..."> items based on the URL
172 | 	# Generated class="" also spotted on non-Drupal www.minutouno.com
173 | 	# Duplicate class="" on stopbadware.org
174 | 	body = re.sub(br'<(body|div)( id="[^"]+")? class="[^"]+"( class="[^"]+")?( data-src="[^"]{1,2000}")?', b"", body)
175 | 
176 | 	return body
177 | 
178 | 
179 | def compare_bodies(body1, body2, url1, url2):
180 | 	# TODO: handle non-utf-8 bodies
181 | 	for line in difflib.unified_diff(
182 | 		body1.decode("utf-8", "replace").splitlines(keepends=True),
183 | 		body2.decode("utf-8", "replace").splitlines(keepends=True),
184 | 		fromfile=url1,
185 | 		tofile=url2):
186 | 		if not "\n" in line:
187 | 			line += "\n"
188 | 		sys.stdout.buffer.write(line.encode("utf-8"))
189 | 
190 | 
191 | def compare_unprocessed_bodies(up_body1, up_body2, url1, url2):
192 | 	body1 = process_body(up_body1, url1)
193 | 	body2 = process_body(up_body2, url2)
194 | 	print("{} == md5({!r})".format(md5_url(url1), url1))
195 | 	print("{} == md5({!r})".format(md5_url(url2), url2))
196 | 	print("After processing,")
197 | 	print("len(body({!r})) == {}".format(url1, len(body1)))
198 | 	print("len(body({!r})) == {}".format(url2, len(body2)))
199 | 	compare_bodies(body1, body2, url1, url2)
200 | 
201 | 
202 | def main():
203 | 	try:
204 | 		os.makedirs(cache_dir)
205 | 	except OSError:
206 | 		pass
207 | 
208 | 	assert os.path.exists(cache_dir)
209 | 
210 | 	if len(sys.argv) == 2:
211 | 		# Just save and print the body
212 | 		print(get_body(sys.argv[1]))
213 | 	elif len(sys.argv) == 3:
214 | 		url1, url2 = sys.argv[1], sys.argv[2]
215 | 		compare_unprocessed_bodies(get_body(url1), get_body(url2), url1, url2)
216 | 	else:
217 | 		assert 0, sys.argv
218 | 
219 | 
220 | if __name__ == '__main__':
221 | 	main()
222 | 


--------------------------------------------------------------------------------
/libgrabsite/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArchiveTeam/grab-site/0eaf88628f4d8ef5af4df4c13f594606a41de3cd/libgrabsite/favicon.ico


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/blogs:
--------------------------------------------------------------------------------
1 | # All 'blogs' ignores have been migrated to 'global';
2 | # there is no need to specify --igsets=blogs
3 | 


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/coppermine:
--------------------------------------------------------------------------------
1 | (?:displayimage|thumbnails)\.php[?&]album=(?:topn|toprated|lastcom|lastup|lastupby|random|lastcomby)
2 | ratepic\.php
3 | addfav\.php\?.*ref=displayimage\.php
4 | displayimage\.php\?.*slideshow=\d+
5 | 


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/facebook:
--------------------------------------------------------------------------------
1 | ^https?://error\.facebook\.com/common/scribe_endpoint\.php\?c=
2 | ^https?://www\.facebook\.com/[^/]+/(posts/|app_)[^/]+\?(ref=page_internal&)?_fb_noscript=
3 | ^https?://www\.facebook\.com/[^/]+/photos/(pb|a)\.[^/]+/[^/]+/.{4}/
4 | ^https?://www\.facebook\.com/[^/]+/photos/(pb|a)\.[^/]+/[^/]+/\?type=
5 | 


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/forums:
--------------------------------------------------------------------------------
 1 | /cron\.php\?
 2 | /external\.php\?type=rss
 3 | /login\.php\?
 4 | /newreply\.php\?
 5 | /private\.php\?
 6 | /privmsg\.php\?
 7 | /register\.php\?
 8 | /sendmessage\.php\?
 9 | /subscription\.php\?
10 | /posting\.php\?
11 | /viewtopic\.php\?.+&view=(next|previous)
12 | /viewtopic\.php\?.+&hilit=
13 | /feed\.php\?
14 | /index\.php\?option=com_mailto
15 | &view=login&return=
16 | &format=opensearch
17 | /misc\.php\?do=whoposted
18 | /newthread\.php\?
19 | /post_thanks\.php\?
20 | /blog_post\.php\?do=newblog
21 | /forumdisplay\.php.*[\?&]do=markread
22 | /userpoll/vote\.php\?
23 | /showthread\.php.*[\?&]goto=(next(old|new)est|newpost)
24 | /editpost\.php\?
25 | /\?view=getlastpost$
26 | /index\.php\?sharelink=
27 | /ucp\.php\?mode=delete_cookies
28 | /index.php\?action=(verificationcode|reporttm|emailuser|quickmod2)
29 | 


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/global:
--------------------------------------------------------------------------------
  1 | # URLs that are very likely to be endless loops
  2 | %25252525
  3 | /App_Themes/.+/App_Themes/
  4 | /bxSlider/.+/bxSlider/
  5 | /bxSlider/bxSlider/
  6 | /slides/slides/.+/slides/
  7 | /slides/.+/slides/slides/
  8 | /slides/slides/slides/
  9 | /js/js/.+/js/
 10 | /js/.+/js/js/
 11 | /js/js/js/
 12 | /css/css/.+/css/
 13 | /css/.+/css/css/
 14 | /css/css/css/
 15 | /styles/styles/.+/styles/
 16 | /styles/.+/styles/styles/
 17 | /styles/styles/styles/
 18 | /scripts/scripts/.+/scripts/
 19 | /scripts/.+/scripts/scripts/
 20 | /scripts/scripts/scripts/
 21 | /images/images/.+/images/
 22 | /images/.+/images/images/
 23 | /images/images/images/
 24 | /img/img/.+/img/
 25 | /img/.+/img/img/
 26 | /img/img/img/
 27 | /clientscript/clientscript/.+/clientscript/
 28 | /clientscript/.+/clientscript/clientscript/
 29 | /clientscript/clientscript/clientscript/
 30 | /lib/exe/.*lib[-_]exe[-_]lib[-_]exe[-_]
 31 | ^https?://{any_start_netloc}/.*&amp;amp;amp;
 32 | ^https?://{any_start_netloc}/.*amp%3Bamp%3Bamp%3B
 33 | ^https?://{any_start_netloc}/.+/plugins/ultimate-social-media-plus/.+/like/like/
 34 | 
 35 | # URLs that are very likely incorrectly extracted by wpull
 36 | /(%5C)+(%22|%27)
 37 | /%5C/%5C/
 38 | /%27\+[^/]+\+%27
 39 | /%22\+[^/]+\+%22
 40 | /%27%20\+[^/]+\+%20%27
 41 | /%22%20\+[^/]+\+%20%22
 42 | /\\+(%22|%27)
 43 | /\\+["']
 44 | /\\/\\/
 45 | /'\+[^/]+\+'
 46 | ^https?://{any_start_netloc}/.+/%3Ca%20href=
 47 | ^https?://www\.youtube\.com/.*\[\[.+\]\]
 48 | ^https?://www\.youtube\.com/.*\{\{.+\}\}
 49 | 
 50 | ^https?://www\.google\.com/recaptcha/(api|mailhide/d\?)
 51 | ^https?://www\.google\.com/accounts/AccountChooser
 52 | ^https?://accounts\.google\.com/(SignUp|ServiceLogin|AccountChooser|a/UniversalLogin)
 53 | 
 54 | # CAPTCHAs on ASP.NET sites
 55 | ^https?://[^/]+/.+/CaptchaImage\.axd
 56 | 
 57 | # We don't want to change language
 58 | ^https?://www\.flickr\.com/change_language\.gne
 59 | 
 60 | # Tracking scripts, tracking pixels, analytics
 61 | ^https?://geo\.yahoo\.com/b\?
 62 | ^https?://b\.scorecardresearch\.com/
 63 | ^https?://pixel\.blog\.hu/
 64 | ^https?://pixel\.redditmedia\.com/pixel/
 65 | ^https?://alb\.reddit\.com/
 66 | ^https?://pixel\.(quantserve|wp)\.com/
 67 | ^https?://(www|ssl)\.google-analytics\.com/(r/)?(__utm\.gif|collect\?)
 68 | ^https?://p\.opt\.fimserve\.com/
 69 | ^https?://.+/js-agent\.newrelic\.com/nr-\d{3}(\.min)?\.js$
 70 | ^https?://.+/stats\.g\.doubleclick\.net/dc\.js$
 71 | ^https?://.+/js/chartbeat\.js$
 72 | ^https?://[^/]+\.xiti\.com/hit\.xiti\?
 73 | ^https?://[^/]+\.services\.livejournal\.com/ljcounter
 74 | ^https?://beacon\.wikia-services\.com/
 75 | ^https?://s\d+\.sitemeter\.com/(js/counter\.js|meter\.asp)
 76 | ^https?://www\.amazon\.com/.+/logging/log-action\.html
 77 | 
 78 | # The tracking on warnerbros.com inexplicably links to bogus warnerbros.com/\d+ pages
 79 | ^https?://www\.warnerbros\.com/\d+$
 80 | 
 81 | # Inaccessible and dead sites that are frequently-linked
 82 | ^https?://i\.dev\.cdn\.turner\.com/
 83 | ^https?://[^/]+\.corp\.ne1\.yahoo\.com/
 84 | ^https?://prod-preview\.wired\.com/
 85 | ^https?://(www\.)?(megaupload|filesonic|wupload)\.com/
 86 | 
 87 | # Links to TED and TED embeds are common enough that we need to ignore their
 88 | # videos to prevent WARC bloat
 89 | ^https?://video-subtitle\.tedcdn\.com/
 90 | ^https?://download\.ted\.com/
 91 | 
 92 | # Avoid bloating WARCs with TMZ videos
 93 | ^https?://tmz\.vo\.llnwd\.net/
 94 | 
 95 | # Avoid hitting radio and TV streams, which can hang crawls for a long time.
 96 | # Note that we also detect and abort Icecast/SHOUTcast responses in
 97 | # wpull_hooks.py, so some of these ignores are no longer necessary.
 98 | ^https?://([^\./]+\.)?stream\.publicradio\.org/
 99 | ^https?://av\.rasset\.ie/av/live/
100 | ^https?://gcnplayer\.gcnlive\.com/.+
101 | ^https?://mp3\.ffh\.de/
102 | ^https?://(audio\d?|nfw)\.video\.ria\.ru/
103 | ^https?://[^\./]+\.radioscoop\.(com|net):\d+/
104 | ^https?://[^\./]+\.streamchan\.org:\d+/
105 | ^https?://[^/]*musicproxy\.s12\.de/
106 | ^https?://relay\.broadcastify\.com/
107 | ^https?://audio\d?\.radioreference\.com/
108 | ^https?://[^/]+\.akadostream\.ru(:\d+)?/
109 | ^https?://play(\d+)?\.radio13\.ru:8000/
110 | ^https?://stream(\d+)?\.media\.rambler\.ru/
111 | ^https?://pub(\d+)?\.di\.fm/
112 | ^https?://[^/]+\.streamtheworld\.com/
113 | ^https?://[^/]+\.gaduradio\.pl/
114 | ^https?://r-a-d\.io/.+\.mp3$
115 | ^https?://mp3tslg\.tdf-cdn\.com/
116 | ^https?://[^/]+/anony/mjpg\.cgi$
117 | ^https?://[^/]+/mjpg/video\.mjpg
118 | ^https?://air\.radiorecord\.ru(:\d+)?/
119 | ^https?://[^/]+\.rastream\.com(:\d+)?/
120 | ^https?://audiots\.scdn\.arkena\.com/
121 | ^https?://[a-z0-9]+\.cdn\.dvmr\.fr(:\d+)?/.+\.mp3
122 | 
123 | # Avoid following any kind of 'share' or 'bookmark' link
124 | ^https?://(www|draft)\.blogger\.com/(navbar\.g|post-edit\.g|delete-comment\.g|comment-iframe\.g|share-post\.g|email-post\.g|blog-this\.g|delete-backlink\.g|rearrange|blog_this\.pyra)\?
125 | ^https?://(www|px\.srvcs)\.tumblr\.com/(impixu\?|share(/link/?)?\?|reblog/)
126 | ^https?://plus\.google\.com/share\?
127 | ^https?://(apis|plusone)\.google\.com/_/\+1/
128 | ^https?://(ssl\.|www\.)?reddit\.com/(login\?dest=|submit\?|static/button/button)
129 | ^https?://(www\.)?digg\.com/submit\?
130 | ^https?://(www\.)?facebook\.com/(plugins/(share_button|like(box)?)\.php|sharer/sharer\.php|sharer?\.php|dialog/(feed|share))\?
131 | ^https?://(www\.)?facebook\.com/v[\d\.]+/plugins/like\.php
132 | ^https?://social-plugins\.line\.me/lineit/share
133 | ^https?://(www\.)?twitter\.com/(share\?|intent/((re)?tweet|favorite)|home/?\?status=|\?status=)
134 | ^https?://platform\d?\.twitter\.com/widgets/tweet_button.html\?
135 | ^https?://www\.newsvine\.com/_wine/save\?
136 | ^https?://www\.netvibes\.com/subscribe\.php\?
137 | ^https?://add\.my\.yahoo\.com/(rss|content)\?
138 | ^https?://www\.addtoany\.com/(add_to/|share_save\?)
139 | ^https?://www\.addthis\.com/bookmark\.php\?
140 | ^https?://([^\.]+\.)?pinterest\.com/pin/create/
141 | ^https?://www\.linkedin\.com/(cws/share|shareArticle)\?
142 | ^https?://(www\.)?stumbleupon\.com/(submit\?|badge/embed/)
143 | ^https?://csp\.cyworld\.com/bi/bi_recommend_pop\.php\?
144 | ^https?://share\.flipboard\.com/bookmarklet/popout\?
145 | ^https?://flattr.com/submit/auto\?
146 | ^https?://(www\.)?myspace\.com/Modules/PostTo/
147 | ^https?://www\.google\.com/bookmarks/mark\?
148 | ^https?://myweb2\.search\.yahoo\.com/myresults/bookmarklet\?
149 | ^https?://vuible\.com/pins-settings/
150 | ^https?://news\.ycombinator\.com/submitlink\?
151 | ^https?://reporter\.es\.msn\.com/\?fn=contribute
152 | ^https?://www\.blinklist\.com/index\.php\?Action=Blink/addblink\.php
153 | ^https?://sphinn\.com/index\.php\?c=post&m=submit&
154 | ^https?://posterous\.com/share\?
155 | ^https?://del\.icio\.us/post\?
156 | ^https?://delicious\.com/(save|post)\?
157 | ^https?://(www\.)?friendfeed\.com/share\?
158 | ^https?://(www\.)?xing\.com/(app/user\?op=share|social_plugins/share\?)
159 | ^https?://iwiw\.hu/pages/share/share\.jsp\?
160 | ^https?://memori(\.qip)?\.ru/link/\?
161 | ^https?://wow\.ya\.ru/posts_(add|share)_link\.xml\?
162 | ^https?://connect\.mail\.ru/share\?
163 | ^https?://zakladki\.yandex\.ru/newlink\.xml\?
164 | ^https?://vkontakte\.ru/share\.php\?
165 | ^https?://www\.odnoklassniki\.ru/dk\?st\.cmd=addShare
166 | ^https?://www\.google\.com/(reader/link\?|buzz/post\?)
167 | ^https?://service\.weibo\.com/share/share\.php\?
168 | ^https?://(www\.)?technorati\.com/faves/?\?add=
169 | ^https?://bufferapp\.com/add\?
170 | ^https?://b\.hatena\.ne\.jp/add\?
171 | ^https?://api\.addthis\.com/
172 | ^https?://bookmark\.naver\.com/post\?
173 | ^https?://(www\.)?instapaper\.com/hello2\?
174 | ^https?://getpocket\.com/(save|edit)/?\?
175 | ^https?://medium\.com/_/(vote|bookmark|subscribe)/
176 | ^https?://telegram\.me/share/url\?
177 | 
178 | # mail.google.com requires login but shows up on the web surprisingly often
179 | ^https?://mail\.google\.com/mail/
180 | 
181 | # This is the default gravatar that you don't want a million copies of
182 | ^https?://(\d|www|secure)\.gravatar\.com/avatar/ad516503a11cd5ca435acc9bb6523536
183 | 
184 | # imageshack's 404 page that you would be hitting quite often otherwise
185 | ^https?://imageshack\.com/lost$
186 | 
187 | # A loop on khaleejtimes.com
188 | ^https?://www\.khaleejtimes\.com/.+/kt_.+/kt_
189 | ^https?://www\.khaleejtimes\.com/.+/images/.+/images/
190 | ^https?://www\.khaleejtimes\.com/.+/imgactv/.+/imgactv/
191 | 
192 | # More loops
193 | ^https?://photobucket\.com/.+/albums/.+/albums/
194 | ^https?://([^/]+\.)?gdcvault\.com(/.*/|/)(fonts(/.*/|/)fonts/|css(/.*/|/)css/|img(/.*/|/)img/)
195 | ^https?://static\.licdn\.com/sc/p/com\.linkedin\.nux(:|%3A)nux-static-content(\+|%2B)[\d\.]+/f/
196 | ^https?://static\.licdn\.com/sc/p/.+/f//
197 | ^https?://tm\.uol\.com\.br/h/.+/h/
198 | ^https?://((s-)?static\.ak\.fbcdn\.net|(connect\.|www\.)?facebook\.com)/connect\.php/js/.*rsrc\.php
199 | ^https?://web\.archive\.org/web/[^/]+/https?\:/[^/]+\.addthis\.com/.+/static/.+/static/
200 | ^https?://[^/]+\.libsyn\.com/.+/%2[02]https?:/
201 | ^https?://www\.infomous\.com/cloud_widget/lib/lib/
202 | 
203 | # This specifically catches only *invalid* flickr.com links extracted by wpull
204 | ^https?://www\.flickr\.com/(explore/|photos/[^/]+/(sets/\d+/(page\d+/)?)?)\d+_[a-f0-9]+(_[a-z])?\.jpg$
205 | 
206 | # Avoid grabbing thousands of these; they page-requisite each other
207 | ^https?://media\.opb\.org/clips/embed/.+\.js$
208 | 
209 | # Per-post and per-comment Atom feeds
210 | ^https?://www\.blogger\.com/feeds/\d+/posts/default/\d+
211 | ^https?://www\.blogger\.com/feeds/\d+/\d+/comments/default/\d+
212 | 
213 | # Bogus /disqus.com path
214 | ^https?://.+/.+/disqus\.com/forums/$
215 | 
216 | # Bogus literal "/page/%d/" URLs (not filled with a number)
217 | ^https?://{any_start_netloc}(/.*|/)page/%d/$
218 | 
219 | # Bogus URLs on tumblr blogs
220 | ^https?://{any_start_netloc}/.*(\?|%5Cx26)route=(/page/:page|/archive/:year/:month|/tagged/:tag|/post/:id|/image/:post_id)
221 | ^https?://{any_start_netloc}/.*%5Cx26route=/archive
222 | 
223 | # There are too many avatars on tumblr.com
224 | ^https?://\d+\.media\.tumblr\.com/avatar_.+_16\.pn[gj]$
225 | 
226 | ^https?://www\.livejournal\.com/(tools/memadd|update|(identity/)?login)\.bml\?
227 | ^https?://[^\.]+\.livejournal\.com/.+/\*sup_ru/ru/UTF-8/
228 | ^https?://[^\.]+\.livejournal\.com/.+http://[^\.]+\.livejournal\.com/
229 | 
230 | ^https?://www\.dreamwidth\.org/tools/(memadd|tellafriend)\?
231 | 
232 | ^https?://r-login\.wordpress\.com/remote-login\.php
233 | ^https?://{any_start_netloc}/(wp-admin/|wp-login\.php\?)
234 | ^https?://[^/]+\.facebook\.com/login\.php
235 | 
236 | # Ignore /search.*updated-(min|max)= blogspot pagination because all posts are
237 | # crawled anyway via the _archive.html pages.  Need to ignore on all domains
238 | # because blogspot also runs on non-blogspot.com domains.
239 | ^https?://{any_start_netloc}/search(/label/[^\?]+|\?q=[^&]+|)[\?&]updated-(min|max)=\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d.*&max-results=\d+
240 | 
241 | # Ignore bogus /CSI/ links on blogspot.com
242 | ^https?://.+\.blogspot\.(com|in|com\.au|co\.uk|jp|co\.nz|ca|de|it|fr|se|sg|es|pt|com\.br|ar|mx|kr)/(\d{4}/\d{2}/|search/label/)(CSI/$|.*/CSI/CSI/CSI/)
243 | 
244 | # Links to ?share=(twitter|facebook|reddit|email|google-plus-1) etc.
245 | # These typically redirect.
246 | ^https?://{any_start_netloc}/.+[\?&]share=[a-z]{4,}
247 | 
248 | # Per-comment links
249 | ^https?://{any_start_netloc}/.+[\?&]mode=reply
250 | ^https?://{any_start_netloc}/.+[\?&](replyto(com)?|like_comment)=\d+
251 | ^https?://{any_start_netloc}/.+\?showComment(=|%5C)\d+
252 | ^https?://{any_start_netloc}/.+/quote-comment-\d+/$
253 | ^https?://{any_start_netloc}/.+/jetpack-comment/\?blogid=\d+&postid=\d+
254 | 


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/imdb:
--------------------------------------------------------------------------------
 1 | # Intended for archiving imdb forums
 2 | 
 3 | ^http://b\.scorecardresearch\.com/
 4 | ^http://ad\.doubleclick\.net/
 5 | ^http://www\.imdb\.com/rd/
 6 | ^http://www\.imdb\.com/.+\?ref_=
 7 | ^http://www\.imdb\.com/.+/board/flat/
 8 | ^http://www\.imdb\.com/.+/board/inline/
 9 | ^http://www\.imdb\.com/.+/board/thread/
10 | ^http://www\.imdb\.com/help/boards_posting\.html
11 | ^http://www\.imdb\.com/register/
12 | ^http://www\.imdb\.com/.+/board/.+/\d+\?d=
13 | ^http://www\.imdb\.com/.+/videogallery/.+/.+/
14 | 


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/mediawiki:
--------------------------------------------------------------------------------
 1 | # This ignore set avoids grabbing the full history of each page, because there
 2 | # are generally far too many ?oldid= pages to crawl completely.
 3 | ^https?://{any_start_netloc}/.+[\?&]oldid=\d+
 4 | ^https?://{any_start_netloc}/.+[\?&]curid=\d+
 5 | ^https?://{any_start_netloc}/.+[\?&]limit=(20|100|250|500)
 6 | ^https?://{any_start_netloc}/.+[\?&]hide(minor|bots|anons|liu|myself|redirs|links|trans|patrolled)=
 7 | ^https?://{any_start_netloc}/.+([\?&]title=|/)Special:(UserLogin|UserLogout|Translate|MobileFeedback|MobileOptions|RecentChangesLinked|Diff|MobileDiff)
 8 | ^https?://{any_start_netloc}/.+([\?&]title=|/)Special:RecentChanges&from=\d+
 9 | ^https?://{any_start_netloc}/.+([\?&]title=|/)Special:ListFiles&dir=prev&offset=\d+
10 | ^https?://{any_start_netloc}/.+([\?&]title=|/)Special:(ListFiles|PrefixIndex).*&amp;
11 | ^https?://{any_start_netloc}/.+([\?&]title=|/)Special:ListFiles.*&user=
12 | ^https?://{any_start_netloc}/.+([\?&]title=|/)Special:Log/
13 | ^https?://{any_start_netloc}/.+[\?&]action=edit&section=(\d+|new)
14 | ^https?://{any_start_netloc}/.+[\?&]feed(format)?=atom
15 | ^https?://{any_start_netloc}/.+[\?&]printable=yes
16 | ^https?://{any_start_netloc}/.+[\?&]mobileaction=
17 | ^https?://{any_start_netloc}/.+[\?&]undo(after)?=\d+
18 | ^https?://{any_start_netloc}/.+[\?&]lqt_method=
19 | 
20 | # Links to pages that don't exist
21 | ^https?://{any_start_netloc}/.+[\?&]redlink=1
22 | 
23 | # Loops
24 | ^https?://{any_start_netloc}/.*User_talk:.+/User_talk:
25 | ^https?://{any_start_netloc}/.*User_blog:.+/User_blog:
26 | ^https?://{any_start_netloc}/.*User:.+/User:
27 | 


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/meetupeverywhere:
--------------------------------------------------------------------------------
1 | ^https?://.*\meetup\.com/login/
2 | 


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/nogravatar:
--------------------------------------------------------------------------------
1 | ^https?://(\d|secure)\.gravatar\.com/avatar/
2 | 


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/noonion:
--------------------------------------------------------------------------------
1 | ^https?://[^/]+\.onion/
2 | 


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/nosortedindex:
--------------------------------------------------------------------------------
1 | # These are the "sort by" links on "index of" directory listings
2 | \?C=[NMSD];O=[AD]$
3 | 


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/pinterest:
--------------------------------------------------------------------------------
1 | ^https?://www\.pinterest\.com/[^/]+/\^/[^/]+/
2 | ^https?://www\.pinterest\.com/[^/]+/[^/]+/\^/[^/]+/
3 | ^https?://www\.pinterest\.com/[^/]+/[^/]+\.[^/]+
4 | ^https?://www\.pinterest\.com/[^/]+/[^/]+/[^/]+\.[^/]+
5 | ^https?://www\.pinterest\.com/[^/]+/webapp/js/app/(desktop|common)/bundle-(jcrop|mapbox)\.js
6 | ^https?://www\.pinterest\.com/[^/]+/[^/]+/webapp/js/app/(desktop|common)/bundle-(jcrop|mapbox)\.js
7 | 


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/reddit:
--------------------------------------------------------------------------------
 1 | # These ignores are designed for archiving subreddits.  Note that not
 2 | # all comments will be downloaded because many comments are collapsed
 3 | # by reddit.
 4 | 
 5 | ^https?://(www|old)\.reddit\.com/gold\?goldtype=
 6 | # URLs with utm_ can (hopefully) be safely ignored because reddit also sends
 7 | # href=""s without the utm_ trackers.
 8 | ^https?://(www|old)\.reddit\.com/r/[^/]+/.*[\?&]utm_
 9 | ^https?://(www|old)\.reddit\.com/r/[^/]+/comments/[a-z0-9]+/[^/]+/[a-z0-9]+
10 | ^https?://(www|old)\.reddit\.com/r/[^/]+/comments/[a-z0-9]+.*\?sort=
11 | ^https?://(www|old)\.reddit\.com/r/[^/]+/comments/[a-z0-9]+/[^/]+/\.compact
12 | ^https?://(www|old)\.reddit\.com/r/[^/]+/(top|new|rising|controversial|gilded|ads)/.+[\?&]after=
13 | ^https?://(www|old)\.reddit\.com/r/[^/]+/related/
14 | ^https?://(www|old)\.reddit\.com/r/[^/]+/(gilded)?\.mobile\?
15 | ^https?://(www|old)\.reddit\.com/r/[^/]+/search/?\?
16 | ^https?://(www|old)\.reddit\.com/r/[^/]+/wiki/(revisions|discussions)/user/.+
17 | ^https?://(www|old)\.reddit\.com/user/[^/]+/(comments/)?.+[\?&]sort=
18 | ^https?://(www|old)\.reddit\.com/.+/\.rss$
19 | \.reddit\.com/message/compose/?\?
20 | ^https?://(m|out|simple|amp)\.reddit\.com/
21 | 


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/singletumblr:
--------------------------------------------------------------------------------
1 | # You generally want this ignore set if you are archiving a tumblr blog,
2 | # because tumblr blogs can have tens of thousands of links to other tumblr
3 | # blogs, and grab-site's default --offsite-links behavior will otherwise grab
4 | # all of their homepages.
5 | #
6 | # This homepage ignore won't apply to any of the start URLs given to grab-site.
7 | 
8 | ^https?://[^/]+\.tumblr\.com/$
9 | 


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/twitter:
--------------------------------------------------------------------------------
1 | ^https?://((?:www|mobile)\.)?twitter\.com/.+[\?&](?:id|lang|locale|screen_name|nav)=
2 | ^https?://mobile\.twitter\.com/i/anonymize\?data=
3 | 


--------------------------------------------------------------------------------
/libgrabsite/ignore_sets/youtube:
--------------------------------------------------------------------------------
1 | \.?youtube\.com/user/[^/]+/(playlists|channels|videos)\?(flow|view|sort|live_view)=
2 | 


--------------------------------------------------------------------------------
/libgrabsite/main.py:
--------------------------------------------------------------------------------
  1 | import faulthandler
  2 | faulthandler.enable()
  3 | 
  4 | import re
  5 | import os
  6 | import sys
  7 | import urllib.request
  8 | import shutil
  9 | import binascii
 10 | import datetime
 11 | import shlex
 12 | import click
 13 | import libgrabsite
 14 | 
 15 | def print_version(ctx, param, value):
 16 | 	if not value or ctx.resilient_parsing:
 17 | 		return
 18 | 	click.echo(libgrabsite.__version__)
 19 | 	ctx.exit()
 20 | 
 21 | def replace_2arg(args, arg, replacement):
 22 | 	idx = args.index(arg)
 23 | 	if idx == -1:
 24 | 		return
 25 | 	args.pop(idx)
 26 | 	args.pop(idx)
 27 | 	for r in reversed(replacement):
 28 | 		args.insert(idx, r)
 29 | 
 30 | def patch_dns_inet_is_multicast():
 31 | 	"""
 32 | 	Patch dnspython's dns.inet.is_multicast to not raise ValueError:
 33 | 	https://github.com/ArchiveTeam/grab-site/issues/111
 34 | 	"""
 35 | 	import dns.inet
 36 | 	is_multicast_dnspython = dns.inet.is_multicast
 37 | 	def is_multicast(text):
 38 | 		try:
 39 | 			return is_multicast_dnspython(text)
 40 | 		except Exception:
 41 | 			return False
 42 | 	dns.inet.is_multicast = is_multicast
 43 | 
 44 | @click.command()
 45 | 
 46 | @click.option('--concurrency', default=2, metavar='NUM',
 47 | 	help='Use this many connections to fetch in parallel (default: 2).')
 48 | 
 49 | @click.option('--concurrent', default=-1, metavar='NUM',
 50 | 	help='Alias for --concurrency.')
 51 | 
 52 | @click.option('--delay', default="0", metavar='DELAY',
 53 | 	help=
 54 | 		'Time to wait between requests, in milliseconds (default: 0).  '
 55 | 		'Can be "NUM", or "MIN-MAX" to use a random delay between MIN and MAX '
 56 | 		'for each request.  Delay applies to each concurrent fetcher, not globally.')
 57 | 
 58 | @click.option('--recursive/--1', default=True,
 59 | 	help=
 60 | 		'--recursive (default: true) to crawl under last /path/ component '
 61 | 		'recursively, or --1 to get just START_URL.')
 62 | 
 63 | @click.option('--offsite-links/--no-offsite-links', default=True,
 64 | 	help=
 65 | 		'--offsite-links (default: true) to grab all links to a depth of 1 '
 66 | 		'on other domains, or --no-offsite-links to disable.')
 67 | 
 68 | @click.option('--igsets', default="", metavar='LIST',
 69 | 	help='Comma-separated list of ignore sets to use in addition to "global".')
 70 | 
 71 | @click.option('--ignore-sets', default="", metavar='LIST',
 72 | 	help='Alias for --igsets.')
 73 | 
 74 | @click.option('--no-global-igset', is_flag=True,
 75 | 	help='Do not add the "global" ignore set.')
 76 | 
 77 | @click.option('--import-ignores', default=None, metavar='FILE',
 78 | 	help='Copy this file to DIR/ignores before the crawl begins.')
 79 | 
 80 | @click.option('--igon/--igoff', default=False,
 81 | 	help=
 82 | 		'--igon (default: false) to print all URLs being ignored to the terminal '
 83 | 		'and dashboard.')
 84 | 
 85 | @click.option('--debug', is_flag=True, help='Print a lot of debugging information.')
 86 | 
 87 | @click.option('--video/--no-video', default=True,
 88 | 	help=
 89 | 		'--no-video (default: false) to skip the download of videos by both '
 90 | 		'mime type and file extension.  Skipped videos are logged to '
 91 | 		'DIR/skipped_videos')
 92 | 
 93 | @click.option('-i', '--input-file', default=None, type=str,
 94 | 	help=
 95 | 		'Load list of URLs-to-grab from a local file or from a URL; like wget -i. '
 96 | 		'File must be a newline-delimited list of URLs. '
 97 | 		'Combine with --1 to avoid a recursive crawl on each URL.')
 98 | 
 99 | @click.option('--max-content-length', default=-1, metavar='N',
100 | 	help=
101 | 		"Skip the download of any response that claims a Content-Length "
102 | 		"larger than N (default: -1, don't skip anything).")
103 | 
104 | @click.option('--level', default="inf", metavar='NUM',
105 | 	help='Recurse this many levels (default: inf).')
106 | 
107 | @click.option('--page-requisites-level', default="5", metavar='NUM',
108 | 	help='Recursive this many levels for page requisites (default: 5).')
109 | 
110 | @click.option('--warc-max-size', default=5368709120, metavar='BYTES',
111 | 	help=
112 | 		'Try to limit each WARC file to around BYTES bytes before rolling over '
113 | 		'to a new WARC file (default: 5368709120, which is 5GiB).')
114 | 
115 | @click.option('--ua', default="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0",
116 | 	metavar='STRING', help='Send User-Agent: STRING instead of pretending to be Firefox on Windows.')
117 | 
118 | @click.option('--wpull-args', default="",
119 | 	metavar='ARGS', help=
120 | 		r'String containing additional arguments to pass to wpull; '
121 | 		r'see ~/.local/bin/wpull --help.  ARGS is split with shlex.split '
122 | 		r'and individual arguments can contain spaces if quoted, e.g. '
123 | 		r'--wpull-args="--youtube-dl \"--youtube-dl-exe=/My Documents/youtube-dl\""')
124 | 
125 | @click.option('--sitemaps/--no-sitemaps', default=True,
126 | 	help=
127 | 		'--sitemaps (default: true) to queue URLs from sitemap.xml '
128 | 		'at the root of the site, or --no-sitemaps to disable.')
129 | 
130 | @click.option('--dupespotter/--no-dupespotter', default=True,
131 | 	help=
132 | 		'--dupespotter (default: true) to skip the extraction of links '
133 | 		'from pages that look like duplicates of earlier pages, or '
134 | 		'--no-dupespotter to disable.  Disable this for sites that are '
135 | 		'directory listings.')
136 | 
137 | @click.option('--id', default=None, type=str, metavar='ID',
138 | 	help=
139 | 		'Use id ID for the crawl instead of a random 128-bit id. '
140 | 		'This must be unique for every crawl.')
141 | 
142 | @click.option('--dir', default=None, type=str, metavar='DIR', help=
143 | 	'Put control files, temporary files, and unfinished WARCs in DIR '
144 | 	'(default: a directory name based on the URL, date, and first 8 '
145 | 	'characters of the id).')
146 | 
147 | @click.option('--finished-warc-dir', default=None, type=str, metavar='FINISHED_WARC_DIR',
148 | 	help=
149 | 		'Absolute path to a directory into which finished .warc.gz and .cdx '
150 | 		'files will be moved.')
151 | 
152 | @click.option('--permanent-error-status-codes', default='401,403,404,405,410', type=str,
153 | 	metavar='STATUS_CODES',
154 | 	help=
155 | 		'A comma-separated list of HTTP status codes to treat as a permanent '
156 | 		'error and therefore *not* retry (default: 401,403,404,405,410)')
157 | 
158 | @click.option('--which-wpull-args-partial', is_flag=True,
159 | 	help=
160 | 		'Print a partial list of wpull arguments that would be used and exit.  '
161 | 		'Excludes grab-site-specific features, and removes DIR/ from paths.  '
162 | 		'Useful for reporting bugs on wpull without grab-site involvement.')
163 | 
164 | @click.option('--which-wpull-command', is_flag=True,
165 | 	help=
166 | 		"Populate DIR/ but don't start wpull; instead print the command that would "
167 | 		"have been used to start wpull with all of the grab-site functionality.")
168 | 
169 | @click.option('--version', is_flag=True, callback=print_version,
170 | 	expose_value=False, is_eager=True, help='Print version and exit.')
171 | 
172 | @click.argument('start_url', nargs=-1, required=False)
173 | 
174 | def main(concurrency, concurrent, delay, recursive, offsite_links, igsets,
175 | ignore_sets, no_global_igset, import_ignores, igon, debug, video, level,
176 | page_requisites_level, max_content_length, sitemaps, dupespotter, warc_max_size,
177 | ua, input_file, wpull_args, start_url, id, dir, finished_warc_dir,
178 | permanent_error_status_codes, which_wpull_args_partial, which_wpull_command):
179 | 	"""
180 | 	Runs a crawl on one or more URLs.  For additional help, see
181 | 
182 | 	https://github.com/ArchiveTeam/grab-site/blob/master/README.md#usage
183 | 	"""
184 | 	if not (input_file or start_url):
185 | 		print("Neither a START_URL or --input-file= was specified; see --help", file=sys.stderr)
186 | 		sys.exit(1)
187 | 	elif input_file and start_url:
188 | 		print("Can't specify both START_URL and --input-file=; see --help", file=sys.stderr)
189 | 		sys.exit(1)
190 | 
191 | 	span_hosts_allow = "page-requisites,linked-pages"
192 | 	if not offsite_links:
193 | 		span_hosts_allow = "page-requisites"
194 | 
195 | 	if concurrent != -1:
196 | 		concurrency = concurrent
197 | 
198 | 	if ignore_sets != "":
199 | 		igsets = ignore_sets
200 | 
201 | 	if start_url:
202 | 		claim_start_url = start_url[0]
203 | 	else:
204 | 		input_file_is_remote = bool(re.match("^(ftp|https?)://", input_file))
205 | 		if input_file_is_remote:
206 | 			claim_start_url = input_file
207 | 		else:
208 | 			claim_start_url = 'file://' + os.path.abspath(input_file)
209 | 
210 | 	if not id:
211 | 		id = binascii.hexlify(os.urandom(16)).decode('utf-8')
212 | 	ymd                  = datetime.datetime.utcnow().isoformat()[:10]
213 | 	no_proto_no_trailing = claim_start_url.split('://', 1)[1].rstrip('/')[:100]
214 | 	unwanted_chars_re    = r'[^-_a-zA-Z0-9%\.,;@+=]'
215 | 	warc_name            = "{}-{}-{}".format(re.sub(unwanted_chars_re, '-', no_proto_no_trailing).lstrip('-'), ymd, id[:8])
216 | 
217 | 	# make absolute because wpull will start in temp/
218 | 	if not dir:
219 | 		working_dir = os.path.abspath(warc_name)
220 | 	else:
221 | 		working_dir = os.path.abspath(dir)
222 | 
223 | 	LIBGRABSITE = os.path.dirname(libgrabsite.__file__)
224 | 	args = [
225 | 		"--debug" if debug else "--quiet",
226 | 		"-U",                      ua,
227 | 		"--header",                "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
228 | 		"--header",                "Accept-Language: en-US,en;q=0.5",
229 | 		"--no-check-certificate",
230 | 		"--no-robots",
231 | 		"--inet4-only",
232 | 		"--dns-timeout",           "20",
233 | 		"--connect-timeout",       "20",
234 | 		"--read-timeout",          "900",
235 | 		"--session-timeout",       str(86400 * 2),
236 | 		"--tries",                 "3",
237 | 		"--waitretry",             "5",
238 | 		"--max-redirect",          "8",
239 | 		"--output-file",           "{}/wpull.log".format(working_dir),
240 | 		"--database",              "{}/wpull.db".format(working_dir),
241 | 		"--plugin-script",         "{}/wpull_hooks.py".format(LIBGRABSITE),
242 | 		"--save-cookies",          "{}/cookies.txt".format(working_dir),
243 | 		"--delete-after",
244 | 		"--page-requisites",
245 | 		"--no-parent",
246 | 		"--concurrent",            str(concurrency),
247 | 		"--warc-file",             "{}/{}".format(working_dir, warc_name),
248 | 		"--warc-max-size",         str(warc_max_size),
249 | 		"--warc-cdx",
250 | 		"--strip-session-id",
251 | 		"--escaped-fragment",
252 | 		"--level",                 level,
253 | 		"--page-requisites-level", page_requisites_level,
254 | 		"--span-hosts-allow",      span_hosts_allow,
255 | 		"--load-cookies",          "{}/default_cookies.txt".format(LIBGRABSITE),
256 | 	]
257 | 
258 | 	if os.name != "nt" and sys.platform != "cygwin":
259 | 		args += [
260 | 			"--debug-manhole"
261 | 		]
262 | 
263 | 	if finished_warc_dir is not None:
264 | 		args += ["--warc-move", finished_warc_dir]
265 | 
266 | 	if sitemaps:
267 | 		args += ["--sitemaps"]
268 | 
269 | 	if recursive:
270 | 		args += ["--recursive"]
271 | 
272 | 	if wpull_args:
273 | 		args += shlex.split(wpull_args)
274 | 
275 | 	DIR_input_file = os.path.join(working_dir, "input_file")
276 | 	if start_url:
277 | 		args.extend(start_url)
278 | 	else:
279 | 		args += ["--input-file", DIR_input_file]
280 | 
281 | 	if which_wpull_args_partial:
282 | 		replace_2arg(args, "--output-file",   ["--output-file",  "wpull.log"])
283 | 		replace_2arg(args, "--database",      ["--database",     "wpull.db"])
284 | 		replace_2arg(args, "--plugin-script", [])
285 | 		replace_2arg(args, "--save-cookies",  ["--save-cookies", "cookies.txt"])
286 | 		replace_2arg(args, "--load-cookies",  [])
287 | 		replace_2arg(args, "--warc-file",     ["--warc-file",    warc_name])
288 | 		try:
289 | 			args.remove("--quiet")
290 | 		except ValueError:
291 | 			pass
292 | 		print(" ".join(shlex.quote(a) for a in args))
293 | 		return
294 | 
295 | 	# Create DIR and DIR files only after which_wpull_args_* checks
296 | 	os.makedirs(working_dir)
297 | 	temp_dir = os.path.join(working_dir, "temp")
298 | 	os.makedirs(temp_dir)
299 | 
300 | 	if input_file is not None:
301 | 		# wpull -i doesn't support URLs, so download the input file ourselves if necessary
302 | 		if input_file_is_remote:
303 | 			# TODO: use wpull with correct user agent instead of urllib.request
304 | 			# wpull -O fails: https://github.com/chfoo/wpull/issues/275
305 | 			u = urllib.request.urlopen(input_file)
306 | 			with open(DIR_input_file, "wb") as f:
307 | 				while True:
308 | 					s = u.read(1024 * 1024)
309 | 					if not s:
310 | 						break
311 | 					f.write(s)
312 | 		else:
313 | 			shutil.copyfile(input_file, DIR_input_file)
314 | 
315 | 	with open("{}/id".format(working_dir), "w") as f:
316 | 		f.write(id)
317 | 
318 | 	with open("{}/start_url".format(working_dir), "w") as f:
319 | 		f.write(claim_start_url)
320 | 
321 | 	with open("{}/all_start_urls".format(working_dir), "w") as f:
322 | 		for u in start_url:
323 | 			f.write(u + "\n")
324 | 
325 | 	with open("{}/concurrency".format(working_dir), "w") as f:
326 | 		f.write(str(concurrency))
327 | 
328 | 	with open("{}/max_content_length".format(working_dir), "w") as f:
329 | 		f.write(str(max_content_length))
330 | 
331 | 	with open("{}/igsets".format(working_dir), "w") as f:
332 | 		f.write("{}{}".format("" if no_global_igset else "global,", igsets))
333 | 
334 | 	if video:
335 | 		with open("{}/video".format(working_dir), "w") as f:
336 | 			pass
337 | 
338 | 	if not igon:
339 | 		with open("{}/igoff".format(working_dir), "w") as f:
340 | 			pass
341 | 
342 | 	with open("{}/ignores".format(working_dir), "w") as f:
343 | 		if import_ignores is not None:
344 | 			f.write(open(import_ignores, "r").read())
345 | 
346 | 	with open("{}/delay".format(working_dir), "w") as f:
347 | 		f.write(delay)
348 | 
349 | 	with open("{}/scrape".format(working_dir), "w") as f:
350 | 		pass
351 | 
352 | 	# We don't actually need to write control files for this mode to work, but the
353 | 	# only reason to use this is if you're starting wpull manually with modified
354 | 	# arguments, and wpull_hooks.py requires the control files.
355 | 	if which_wpull_command:
356 | 		bin = sys.argv[0].replace("/grab-site", "/wpull") # TODO
357 | 		print("GRAB_SITE_WORKING_DIR={} DUPESPOTTER_ENABLED={} {} {}".format(
358 | 			working_dir, int(dupespotter), bin, " ".join(shlex.quote(a) for a in args)))
359 | 		return
360 | 
361 | 	patch_dns_inet_is_multicast()
362 | 
363 | 	# Mutate argv, environ, cwd before we turn into wpull
364 | 	sys.argv[1:] = args
365 | 	os.environ["GRAB_SITE_WORKING_DIR"] = working_dir
366 | 	os.environ["DUPESPOTTER_ENABLED"]   = "1" if dupespotter else "0"
367 | 	# We can use --warc-tempdir= to put WARC-related temporary files in a temp
368 | 	# directory, but wpull also creates non-WARC-related "resp_cb" temporary
369 | 	# files in the cwd, so we must start wpull in temp/ anyway.
370 | 	os.chdir(temp_dir)
371 | 
372 | 	# Modify NO_DOCUMENT_STATUS_CODES
373 | 	# https://github.com/chfoo/wpull/issues/143
374 | 	from wpull.processor.web import WebProcessor
375 | 	WebProcessor.NO_DOCUMENT_STATUS_CODES = \
376 | 		tuple(int(code) for code in permanent_error_status_codes.split(","))
377 | 
378 | 	import wpull.application.main
379 | 	# Don't let wpull install a handler for SIGINT or SIGTERM,
380 | 	# because we install our own in wpull_hooks.py.
381 | 	wpull.application.main.main(use_signals=False)
382 | 
383 | 
384 | if __name__ == '__main__':
385 | 	main()
386 | 


--------------------------------------------------------------------------------
/libgrabsite/server.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import txaio
  4 | txaio.use_asyncio()
  5 | import os
  6 | import json
  7 | import pprint
  8 | import asyncio
  9 | from autobahn.asyncio.websocket import WebSocketServerFactory, WebSocketServerProtocol
 10 | 
 11 | class GrabberServerProtocol(WebSocketServerProtocol):
 12 | 	def __init__(self):
 13 | 		super().__init__()
 14 | 		self.mode = None
 15 | 
 16 | 	def onConnect(self, request):
 17 | 		self.peer = request.peer
 18 | 		print(f"{self.peer} connected")
 19 | 		self.factory.clients.add(self)
 20 | 
 21 | 	def onClose(self, wasClean, code, reason):
 22 | 		print(f"{self.peer} disconnected")
 23 | 		self.factory.clients.discard(self)
 24 | 
 25 | 	def onMessage(self, payload, isBinary):
 26 | 		obj  = json.loads(payload.decode("utf-8"))
 27 | 		type = obj["type"]
 28 | 		if self.mode is None and type == "hello" and obj.get("mode"):
 29 | 			mode = obj["mode"]
 30 | 			if mode in ("dashboard", "grabber"):
 31 | 				self.mode = mode
 32 | 				if mode == "grabber":
 33 | 					print(f'{self.peer} is grabbing {obj["url"]}')
 34 | 				elif mode == "dashboard":
 35 | 					user_agent = obj.get("user_agent", "(no User-Agent)")
 36 | 					print(f"{self.peer} is dashboarding with {user_agent}")
 37 | 		elif self.mode == "grabber":
 38 | 			if type == "download":
 39 | 				self.broadcast_to_dashboards({
 40 | 					"type":          type,
 41 | 					"job_data":      obj["job_data"],
 42 | 					"url":           obj["url"],
 43 | 					"response_code": obj["response_code"],
 44 | 					"wget_code":     obj["response_message"]
 45 | 				})
 46 | 			elif type in ("stdout", "stderr"):
 47 | 				self.broadcast_to_dashboards({
 48 | 					"type":     type,
 49 | 					"job_data": obj["job_data"],
 50 | 					"message":  obj["message"]
 51 | 				})
 52 | 			elif type == "ignore":
 53 | 				self.broadcast_to_dashboards({
 54 | 					"type":     type,
 55 | 					"job_data": obj["job_data"],
 56 | 					"url":      obj["url"],
 57 | 					"pattern":  obj["pattern"],
 58 | 				})
 59 | 
 60 | 	def broadcast_to_dashboards(self, obj):
 61 | 		for client in self.factory.clients:
 62 | 			if client.mode == "dashboard":
 63 | 				client.sendMessage(json.dumps(obj).encode("utf-8"))
 64 | 
 65 | 	# Called when we get an HTTP request instead of a WebSocket request
 66 | 	def sendServerStatus(self, redirectUrl=None, redirectAfter=0):
 67 | 		requestPath = self.http_request_uri.split("?")[0]
 68 | 		if requestPath == "/":
 69 | 			self.send_page("dashboard.html", 200, "OK", "text/html; charset=UTF-8")
 70 | 		elif requestPath == "/favicon.ico":
 71 | 			self.send_page("favicon.ico", 200, "OK", "image/x-icon")
 72 | 		else:
 73 | 			self.send_page("404.html", 404, "Not Found", "text/html; charset=UTF-8")
 74 | 
 75 | 	# Based on AutoBahn's WebSocketServerProtocol.sendHtml
 76 | 	def send_page(self, fname, code, status, content_type):
 77 | 		with open(os.path.join(os.path.dirname(__file__), fname), "rb") as f:
 78 | 			response_body = f.read()
 79 | 		response =  f"HTTP/1.1 {code} {status}\r\n"
 80 | 		response += f"Content-Type: {content_type}\r\n"
 81 | 		response += f"Content-Length: {len(response_body)}\r\n"
 82 | 		response += "X-Frame-Options: DENY\r\n"
 83 | 		response += "\r\n"
 84 | 		self.sendData(response.encode("utf-8"))
 85 | 		self.sendData(response_body)
 86 | 
 87 | 
 88 | class GrabberServerFactory(WebSocketServerFactory):
 89 | 	protocol = GrabberServerProtocol
 90 | 
 91 | 	def __init__(self):
 92 | 		super().__init__()
 93 | 		self.clients = set()
 94 | 
 95 | 
 96 | def main():
 97 | 	loop      = asyncio.get_event_loop()
 98 | 	ports     = list(int(p) for p in os.environ.get("GRAB_SITE_PORT", "29000").split(","))
 99 | 	factory   = GrabberServerFactory()
100 | 	interface = os.environ.get("GRAB_SITE_INTERFACE", "0.0.0.0")
101 | 	for port in ports:
102 | 		coro = loop.create_server(factory, interface, port)
103 | 		loop.run_until_complete(coro)
104 | 		print(f"grab-site server listening on {interface}:{port}")
105 | 
106 | 	loop.run_forever()
107 | 
108 | 
109 | if __name__ == "__main__":
110 | 	main()
111 | 


--------------------------------------------------------------------------------
/libgrabsite/wpull_hooks.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import re2
  3 | import os
  4 | import sys
  5 | import time
  6 | import signal
  7 | import random
  8 | import functools
  9 | import traceback
 10 | import asyncio
 11 | import urllib.parse
 12 | 
 13 | from wpull.application.hook import Actions
 14 | from wpull.application.plugin import WpullPlugin, PluginFunctions, hook, event
 15 | from wpull.pipeline.app import AppSession
 16 | from wpull.pipeline.item import URLRecord
 17 | from wpull.pipeline.session import ItemSession
 18 | from wpull.url import URLInfo
 19 | 
 20 | from libgrabsite import wpull_tweaks, dashboard_client
 21 | import libgrabsite
 22 | 
 23 | 
 24 | working_dir = os.environ["GRAB_SITE_WORKING_DIR"]
 25 | def cf(fname):
 26 | 	return os.path.join(working_dir, fname)
 27 | 
 28 | def re_compile(regexp):
 29 | 	# Validate with re first, because re2 may be more prone to segfaulting on
 30 | 	# bad regexps, and because re returns useful errors.
 31 | 	re.compile(regexp)
 32 | 	try:
 33 | 		return re2.compile(regexp)
 34 | 	except re.error:
 35 | 		# Regular expressions with lookaround expressions cannot be compiled with
 36 | 		# re2, so on error try compiling with re.
 37 | 		return re.compile(regexp)
 38 | 
 39 | def compile_combined_regexp(patterns):
 40 | 	# If there are no patterns, we want to ignore nothing, not everything.
 41 | 	if not patterns:
 42 | 		return re_compile("$^")
 43 | 	regexp = "|".join(map(lambda pattern: f"({pattern})", patterns))
 44 | 	return re_compile(regexp)
 45 | 
 46 | def include_ignore_line(line):
 47 | 	return line and not line.startswith("#")
 48 | 
 49 | ignore_sets_path = os.path.join(os.path.dirname(libgrabsite.__file__), "ignore_sets")
 50 | def get_patterns_for_ignore_set(name: str):
 51 | 	assert name != "", name
 52 | 	with open(os.path.join(ignore_sets_path, name), "r", encoding="utf-8") as f:
 53 | 		return f.read().strip("\n").split("\n")
 54 | 
 55 | def swallow_exception(f):
 56 | 	@functools.wraps(f)
 57 | 	def wrapper(*args, **kwargs):
 58 | 		try:
 59 | 			return f(*args, **kwargs)
 60 | 		except Exception:
 61 | 			traceback.print_exc()
 62 | 	return wrapper
 63 | 
 64 | CONTROL_FILE_CACHE_SEC = 1.5
 65 | 
 66 | def caching_decorator(f):
 67 | 	cache = {}
 68 | 	@functools.wraps(f)
 69 | 	def wrapper(path):
 70 | 		timestamp, val = cache.get(path, (-CONTROL_FILE_CACHE_SEC, None))
 71 | 		if timestamp > (time.monotonic() - CONTROL_FILE_CACHE_SEC):
 72 | 			#print(f"returning cached value {path} {val}")
 73 | 			return val
 74 | 		val = f(path)
 75 | 		cache[path] = (time.monotonic(), val)
 76 | 		#print(f"returning new value {path} {val}")
 77 | 		return val
 78 | 	return wrapper
 79 | 
 80 | @caching_decorator
 81 | def path_exists_with_cache(path):
 82 | 	return os.path.exists(path)
 83 | 
 84 | @caching_decorator
 85 | def mtime_with_cache(path):
 86 | 	return os.stat(path).st_mtime
 87 | 
 88 | class FileChangedWatcher(object):
 89 | 	def __init__(self, fname):
 90 | 		self.fname = fname
 91 | 		# Use a bogus mtime so that has_changed() returns True
 92 | 		# at least once
 93 | 		self.last_mtime = -1
 94 | 
 95 | 	def has_changed(self):
 96 | 		now_mtime       = mtime_with_cache(self.fname)
 97 | 		changed         = now_mtime != self.last_mtime
 98 | 		self.last_mtime = now_mtime
 99 | 		if changed:
100 | 			print(f"Imported {self.fname}")
101 | 		return changed
102 | 
103 | 
104 | ICY_FIELD_PATTERN = re2.compile("(?i)^icy-|ice-|x-audiocast-")
105 | ICY_VALUE_PATTERN = re2.compile("(?i)^icecast")
106 | 
107 | def get_content_length(response) -> int:
108 | 	try:
109 | 		return int(list(p for p in response.fields.get_all() if p[0] == "Content-Length")[0][1])
110 | 	except (IndexError, ValueError):
111 | 		return -1
112 | 
113 | def has_content_type_video(response) -> bool:
114 | 	try:
115 | 		t = list(p for p in response.fields.get_all() if p[0] == "Content-Type")[0][1]
116 | 		return t.lower().startswith("video/")
117 | 	except (IndexError, ValueError):
118 | 		return False
119 | 
120 | def response_status_code(response) -> int:
121 | 	statcode = 0
122 | 
123 | 	try:
124 | 		# duck typing: assume the response is
125 | 		# wpull.protocol.http.request.Response
126 | 		statcode = response.status_code
127 | 	except (AttributeError, KeyError):
128 | 		pass
129 | 
130 | 	try:
131 | 		# duck typing: assume the response is
132 | 		# wpull.protocol.ftp.request.Response
133 | 		statcode = response.reply.code
134 | 	except (AttributeError, KeyError):
135 | 		pass
136 | 
137 | 	return statcode
138 | 
139 | # Excluded vob, mpeg, mpg, avi because they are not found on the general web
140 | video_exts = set("webm mp4 m4v mkv ts 3gp 3g2 flv mov wmv ogv ogm".split(" "))
141 | 
142 | def has_video_ext(url: str) -> bool:
143 | 	ext = url.rsplit(".")[-1]
144 | 	return ext.lower() in video_exts
145 | 
146 | class GrabSitePlugin(WpullPlugin):
147 | 	def activate(self):
148 | 		wpull_tweaks.activate(self.app_session)
149 | 		self.loop = asyncio.get_event_loop()
150 | 		self.enable_stdio_capture()
151 | 		self.add_signal_handlers()
152 | 		self.init_job_data()
153 | 		self.init_ws()
154 | 		self.setup_watchers()
155 | 		self.all_start_urls             = open(cf("all_start_urls")).read().rstrip("\n").split("\n")
156 | 		self.all_start_netlocs          = set(urllib.parse.urlparse(url).netloc for url in self.all_start_urls)
157 | 		self.skipped_videos             = open(cf("skipped_videos"),             "w", encoding="utf-8")
158 | 		self.skipped_max_content_length = open(cf("skipped_max_content_length"), "w", encoding="utf-8")
159 | 		self.update_ignores()
160 | 		super().activate()
161 | 
162 | 	def enable_stdio_capture(self):
163 | 		self.real_stdout_write  = sys.stdout.buffer.write
164 | 		self.real_stderr_write  = sys.stderr.buffer.write
165 | 		sys.stdout.buffer.write = self.stdout_write_both
166 | 		sys.stderr.buffer.write = self.stderr_write_both
167 | 
168 | 	def print_to_terminal(self, s):
169 | 		self.real_stdout_write((s + "\n").encode("utf-8"))
170 | 		sys.stdout.buffer.flush()
171 | 
172 | 	def graceful_stop_callback(self):
173 | 		self.print_to_terminal("\n^C detected, creating 'stop' file, please wait for exit...")
174 | 		with open(cf("stop"), "wb") as _f:
175 | 			pass
176 | 
177 | 	def forceful_stop_callback(self):
178 | 		self.loop.stop()
179 | 
180 | 	def add_signal_handlers(self):
181 | 		try:
182 | 			self.loop.add_signal_handler(signal.SIGINT,  self.graceful_stop_callback)
183 | 			self.loop.add_signal_handler(signal.SIGTERM, self.forceful_stop_callback)
184 | 		except NotImplementedError:
185 | 			# Not supported on Windows
186 | 			pass
187 | 
188 | 	def setup_watchers(self):
189 | 		self.watchers = {}
190 | 		for f in ["igsets", "ignores", "delay", "concurrency", "max_content_length"]:
191 | 			self.watchers[f] = FileChangedWatcher(cf(f))
192 | 
193 | 	def put_ws_queue(self, obj):
194 | 		try:
195 | 			self.ws_queue.put_nowait(obj)
196 | 		except asyncio.QueueFull:
197 | 			pass
198 | 
199 | 	def stdout_write_both(self, message):
200 | 		assert isinstance(message, bytes), message
201 | 		try:
202 | 			self.real_stdout_write(message)
203 | 			self.put_ws_queue({
204 | 				"type":     "stdout",
205 | 				"job_data": self.job_data,
206 | 				"message":  message.decode("utf-8")
207 | 			})
208 | 		except Exception as e:
209 | 			self.real_stderr_write((str(e) + "\n").encode("utf-8"))
210 | 
211 | 	def stderr_write_both(self, message):
212 | 		assert isinstance(message, bytes), message
213 | 		try:
214 | 			self.real_stderr_write(message)
215 | 			self.put_ws_queue({
216 | 				"type":     "stderr",
217 | 				"job_data": self.job_data,
218 | 				"message":  message.decode("utf-8")
219 | 			})
220 | 		except Exception as e:
221 | 			self.real_stderr_write((str(e) + "\n").encode("utf-8"))
222 | 
223 | 	def init_job_data(self):
224 | 		self.job_data = {
225 | 			"ident":                   open(cf("id")).read().strip(),
226 | 			"url":                     open(cf("start_url")).read().strip(),
227 | 			"started_at":              os.stat(cf("start_url")).st_mtime,
228 | 			"max_content_length":      -1,
229 | 			"suppress_ignore_reports": True,
230 | 			"video":                   True,
231 | 			"scrape":                  True,
232 | 			"concurrency":             2,
233 | 			"bytes_downloaded":        0,
234 | 			"items_queued":            0,
235 | 			"items_downloaded":        0,
236 | 			"delay_min":               0,
237 | 			"delay_max":               0,
238 | 			"r1xx":                    0,
239 | 			"r2xx":                    0,
240 | 			"r3xx":                    0,
241 | 			"r4xx":                    0,
242 | 			"r5xx":                    0,
243 | 			"runk":                    0,
244 | 		}
245 | 
246 | 	def init_ws(self):
247 | 		self.ws_queue = asyncio.Queue(maxsize=250)
248 | 
249 | 		ws_host = os.environ.get("GRAB_SITE_HOST", "127.0.0.1")
250 | 		ws_port = int(os.environ.get("GRAB_SITE_PORT", 29000))
251 | 		ws_url  = f"ws://{ws_host}:{ws_port}"
252 | 
253 | 		self.loop.create_task(dashboard_client.sender(self, ws_url))
254 | 
255 | 	@swallow_exception
256 | 	def update_max_content_length(self):
257 | 		if not self.watchers["max_content_length"].has_changed():
258 | 			return
259 | 		with open(self.watchers["max_content_length"].fname, "r") as f:
260 | 			self.job_data["max_content_length"] = int(f.read().strip())
261 | 
262 | 	@swallow_exception
263 | 	def update_delay(self):
264 | 		if not self.watchers["delay"].has_changed():
265 | 			return
266 | 		with open(self.watchers["delay"].fname, "r") as f:
267 | 			content = f.read().strip()
268 | 			if "-" in content:
269 | 				self.job_data["delay_min"], self.job_data["delay_max"] = list(int(s) for s in content.split("-", 1))
270 | 			else:
271 | 				self.job_data["delay_min"] = self.job_data["delay_max"] = int(content)
272 | 
273 | 	@swallow_exception
274 | 	def update_concurrency(self):
275 | 		if not self.watchers["concurrency"].has_changed():
276 | 			return
277 | 		with open(self.watchers["concurrency"].fname, "r") as f:
278 | 			concurrency = int(f.read().strip())
279 | 			if concurrency < 1:
280 | 				print(f"Warning: using 1 for concurrency instead of {concurrency} because it cannot be < 1")
281 | 				concurrency = 1
282 | 			self.job_data["concurrency"] = concurrency
283 | 		self.app_session.factory["PipelineSeries"].concurrency = concurrency
284 | 
285 | 	stop_path = cf("stop")
286 | 	def should_stop(self):
287 | 		return path_exists_with_cache(self.stop_path)
288 | 
289 | 	def should_ignore_url(self, url, record_info):
290 | 		return self.combined_ignore_regexp.search(url)
291 | 
292 | 	igoff_path = cf("igoff")
293 | 	def update_igoff(self):
294 | 		self.job_data["suppress_ignore_reports"] = path_exists_with_cache(self.igoff_path)
295 | 
296 | 	video_path = cf("video")
297 | 	def update_video(self):
298 | 		self.job_data["video"] = path_exists_with_cache(self.video_path)
299 | 
300 | 	scrape_path = cf("scrape")
301 | 	@swallow_exception
302 | 	def update_scrape(self):
303 | 		scrape = path_exists_with_cache(self.scrape_path)
304 | 		self.job_data["scrape"] = scrape
305 | 		if not scrape:
306 | 			# Empty the list of scrapers, which will stop scraping for new URLs
307 | 			# but still keep going through what is already in the queue.
308 | 			self.app_session.factory["DemuxDocumentScraper"]._document_scrapers = []
309 | 
310 | 	@swallow_exception
311 | 	def update_ignores(self):
312 | 		if not (self.watchers["igsets"].has_changed() or self.watchers["ignores"].has_changed()):
313 | 			return
314 | 
315 | 		ignores = set()
316 | 
317 | 		with open(cf("igsets"), "r") as f:
318 | 			igsets = f.read().strip("\r\n\t ,").split(',')
319 | 			if igsets == [""]:
320 | 				igsets = []
321 | 
322 | 		for igset in igsets:
323 | 			for pattern in get_patterns_for_ignore_set(igset):
324 | 				if include_ignore_line(pattern):
325 | 					ignores.update(self.ignore_pattern_to_regexp_strings(pattern))
326 | 
327 | 		with open(cf("ignores"), "r") as f:
328 | 			lines = f.read().strip("\n").split("\n")
329 | 			for pattern in lines:
330 | 				if include_ignore_line(pattern):
331 | 					ignores.update(self.ignore_pattern_to_regexp_strings(pattern))
332 | 
333 | 		self.print_to_terminal(f"Using these {len(ignores)} ignores:")
334 | 		for ig in sorted(ignores):
335 | 			self.print_to_terminal(f"\t{ig}")
336 | 
337 | 		self.compiled_ignores       = [(ig, re_compile(ig)) for ig in ignores]
338 | 		self.combined_ignore_regexp = compile_combined_regexp(ignores)
339 | 
340 | 	def ignore_pattern_to_regexp_strings(self, pattern):
341 | 		if "{any_start_netloc}" not in pattern:
342 | 			return [pattern]
343 | 
344 | 		return [pattern.replace("{any_start_netloc}", re.escape(netloc)) for netloc in self.all_start_netlocs]
345 | 
346 | 	def get_specific_ignore_pattern(self, url):
347 | 		for pattern, regexp in self.compiled_ignores:
348 | 			if regexp.search(url):
349 | 				# We can't use regexp.pattern because that quickly causes segfaults
350 | 				return pattern
351 | 
352 | 	@hook(PluginFunctions.accept_url)
353 | 	def accept_url(self, item_session: ItemSession, verdict: bool, reasons: dict):
354 | 		record_info = item_session.url_record
355 | 		url_info    = item_session.request.url_info
356 | 		url         = url_info.raw
357 | 
358 | 		self.update_ignores()
359 | 
360 | 		if url.startswith("data:"):
361 | 			# data: URLs aren't something you can grab, so drop them to avoid ignore
362 | 			# checking and ignore logging.
363 | 			return False
364 | 
365 | 		# Don't apply ignores to any of the start URLs
366 | 		if url in self.all_start_urls:
367 | 			# Return original verdict instead of True to avoid infinite retries
368 | 			return verdict
369 | 
370 | 		should_ignore = self.should_ignore_url(url, record_info)
371 | 		if should_ignore:
372 | 			if not self.job_data["suppress_ignore_reports"]:
373 | 				pattern = self.get_specific_ignore_pattern(url)
374 | 				self.maybe_log_ignore(url, pattern)
375 | 			return False
376 | 
377 | 		# If we get here, none of our ignores apply. Return the original verdict.
378 | 		return verdict
379 | 
380 | 	def handle_result(self, url_info, record_info, error_info, response):
381 | 		self.update_igoff()
382 | 
383 | 		self.job_data["bytes_downloaded"] += wpull_tweaks.response_body_size(response)
384 | 
385 | 		response_code    = 0
386 | 		response_message = ""
387 | 		if error_info:
388 | 			response_message = str(error_info)
389 | 		elif response:
390 | 			response_code    = response_status_code(response)
391 | 			response_message = response.reason
392 | 		response_code_str = str(response_code)
393 | 
394 | 		if len(response_code_str) == 3 and response_code_str[0] in "12345":
395 | 			self.job_data[f"r{response_code_str[0]}xx"] += 1
396 | 		else:
397 | 			self.job_data["runk"] += 1
398 | 
399 | 		self.put_ws_queue({
400 | 			"type":             "download",
401 | 			"job_data":         self.job_data,
402 | 			"url":              url_info.raw,
403 | 			"response_code":    response_code,
404 | 			"response_message": response_message,
405 | 		})
406 | 
407 | 		if self.should_stop():
408 | 			return Actions.STOP
409 | 
410 | 		return Actions.NORMAL
411 | 
412 | 	def maybe_log_ignore(self, url, pattern):
413 | 		if not self.job_data["suppress_ignore_reports"]:
414 | 			self.print_to_terminal(f"IGNOR {url}\n   by {pattern}")
415 | 			self.put_ws_queue({
416 | 				"type":     "ignore",
417 | 				"job_data": self.job_data,
418 | 				"url":      url,
419 | 				"pattern":  pattern
420 | 			})
421 | 
422 | 	@event(PluginFunctions.queued_url)
423 | 	def queued_url(self, _url_info: URLInfo):
424 | 		self.job_data["items_queued"] += 1
425 | 
426 | 	@event(PluginFunctions.dequeued_url)
427 | 	def dequeued_url(self, _url_info: URLInfo, _record_info: URLRecord):
428 | 		self.job_data["items_downloaded"] += 1
429 | 
430 | 	@hook(PluginFunctions.handle_response)
431 | 	def handle_response(self, item_session: ItemSession):
432 | 		url_info    = item_session.request.url_info
433 | 		record_info = item_session.url_record
434 | 		response    = item_session.response
435 | 		error_info  = None
436 | 		return self.handle_result(url_info, record_info, error_info, response)
437 | 
438 | 	@hook(PluginFunctions.handle_error)
439 | 	def handle_error(self, item_session: ItemSession, error_info: BaseException):
440 | 		url_info    = item_session.request.url_info
441 | 		record_info = item_session.url_record
442 | 		response    = item_session.response
443 | 		return self.handle_result(url_info, record_info, error_info, response)
444 | 
445 | 	@hook(PluginFunctions.handle_pre_response)
446 | 	def handle_pre_response(self, item_session: ItemSession):
447 | 		url_info = item_session.request.url_info
448 | 		response = item_session.response
449 | 		self.update_scrape()
450 | 
451 | 		url = url_info.raw
452 | 
453 | 		self.update_max_content_length()
454 | 		limit = self.job_data["max_content_length"]
455 | 		if limit != -1:
456 | 			length = get_content_length(response)
457 | 			if length > limit:
458 | 				self.skipped_max_content_length.write(url + "\n")
459 | 				self.skipped_max_content_length.flush()
460 | 				self.maybe_log_ignore(url, f"[content-length {length} over limit {limit}]")
461 | 				return Actions.FINISH
462 | 
463 | 		self.update_video()
464 | 		if not self.job_data["video"]:
465 | 			if has_content_type_video(response) or has_video_ext(url):
466 | 				self.skipped_videos.write(url + "\n")
467 | 				self.skipped_videos.flush()
468 | 				self.maybe_log_ignore(url, "[video]")
469 | 				return Actions.FINISH
470 | 
471 | 		# Check if server version starts with ICY
472 | 		if response.version == "ICY":
473 | 			self.maybe_log_ignore(url, "[icy version]")
474 | 			return Actions.FINISH
475 | 
476 | 		# Loop through all the server headers for matches
477 | 		for field, value in response.fields.get_all():
478 | 			if ICY_FIELD_PATTERN.match(field):
479 | 				self.maybe_log_ignore(url, "[icy field]")
480 | 				return Actions.FINISH
481 | 
482 | 			if field == "Server" and ICY_VALUE_PATTERN.match(value):
483 | 				self.maybe_log_ignore(url, "[icy server]")
484 | 				return Actions.FINISH
485 | 
486 | 		# Nothing matched, allow download
487 | 		self.print_to_terminal(url + " ...")
488 | 		return Actions.NORMAL
489 | 
490 | 	@hook(PluginFunctions.exit_status)
491 | 	def exit_status(self, _app_session: AppSession, code: int) -> int:
492 | 		print()
493 | 		print(f'Finished grab {self.job_data["ident"]} {self.job_data["url"]} with exit code {code}')
494 | 		print(f"Output is in directory:\n{working_dir}")
495 | 		return code
496 | 
497 | 	@hook(PluginFunctions.wait_time)
498 | 	def wait_time(self, _seconds: float, _item_session: ItemSession, _error):
499 | 		self.update_delay()
500 | 		self.update_concurrency()
501 | 		return random.uniform(self.job_data["delay_min"], self.job_data["delay_max"]) / 1000
502 | 
503 | 	@event(PluginFunctions.get_urls)
504 | 	def get_urls(self, item_session: ItemSession):
505 | 		url_info   = item_session.request.url_info
506 | 		url        = url_info.raw
507 | 		extra_urls = None
508 | 		# If we see this URL, also queue the URL for the :orig quality image
509 | 		if url.startswith("https://pbs.twimg.com/media/"):
510 | 			new_url = re.sub(":[a-z]{1,10}$", "", url) + ":orig"
511 | 			# see wpull/item.py:LinkType
512 | 			extra_urls = [dict(url=new_url, link_type="media", inline=True)]
513 | 		# Quora shows login-required screen unless you add ?share=1
514 | 		elif url.startswith("https://www.quora.com/") and not "?" in url:
515 | 			new_url = url + "?share=1"
516 | 			extra_urls = [dict(url=new_url, link_type="html")]
517 | 		return extra_urls
518 | 


--------------------------------------------------------------------------------
/libgrabsite/wpull_tweaks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import hashlib
 3 | import functools
 4 | 
 5 | from wpull.database.sqltable import SQLiteURLTable
 6 | from wpull.document.html import HTMLReader
 7 | from wpull.processor.rule import ProcessingRule
 8 | 
 9 | from libgrabsite import dupespotter, __version__
10 | from libgrabsite.dupes import DupesOnDisk
11 | 
12 | 
13 | def response_body_size(response) -> int:
14 | 	try:
15 | 		return response.body.size()
16 | 	except Exception:
17 | 		return 0
18 | 
19 | class NoFsyncSQLTable(SQLiteURLTable):
20 | 	@classmethod
21 | 	def _apply_pragmas_callback(cls, connection, record):
22 | 		super()._apply_pragmas_callback(connection, record)
23 | 		connection.execute('PRAGMA synchronous=OFF')
24 | 
25 | 
26 | class DupeSpottingProcessingRule(ProcessingRule):
27 | 	def __init__(self, *args, **kwargs):
28 | 		self.dupes_db = kwargs.pop('dupes_db', None)
29 | 		super().__init__(*args, **kwargs)
30 | 
31 | 	def scrape_document(self, item_session):
32 | 		response = item_session.response
33 | 		url_info = item_session.request.url_info
34 | 		url      = url_info.raw
35 | 
36 | 		if response_body_size(response) < 30 * 1024 * 1024:
37 | 			dupes_db = self.dupes_db
38 | 			body     = response.body.content()
39 | 			if HTMLReader.is_response(response):
40 | 				body = dupespotter.process_body(body, url)
41 | 			digest = hashlib.md5(body).digest()
42 | 			if dupes_db is not None:
43 | 				dupe_of = dupes_db.get_old_url(digest)
44 | 			else:
45 | 				dupe_of = None
46 | 			if dupe_of is not None:
47 | 				# Don't extract links from pages we've already seen
48 | 				# to avoid loops that descend a directory endlessly
49 | 				print("DUPE {}\n  OF {}".format(url, dupe_of))
50 | 				return
51 | 			else:
52 | 				if dupes_db is not None:
53 | 					dupes_db.set_old_url(digest, url)
54 | 
55 | 		super().scrape_document(item_session)
56 | 
57 | 
58 | def activate(app_session):
59 | 	app_session.factory.class_map['URLTableImplementation'] = NoFsyncSQLTable
60 | 
61 | 	warc_recorder_cls = app_session.factory.class_map['WARCRecorder']
62 | 	warc_recorder_cls.DEFAULT_SOFTWARE_STRING = f'grab-site/{__version__} ' + warc_recorder_cls.DEFAULT_SOFTWARE_STRING
63 | 
64 | 	if int(os.environ["DUPESPOTTER_ENABLED"]):
65 | 		dupes_db_location = os.path.join(os.environ["GRAB_SITE_WORKING_DIR"], "dupes_db")
66 | 		dupes_db = DupesOnDisk(dupes_db_location)
67 | 		app_session.factory.class_map['ProcessingRule'] = \
68 | 			functools.partial(DupeSpottingProcessingRule, dupes_db=dupes_db)
69 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | try:
 4 | 	from setuptools import setup
 5 | except ImportError:
 6 | 	from distutils.core import setup
 7 | 
 8 | import os
 9 | import sys
10 | import libgrabsite
11 | 
12 | install_requires = [
13 | 	"click>=6.3",
14 | 	"wpull @ https://github.com/ArchiveTeam/ludios_wpull/archive/refs/tags/3.0.9.zip",
15 | 	"manhole>=1.0.0",
16 | 	"lmdb>=0.89",
17 | 	"autobahn>=0.12.1",
18 | 	"google-re2>=1.0.6",
19 | 	"websockets>=6.0",
20 | ]
21 | 
22 | if 'GRAB_SITE_NO_CCHARDET' not in os.environ:
23 | 	install_requires.append("cchardet>=1.0.0")
24 | 
25 | setup(
26 | 	name="grab-site",
27 | 	version=libgrabsite.__version__,
28 | 	description="The archivist's web crawler: WARC output, dashboard for all crawls, dynamic ignore patterns",
29 | 	url="https://ludios.org/grab-site/",
30 | 	author="Ivan Kozik",
31 | 	author_email="ivan@ludios.org",
32 | 	classifiers=[
33 | 		"Programming Language :: Python :: 3",
34 | 		"Development Status :: 5 - Production/Stable",
35 | 		"Intended Audience :: End Users/Desktop",
36 | 		"License :: OSI Approved :: MIT License",
37 | 		"Topic :: Internet :: WWW/HTTP",
38 | 	],
39 | 	scripts=["grab-site", "gs-server", "gs-dump-urls"],
40 | 	packages=["libgrabsite"],
41 | 	package_data={"libgrabsite": ["*.html", "*.ico", "*.txt", "ignore_sets/*"]},
42 | 	install_requires=install_requires,
43 | )
44 | 


--------------------------------------------------------------------------------
/tests/offline-tests:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -eu -o pipefail -o verbose
4 | 
5 | grab-site --help
6 | grab-site --version
7 | gs-dump-urls --help
8 | python -c 'import libgrabsite.server'
9 | 


--------------------------------------------------------------------------------
/tests/online-tests:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -eu -o pipefail -o verbose
 4 | 
 5 | server_log=$(mktemp)
 6 | python -u -m http.server 0 > "$server_log" &
 7 | port=
 8 | # Try until server starts up
 9 | while [[ "$port" = "" ]]; do
10 | 	port=$(grep -P -o 'port \d+' "$server_log" | cut -d ' ' -f 2 || true)
11 | 	sleep 0.1
12 | done
13 | server="http://127.0.0.1:$port/"
14 | 
15 | trash=$(mktemp -d)
16 | cd "$trash"
17 | 
18 | nodnspython=--wpull-args=--no-skip-getaddrinfo
19 | 
20 | grab-site --1 "$server"
21 | grab-site $nodnspython --1 "$server"
22 | ls -l 127.0.0.1*
23 | for i in 127.0.0.1*/wpull.db; do
24 | 	gs-dump-urls "$i" done
25 | done
26 | grab-site $nodnspython --1 --permanent-error-status-codes=404 "$server"
27 | echo '.*' > ignores
28 | grab-site $nodnspython --import-ignores ignores "$server"
29 | grab-site $nodnspython --1 --id my-id --no-dupespotter --no-video --concurrent 3 "$server" "$server/another"
30 | # TODO: test -i with remote URL list
31 | echo "$server" > local-url-list
32 | grab-site $nodnspython --1 -i local-url-list
33 | grab-site $nodnspython --1 -i local-url-list --which-wpull-args-partial
34 | grab-site $nodnspython --1 -i local-url-list --which-wpull-command
35 | 
36 | # kill http.server
37 | kill $!
38 | 


--------------------------------------------------------------------------------