17 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Nick Sweeting
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/bin/export-browser-history:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"; cd .. && pwd )"
4 |
5 | if [[ "$1" == "--chrome" ]]; then
6 | # Google Chrome / Chromium
7 | default=$(ls ~/Library/Application\ Support/Google/Chrome/Default/History)
8 | if [[ -e "$2" ]]; then
9 | cp "$2" "$REPO_DIR/output/sources/chrome_history.db.tmp"
10 | else
11 | echo "Defaulting to history db: $default"
12 | echo "Optionally specify the path to a different sqlite history database as the 2nd argument."
13 | cp "$default" "$REPO_DIR/output/sources/chrome_history.db.tmp"
14 | fi
15 | sqlite3 "$REPO_DIR/output/sources/chrome_history.db.tmp" "SELECT \"[\" || group_concat(json_object('timestamp', last_visit_time, 'description', title, 'href', url)) || \"]\" FROM urls;" > "$REPO_DIR/output/sources/chrome_history.json"
16 | rm "$REPO_DIR"/output/sources/chrome_history.db.*
17 | echo "Chrome history exported to:"
18 | echo " output/sources/chrome_history.json"
19 | fi
20 |
21 | if [[ "$1" == "--firefox" ]]; then
22 | # Firefox
23 | default=$(ls ~/Library/Application\ Support/Firefox/Profiles/*.default/places.sqlite)
24 | if [[ -e "$2" ]]; then
25 | cp "$2" "$REPO_DIR/output/sources/firefox_history.db.tmp"
26 | else
27 | echo "Defaulting to history db: $default"
28 | echo "Optionally specify the path to a different sqlite history database as the 2nd argument."
29 | cp "$default" "$REPO_DIR/output/sources/firefox_history.db.tmp"
30 | fi
31 | sqlite3 "$REPO_DIR/output/sources/firefox_history.db.tmp" "SELECT \"[\" || group_concat(json_object('timestamp', last_visit_date, 'description', title, 'href', url)) || \"]\" FROM moz_places;" > "$REPO_DIR/output/sources/firefox_history.json"
32 | rm "$REPO_DIR"/output/sources/firefox_history.db.*
33 | echo "Firefox history exported to:"
34 | echo " output/sources/firefox_history.json"
35 | fi
36 |
--------------------------------------------------------------------------------
/archiver/templates/link_index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | $title
5 |
6 |
7 |
8 |
9 |
35 |
--------------------------------------------------------------------------------
/archiver/archive_methods.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | from functools import wraps
5 | from collections import defaultdict
6 | from datetime import datetime
7 | from subprocess import run, PIPE, DEVNULL
8 |
9 | from peekable import Peekable
10 |
11 | from index import wget_output_path, parse_json_link_index, write_link_index
12 | from links import links_after_timestamp
13 | from config import (
14 | CHROME_BINARY,
15 | FETCH_WGET,
16 | FETCH_WGET_REQUISITES,
17 | FETCH_PDF,
18 | FETCH_SCREENSHOT,
19 | FETCH_DOM,
20 | RESOLUTION,
21 | CHECK_SSL_VALIDITY,
22 | SUBMIT_ARCHIVE_DOT_ORG,
23 | FETCH_AUDIO,
24 | FETCH_VIDEO,
25 | FETCH_FAVICON,
26 | WGET_USER_AGENT,
27 | CHROME_USER_DATA_DIR,
28 | TIMEOUT,
29 | ANSI,
30 | ARCHIVE_DIR,
31 | )
32 | from util import (
33 | check_dependencies,
34 | progress,
35 | chmod_file,
36 | pretty_path,
37 | )
38 |
39 |
40 | _RESULTS_TOTALS = { # globals are bad, mmkay
41 | 'skipped': 0,
42 | 'succeded': 0,
43 | 'failed': 0,
44 | }
45 |
46 | def archive_links(archive_path, links, source=None, resume=None):
47 | check_dependencies()
48 |
49 | to_archive = Peekable(links_after_timestamp(links, resume))
50 | idx, link = 0, to_archive.peek(0)
51 |
52 | try:
53 | for idx, link in enumerate(to_archive):
54 | link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
55 | archive_link(link_dir, link)
56 |
57 | except (KeyboardInterrupt, SystemExit, Exception) as e:
58 | print('{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
59 | **ANSI,
60 | now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
61 | idx=idx+1,
62 | timestamp=link['timestamp'],
63 | total=len(links),
64 | ))
65 | print(' Continue where you left off by running:')
66 | print(' {} {}'.format(
67 | pretty_path(sys.argv[0]),
68 | link['timestamp'],
69 | ))
70 | if not isinstance(e, KeyboardInterrupt):
71 | raise e
72 | raise SystemExit(1)
73 |
74 |
75 | def archive_link(link_dir, link, overwrite=True):
76 | """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
77 |
78 | update_existing = os.path.exists(link_dir)
79 | if update_existing:
80 | link = {
81 | **parse_json_link_index(link_dir),
82 | **link,
83 | }
84 | else:
85 | os.makedirs(link_dir)
86 |
87 | log_link_archive(link_dir, link, update_existing)
88 |
89 | if FETCH_WGET:
90 | link = fetch_wget(link_dir, link, overwrite=overwrite)
91 |
92 | if FETCH_PDF:
93 | link = fetch_pdf(link_dir, link, overwrite=overwrite)
94 |
95 | if FETCH_SCREENSHOT:
96 | link = fetch_screenshot(link_dir, link, overwrite=overwrite)
97 |
98 | if FETCH_DOM:
99 | link = fetch_dom(link_dir, link, overwrite=overwrite)
100 |
101 | if SUBMIT_ARCHIVE_DOT_ORG:
102 | link = archive_dot_org(link_dir, link, overwrite=overwrite)
103 |
104 | # if FETCH_AUDIO:
105 | # link = fetch_audio(link_dir, link, overwrite=overwrite)
106 |
107 | # if FETCH_VIDEO:
108 | # link = fetch_video(link_dir, link, overwrite=overwrite)
109 |
110 | if FETCH_FAVICON:
111 | link = fetch_favicon(link_dir, link, overwrite=overwrite)
112 |
113 | write_link_index(link_dir, link)
114 | # print()
115 |
116 | return link
117 |
118 | def log_link_archive(link_dir, link, update_existing):
119 | print('[{symbol_color}{symbol}{reset}] [{now}] "{title}"\n {blue}{url}{reset}'.format(
120 | symbol='*' if update_existing else '+',
121 | symbol_color=ANSI['black' if update_existing else 'green'],
122 | now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
123 | **link,
124 | **ANSI,
125 | ))
126 |
127 | print(' > {}{}'.format(pretty_path(link_dir), '' if update_existing else ' (new)'))
128 | if link['type']:
129 | print(' i {}'.format(link['type']))
130 |
131 |
132 |
133 | def attach_result_to_link(method):
134 | """
135 | Instead of returning a result={output:'...', status:'success'} object,
136 | attach that result to the links's history & latest fields, then return
137 | the updated link object.
138 | """
139 | def decorator(fetch_func):
140 | @wraps(fetch_func)
141 | def timed_fetch_func(link_dir, link, overwrite=False, **kwargs):
142 | # initialize methods and history json field on link
143 | link['latest'] = link.get('latest') or {}
144 | link['latest'][method] = link['latest'].get(method) or None
145 | link['history'] = link.get('history') or {}
146 | link['history'][method] = link['history'].get(method) or []
147 |
148 | start_ts = datetime.now().timestamp()
149 |
150 | # if a valid method output is already present, dont run the fetch function
151 | if link['latest'][method] and not overwrite:
152 | print(' √ {}'.format(method))
153 | result = None
154 | else:
155 | print(' > {}'.format(method))
156 | result = fetch_func(link_dir, link, **kwargs)
157 |
158 | end_ts = datetime.now().timestamp()
159 | duration = str(end_ts * 1000 - start_ts * 1000).split('.')[0]
160 |
161 | # append a history item recording fail/success
162 | history_entry = {
163 | 'timestamp': str(start_ts).split('.')[0],
164 | }
165 | if result is None:
166 | history_entry['status'] = 'skipped'
167 | elif isinstance(result.get('output'), Exception):
168 | history_entry['status'] = 'failed'
169 | history_entry['duration'] = duration
170 | history_entry.update(result or {})
171 | link['history'][method].append(history_entry)
172 | else:
173 | history_entry['status'] = 'succeded'
174 | history_entry['duration'] = duration
175 | history_entry.update(result or {})
176 | link['history'][method].append(history_entry)
177 | link['latest'][method] = result['output']
178 |
179 | _RESULTS_TOTALS[history_entry['status']] += 1
180 |
181 | return link
182 | return timed_fetch_func
183 | return decorator
184 |
185 |
186 | @attach_result_to_link('wget')
187 | def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, timeout=TIMEOUT):
188 | """download full site using wget"""
189 |
190 | domain_dir = os.path.join(link_dir, link['domain'])
191 | existing_file = wget_output_path(link)
192 | if os.path.exists(domain_dir) and existing_file:
193 | return {'output': existing_file, 'status': 'skipped'}
194 |
195 | CMD = [
196 | # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
197 | *'wget -N -E -np -x -H -k -K -S --restrict-file-names=unix'.split(' '),
198 | *(('-p',) if FETCH_WGET_REQUISITES else ()),
199 | *(('--user-agent="{}"'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
200 | *((() if CHECK_SSL_VALIDITY else ('--no-check-certificate',))),
201 | link['url'],
202 | ]
203 | end = progress(timeout, prefix=' ')
204 | try:
205 | result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # index.html
206 | end()
207 | output = wget_output_path(link, look_in=domain_dir)
208 |
209 | # Check for common failure cases
210 | if result.returncode > 0:
211 | print(' got wget response code {}:'.format(result.returncode))
212 | if result.returncode != 8:
213 | print('\n'.join(' ' + line for line in (result.stderr or result.stdout).decode().rsplit('\n', 10)[-10:] if line.strip()))
214 | if b'403: Forbidden' in result.stderr:
215 | raise Exception('403 Forbidden (try changing WGET_USER_AGENT)')
216 | if b'404: Not Found' in result.stderr:
217 | raise Exception('404 Not Found')
218 | if b'ERROR 500: Internal Server Error' in result.stderr:
219 | raise Exception('500 Internal Server Error')
220 | if result.returncode == 4:
221 | raise Exception('Failed wget download')
222 | except Exception as e:
223 | end()
224 | print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
225 | print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
226 | output = e
227 |
228 | return {
229 | 'cmd': CMD,
230 | 'output': output,
231 | }
232 |
233 |
234 | @attach_result_to_link('pdf')
235 | def fetch_pdf(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
236 | """print PDF of site to file using chrome --headless"""
237 |
238 | if link['type'] in ('PDF', 'image'):
239 | return {'output': wget_output_path(link)}
240 |
241 | if os.path.exists(os.path.join(link_dir, 'output.pdf')):
242 | return {'output': 'output.pdf', 'status': 'skipped'}
243 |
244 | CMD = [
245 | *chrome_headless(user_data_dir=user_data_dir),
246 | '--print-to-pdf',
247 | link['url']
248 | ]
249 | end = progress(timeout, prefix=' ')
250 | try:
251 | result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.pdf
252 | end()
253 | if result.returncode:
254 | print(' ', (result.stderr or result.stdout).decode())
255 | raise Exception('Failed to print PDF')
256 | chmod_file('output.pdf', cwd=link_dir)
257 | output = 'output.pdf'
258 | except Exception as e:
259 | end()
260 | print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
261 | print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
262 | output = e
263 |
264 | return {
265 | 'cmd': CMD,
266 | 'output': output,
267 | }
268 |
269 | @attach_result_to_link('screenshot')
270 | def fetch_screenshot(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR, resolution=RESOLUTION):
271 | """take screenshot of site using chrome --headless"""
272 |
273 | if link['type'] in ('PDF', 'image'):
274 | return {'output': wget_output_path(link)}
275 |
276 | if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
277 | return {'output': 'screenshot.png', 'status': 'skipped'}
278 |
279 | CMD = [
280 | *chrome_headless(user_data_dir=user_data_dir),
281 | '--screenshot',
282 | '--window-size={}'.format(resolution),
283 | '--hide-scrollbars',
284 | # '--full-page', # TODO: make this actually work using ./bin/screenshot fullPage: true
285 | link['url'],
286 | ]
287 | end = progress(timeout, prefix=' ')
288 | try:
289 | result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # sreenshot.png
290 | end()
291 | if result.returncode:
292 | print(' ', (result.stderr or result.stdout).decode())
293 | raise Exception('Failed to take screenshot')
294 | chmod_file('screenshot.png', cwd=link_dir)
295 | output = 'screenshot.png'
296 | except Exception as e:
297 | end()
298 | print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
299 | print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
300 | output = e
301 |
302 | return {
303 | 'cmd': CMD,
304 | 'output': output,
305 | }
306 |
307 | @attach_result_to_link('dom')
308 | def fetch_dom(link_dir, link, timeout=TIMEOUT, user_data_dir=CHROME_USER_DATA_DIR):
309 | """print HTML of site to file using chrome --dump-html"""
310 |
311 | if link['type'] in ('PDF', 'image'):
312 | return {'output': wget_output_path(link)}
313 |
314 | output_path = os.path.join(link_dir, 'output.html')
315 |
316 | if os.path.exists(output_path):
317 | return {'output': 'output.html', 'status': 'skipped'}
318 |
319 | CMD = [
320 | *chrome_headless(user_data_dir=user_data_dir),
321 | '--dump-dom',
322 | link['url']
323 | ]
324 | end = progress(timeout, prefix=' ')
325 | try:
326 | with open(output_path, 'w+') as f:
327 | result = run(CMD, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout + 1) # output.html
328 | end()
329 | if result.returncode:
330 | print(' ', (result.stderr).decode())
331 | raise Exception('Failed to fetch DOM')
332 | chmod_file('output.html', cwd=link_dir)
333 | output = 'output.html'
334 | except Exception as e:
335 | end()
336 | print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
337 | print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
338 | output = e
339 |
340 | return {
341 | 'cmd': CMD,
342 | 'output': output,
343 | }
344 |
345 | @attach_result_to_link('archive_org')
346 | def archive_dot_org(link_dir, link, timeout=TIMEOUT):
347 | """submit site to archive.org for archiving via their service, save returned archive url"""
348 |
349 | path = os.path.join(link_dir, 'archive.org.txt')
350 | if os.path.exists(path):
351 | archive_org_url = open(path, 'r').read().strip()
352 | return {'output': archive_org_url, 'status': 'skipped'}
353 |
354 | submit_url = 'https://web.archive.org/save/{}'.format(link['url'].split('?', 1)[0])
355 |
356 | success = False
357 | CMD = ['curl', '-I', submit_url]
358 | end = progress(timeout, prefix=' ')
359 | try:
360 | result = run(CMD, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # archive.org.txt
361 | end()
362 |
363 | # Parse archive.org response headers
364 | headers = defaultdict(list)
365 |
366 | # lowercase all the header names and store in dict
367 | for header in result.stdout.splitlines():
368 | if b':' not in header or not header.strip():
369 | continue
370 | name, val = header.decode().split(':', 1)
371 | headers[name.lower().strip()].append(val.strip())
372 |
373 | # Get successful archive url in "content-location" header or any errors
374 | content_location = headers['content-location']
375 | errors = headers['x-archive-wayback-runtime-error']
376 |
377 | if content_location:
378 | saved_url = 'https://web.archive.org{}'.format(content_location[0])
379 | success = True
380 | elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
381 | output = submit_url
382 | # raise Exception('Archive.org denied by {}/robots.txt'.format(link['domain']))
383 | elif errors:
384 | raise Exception(', '.join(errors))
385 | else:
386 | raise Exception('Failed to find "content-location" URL header in Archive.org response.')
387 | except Exception as e:
388 | end()
389 | print(' Visit url to see output:', ' '.join(CMD))
390 | print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
391 | output = e
392 |
393 | if success:
394 | with open(os.path.join(link_dir, 'archive.org.txt'), 'w', encoding='utf-8') as f:
395 | f.write(saved_url)
396 | chmod_file('archive.org.txt', cwd=link_dir)
397 | output = saved_url
398 |
399 | return {
400 | 'cmd': CMD,
401 | 'output': output,
402 | }
403 |
404 | @attach_result_to_link('favicon')
405 | def fetch_favicon(link_dir, link, timeout=TIMEOUT):
406 | """download site favicon from google's favicon api"""
407 |
408 | if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
409 | return {'output': 'favicon.ico', 'status': 'skipped'}
410 |
411 | CMD = ['curl', 'https://www.google.com/s2/favicons?domain={domain}'.format(**link)]
412 | fout = open('{}/favicon.ico'.format(link_dir), 'w')
413 | end = progress(timeout, prefix=' ')
414 | try:
415 | run(CMD, stdout=fout, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # favicon.ico
416 | fout.close()
417 | end()
418 | chmod_file('favicon.ico', cwd=link_dir)
419 | output = 'favicon.ico'
420 | except Exception as e:
421 | fout.close()
422 | end()
423 | print(' Run to see full output:', ' '.join(CMD))
424 | print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
425 | output = e
426 |
427 | return {
428 | 'cmd': CMD,
429 | 'output': output,
430 | }
431 |
432 | # @attach_result_to_link('audio')
433 | # def fetch_audio(link_dir, link, timeout=TIMEOUT):
434 | # """Download audio rip using youtube-dl"""
435 |
436 | # if link['type'] not in ('soundcloud',)\
437 | # and 'audio' not in link['tags']:
438 | # return
439 |
440 | # path = os.path.join(link_dir, 'audio')
441 |
442 | # if not os.path.exists(path) or overwrite:
443 | # print(' - Downloading audio')
444 | # CMD = [
445 | # "youtube-dl -x --audio-format mp3 --audio-quality 0 -o '%(title)s.%(ext)s'",
446 | # link['url'],
447 | # ]
448 | # end = progress(timeout, prefix=' ')
449 | # try:
450 | # result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # audio/audio.mp3
451 | # end()
452 | # if result.returncode:
453 | # print(' ', result.stderr.decode())
454 | # raise Exception('Failed to download audio')
455 | # chmod_file('audio.mp3', cwd=link_dir)
456 | # return 'audio.mp3'
457 | # except Exception as e:
458 | # end()
459 | # print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
460 | # print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
461 | # raise
462 | # else:
463 | # print(' √ Skipping audio download')
464 |
465 | # @attach_result_to_link('video')
466 | # def fetch_video(link_dir, link, timeout=TIMEOUT):
467 | # """Download video rip using youtube-dl"""
468 |
469 | # if link['type'] not in ('youtube', 'youku', 'vimeo')\
470 | # and 'video' not in link['tags']:
471 | # return
472 |
473 | # path = os.path.join(link_dir, 'video')
474 |
475 | # if not os.path.exists(path) or overwrite:
476 | # print(' - Downloading video')
477 | # CMD = [
478 | # "youtube-dl -x --video-format mp4 --audio-quality 0 -o '%(title)s.%(ext)s'",
479 | # link['url'],
480 | # ]
481 | # end = progress(timeout, prefix=' ')
482 | # try:
483 | # result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=link_dir, timeout=timeout + 1) # video/movie.mp4
484 | # end()
485 | # if result.returncode:
486 | # print(' ', result.stderr.decode())
487 | # raise Exception('Failed to download video')
488 | # chmod_file('video.mp4', cwd=link_dir)
489 | # return 'video.mp4'
490 | # except Exception as e:
491 | # end()
492 | # print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
493 | # print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
494 | # raise
495 | # else:
496 | # print(' √ Skipping video download')
497 |
498 |
499 | def chrome_headless(binary=CHROME_BINARY, user_data_dir=CHROME_USER_DATA_DIR):
500 | args = [binary, '--headless'] # '--disable-gpu'
501 | default_profile = os.path.expanduser('~/Library/Application Support/Google/Chrome/Default')
502 | if user_data_dir:
503 | args.append('--user-data-dir={}'.format(user_data_dir))
504 | elif os.path.exists(default_profile):
505 | args.append('--user-data-dir={}'.format(default_profile))
506 | return args
507 |
--------------------------------------------------------------------------------
/archiver/util.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import sys
4 | import time
5 | import json
6 | import requests
7 |
8 | from datetime import datetime
9 | from subprocess import run, PIPE, DEVNULL
10 | from multiprocessing import Process
11 | from urllib.parse import quote
12 |
13 | from config import (
14 | IS_TTY,
15 | OUTPUT_PERMISSIONS,
16 | REPO_DIR,
17 | SOURCES_DIR,
18 | OUTPUT_DIR,
19 | ARCHIVE_DIR,
20 | TIMEOUT,
21 | TERM_WIDTH,
22 | SHOW_PROGRESS,
23 | ANSI,
24 | CHROME_BINARY,
25 | FETCH_WGET,
26 | FETCH_PDF,
27 | FETCH_SCREENSHOT,
28 | FETCH_DOM,
29 | FETCH_FAVICON,
30 | FETCH_AUDIO,
31 | FETCH_VIDEO,
32 | SUBMIT_ARCHIVE_DOT_ORG,
33 | )
34 |
35 | # URL helpers
36 | without_scheme = lambda url: url.replace('http://', '').replace('https://', '').replace('ftp://', '')
37 | without_query = lambda url: url.split('?', 1)[0]
38 | without_hash = lambda url: url.split('#', 1)[0]
39 | without_path = lambda url: url.split('/', 1)[0]
40 | domain = lambda url: without_hash(without_query(without_path(without_scheme(url))))
41 | base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
42 |
43 | short_ts = lambda ts: ts.split('.')[0]
44 |
45 |
46 | def check_dependencies():
47 | """Check that all necessary dependencies are installed, and have valid versions"""
48 |
49 | python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
50 | if python_vers < 3.5:
51 | print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
52 | print(' See https://github.com/pirate/bookmark-archiver#troubleshooting for help upgrading your Python installation.')
53 | raise SystemExit(1)
54 |
55 | if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
56 | if run(['which', CHROME_BINARY], stdout=DEVNULL).returncode:
57 | print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
58 | print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
59 | print(' See https://github.com/pirate/bookmark-archiver for help.')
60 | raise SystemExit(1)
61 |
62 | # parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
63 | try:
64 | result = run([CHROME_BINARY, '--version'], stdout=PIPE)
65 | version_str = result.stdout.decode('utf-8')
66 | version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
67 | version = [l for l in version_lines if l.isdigit()][-1]
68 | if int(version) < 59:
69 | print(version_lines)
70 | print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
71 | print(' See https://github.com/pirate/bookmark-archiver for help.')
72 | raise SystemExit(1)
73 | except (IndexError, TypeError, OSError):
74 | print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
75 | print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format(CHROME_BINARY))
76 | print(' See https://github.com/pirate/bookmark-archiver for help.')
77 | raise SystemExit(1)
78 |
79 | if FETCH_WGET:
80 | if run(['which', 'wget'], stdout=DEVNULL).returncode or run(['wget', '--version'], stdout=DEVNULL).returncode:
81 | print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
82 | print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('wget'))
83 | print(' See https://github.com/pirate/bookmark-archiver for help.')
84 | raise SystemExit(1)
85 |
86 | if FETCH_FAVICON or SUBMIT_ARCHIVE_DOT_ORG:
87 | if run(['which', 'curl'], stdout=DEVNULL).returncode or run(['curl', '--version'], stdout=DEVNULL).returncode:
88 | print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
89 | print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('curl'))
90 | print(' See https://github.com/pirate/bookmark-archiver for help.')
91 | raise SystemExit(1)
92 |
93 | if FETCH_AUDIO or FETCH_VIDEO:
94 | if run(['which', 'youtube-dl'], stdout=DEVNULL).returncode or run(['youtube-dl', '--version'], stdout=DEVNULL).returncode:
95 | print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
96 | print(' Run ./setup.sh, then confirm it was installed with: {} --version'.format('youtube-dl'))
97 | print(' See https://github.com/pirate/bookmark-archiver for help.')
98 | raise SystemExit(1)
99 |
100 |
101 | def chmod_file(path, cwd='.', permissions=OUTPUT_PERMISSIONS, timeout=30):
102 | """chmod -R /"""
103 |
104 | if not os.path.exists(os.path.join(cwd, path)):
105 | raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
106 |
107 | chmod_result = run(['chmod', '-R', permissions, path], cwd=cwd, stdout=DEVNULL, stderr=PIPE, timeout=timeout)
108 | if chmod_result.returncode == 1:
109 | print(' ', chmod_result.stderr.decode())
110 | raise Exception('Failed to chmod {}/{}'.format(cwd, path))
111 |
112 |
113 | def progress(seconds=TIMEOUT, prefix=''):
114 | """Show a (subprocess-controlled) progress bar with a timeout,
115 | returns end() function to instantly finish the progress
116 | """
117 |
118 | if not SHOW_PROGRESS:
119 | return lambda: None
120 |
121 | chunk = '█' if sys.stdout.encoding == 'UTF-8' else '#'
122 | chunks = TERM_WIDTH - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
123 |
124 | def progress_bar(seconds=seconds, prefix=prefix):
125 | """show timer in the form of progress bar, with percentage and seconds remaining"""
126 | try:
127 | for s in range(seconds * chunks):
128 | progress = s / chunks / seconds * 100
129 | bar_width = round(progress/(100/chunks))
130 |
131 | # ████████████████████ 0.9% (1/60sec)
132 | sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
133 | prefix,
134 | ANSI['green'],
135 | (chunk * bar_width).ljust(chunks),
136 | ANSI['reset'],
137 | round(progress, 1),
138 | round(s/chunks),
139 | seconds,
140 | ))
141 | sys.stdout.flush()
142 | time.sleep(1 / chunks)
143 |
144 | # ██████████████████████████████████ 100.0% (60/60sec)
145 | sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
146 | prefix,
147 | ANSI['red'],
148 | chunk * chunks,
149 | ANSI['reset'],
150 | 100.0,
151 | seconds,
152 | seconds,
153 | ))
154 | sys.stdout.flush()
155 | except KeyboardInterrupt:
156 | print()
157 | pass
158 |
159 | p = Process(target=progress_bar)
160 | p.start()
161 |
162 | def end():
163 | """immediately finish progress and clear the progressbar line"""
164 | p.terminate()
165 | sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH), ANSI['reset'])) # clear whole terminal line
166 | sys.stdout.flush()
167 |
168 | return end
169 |
170 | def pretty_path(path):
171 | """convert paths like .../bookmark-archiver/archiver/../output/abc into output/abc"""
172 | return path.replace(REPO_DIR + '/', '')
173 |
174 |
175 | def download_url(url):
176 | """download a given url's content into downloads/domain.txt"""
177 |
178 | if not os.path.exists(SOURCES_DIR):
179 | os.makedirs(SOURCES_DIR)
180 |
181 | ts = str(datetime.now().timestamp()).split('.', 1)[0]
182 |
183 | source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts))
184 |
185 | print('[*] [{}] Downloading {} > {}'.format(
186 | datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
187 | url,
188 | pretty_path(source_path),
189 | ))
190 | end = progress(TIMEOUT, prefix=' ')
191 | try:
192 | downloaded_xml = requests.get(url).content.decode()
193 | end()
194 | except Exception as e:
195 | end()
196 | print('[!] Failed to download {}\n'.format(url))
197 | print(' ', e)
198 | raise SystemExit(1)
199 |
200 | with open(source_path, 'w', encoding='utf-8') as f:
201 | f.write(downloaded_xml)
202 |
203 | return source_path
204 |
205 | def str_between(string, start, end=None):
206 | """(12345, , ) -> 12345"""
207 |
208 | content = string.split(start, 1)[-1]
209 | if end is not None:
210 | content = content.rsplit(end, 1)[0]
211 |
212 | return content
213 |
214 | def get_link_type(link):
215 | """Certain types of links need to be handled specially, this figures out when that's the case"""
216 |
217 | if link['base_url'].endswith('.pdf'):
218 | return 'PDF'
219 | elif link['base_url'].rsplit('.', 1) in ('pdf', 'png', 'jpg', 'jpeg', 'svg', 'bmp', 'gif', 'tiff', 'webp'):
220 | return 'image'
221 | elif 'wikipedia.org' in link['domain']:
222 | return 'wiki'
223 | elif 'youtube.com' in link['domain']:
224 | return 'youtube'
225 | elif 'soundcloud.com' in link['domain']:
226 | return 'soundcloud'
227 | elif 'youku.com' in link['domain']:
228 | return 'youku'
229 | elif 'vimeo.com' in link['domain']:
230 | return 'vimeo'
231 | return None
232 |
233 | def merge_links(a, b):
234 | """deterministially merge two links, favoring longer field values over shorter,
235 | and "cleaner" values over worse ones.
236 | """
237 | longer = lambda key: a[key] if len(a[key]) > len(b[key]) else b[key]
238 | earlier = lambda key: a[key] if a[key] < b[key] else b[key]
239 |
240 | url = longer('url')
241 | longest_title = longer('title')
242 | cleanest_title = a['title'] if '://' not in a['title'] else b['title']
243 | link = {
244 | 'timestamp': earlier('timestamp'),
245 | 'url': url,
246 | 'domain': domain(url),
247 | 'base_url': base_url(url),
248 | 'tags': longer('tags'),
249 | 'title': longest_title if '://' not in longest_title else cleanest_title,
250 | 'sources': list(set(a.get('sources', []) + b.get('sources', []))),
251 | }
252 | link['type'] = get_link_type(link)
253 | return link
254 |
255 | def find_link(folder, links):
256 | """for a given archive folder, find the corresponding link object in links"""
257 | url = parse_url(folder)
258 | if url:
259 | for link in links:
260 | if (link['base_url'] in url) or (url in link['url']):
261 | return link
262 |
263 | timestamp = folder.split('.')[0]
264 | for link in links:
265 | if link['timestamp'].startswith(timestamp):
266 | if link['domain'] in os.listdir(os.path.join(ARCHIVE_DIR, folder)):
267 | return link # careful now, this isn't safe for most ppl
268 | if link['domain'] in parse_url(folder):
269 | return link
270 | return None
271 |
272 |
273 | def parse_url(folder):
274 | """for a given archive folder, figure out what url it's for"""
275 | link_json = os.path.join(ARCHIVE_DIR, folder, 'index.json')
276 | if os.path.exists(link_json):
277 | with open(link_json, 'r') as f:
278 | try:
279 | link_json = f.read().strip()
280 | if link_json:
281 | link = json.loads(link_json)
282 | return link['base_url']
283 | except ValueError:
284 | print('File contains invalid JSON: {}!'.format(link_json))
285 |
286 | archive_org_txt = os.path.join(ARCHIVE_DIR, folder, 'archive.org.txt')
287 | if os.path.exists(archive_org_txt):
288 | with open(archive_org_txt, 'r') as f:
289 | original_link = f.read().strip().split('/http', 1)[-1]
290 | with_scheme = 'http{}'.format(original_link)
291 | return with_scheme
292 |
293 | return ''
294 |
295 | def manually_merge_folders(source, target):
296 | """prompt for user input to resolve a conflict between two archive folders"""
297 |
298 | if not IS_TTY:
299 | return
300 |
301 | fname = lambda path: path.split('/')[-1]
302 |
303 | print(' {} and {} have conflicting files, which do you want to keep?'.format(fname(source), fname(target)))
304 | print(' - [enter]: do nothing (keep both)')
305 | print(' - a: prefer files from {}'.format(source))
306 | print(' - b: prefer files from {}'.format(target))
307 | print(' - q: quit and resolve the conflict manually')
308 | try:
309 | answer = input('> ').strip().lower()
310 | except KeyboardInterrupt:
311 | answer = 'q'
312 |
313 | assert answer in ('', 'a', 'b', 'q'), 'Invalid choice.'
314 |
315 | if answer == 'q':
316 | print('\nJust run Bookmark Archiver again to pick up where you left off.')
317 | raise SystemExit(0)
318 | elif answer == '':
319 | return
320 |
321 | files_in_source = set(os.listdir(source))
322 | files_in_target = set(os.listdir(target))
323 | for file in files_in_source:
324 | if file in files_in_target:
325 | to_delete = target if answer == 'a' else source
326 | run(['rm', '-Rf', os.path.join(to_delete, file)])
327 | run(['mv', os.path.join(source, file), os.path.join(target, file)])
328 |
329 | if not set(os.listdir(source)):
330 | run(['rm', '-Rf', source])
331 |
332 | def fix_folder_path(archive_path, link_folder, link):
333 | """given a folder, merge it to the canonical 'correct' path for the given link object"""
334 | source = os.path.join(archive_path, link_folder)
335 | target = os.path.join(archive_path, link['timestamp'])
336 |
337 | url_in_folder = parse_url(source)
338 | if not (url_in_folder in link['base_url']
339 | or link['base_url'] in url_in_folder):
340 | raise ValueError('The link does not match the url for this folder.')
341 |
342 | if not os.path.exists(target):
343 | # target doesn't exist so nothing needs merging, simply move A to B
344 | run(['mv', source, target])
345 | else:
346 | # target folder exists, check for conflicting files and attempt manual merge
347 | files_in_source = set(os.listdir(source))
348 | files_in_target = set(os.listdir(target))
349 | conflicting_files = files_in_source & files_in_target
350 |
351 | if not conflicting_files:
352 | for file in files_in_source:
353 | run(['mv', os.path.join(source, file), os.path.join(target, file)])
354 |
355 | if os.path.exists(source):
356 | files_in_source = set(os.listdir(source))
357 | if files_in_source:
358 | manually_merge_folders(source, target)
359 | else:
360 | run(['rm', '-R', source])
361 |
362 |
363 | def migrate_data():
364 | # migrate old folder to new OUTPUT folder
365 | old_dir = os.path.join(REPO_DIR, 'html')
366 | if os.path.exists(old_dir):
367 | print('[!] WARNING: Moved old output folder "html" to new location: {}'.format(OUTPUT_DIR))
368 | run(['mv', old_dir, OUTPUT_DIR], timeout=10)
369 |
370 |
371 | def cleanup_archive(archive_path, links):
372 | """move any incorrectly named folders to their canonical locations"""
373 |
374 | # for each folder that exists, see if we can match it up with a known good link
375 | # if we can, then merge the two folders (TODO: if not, move it to lost & found)
376 |
377 | unmatched = []
378 | bad_folders = []
379 |
380 | if not os.path.exists(archive_path):
381 | return
382 |
383 | for folder in os.listdir(archive_path):
384 | try:
385 | files = os.listdir(os.path.join(archive_path, folder))
386 | except NotADirectoryError:
387 | continue
388 |
389 | if files:
390 | link = find_link(folder, links)
391 | if link is None:
392 | unmatched.append(folder)
393 | continue
394 |
395 | if folder != link['timestamp']:
396 | bad_folders.append((folder, link))
397 | else:
398 | # delete empty folders
399 | run(['rm', '-R', os.path.join(archive_path, folder)])
400 |
401 | if bad_folders and IS_TTY and input('[!] Cleanup archive? y/[n]: ') == 'y':
402 | print('[!] Fixing {} improperly named folders in archive...'.format(len(bad_folders)))
403 | for folder, link in bad_folders:
404 | fix_folder_path(archive_path, folder, link)
405 | elif bad_folders:
406 | print('[!] Warning! {} folders need to be merged, fix by running bookmark archiver.'.format(len(bad_folders)))
407 |
408 | if unmatched:
409 | print('[!] Warning! {} unrecognized folders in html/archive/'.format(len(unmatched)))
410 | print(' '+ '\n '.join(unmatched))
411 |
412 |
413 | def wget_output_path(link, look_in=None):
414 | """calculate the path to the wgetted .html file, since wget may
415 | adjust some paths to be different than the base_url path.
416 |
417 | See docs on wget --adjust-extension (-E)
418 | """
419 |
420 | # if we have it stored, always prefer the actual output path to computed one
421 | if link.get('latest', {}).get('wget'):
422 | return link['latest']['wget']
423 |
424 | urlencode = lambda s: quote(s, encoding='utf-8', errors='replace')
425 |
426 | if link['type'] in ('PDF', 'image'):
427 | return urlencode(link['base_url'])
428 |
429 | # Since the wget algorithm to for -E (appending .html) is incredibly complex
430 | # instead of trying to emulate it here, we just look in the output folder
431 | # to see what html file wget actually created as the output
432 | wget_folder = link['base_url'].rsplit('/', 1)[0].split('/')
433 | look_in = os.path.join(ARCHIVE_DIR, link['timestamp'], *wget_folder)
434 |
435 | if look_in and os.path.exists(look_in):
436 | html_files = [
437 | f for f in os.listdir(look_in)
438 | if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
439 | ]
440 | if html_files:
441 | return urlencode(os.path.join(*wget_folder, html_files[0]))
442 |
443 | return None
444 |
445 | # If finding the actual output file didn't work, fall back to the buggy
446 | # implementation of the wget .html appending algorithm
447 | # split_url = link['url'].split('#', 1)
448 | # query = ('%3F' + link['url'].split('?', 1)[-1]) if '?' in link['url'] else ''
449 |
450 | # if re.search(".+\\.[Hh][Tt][Mm][Ll]?$", split_url[0], re.I | re.M):
451 | # # already ends in .html
452 | # return urlencode(link['base_url'])
453 | # else:
454 | # # .html needs to be appended
455 | # without_scheme = split_url[0].split('://', 1)[-1].split('?', 1)[0]
456 | # if without_scheme.endswith('/'):
457 | # if query:
458 | # return urlencode('#'.join([without_scheme + 'index.html' + query + '.html', *split_url[1:]]))
459 | # return urlencode('#'.join([without_scheme + 'index.html', *split_url[1:]]))
460 | # else:
461 | # if query:
462 | # return urlencode('#'.join([without_scheme + '/index.html' + query + '.html', *split_url[1:]]))
463 | # elif '/' in without_scheme:
464 | # return urlencode('#'.join([without_scheme + '.html', *split_url[1:]]))
465 | # return urlencode(link['base_url'] + '/index.html')
466 |
467 |
468 | def derived_link_info(link):
469 | """extend link info with the archive urls and other derived data"""
470 |
471 | link_info = {
472 | **link,
473 | 'date': datetime.fromtimestamp(float(link['timestamp'])).strftime('%Y-%m-%d %H:%M'),
474 | 'google_favicon_url': 'https://www.google.com/s2/favicons?domain={domain}'.format(**link),
475 | 'favicon_url': 'archive/{timestamp}/favicon.ico'.format(**link),
476 | 'files_url': 'archive/{timestamp}/index.html'.format(**link),
477 | 'archive_url': 'archive/{}/{}'.format(link['timestamp'], wget_output_path(link) or 'index.html'),
478 | 'pdf_link': 'archive/{timestamp}/output.pdf'.format(**link),
479 | 'screenshot_link': 'archive/{timestamp}/screenshot.png'.format(**link),
480 | 'dom_link': 'archive/{timestamp}/output.html'.format(**link),
481 | 'archive_org_url': 'https://web.archive.org/web/{base_url}'.format(**link),
482 | }
483 |
484 | # PDF and images are handled slightly differently
485 | # wget, screenshot, & pdf urls all point to the same file
486 | if link['type'] in ('PDF', 'image'):
487 | link_info.update({
488 | 'archive_url': 'archive/{timestamp}/{base_url}'.format(**link),
489 | 'pdf_link': 'archive/{timestamp}/{base_url}'.format(**link),
490 | 'screenshot_link': 'archive/{timestamp}/{base_url}'.format(**link),
491 | 'dom_link': 'archive/{timestamp}/{base_url}'.format(**link),
492 | 'title': '{title} ({type})'.format(**link),
493 | })
494 | return link_info
495 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Bookmark Archiver [](https://github.com/pirate/bookmark-archiver) [](https://twitter.com/thesquashSH)
2 |
3 | "Your own personal Way-Back Machine"
4 |
5 | ▶️ [Quickstart](#quickstart) | [Details](#details) | [Configuration](#configuration) | [Manual Setup](#manual-setup) | [Troubleshooting](#troubleshooting) | [Demo](https://archive.sweeting.me) | [Changelog](#changelog) | [Donate](https://github.com/pirate/bookmark-archiver/blob/master/DONATE.md)
6 |
7 | ---
8 |
9 | Save an archived copy of all websites you bookmark (the actual *content* of each site, not just the list of bookmarks).
10 |
11 | Can import links from:
12 |
13 | - Browser history & bookmarks (Chrome, Firefox, Safari, IE, Opera)
14 | - Pocket
15 | - Pinboard
16 | - RSS or plain text lists
17 | - Shaarli, Delicious, Instapaper, Reddit Saved Posts, Wallabag, Unmark.it, and more!
18 |
19 | For each site, it outputs (configurable):
20 |
21 | - Browsable static HTML archive (wget)
22 | - PDF (Chrome headless)
23 | - Screenshot (Chrome headless)
24 | - DOM dump (Chrome headless)
25 | - Favicon
26 | - Submits URL to archive.org
27 | - Index summary pages: index.html & index.json
28 |
29 | The archiving is additive, so you can schedule `./archive` to run regularly and pull new links into the index.
30 | All the saved content is static and indexed with json files, so it lives forever & is easily parseable, it requires no always-running backend.
31 |
32 | [DEMO: archive.sweeting.me](https://archive.sweeting.me)
33 |
34 |
35 |
36 | ## Quickstart
37 |
38 | **1. Get your list of URLs:**
39 |
40 | Follow the links here to find instructions for exporting bookmarks from each service.
41 |
42 | - [Pocket](https://getpocket.com/export)
43 | - [Pinboard](https://pinboard.in/export/)
44 | - [Instapaper](https://www.instapaper.com/user/export)
45 | - [Reddit Saved Posts](https://github.com/csu/export-saved-reddit)
46 | - [Shaarli](http://shaarli.readthedocs.io/en/master/Backup,-restore,-import-and-export/#export-links-as)
47 | - [Unmark.it](http://help.unmark.it/import-export)
48 | - [Wallabag](https://doc.wallabag.org/en/user/import/wallabagv2.html)
49 | - [Chrome Bookmarks](https://support.google.com/chrome/answer/96816?hl=en)
50 | - [Firefox Bookmarks](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer)
51 | - [Safari Bookmarks](http://i.imgur.com/AtcvUZA.png)
52 | - [Opera Bookmarks](http://help.opera.com/Windows/12.10/en/importexport.html)
53 | - [Internet Explorer Bookmarks](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows)
54 | - Chrome History: `./bin/export-browser-history --chrome`
55 | - Firefox History: `./bin/export-browser-history --firefox`
56 | - Other File or URL: (e.g. RSS feed) pass as second argument in the next step
57 |
58 | (If any of these links are broken, please submit an issue and I'll fix it)
59 |
60 | **2. Create your archive:**
61 |
62 | ```bash
63 | git clone https://github.com/pirate/bookmark-archiver
64 | cd bookmark-archiver/
65 | ./setup # install all dependencies
66 |
67 | # add a list of links from a file
68 | ./archive ~/Downloads/bookmark_export.html # replace with the path to your export file or URL from step 1
69 |
70 | # OR add a list of links from remote URL
71 | ./archive "https://getpocket.com/users/yourusername/feed/all" # url to an RSS, html, or json links file
72 |
73 | # OR add all the links from your browser history
74 | ./bin/export-browser-history --chrome # works with --firefox as well, can take path to SQLite history db
75 | ./archive output/sources/chrome_history.json
76 |
77 | # OR just continue archiving the existing links in the index
78 | ./archive # at any point if you just want to continue archiving where you left off, without adding any new links
79 | ```
80 |
81 | **3. Done!**
82 |
83 | You can open `output/index.html` to view your archive. (favicons will appear next to each title once it has finished downloading)
84 |
85 | If you want to host your archive somewhere to share it with other people, see the [Publishing Your Archive](#publishing-your-archive) section below.
86 |
87 | **4. (Optional) Schedule it to run every day**
88 |
89 | You can import links from any local file path or feed url by changing the second argument to `archive.py`.
90 | Bookmark Archiver will ignore links that are imported multiple times, it will keep the earliest version that it's seen.
91 | This means you can add multiple cron jobs to pull links from several different feeds or files each day,
92 | it will keep the index up-to-date without duplicate links.
93 |
94 | This example archives a pocket RSS feed and an export file every 24 hours, and saves the output to a logfile.
95 | ```bash
96 | 0 24 * * * yourusername /opt/bookmark-archiver/archive https://getpocket.com/users/yourusername/feed/all > /var/log/bookmark_archiver_rss.log
97 | 0 24 * * * yourusername /opt/bookmark-archiver/archive /home/darth-vader/Desktop/bookmarks.html > /var/log/bookmark_archiver_firefox.log
98 | ```
99 | (Add the above lines to `/etc/crontab`)
100 |
101 | **Next Steps**
102 |
103 | If you have any trouble, see the [Troubleshooting](#troubleshooting) section at the bottom.
104 | If you'd like to customize options, see the [Configuration](#configuration) section.
105 |
106 | If you want something easier than running programs in the command-line, take a look at [Pocket Premium](https://getpocket.com/premium) (yay Mozilla!) and [Pinboard Pro](https://pinboard.in/upgrade/) (yay independent developer!). Both offer easy-to-use bookmark archiving with full-text-search and other features.
107 |
108 | ## Details
109 |
110 | `archive.py` is a script that takes a [Pocket-format](https://getpocket.com/export), [JSON-format](https://pinboard.in/export/), [Netscape-format](https://msdn.microsoft.com/en-us/library/aa753582(v=vs.85).aspx), or RSS-formatted list of links, and downloads a clone of each linked website to turn into a browsable archive that you can store locally or host online.
111 |
112 | The archiver produces an output folder `output/` containing an `index.html`, `index.json`, and archived copies of all the sites,
113 | organized by timestamp bookmarked. It's Powered by [headless](https://developers.google.com/web/updates/2017/04/headless-chrome) Chromium and good 'ol `wget`.
114 |
115 | For each sites it saves:
116 |
117 | - wget of site, e.g. `en.wikipedia.org/wiki/Example.html` with .html appended if not present
118 | - `output.pdf` Printed PDF of site using headless chrome
119 | - `screenshot.png` 1440x900 screenshot of site using headless chrome
120 | - `output.html` DOM Dump of the HTML after rendering using headless chrome
121 | - `archive.org.txt` A link to the saved site on archive.org
122 | - `audio/` and `video/` for sites like youtube, soundcloud, etc. (using youtube-dl) (WIP)
123 | - `code/` clone of any repository for github, bitbucket, or gitlab links (WIP)
124 | - `index.json` JSON index containing link info and archive details
125 | - `index.html` HTML index containing link info and archive details (optional fancy or simple index)
126 |
127 | Wget doesn't work on sites you need to be logged into, but chrome headless does, see the [Configuration](#configuration)* section for `CHROME_USER_DATA_DIR`.
128 |
129 | **Large Exports & Estimated Runtime:**
130 |
131 | I've found it takes about an hour to download 1000 articles, and they'll take up roughly 1GB.
132 | Those numbers are from running it single-threaded on my i5 machine with 50mbps down. YMMV.
133 |
134 | You can run it in parallel by using the `resume` feature, or by manually splitting export.html into multiple files:
135 | ```bash
136 | ./archive export.html 1498800000 & # second argument is timestamp to resume downloading from
137 | ./archive export.html 1498810000 &
138 | ./archive export.html 1498820000 &
139 | ./archive export.html 1498830000 &
140 | ```
141 | Users have reported running it with 50k+ bookmarks with success (though it will take more RAM while running).
142 |
143 | ## Configuration
144 |
145 | You can tweak parameters via environment variables, or by editing `config.py` directly:
146 | ```bash
147 | env CHROME_BINARY=google-chrome-stable RESOLUTION=1440,900 FETCH_PDF=False ./archive ~/Downloads/bookmarks_export.html
148 | ```
149 |
150 | **Shell Options:**
151 | - colorize console ouput: `USE_COLOR` value: [`True`]/`False`
152 | - show progress bar: `SHOW_PROGRESS` value: [`True`]/`False`
153 | - archive permissions: `OUTPUT_PERMISSIONS` values: [`755`]/`644`/`...`
154 |
155 | **Dependency Options:**
156 | - path to Chrome: `CHROME_BINARY` values: [`chromium-browser`]/`/usr/local/bin/google-chrome`/`...`
157 | - path to wget: `WGET_BINARY` values: [`wget`]/`/usr/local/bin/wget`/`...`
158 |
159 | **Archive Options:**
160 | - maximum allowed download time per link: `TIMEOUT` values: [`60`]/`30`/`...`
161 | - archive methods (values: [`True`]/`False`):
162 | - fetch page with wget: `FETCH_WGET`
163 | - fetch images/css/js with wget: `FETCH_WGET_REQUISITES` (True is highly recommended)
164 | - print page as PDF: `FETCH_PDF`
165 | - fetch a screenshot of the page: `FETCH_SCREENSHOT`
166 | - fetch a DOM dump of the page: `FETCH_DOM`
167 | - fetch a favicon for the page: `FETCH_FAVICON`
168 | - submit the page to archive.org: `SUBMIT_ARCHIVE_DOT_ORG`
169 | - screenshot: `RESOLUTION` values: [`1440,900`]/`1024,768`/`...`
170 | - user agent: `WGET_USER_AGENT` values: [`Wget/1.19.1`]/`"Mozilla/5.0 ..."`/`...`
171 | - chrome profile: `CHROME_USER_DATA_DIR` values: [`~/Library/Application\ Support/Google/Chrome/Default`]/`/tmp/chrome-profile`/`...`
172 | To capture sites that require a user to be logged in, you must specify a path to a chrome profile (which loads the cookies needed for the user to be logged in). If you don't have an existing chrome profile, create one with `chromium-browser --disable-gpu --user-data-dir=/tmp/chrome-profile`, and log into the sites you need. Then set `CHROME_USER_DATA_DIR=/tmp/chrome-profile` to make Bookmark Archiver use that profile.
173 |
174 | (See defaults & more at the top of `config.py`)
175 |
176 | To tweak the outputted html index file's look and feel, just edit the HTML files in `archiver/templates/`.
177 |
178 | The chrome/chromium dependency is _optional_ and only required for screenshots, PDF, and DOM dump output, it can be safely ignored if those three methods are disabled.
179 |
180 | ## Publishing Your Archive
181 |
182 | The archive produced by `./archive` is suitable for serving on any provider that can host static html (e.g. github pages!).
183 |
184 | You can also serve it from a home server or VPS by uploading the outputted `output` folder to your web directory, e.g. `/var/www/bookmark-archiver` and configuring your webserver.
185 |
186 | Here's a sample nginx configuration that works to serve archive folders:
187 |
188 | ```nginx
189 | location / {
190 | alias /path/to/bookmark-archiver/output/;
191 | index index.html;
192 | autoindex on; # see directory listing upon clicking "The Files" links
193 | try_files $uri $uri/ =404;
194 | }
195 | ```
196 |
197 | Make sure you're not running any content as CGI or PHP, you only want to serve static files!
198 |
199 | Urls look like: `https://archive.example.com/archive/1493350273/en.wikipedia.org/wiki/Dining_philosophers_problem.html`
200 |
201 | **Security WARNING & Content Disclaimer**
202 |
203 | Re-hosting other people's content has security implications for any other sites sharing your hosting domain. Make sure you understand
204 | the dangers of hosting unknown archived CSS & JS files [on your shared domain](https://developer.mozilla.org/en-US/docs/Web/Security/Same-origin_policy).
205 | Due to the security risk of serving some malicious JS you archived by accident, it's best to put this on a domain or subdomain
206 | of its own to keep cookies separate and slightly mitigate [CSRF attacks](https://en.wikipedia.org/wiki/Cross-site_request_forgery) and other nastiness.
207 |
208 | You may also want to blacklist your archive in `/robots.txt` if you don't want to be publicly assosciated with all the links you archive via search engine results.
209 |
210 | Be aware that some sites you archive may not allow you to rehost their content publicly for copyright reasons,
211 | it's up to you to host responsibly and respond to takedown requests appropriately.
212 |
213 | Please modify the `FOOTER_INFO` config variable to add your contact info to the footer of your index.
214 |
215 | ## Info & Motivation
216 |
217 | This is basically an open-source version of [Pocket Premium](https://getpocket.com/premium) (which you should consider paying for!).
218 | I got tired of sites I saved going offline or changing their URLS, so I started
219 | archiving a copy of them locally now, similar to The Way-Back Machine provided
220 | by [archive.org](https://archive.org). Self hosting your own archive allows you to save
221 | PDFs & Screenshots of dynamic sites in addition to static html, something archive.org doesn't do.
222 |
223 | Now I can rest soundly knowing important articles and resources I like wont dissapear off the internet.
224 |
225 | My published archive as an example: [archive.sweeting.me](https://archive.sweeting.me).
226 |
227 | ## Manual Setup
228 |
229 | If you don't like running random setup scripts off the internet (:+1:), you can follow these manual setup instructions.
230 |
231 | **1. Install dependencies:** `chromium >= 59`,` wget >= 1.16`, `python3 >= 3.5` (`google-chrome >= v59` works fine as well)
232 |
233 | If you already have Google Chrome installed, or wish to use that instead of Chromium, follow the [Google Chrome Instructions](#google-chrome-instructions).
234 |
235 | ```bash
236 | # On Mac:
237 | brew cask install chromium # If you already have Google Chrome/Chromium in /Applications/, skip this command
238 | brew install wget python3
239 |
240 | echo -e '#!/bin/bash\n/Applications/Chromium.app/Contents/MacOS/Chromium "$@"' > /usr/local/bin/chromium-browser # see instructions for google-chrome below
241 | chmod +x /usr/local/bin/chromium-browser
242 | ```
243 |
244 | ```bash
245 | # On Ubuntu/Debian:
246 | apt install chromium-browser python3 wget
247 | ```
248 |
249 | ```bash
250 | # Check that everything worked:
251 | chromium-browser --version && which wget && which python3 && which curl && echo "[√] All dependencies installed."
252 | ```
253 |
254 | **2. Get your bookmark export file:**
255 |
256 | Follow the instruction links above in the "Quickstart" section to download your bookmarks export file.
257 |
258 | **3. Run the archive script:**
259 |
260 | 1. Clone this repo `git clone https://github.com/pirate/bookmark-archiver`
261 | 3. `cd bookmark-archiver/`
262 | 4. `./archive ~/Downloads/bookmarks_export.html`
263 |
264 | You may optionally specify a second argument to `archive.py export.html 153242424324` to resume the archive update at a specific timestamp.
265 |
266 | If you have any trouble, see the [Troubleshooting](#troubleshooting) section at the bottom.
267 |
268 | ### Google Chrome Instructions:
269 |
270 | I recommend Chromium instead of Google Chrome, since it's open source and doesn't send your data to Google.
271 | Chromium may have some issues rendering some sites though, so you're welcome to try Google-chrome instead.
272 | It's also easier to use Google Chrome if you already have it installed, rather than downloading Chromium all over.
273 |
274 | 1. Install & link google-chrome
275 | ```bash
276 | # On Mac:
277 | # If you already have Google Chrome in /Applications/, skip this brew command
278 | brew cask install google-chrome
279 | brew install wget python3
280 |
281 | echo -e '#!/bin/bash\n/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome "$@"' > /usr/local/bin/google-chrome
282 | chmod +x /usr/local/bin/google-chrome
283 | ```
284 |
285 | ```bash
286 | # On Linux:
287 | wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | sudo apt-key add -
288 | sudo sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list'
289 | apt update; apt install google-chrome-beta python3 wget
290 | ```
291 |
292 | 2. Set the environment variable `CHROME_BINARY` to `google-chrome` before running:
293 |
294 | ```bash
295 | env CHROME_BINARY=google-chrome ./archive ~/Downloads/bookmarks_export.html
296 | ```
297 | If you're having any trouble trying to set up Google Chrome or Chromium, see the Troubleshooting section below.
298 |
299 | ## Troubleshooting
300 |
301 | ### Dependencies
302 |
303 | **Python:**
304 |
305 | On some Linux distributions the python3 package might not be recent enough.
306 | If this is the case for you, resort to installing a recent enough version manually.
307 | ```bash
308 | add-apt-repository ppa:fkrull/deadsnakes && apt update && apt install python3.6
309 | ```
310 | If you still need help, [the official Python docs](https://docs.python.org/3.6/using/unix.html) are a good place to start.
311 |
312 | **Chromium/Google Chrome:**
313 |
314 | `archive.py` depends on being able to access a `chromium-browser`/`google-chrome` executable. The executable used
315 | defaults to `chromium-browser` but can be manually specified with the environment variable `CHROME_BINARY`:
316 |
317 | ```bash
318 | env CHROME_BINARY=/usr/local/bin/chromium-browser ./archive ~/Downloads/bookmarks_export.html
319 | ```
320 |
321 | 1. Test to make sure you have Chrome on your `$PATH` with:
322 |
323 | ```bash
324 | which chromium-browser || which google-chrome
325 | ```
326 | If no executable is displayed, follow the setup instructions to install and link one of them.
327 |
328 | 2. If a path is displayed, the next step is to check that it's runnable:
329 |
330 | ```bash
331 | chromium-browser --version || google-chrome --version
332 | ```
333 | If no version is displayed, try the setup instructions again, or confirm that you have permission to access chrome.
334 |
335 | 3. If a version is displayed and it's `<59`, upgrade it:
336 |
337 | ```bash
338 | apt upgrade chromium-browser -y
339 | # OR
340 | brew cask upgrade chromium-browser
341 | ```
342 |
343 | 4. If a version is displayed and it's `>=59`, make sure `archive.py` is running the right one:
344 |
345 | ```bash
346 | env CHROME_BINARY=/path/from/step/1/chromium-browser ./archive bookmarks_export.html # replace the path with the one you got from step 1
347 | ```
348 |
349 |
350 | **Wget & Curl:**
351 |
352 | If you're missing `wget` or `curl`, simply install them using `apt` or your package manager of choice.
353 | See the "Manual Setup" instructions for more details.
354 |
355 | If wget times out or randomly fails to download some sites that you have confirmed are online,
356 | upgrade wget to the most recent version with `brew upgrade wget` or `apt upgrade wget`. There is
357 | a bug in versions `<=1.19.1_1` that caused wget to fail for perfectly valid sites.
358 |
359 | ### Archiving
360 |
361 | **No links parsed from export file:**
362 |
363 | Please open an [issue](https://github.com/pirate/bookmark-archiver/issues) with a description of where you got the export, and
364 | preferrably your export file attached (you can redact the links). We'll fix the parser to support your format.
365 |
366 | **Lots of skipped sites:**
367 |
368 | If you ran the archiver once, it wont re-download sites subsequent times, it will only download new links.
369 | If you haven't already run it, make sure you have a working internet connection and that the parsed URLs look correct.
370 | You can check the `archive.py` output or `index.html` to see what links it's downloading.
371 |
372 | If you're still having issues, try deleting or moving the `output/archive` folder (back it up first!) and running `./archive` again.
373 |
374 | **Lots of errors:**
375 |
376 | Make sure you have all the dependencies installed and that you're able to visit the links from your browser normally.
377 | Open an [issue](https://github.com/pirate/bookmark-archiver/issues) with a description of the errors if you're still having problems.
378 |
379 | **Lots of broken links from the index:**
380 |
381 | Not all sites can be effectively archived with each method, that's why it's best to use a combination of `wget`, PDFs, and screenshots.
382 | If it seems like more than 10-20% of sites in the archive are broken, open an [issue](https://github.com/pirate/bookmark-archiver/issues)
383 | with some of the URLs that failed to be archived and I'll investigate.
384 |
385 | ### Hosting the Archive
386 |
387 | If you're having issues trying to host the archive via nginx, make sure you already have nginx running with SSL.
388 | If you don't, google around, there are plenty of tutorials to help get that set up. Open an [issue](https://github.com/pirate/bookmark-archiver/issues)
389 | if you have problem with a particular nginx config.
390 |
391 | ## Roadmap
392 |
393 | If you feel like contributing a PR, some of these tasks are pretty easy. Feel free to open an issue if you need help getting started in any way!
394 |
395 | - download closed-captions text from youtube videos
396 | - body text extraction using [fathom](https://hacks.mozilla.org/2017/04/fathom-a-framework-for-understanding-web-pages/)
397 | - auto-tagging based on important extracted words
398 | - audio & video archiving with `youtube-dl`
399 | - full-text indexing with elasticsearch/elasticlunr/ag
400 | - video closed-caption downloading for full-text indexing video content
401 | - automatic text summaries of article with summarization library
402 | - feature image extraction
403 | - http support (from my https-only domain)
404 | - try wgetting dead sites from archive.org (https://github.com/hartator/wayback-machine-downloader)
405 | - live updating from pocket/pinboard
406 |
407 | It's possible to pull links via the pocket API or public pocket RSS feeds instead of downloading an html export.
408 | Once I write a script to do that, we can stick this in `cron` and have it auto-update on it's own.
409 |
410 | For now you just have to download `ril_export.html` and run `archive.py` each time it updates. The script
411 | will run fast subsequent times because it only downloads new links that haven't been archived already.
412 |
413 | ## Links
414 |
415 | **Similar Projects:**
416 | - [Memex by Worldbrain.io](https://github.com/WorldBrain/Memex) a browser extension that saves all your history and does full-text search
417 | - [Hypothes.is](https://web.hypothes.is/) a web/pdf/ebook annotation tool that also archives content
418 | - [Perkeep](https://perkeep.org/) "Perkeep lets you permanently keep your stuff, for life."
419 | - [Fetching.io](http://fetching.io/) A personal search engine/archiver that lets you search through all archived websites that you've bookmarked
420 | - [Shaarchiver](https://github.com/nodiscc/shaarchiver) very similar project that archives Firefox, Shaarli, or Delicious bookmarks and all linked media, generating a markdown/HTML index
421 | - [Webrecorder.io](https://webrecorder.io/) Save full browsing sessions and archive all the content
422 | - [Wallabag](https://wallabag.org) Save articles you read locally or on your phone
423 |
424 | **Discussions:**
425 | - [Hacker News Discussion](https://news.ycombinator.com/item?id=14272133)
426 | - [Reddit r/selfhosted Discussion](https://www.reddit.com/r/selfhosted/comments/69eoi3/pocket_stream_archive_your_own_personal_wayback/)
427 | - [Reddit r/datahoarder Discussion #1](https://www.reddit.com/r/DataHoarder/comments/69e6i9/archive_a_browseable_copy_of_your_saved_pocket/)
428 | - [Reddit r/datahoarder Discussion #2](https://www.reddit.com/r/DataHoarder/comments/6kepv6/bookmarkarchiver_now_supports_archiving_all_major/)
429 |
430 |
431 | **Tools/Other:**
432 | - https://github.com/ikreymer/webarchiveplayer#auto-load-warcs
433 | - [Sheetsee-Pocket](http://jlord.us/sheetsee-pocket/) project that provides a pretty auto-updating index of your Pocket links (without archiving them)
434 | - [Pocket -> IFTTT -> Dropbox](https://christopher.su/2013/saving-pocket-links-file-day-dropbox-ifttt-launchd/) Post by Christopher Su on his Pocket saving IFTTT recipie
435 |
436 | ## Changelog
437 |
438 | - v0.1.0 released
439 | - support for browser history exporting added with `./bin/export-browser-history`
440 | - support for chrome `--dump-dom` to output full page HTML after JS executes
441 | - v0.0.3 released
442 | - support for chrome `--user-data-dir` to archive sites that need logins
443 | - fancy individual html & json indexes for each link
444 | - smartly append new links to existing index instead of overwriting
445 | - v0.0.2 released
446 | - proper HTML templating instead of format strings (thanks to https://github.com/bardisty!)
447 | - refactored into separate files, wip audio & video archiving
448 | - v0.0.1 released
449 | - Index links now work without nginx url rewrites, archive can now be hosted on github pages
450 | - added setup.sh script & docstrings & help commands
451 | - made Chromium the default instead of Google Chrome (yay free software)
452 | - added [env-variable](https://github.com/pirate/bookmark-archiver/pull/25) configuration (thanks to https://github.com/hannah98!)
453 | - renamed from **Pocket Archive Stream** -> **Bookmark Archiver**
454 | - added [Netscape-format](https://github.com/pirate/bookmark-archiver/pull/20) export support (thanks to https://github.com/ilvar!)
455 | - added [Pinboard-format](https://github.com/pirate/bookmark-archiver/pull/7) export support (thanks to https://github.com/sconeyard!)
456 | - front-page of HN, oops! apparently I have users to support now :grin:?
457 | - added Pocket-format export support
458 | - v0.0.0 released: created Pocket Archive Stream 2017/05/05
459 |
460 | ## Donations
461 |
462 | This project can really flourish with some more engineering effort, but unless it can support
463 | me financially I'm unlikely to be able to take it to the next level alone. It's already pretty
464 | functional and robust, but it really deserves to be taken to the next level with a few more
465 | talented engineers. If you or your foundation wants to sponsor this project long-term, contact
466 | me at bookmark-archiver@sweeting.me.
467 |
468 | [Grants / Donations](https://github.com/pirate/bookmark-archiver/blob/master/donate.md)
469 |
--------------------------------------------------------------------------------