├── trawler
    ├── adapters
    │   ├── __init__.py
    │   ├── renesas.py
    │   ├── usb-if.py
    │   ├── arm.py
    │   ├── xilinx.py
    │   └── zotero.py
    ├── common.py
    ├── net.py
    ├── config.py
    ├── db.py
    └── __init__.py
├── requirements.txt
├── .gitignore
├── __main__.py
├── trawler.py
├── setup.py
├── LICENSE
└── README.md


/trawler/adapters/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | tqdm
3 | selenium
4 | orator
5 | beautifulsoup4
6 | lxml
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__/
3 | _build/
4 | build/
5 | .venv/
6 | .webdriver_profile/
7 | datasheets/
8 | *.db
9 | 


--------------------------------------------------------------------------------
/__main__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # SPDX-License-Identifier: BSD-3-Clause
3 | import sys
4 | from trawler import main
5 | 
6 | if __name__ == '__main__':
7 | 	sys.exit(main())
8 | 


--------------------------------------------------------------------------------
/trawler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | 
 4 | import sys
 5 | from pathlib import Path
 6 | 
 7 | trawler_path = Path(sys.argv[0]).resolve()
 8 | 
 9 | if (trawler_path.parent / 'trawler').is_dir():
10 | 	sys.path.insert(0, str(trawler_path.parent))
11 | 
12 | from trawler import main
13 | 
14 | if __name__ == '__main__':
15 | 	sys.exit(main())
16 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: BSD-3-Clause
 2 | #!/usr/bin/env python3
 3 | 
 4 | from setuptools import setup, find_packages
 5 | 
 6 | setup(
 7 | 	name = 'Trawler',
 8 | 	version = '0.1',
 9 | 	description = 'Bulk scrape and download datasheets from various vendors (insult)',
10 | 	license = 'BSD-3-Clause',
11 | 	python_requires = '~=3.7',
12 | 	install_requires = [
13 | 		'requests',
14 | 		'tqdm',
15 | 		'selenium',
16 | 		'orator',
17 | 		'beautifulsoup4',
18 | 		'lxml'
19 | 	],
20 | 	entry_points = {
21 | 		'console_scripts': [
22 | 			'trawler = trawler:main',
23 | 		]
24 | 	},
25 | 	packages = find_packages(),
26 | 	project_urls = {
27 | 		'Source Code': 'https://github.com/bad-alloc-heavy-industries/Trawler',
28 | 		'Bug Tracker': 'https://github.com/bad-alloc-heavy-industries/Trawler/issues'
29 | 	}
30 | )
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2021,  All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are met:
 5 | 
 6 | 1. Redistributions of source code must retain the above copyright notice,
 7 |    this list of conditions and the following disclaimer.
 8 | 
 9 | 2. Redistributions in binary form must reproduce the above copyright
10 |    notice, this list of conditions and the following disclaimer in the
11 |    documentation and/or other materials provided with the distribution.
12 | 
13 | 3. Neither the name of ORGANIZATION nor the names of
14 |    its contributors may be used to endorse or promote products derived from
15 |    this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 | POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/trawler/common.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: BSD-3-Clause
 2 | import sys
 3 | import os
 4 | import collections.abc
 5 | from tqdm import tqdm
 6 | 
 7 | __all__ = (
 8 | 	'log', 'err', 'wrn', 'inf', 'dbg',
 9 | 	'tlog', 'terr', 'twrn', 'tinf', 'tdbg',
10 | 	'fixup_title',
11 | 
12 | 	'EXECUTABLE_EXTS', 'ARCHIVE_EXTS'
13 | )
14 | 
15 | EXECUTABLE_EXTS = (
16 | 	'exe', 'msi',
17 | )
18 | 
19 | ARCHIVE_EXTS = (
20 | 	'tar', 'gz', 'bz2', 'xz', 'zip',
21 | 	'lzma', '7z',
22 | )
23 | 
24 | def log(str, end = '\n', file = sys.stdout):
25 | 	print(f'\x1B[35m[*]\x1B[0m {str}', end = end, file = file)
26 | 
27 | def err(str, end = '\n', file = sys.stderr):
28 | 	print(f'\x1B[31m[!]\x1B[0m {str}', end = end, file = file)
29 | 
30 | def wrn(str, end = '\n', file = sys.stderr):
31 | 	print(f'\x1B[33m[~]\x1B[0m {str}', end = end, file = file)
32 | 
33 | def inf(str, end = '\n', file = sys.stdout):
34 | 	print(f'\x1B[36m[~]\x1B[0m {str}', end = end, file = file)
35 | 
36 | def dbg(str, end = '\n', file = sys.stdout):
37 | 	print(f'\x1B[34m[~]\x1B[0m {str}', end = end, file = file)
38 | 
39 | def tlog(str, end = '\n', file = sys.stdout):
40 | 	tqdm.write(f'\x1B[35m[*]\x1B[0m {str}', end = end, file = file)
41 | 
42 | def terr(str, end = '\n', file = sys.stderr):
43 | 	tqdm.write(f'\x1B[31m[!]\x1B[0m {str}', end = end, file = file)
44 | 
45 | def twrn(str, end = '\n', file = sys.stderr):
46 | 	tqdm.write(f'\x1B[33m[~]\x1B[0m {str}', end = end, file = file)
47 | 
48 | def tinf(str, end = '\n', file = sys.stdout):
49 | 	tqdm.write(f'\x1B[36m[~]\x1B[0m {str}', end = end, file = file)
50 | 
51 | def tdbg(str, end = '\n', file = sys.stdout):
52 | 	tqdm.write(f'\x1B[34m[~]\x1B[0m {str}', end = end, file = file)
53 | 
54 | def recusive_zip(d, u):
55 | 	for k, v in u.items():
56 | 		if isinstance(v, collections.abc.Mapping):
57 | 			d[k] = _recusive_zip(d.get(k, {}), v)
58 | 		else:
59 | 			d[k] = v
60 | 	return d
61 | 
62 | def fixup_title(s):
63 | 	if len(s) < 18:
64 | 		return f'{s}{" "*(18 - len(s))}'
65 | 	else:
66 | 		return f'{s[:15]}...'
67 | 


--------------------------------------------------------------------------------
/trawler/net.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: BSD-3-Clause
 2 | import sys
 3 | import time
 4 | import re
 5 | 
 6 | from os import path
 7 | 
 8 | import requests
 9 | from requests import utils
10 | 
11 | from . import config
12 | from .common import *
13 | 
14 | 
15 | __all__ = (
16 | 	'download_resource', 'get_content'
17 | )
18 | 
19 | def get_content(url, args):
20 | 	try_count = 0
21 | 	while try_count < args.retry:
22 | 		try:
23 | 			if args.delay > 0:
24 | 				time.sleep(args.delay)
25 | 
26 | 			with requests.get(
27 | 				url,
28 | 				allow_redirects = True,
29 | 				timeout = args.timeout,
30 | 				headers = {
31 | 					'User-Agent': args.user_agent
32 | 				}
33 | 			) as r:
34 | 				return r.content
35 | 
36 | 		except Exception as e:
37 | 			if isinstance(e, KeyboardInterrupt):
38 | 				sys.quit()
39 | 			else:
40 | 				try_count += 1
41 | 
42 | 	if try_count != 0:
43 | 		return False
44 | 
45 | def download_resource(dl_dir, ds, args):
46 | 	tlog(f'  => Downloading {ds.title} ({ds.id})')
47 | 	try_count = 0
48 | 	while try_count < args.retry:
49 | 		try:
50 | 			if args.delay > 0:
51 | 				time.sleep(args.delay)
52 | 
53 | 			with requests.get(
54 | 				ds.url,
55 | 				allow_redirects = True,
56 | 				timeout = args.timeout,
57 | 				headers = {
58 | 					'User-Agent': args.user_agent
59 | 				}
60 | 			) as r:
61 | 				fname = ''
62 | 				if 'content-disposition' in r.headers.keys():
63 | 					fname = re.findall('filename=(.*)', r.headers['content-disposition'])[0]
64 | 				else:
65 | 					fname = ds.url.split('/')[-1]
66 | 
67 | 				ds.filename = fname
68 | 				if not ds.dl_location.endswith(fname):
69 | 					ds.dl_location = path.join(ds.dl_location, fname)
70 | 				ds.save()
71 | 				tlog(f'    ==> Saving {fname} to {ds.dl_location}')
72 | 				with open(ds.dl_location, 'wb') as file:
73 | 					file.write(r.content)
74 | 
75 | 				ds.downloaded = True
76 | 				ds.save()
77 | 				break
78 | 		except Exception as e:
79 | 			if isinstance(e, KeyboardInterrupt):
80 | 				sys.quit()
81 | 			else:
82 | 				twrn(f'  => Download failed {e}, retrying')
83 | 				try_count += 1
84 | 
85 | 	if try_count != 0:
86 | 		terr(f'  => Unable to download datasheet with id {ds.id}')
87 | 		return False
88 | 
89 | 	return True
90 | 


--------------------------------------------------------------------------------
/trawler/config.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: BSD-3-Clause
 2 | import os
 3 | import enum
 4 | from enum import Enum
 5 | 
 6 | @enum.unique
 7 | class WebdriverBackend(Enum):
 8 | 	Chrome  = enum.auto()
 9 | 	FireFox = enum.auto()
10 | 
11 | 	def __str__(self) -> str:
12 | 		return self.name
13 | 
14 | 	@staticmethod
15 | 	def from_string(s: str):
16 | 		try:
17 | 			return WebdriverBackend[s]
18 | 		except KeyError:
19 | 			raise ValueError()
20 | 
21 | # ==== Various constants ==== #
22 | TRAWLER_NAME = 'trawler'
23 | TRAWLER_VERSION = 'v0.2'
24 | TRAWLER_SCHEMA_VERSION = 1
25 | 
26 | # ==== Directories ==== #
27 | XDG_CACHE_DIR = os.path.join(os.path.expanduser('~'), '.cache') if 'XDG_CACHE_HOME' not in os.environ else os.environ['XDG_CACHE_HOME']
28 | XDG_DATA_HOME = os.path.join(os.path.expanduser('~'), '.local/share') if 'XDG_DATA_HOME' not in os.environ else os.environ['XDG_DATA_HOME']
29 | XDG_DOCUMENTS_DIR = os.path.join(os.path.expanduser('~'), 'Documents') if 'XDG_DOCUMENTS_DIR' not in os.environ else os.environ['XDG_DOCUMENTS_DIR']
30 | 
31 | TRAWLER_DATA = os.path.join(XDG_DATA_HOME, TRAWLER_NAME)
32 | TRAWLER_CACHE = os.path.join(XDG_CACHE_DIR, TRAWLER_NAME)
33 | 
34 | TRAWLER_USER_ADAPTERS = os.path.join(TRAWLER_DATA, 'adapters')
35 | TRAWLER_DL_DIR = os.path.join(XDG_DOCUMENTS_DIR, TRAWLER_NAME)
36 | 
37 | # ==== Default Settings ==== #
38 | DEFAULT_OUTPUT_DIR = TRAWLER_DL_DIR
39 | DEFAULT_TIMEOUT = 120
40 | DEFAULT_RETRY_COUNT = 3
41 | DEFAULT_DOWNLOAD_DELAY = 3
42 | DEFAULT_PROFILE_DIRECTORY = os.path.join(TRAWLER_CACHE, '.webdriver_profile')
43 | DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.9999.9999 Safari/537.36'
44 | 
45 | DEFAULT_DATABASE = os.path.join(TRAWLER_CACHE, 'datasheets.db')
46 | DEFAULT_WEBDRIVER = WebdriverBackend.Chrome
47 | DEFAULT_WD_HEADLESS = False
48 | DEFAULT_WD_HEADLESS_RES = (1920, 1080)
49 | 
50 | # ==== Zotero Stuff ==== #
51 | ZOTERO_ROOT = os.path.join(os.path.expanduser('~'), 'Zotero')
52 | ZOTERO_DB = os.path.join(ZOTERO_ROOT, 'zotero.sqlite')
53 | 
54 | # ==== Database Settings ==== #
55 | DATABASE = {
56 | 	'default': 'trawler_cache',
57 | 	'trawler_cache': {
58 | 		'driver': 'sqlite',
59 | 		'database': DEFAULT_DATABASE,
60 | 	},
61 | 	'zotero': {
62 | 		'driver': 'sqlite',
63 | 		'database': ZOTERO_DB,
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/trawler/adapters/renesas.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: BSD-3-Clause
  2 | """
  3 | renesas.py
  4 | ---------
  5 | 
  6 | This script is designed to scrape all of the datasheets from https://www.renesas.com/us/en/support/document-search
  7 | 
  8 | """
  9 | import sys
 10 | import time
 11 | import enum
 12 | import re
 13 | 
 14 | from enum import Enum, Flag
 15 | from os import getcwd, path, mkdir
 16 | from datetime import datetime, timedelta
 17 | 
 18 | import requests
 19 | from requests import utils
 20 | 
 21 | from tqdm import tqdm
 22 | 
 23 | from ..common import *
 24 | from ..net import download_resource, get_content
 25 | from ..db import Datasheet, DatasheetTag, Scraper
 26 | 
 27 | from bs4 import BeautifulSoup
 28 | 
 29 | ADAPTER_NAME = 'renesas'
 30 | ADAPTER_DESC = 'Renesas datasheet adapter'
 31 | 
 32 | RENESAS_ROOT = 'https://www.renesas.com'
 33 | RENESAS_DOCS_ROOT = f'{RENESAS_ROOT}/us/en/support/document-search'
 34 | 
 35 | def collect_datasheets(args, dl_dir):
 36 | 	sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id
 37 | 	log('Collecting datasheets... this might take a while')
 38 | 
 39 | 	page_index = 0
 40 | 	has_next_page = True
 41 | 	datasheets = 0
 42 | 	start_time = datetime.now()
 43 | 
 44 | 	while has_next_page:
 45 | 		inf(f'  => On page index {page_index+1}, total so far {datasheets}')
 46 | 		url = f'{RENESAS_DOCS_ROOT}?page={page_index}'
 47 | 
 48 | 		content = get_content(url, args)
 49 | 		soup = BeautifulSoup(content, 'lxml')
 50 | 		try:
 51 | 			doc_tab = soup.find('table').find('tbody')
 52 | 		except Exception as e:
 53 | 			if isinstance(e, KeyboardInterrupt):
 54 | 				sys.quit()
 55 | 			else:
 56 | 				has_next_page = False
 57 | 				break
 58 | 
 59 | 		for doc in tqdm(doc_tab.find_all('tr')):
 60 | 			tds = doc.find_all('td')
 61 | 			is_locked = tds[0].find('span') is not None
 62 | 			url = tds[1].find_all('a')[1]['href']
 63 | 			title = tds[1].find_all('a')[1].text
 64 | 
 65 | 			if not is_locked:
 66 | 				try:
 67 | 					ds = Datasheet \
 68 | 						.where('title', '=', title) \
 69 | 						.where('scraper_id', '=', sc_id) \
 70 | 						.first_or_fail()
 71 | 				except:
 72 | 					ds = Datasheet()
 73 | 					ds.scraper_id = sc_id
 74 | 					ds.found = datetime.now()
 75 | 					ds.src = f'{RENESAS_ROOT}/{url}'
 76 | 					ds.url = f'{RENESAS_ROOT}/{url}'
 77 | 					ds.dl_location = dl_dir
 78 | 					ds.title = title
 79 | 
 80 | 				ds.last_seen = datetime.now()
 81 | 				ds.save()
 82 | 				datasheets += 1
 83 | 
 84 | 		page_index += 1
 85 | 
 86 | 	end_time = datetime.now()
 87 | 
 88 | 	log(f'Found {datasheets} datasheets in {end_time - start_time}')
 89 | 
 90 | 
 91 | def parser_init(parser):
 92 | 	renesas_options = parser.add_argument_group('Renesas adapter options')
 93 | 
 94 | def adapter_main(args, driver, driver_options, dl_dir):
 95 | 	sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id
 96 | 
 97 | 	if not args.skip_collect:
 98 | 		collect_datasheets(args, dl_dir)
 99 | 
100 | 	if not args.skip_download:
101 | 		sheets = Datasheet.where('src', '!=', 'NULL').where('scraper_id', '=', sc_id).get()
102 | 		with tqdm(
103 | 				miniters = 1, total = len(sheets),
104 | 			) as bar:
105 | 				for ds in sheets:
106 | 					bar.set_description(fixup_title(ds.title))
107 | 					if download_resource(dl_dir, ds, args):
108 | 						bar.update(1)
109 | 
110 | 	return 0
111 | 


--------------------------------------------------------------------------------
/trawler/adapters/usb-if.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: BSD-3-Clause
  2 | """
  3 | usb-if.py
  4 | ---------
  5 | 
  6 | This script is designed to scrape all of the datasheets from https://www.usb.org/documents
  7 | 
  8 | """
  9 | import sys
 10 | import time
 11 | import enum
 12 | import re
 13 | 
 14 | from enum import Enum, Flag
 15 | from os import getcwd, path, mkdir
 16 | from datetime import datetime, timedelta
 17 | 
 18 | import requests
 19 | from requests import utils
 20 | 
 21 | from tqdm import tqdm
 22 | 
 23 | from ..common import *
 24 | from ..net import download_resource, get_content
 25 | from ..db import Datasheet, DatasheetTag, Scraper
 26 | 
 27 | from bs4 import BeautifulSoup
 28 | 
 29 | ADAPTER_NAME = 'usb-if'
 30 | ADAPTER_DESC = 'USB-IF datasheet adapter'
 31 | 
 32 | USB_DOCS_ROOT_URL = 'https://www.usb.org/documents'
 33 | USB_DOCS_ALL = f'{USB_DOCS_ROOT_URL}?search=&items_per_page=All'
 34 | 
 35 | 
 36 | def collect_datasheets(args, dl_dir):
 37 | 	sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id
 38 | 	log('Collecting datasheets... this might take a while')
 39 | 	datasheets = 0
 40 | 	start_time = datetime.now()
 41 | 
 42 | 	content = get_content(USB_DOCS_ALL, args)
 43 | 	soup = BeautifulSoup(content, 'lxml')
 44 | 
 45 | 	# We assume that there is only one table on this page, I know I know
 46 | 	doc_tab = soup.find('table').find('tbody')
 47 | 	for doc in tqdm(doc_tab.find_all('tr')):
 48 | 		tds = doc.find_all('td')
 49 | 		try:
 50 | 			file = tds[0].find('span').find_all('span')[2].find('a')['href']
 51 | 		except Exception as e:
 52 | 			if isinstance(e, KeyboardInterrupt):
 53 | 				sys.quit()
 54 | 			else:
 55 | 				continue
 56 | 
 57 | 		title = tds[0].find_all('a')[1].text
 58 | 
 59 | 		try:
 60 | 			ds = Datasheet \
 61 | 				.where('title', '=', title) \
 62 | 				.where('scraper_id', '=', sc_id) \
 63 | 				.first_or_fail()
 64 | 		except:
 65 | 			ds = Datasheet()
 66 | 			ds.scraper_id = sc_id
 67 | 			ds.found = datetime.now()
 68 | 			ds.src = file
 69 | 			ds.url = file
 70 | 			ds.dl_location = dl_dir
 71 | 			ds.title = title
 72 | 
 73 | 		ds.last_seen = datetime.now()
 74 | 		ds.save()
 75 | 		datasheets += 1
 76 | 
 77 | 		# Try to pull out the "tags"
 78 | 		t_tags = list(map(lambda t: t.strip(), tds[4].text.split(','))) + [
 79 | 			tds[1].text.strip() if tds[1] is not None else '',
 80 | 			tds[2].text.strip() if tds[2] is not None else ''
 81 | 		]
 82 | 
 83 | 		for tag in t_tags:
 84 | 			if tag != '':
 85 | 				try:
 86 | 					te = DatasheetTag \
 87 | 						.where('scraper_id', '=', sc_id) \
 88 | 						.where('name', '=', tag) \
 89 | 						.first_or_fail()
 90 | 				except:
 91 | 					te = DatasheetTag()
 92 | 					te.scraper_id = sc_id
 93 | 					te.name = tag
 94 | 					te.save()
 95 | 
 96 | 				ds.add_tag(te)
 97 | 
 98 | 	end_time = datetime.now()
 99 | 	log(f'Found {datasheets} datasheets in {end_time - start_time}')
100 | 
101 | def parser_init(parser):
102 | 	usbif_options = parser.add_argument_group('USB-IF adapter options')
103 | 
104 | 
105 | def adapter_main(args, driver, driver_options, dl_dir):
106 | 	sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id
107 | 	if not args.skip_collect:
108 | 		collect_datasheets(args, dl_dir)
109 | 
110 | 	if not args.skip_download:
111 | 		sheets = Datasheet.where('src', '!=', 'NULL').where('scraper_id', '=', sc_id).get()
112 | 		with tqdm(
113 | 				miniters = 1, total = len(sheets),
114 | 			) as bar:
115 | 				for ds in sheets:
116 | 					bar.set_description(fixup_title(ds.title))
117 | 					if download_resource(dl_dir, ds, args):
118 | 						bar.update(1)
119 | 
120 | 	return 0
121 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Trawler - Bulk download of datasheets
  2 | 
  3 | Trawler is an application to help in the facilitation of bulk downloads for datasheets and resources from vendor websites.
  4 | 
  5 | This comes about because vendors don't tend to host bulk downloads of their content, or something nice like an rsync mirror, rather we need to result to scraping.
  6 | 
  7 | Trawler is built around selenium, which allows us to pretend to be a user and let AJAX and other bits of JavaSctript run to allow us to interact with the vendor website and collect the datasheets.
  8 | 
  9 | 
 10 | ## Adapters
 11 | 
 12 | Trawler has two types of adapters, source adapters, and meta adapters. The source adapters are responsible for the collection and download of datasheets, where as the meta adapters are for interacting with the cache data Trawler has.
 13 | 
 14 | ### Source Adapters
 15 | 
 16 | The following source adapters are included with Trawler:
 17 |  * `arm` - Download documentation from `https://developer.arm.com/documentation`
 18 |  * `xilinx` - Download the documentation from the Xilinx DocNav service
 19 |  * `usb-if` - Download the documentation from `https://www.usb.org/documents`
 20 |  * `renasas` - Download the documentation from `https://www.renesas.com/us/en/support/document-search`
 21 | 
 22 | The following source adapters are planned:
 23 |  * `ti` - Download the documentation for Texas Instruments.
 24 |  * `st` - Download the documentation from ST.
 25 |  * `microchip` - Download the documentation from Microchip.
 26 |  * `micron` - Download the documentation from Micron
 27 | 
 28 | If there is not an adapter in this list you want, feel free to open an issue or contribute it yourself!
 29 | 
 30 | ### Meta Adapters
 31 | 
 32 | The following meta adapters are implemented currently:
 33 |  * `zorero` - Integration and sync with a local [Zotero](https://www.zotero.org/) install
 34 | 
 35 | The following meta adapters are planned:
 36 |  * `query` - Very trivial datasheet lookup by title / tag
 37 |  * `export` - Export cache information in various formats
 38 | 
 39 | ## Usage
 40 | 
 41 | To use Trawler, in the most simple way, simply invoke the adapter for the datasheet source you want, like so:
 42 | ```
 43 | trawler arm
 44 | ```
 45 | 
 46 | This will cause Trawler to initialize everything it needs and then it will automatically start the entire acquisition process. This will absolutely take a long time, and the length of which heavily depends on the adapter.
 47 | 
 48 | Each adapter has their own settings and configuration in addition to the global settings, to see the settings the adapter support, simply issue `--help` to it:
 49 | ```
 50 | trawler arm --help
 51 | ```
 52 | 
 53 | To list the adapters that Trawler knows about, simply pass `--help` to Trawler by itself and it will let you know about all the adapters it has
 54 | ```
 55 | trawler --help
 56 | ```
 57 | 
 58 | ### Global Settings
 59 | 
 60 | Trawler supports the following settings globally:
 61 |  * `--output / -o` - Specify the output directory for Trawler to use.
 62 |  * `--timeout / -t` - Specify the timeout duration in seconds for network operations.
 63 |  * `--retry / -r` - Specify the number of times to retry network operations.
 64 |  * `--delay / -d` - Specify the delay in seconds for network operations.
 65 |  * `--cache-database / -c` - Specify the location and name of the datasheet cache database Trawler uses.
 66 |  * `--skip-collect / -C` - Skip the datasheet collection stage for the adapter.
 67 |  * `--skip-extract / -E` - Skip the extraction stage for the adapter.
 68 |  * `--skip-download / -D` - Skip the download stage for the adapter.
 69 |  * `--user-agent / -A` - Specify the user-agent to use when downloading files.
 70 | 
 71 | The following settings are used for the WebDriver, and therefore only effect the adapters / stages that use it:
 72 |  * `--profile-directory / -p` - Specify the WebDriver profile directory.
 73 |  * `--webdriver / -w` - Specify the WebDriver to use.
 74 |  * `--headless / -H` - Tell the WebDriver to run in headless mode.
 75 |  * `--headless-width / -X` - Specify the virtual width of the WebDriver instance.
 76 |  * `--headless-height / -Y` - Specify the virtual hight of the WebDriver instance.
 77 | 
 78 | ### ARM Adapter Settings
 79 | 
 80 | The following settings are only applicable to the ARM adapter:
 81 |  * `--arm-document-type / -A` - Specify the types of documents to collect and download.
 82 | 
 83 | ### Xilinx Adapter Settings
 84 | 
 85 | The following settings are only applicable to the Xilinx adapter:
 86 |  * `--dont-group / -G` - Don't group Datasheets into categories and groups when downloading.
 87 |  * `--collect-web-only / -W` - Allow Trawler to collect the web-only content.
 88 |  
 89 | ### Zotero Adapter Settings
 90 | 
 91 | The following settings are only applicable to the Zotero meta adapter:
 92 |  * `--zotero-db-location` - Specify the location of the Zotero database if it's not the default.
 93 | 
 94 | 
 95 | The Zotero has the following actions it can take:
 96 |  * `sync` - Sync the Trawler cache with the Zotero database.
 97 | 
 98 | #### Zotero Sync Settings
 99 | 
100 | The Zotero sync action has the following settings:
101 |  * `--backup` - Backup the Zotero database before performing the sync.
102 |  * `--backup-dir` - Set the backup directory for the Zotero database.
103 | 
104 | ## Installing
105 | 
106 | With pip, all the needed dependencies for Trawler should be pulled in automatically
107 | 
108 | To install the current development snapshot, simply run:
109 | ```
110 | pip3 install --user 'git+https://github.com/bad-alloc-heavy-industries/Trawler.git#egg=Trawler'
111 | ```
112 | Or to install a local development copy:
113 | ```
114 | git clone https://github.com/bad-alloc-heavy-industries/Trawler.git
115 | cd Trawler
116 | pip3 install --user --editable '.'
117 | ```
118 | 
119 | **NOTE:** The adapters that need a WebDriver will only work if you have one installed for selenium to use!
120 | 
121 | ## Important Notes
122 | 
123 |  * Some adapters won't work if the WebDriver viewport is smaller than 1920x1080, you can possibly fix this by running the WebDriver headless with the correct virtual size if the WebDriver supports it.
124 | 
125 | ## License
126 | Trawler is licensed under the [BSD 3-Clause](https://spdx.org/licenses/BSD-3-Clause.html) license, the full text of which can be found in the [`LICENSE`](LICENSE) file.
127 | 


--------------------------------------------------------------------------------
/trawler/db.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: BSD-3-Clause
  2 | from orator import Model, orm
  3 | from orator.migrations import Migrator, Migration, DatabaseMigrationRepository
  4 | 
  5 | from . import config
  6 | from .common import *
  7 | 
  8 | # ==== Models ==== #
  9 | 
 10 | class CacheMetadata(Model):
 11 | 	__connection__ = 'trawler_cache'
 12 | 	__fillable__ = ['name', 'value']
 13 | 
 14 | class DatasheetTag(Model):
 15 | 	__fillable__ = ['scraper_id', 'name']
 16 | 	__connection__ = 'trawler_cache'
 17 | 
 18 | 	@orm.belongs_to_many('tag_links', 'tag_id', 'datasheet_id')
 19 | 	def datasheets(self):
 20 | 		return Datasheet
 21 | 
 22 | class Datasheet(Model):
 23 | 	__fillable__ = [
 24 | 		'scraper_id', 'title', 'url', 'filename',
 25 | 		'dl_location', 'version', 'found', 'last_seen',
 26 | 		'downloaded', 'src'
 27 | 	]
 28 | 	__connection__ = 'trawler_cache'
 29 | 
 30 | 	def has_tag(self, tag):
 31 | 		return self.tags().where('tag_id', '=', tag.id).exists()
 32 | 
 33 | 	def add_tag(self, tag):
 34 | 		if not self.has_tag(tag):
 35 | 			self.tags().attach(tag)
 36 | 
 37 | 	def remove_tag(self, tag):
 38 | 		if self.has_tag(tag):
 39 | 			self.tags().detach(tag)
 40 | 
 41 | 	@orm.belongs_to_many('tag_links', 'datasheet_id', 'tag_id')
 42 | 	def tags(self):
 43 | 		return DatasheetTag
 44 | 
 45 | class Scraper(Model):
 46 | 	__fillable__ = ['name', 'last_run']
 47 | 	__connection__ = 'trawler_cache'
 48 | 
 49 | 	@orm.has_many
 50 | 	def datasheets(self):
 51 | 		return Datasheet
 52 | 
 53 | # ==== Migration ==== #
 54 | 
 55 | class CreateDatasheetTable(Migration):
 56 | 	def up(self):
 57 | 		with self.schema.create('datasheets') as table:
 58 | 			table.increments('id').unique()
 59 | 			table.integer('scraper_id').unsigned()
 60 | 			table.foreign('scraper_id').references('id').on('scrapers').on_delete('cascade')
 61 | 			table.string('title').nullable()
 62 | 			table.string('url').nullable()
 63 | 			table.string('src').nullable()
 64 | 			table.string('filename').nullable()
 65 | 			table.string('dl_location').nullable()
 66 | 			table.string('version').nullable()
 67 | 			table.boolean('downloaded').nullable()
 68 | 			table.datetime('last_seen').nullable()
 69 | 			table.datetime('found')
 70 | 			table.timestamps()
 71 | 
 72 | 	def update(self, from_version):
 73 | 		with self.schema.table('datasheets') as table:
 74 | 			pass
 75 | 
 76 | 	def down(self):
 77 | 		self.schema.drop('datasheets')
 78 | 
 79 | class CreateScraperTable(Migration):
 80 | 	def up(self):
 81 | 		with self.schema.create('scrapers') as table:
 82 | 			table.increments('id').unique()
 83 | 			table.string('name')
 84 | 			table.datetime('last_run').nullable()
 85 | 			table.boolean('meta').nullable()
 86 | 			table.timestamps()
 87 | 
 88 | 	def update(self, from_version):
 89 | 		with self.schema.table('scrapers') as table:
 90 | 			# All the fields added in v0.2
 91 | 			if from_version < 1:
 92 | 				table.boolean('meta').nullable()
 93 | 
 94 | 	def down(self):
 95 | 		self.schema.drop('scrapers')
 96 | 
 97 | class CreateDatasheetTagTable(Migration):
 98 | 	def up(self):
 99 | 		with self.schema.create('datasheet_tags') as table:
100 | 			table.increments('id').unique()
101 | 			table.integer('scraper_id').unsigned()
102 | 			table.foreign('scraper_id').references('id').on('scrapers').on_delete('cascade')
103 | 			table.string('name')
104 | 			table.timestamps()
105 | 
106 | 	def update(self, from_version):
107 | 		with self.schema.table('datasheet_tags') as table:
108 | 			pass
109 | 
110 | 	def down(self):
111 | 		self.schema.drop('datasheet_tags')
112 | 
113 | class CreateTagLinkTable(Migration):
114 | 	def up(self):
115 | 		with self.schema.create('tag_links') as table:
116 | 			table.increments('id').unique()
117 | 			table.integer('tag_id').unsigned()
118 | 			table.foreign('tag_id').references('id').on('datasheet_tags').on_delete('cascade')
119 | 			table.integer('datasheet_id').unsigned()
120 | 			table.foreign('datasheet_id').references('id').on('datasheets').on_delete('cascade')
121 | 			table.timestamps()
122 | 
123 | 	def update(self, from_version):
124 | 		with self.schema.table('tag_links') as table:
125 | 			pass
126 | 
127 | 	def down(self):
128 | 		self.schema.drop('tag_links')
129 | 
130 | class CreateCacheMetadataTable(Migration):
131 | 	def up(self):
132 | 		with self.schema.create('cache_metadata') as table:
133 | 			table.increments('id').unique()
134 | 			table.string('name')
135 | 			table.string('value').nullable()
136 | 			table.timestamps()
137 | 
138 | 	def update(self, from_version):
139 | 		with self.schema.table('cache_metadata') as table:
140 | 			pass
141 | 
142 | 	def down(self):
143 | 		self.schema.drop('cache_metadata')
144 | 
145 | 
146 | _MIGRATIONS = (
147 | 	CreateDatasheetTable,
148 | 	CreateScraperTable,
149 | 	CreateDatasheetTagTable,
150 | 	CreateTagLinkTable,
151 | 	CreateCacheMetadataTable,
152 | )
153 | 
154 | def check_schema(dbm):
155 | 	# If we don't have the cache metadata table, we absolutely need to run migrations
156 | 	try:
157 | 		dbm.table('cache_metadata').exists()
158 | 	except:
159 | 		wrn('Trawler schema is massively out of date, updating')
160 | 		run_migration(dbm, CreateCacheMetadataTable)
161 | 	finally:
162 | 		try:
163 | 			sv = CacheMetadata.where('name', '=', 'schema_version').first_or_fail()
164 | 		except:
165 | 			sv = CacheMetadata()
166 | 			sv.name = 'schema_version'
167 | 			sv.value = 0 # We plan to run an upgrade anyway so
168 | 			sv.save()
169 | 
170 | 		if int(sv.value) < config.TRAWLER_SCHEMA_VERSION:
171 | 			# The schema version is out of date, run the update
172 | 			run_update(dbm, int(sv.value))
173 | 
174 | 			# The update ran, save the new schema version
175 | 			sv.value = config.TRAWLER_SCHEMA_VERSION
176 | 			sv.save()
177 | 
178 | 
179 | 
180 | def run_migration(dbm, m):
181 | 	dbm_repo = DatabaseMigrationRepository(dbm, 'migrations')
182 | 
183 | 	mi = m()
184 | 	mi.set_connection(dbm_repo.get_connection())
185 | 
186 | 	if mi.transactional:
187 | 		with mi.db.transaction():
188 | 			mi.up()
189 | 	else:
190 | 		mi.up()
191 | 
192 | 
193 | def run_migrations(dbm):
194 | 	dbm_repo = DatabaseMigrationRepository(dbm, 'migrations')
195 | 
196 | 	for m in _MIGRATIONS:
197 | 		mi = m()
198 | 		mi.set_connection(dbm_repo.get_connection())
199 | 
200 | 		if mi.transactional:
201 | 			with mi.db.transaction():
202 | 				mi.up()
203 | 		else:
204 | 			mi.up()
205 | 
206 | def run_update(dbm, from_version):
207 | 	dbm_repo = DatabaseMigrationRepository(dbm, 'migrations')
208 | 	inf(f'Updating Trawler schema from v{from_version} to v{config.TRAWLER_SCHEMA_VERSION}')
209 | 
210 | 	for m in _MIGRATIONS:
211 | 		mi = m()
212 | 		mi.set_connection(dbm_repo.get_connection())
213 | 
214 | 		if mi.transactional:
215 | 			with mi.db.transaction():
216 | 				mi.update(from_version)
217 | 		else:
218 | 			mi.update(from_version)
219 | 


--------------------------------------------------------------------------------
/trawler/adapters/arm.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: BSD-3-Clause
  2 | """
  3 | arm.py
  4 | ---------
  5 | 
  6 | This script is designed to scrape all of the datasheets from https://developer.arm.com/documentation
  7 | 
  8 | """
  9 | import sys
 10 | import time
 11 | import enum
 12 | 
 13 | from enum import Enum, Flag
 14 | from os import getcwd, path, mkdir
 15 | from datetime import datetime, timedelta
 16 | 
 17 | import requests
 18 | from requests import utils
 19 | 
 20 | from tqdm import tqdm
 21 | from selenium import webdriver
 22 | 
 23 | from ..common import *
 24 | from ..net import download_resource
 25 | from ..db import Datasheet, DatasheetTag, Scraper
 26 | 
 27 | @enum.unique
 28 | class DocumentType(Enum):
 29 | 	ReferenceManual          = enum.auto()
 30 | 	Architecture             = enum.auto()
 31 | 	Guide                    = enum.auto()
 32 | 	ApplicationNote          = enum.auto()
 33 | 	KnowledgeBaseArticle     = enum.auto()
 34 | 	ReleaseNote              = enum.auto()
 35 | 	SoftwareErrata           = enum.auto()
 36 | 
 37 | 	def __str__(self) -> str:
 38 | 		return self.name
 39 | 
 40 | 	@staticmethod
 41 | 	def from_string(s):
 42 | 		try:
 43 | 			return DocumentType[s]
 44 | 		except KeyError:
 45 | 			raise ValueError()
 46 | 
 47 | 	def filter_name(self):
 48 | 		if self.value == DocumentType.ReferenceManual.value:
 49 | 			return 'Technical%20Reference%20Manual'
 50 | 		if self.value == DocumentType.Architecture.value:
 51 | 			return 'Architecture%20Document'
 52 | 		if self.value == DocumentType.Guide.value:
 53 | 			return 'Guide'
 54 | 		if self.value == DocumentType.ApplicationNote.value:
 55 | 			return 'Application%20Note'
 56 | 		if self.value == DocumentType.KnowledgeBaseArticle.value:
 57 | 			return 'Knowledge%20Base%20Article'
 58 | 		if self.value == DocumentType.ReleaseNote.value:
 59 | 			return 'Release%20Note'
 60 | 		if self.value == DocumentType.SoftwareErrata.value:
 61 | 			return 'Software%20Developer%20Errata%20Notice'
 62 | 
 63 | 
 64 | ADAPTER_NAME = 'arm'
 65 | ADAPTER_DESC = 'arm datasheet adapter'
 66 | 
 67 | ARM_DOCS_ROOT_URL = 'https://developer.arm.com/documentation'
 68 | 
 69 | 
 70 | def extract_datasheet(driver, ds):
 71 | 	tlog(f'  => Extracting datasheet {ds.id} from {ds.src}')
 72 | 	driver.get(ds.src)
 73 | 	time.sleep(3.5)
 74 | 	try:
 75 | 		driver.find_element_by_xpath('/html/body/div/div/div[2]/main/div/div[1]/div/div/div[1]/div/button').click()
 76 | 		dl_loc = driver.find_element_by_xpath('/html/body/div/div/div[2]/main/div/div[1]/div/div/div[1]/div[2]/a')
 77 | 	except Exception as e:
 78 | 		if isinstance(e, KeyboardInterrupt):
 79 | 			sys.quit()
 80 | 		else:
 81 | 			terr(f'  => Error: Unable to extract datasheet with id {ds.id}')
 82 | 			return False
 83 | 
 84 | 	ds.url = dl_loc.get_attribute('href')
 85 | 	ds.save()
 86 | 	return True
 87 | 
 88 | 
 89 | 
 90 | def collect_datasheets(driver, doc_types):
 91 | 	sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id
 92 | 
 93 | 	dt_string = ','.join(map(lambda dt: (DocumentType.from_string(dt)).filter_name(), doc_types))
 94 | 	doc_url = f'sort=relevancy&f:@navigationhierarchiescontenttype=[{dt_string}]'
 95 | 	has_next_page = True
 96 | 	page_index = 0
 97 | 	datasheets = 0
 98 | 	start_time = datetime.now()
 99 | 
100 | 	log('Collecting datasheets... this might take a while')
101 | 	while has_next_page:
102 | 		inf(f'  => On page index {(page_index//10)+1}, total so far {datasheets}')
103 | 		# We need to wait because ajax and frames and just bad.
104 | 		driver.get(f'{ARM_DOCS_ROOT_URL}/#first={page_index}&{doc_url}')
105 | 		time.sleep(3)
106 | 
107 | 		doc_list = driver.find_elements_by_xpath('//*[@id="search"]/div[2]/div[2]/div[10]/div/*')
108 | 
109 | 		for doc in doc_list:
110 | 			card_link = doc.find_element_by_xpath('.//div[1]/div/div[2]/div[1]/div/div/a')
111 | 			try:
112 | 				tags_row = doc.find_element_by_css_selector('div.documentTagsContainer')
113 | 			except:
114 | 				tags_row = None
115 | 
116 | 			title = card_link.get_attribute('title')
117 | 			url = card_link.get_attribute('href')
118 | 
119 | 			try:
120 | 				ds = Datasheet \
121 | 					.where('title', '=', title) \
122 | 					.where('scraper_id', '=', sc_id) \
123 | 					.first_or_fail()
124 | 
125 | 			except:
126 | 				ds = Datasheet()
127 | 				ds.scraper_id = sc_id
128 | 				ds.found = datetime.now()
129 | 				ds.src = url
130 | 				ds.title = title
131 | 
132 | 			ds.last_seen = datetime.now()
133 | 			ds.save()
134 | 
135 | 			if tags_row is not None:
136 | 				tags = tags_row.find_elements_by_xpath('.//span')
137 | 				for tag in tags:
138 | 					for t in tag.text.split(' '):
139 | 						if t != '':
140 | 							try:
141 | 								te = DatasheetTag \
142 | 									.where('scraper_id', '=', sc_id) \
143 | 									.where('name', '=', t) \
144 | 									.first_or_fail()
145 | 							except:
146 | 								te = DatasheetTag()
147 | 								te.scraper_id = sc_id
148 | 								te.name = t
149 | 								te.save()
150 | 
151 | 							ds.add_tag(te)
152 | 
153 | 			ds.save()
154 | 
155 | 			datasheets += 1
156 | 
157 | 		try:
158 | 			driver.find_element_by_xpath('//*[@id="search"]/div[2]/div[2]/div[11]/ul/li[6]')
159 | 			page_index += 10
160 | 		except:
161 | 			has_next_page = False
162 | 
163 | 	end_time = datetime.now()
164 | 
165 | 	log(f'Found {datasheets} datasheets in {end_time - start_time}')
166 | 
167 | 
168 | def parser_init(parser):
169 | 	arm_options = parser.add_argument_group('arm adapter options')
170 | 
171 | 	arm_options.add_argument(
172 | 		'--arm-document-type', '-A',
173 | 		dest = 'arm_doc_type',
174 | 		type = DocumentType.from_string,
175 | 		choices = list(DocumentType),
176 | 		default = [ 'ReferenceManual', 'Architecture' ],
177 | 		help = 'ARM Documentation types to download'
178 | 	)
179 | 
180 | def adapter_main(args, driver, driver_options, dl_dir):
181 | 	sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id
182 | 	if not args.skip_collect or not args.skip_extract:
183 | 		with driver(options = driver_options) as wd:
184 | 			if not args.skip_collect:
185 | 				collect_datasheets(wd, args.arm_doc_type)
186 | 
187 | 			if not args.skip_extract:
188 | 				sheets = Datasheet.where('src', '!=', 'NULL').where('scraper_id', '=', sc_id).get()
189 | 				with tqdm(
190 | 						miniters = 1, total = len(sheets),
191 | 					) as bar:
192 | 						for ds in sheets:
193 | 							bar.set_description(fixup_title(ds.title))
194 | 							if extract_datasheet(wd, ds):
195 | 								bar.update(1)
196 | 
197 | 	if not args.skip_download:
198 | 		sheets = Datasheet.where('url', '!=', 'NULL').where('scraper_id', '=', sc_id).get()
199 | 		with tqdm(
200 | 				miniters = 1, total = len(sheets),
201 | 			) as bar:
202 | 				for ds in sheets:
203 | 					bar.set_description(fixup_title(ds.title))
204 | 					if download_resource(dl_dir, ds, args):
205 | 						bar.update(1)
206 | 
207 | 
208 | 	return 0
209 | 


--------------------------------------------------------------------------------
/trawler/adapters/xilinx.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: BSD-3-Clause
  2 | """
  3 | xilinx.py
  4 | ---------
  5 | 
  6 | This script is designed to scrape all of the datasheets from xilinx.com
  7 | 
  8 | """
  9 | import sys
 10 | import time
 11 | import enum
 12 | 
 13 | from enum import Enum, Flag
 14 | from os import getcwd, path, mkdir
 15 | from datetime import datetime, timedelta
 16 | 
 17 | import requests
 18 | from requests import utils
 19 | 
 20 | from tqdm import tqdm
 21 | from selenium import webdriver
 22 | 
 23 | from ..common import *
 24 | from ..net import download_resource, get_content
 25 | from ..db import Datasheet, DatasheetTag, Scraper
 26 | 
 27 | from bs4 import BeautifulSoup
 28 | 
 29 | @enum.unique
 30 | class DocumentSource(Enum):
 31 | 	DocNav = enum.auto()
 32 | 	Web    = enum.auto()
 33 | 
 34 | 	def __str__(self) -> str:
 35 | 		return self.name
 36 | 
 37 | 	@staticmethod
 38 | 	def from_string(s):
 39 | 		try:
 40 | 			return DocumentSource[s]
 41 | 		except KeyError:
 42 | 			raise ValueError()
 43 | 
 44 | ADAPTER_NAME = 'xilinx'
 45 | ADAPTER_DESC = 'Xilinx datasheet adapter'
 46 | 
 47 | XILINX_DOCNAV_ROOT = 'https://xilinx.com/support/documentation/navigator'
 48 | XILINX_HUBS_INDEX = f'{XILINX_DOCNAV_ROOT}/xhubs.xml'
 49 | XILINX_DOCS_INDEX = f'{XILINX_DOCNAV_ROOT}/xdocs.xml'
 50 | 
 51 | def extract_datasheet(driver, ds):
 52 | 	tlog(f'  => Extracting datasheet {ds.id} from {ds.src}')
 53 | 
 54 | def collect_datasheets(driver, doc_types):
 55 | 	sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id
 56 | 
 57 | 
 58 | def docnav_collect_docs(args):
 59 | 	log(f'Downloading doc index from {XILINX_DOCS_INDEX}')
 60 | 	catalogs = []
 61 | 	content = get_content(XILINX_DOCS_INDEX, args)
 62 | 	soup = BeautifulSoup(content, 'lxml')
 63 | 	for catalog in soup.find_all('catalog'):
 64 | 		# ニャ！
 65 | 		cat = {
 66 | 			'catalog': catalog['label'],
 67 | 			'product': catalog['productName'] if 'productName' in catalog else catalog['productname'],
 68 | 			'collection': catalog['collection'],
 69 | 			'groups': []
 70 | 		}
 71 | 
 72 | 		inf(f'  => Found catalog {cat["catalog"]}')
 73 | 		for group in catalog.find_all('group'):
 74 | 			grp = {
 75 | 				'title': group['label'],
 76 | 				'docs': []
 77 | 			}
 78 | 
 79 | 			inf(f'      => Found group {grp["title"]}')
 80 | 			for doc in group.find_all('document'):
 81 | 				title = doc.find('title')
 82 | 				loc = doc.find('webLocation')
 83 | 				doc_id = doc.find('docID')
 84 | 				doc_type = doc.find('docType')
 85 | 				desc = doc.find('tooltip')
 86 | 				tags = doc.find('functionTags')
 87 | 
 88 | 				grp['docs'].append({
 89 | 					'title': title.get_text() if title is not None else '',
 90 | 					'location': loc.get_text() if loc is not None  else '',
 91 | 					'doc_id': doc_id.get_text()  if doc_id is not None else '',
 92 | 					'type': doc_type.get_text() if doc_type is not None  else '',
 93 | 					'desc': desc.get_text() if desc is not None else '',
 94 | 					'tags': tags.get_text().split(',') if tags is not None else [],
 95 | 				})
 96 | 
 97 | 			cat['groups'].append(grp)
 98 | 		catalogs.append(cat)
 99 | 
100 | 	return catalogs
101 | 
102 | def docnav_populate(args, docs, dl_dir):
103 | 	inf('Populating datasheet database')
104 | 	sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id
105 | 	for cat in docs:
106 | 		inf(f'  => Populating from catalog {cat["catalog"]}')
107 | 		try:
108 | 			cat_tag = DatasheetTag \
109 | 					.where('scraper_id', '=', sc_id) \
110 | 					.where('name', '=', cat['catalog']) \
111 | 					.first_or_fail()
112 | 		except:
113 | 			cat_tag = DatasheetTag()
114 | 			cat_tag.scraper_id = sc_id
115 | 			cat_tag.name = cat['catalog']
116 | 			cat_tag.save()
117 | 
118 | 		for grp in cat['groups']:
119 | 			inf(f'      => Populating {len(grp["docs"])} docs from group {cat["catalog"]}/{grp["title"]}')
120 | 			try:
121 | 				grp_tag = DatasheetTag \
122 | 						.where('scraper_id', '=', sc_id) \
123 | 						.where('name', '=', grp['title']) \
124 | 						.first_or_fail()
125 | 			except:
126 | 				grp_tag = DatasheetTag()
127 | 				grp_tag.scraper_id = sc_id
128 | 				grp_tag.name = grp['title']
129 | 				grp_tag.save()
130 | 
131 | 			for doc in grp['docs']:
132 | 				try:
133 | 					ds = Datasheet \
134 | 						.where('title', '=', doc['title']) \
135 | 						.where('scraper_id', '=', sc_id) \
136 | 						.first_or_fail()
137 | 
138 | 				except:
139 | 					ds = Datasheet()
140 | 					ds.scraper_id = sc_id
141 | 					ds.found = datetime.now()
142 | 					ds.src = doc['location']
143 | 					ds.url = doc['location']
144 | 					ds.title = doc['title']
145 | 
146 | 				if not args.xilinx_doc_group:
147 | 					c_dir = path.join(dl_dir, cat['catalog'].replace('/', '_'))
148 | 					g_dir = path.join(c_dir, grp['title'].replace('/', '_'))
149 | 					if not path.exists(c_dir):
150 | 						log(f'  => Catalog {cat["catalog"]} does not exist, creating')
151 | 						mkdir(c_dir)
152 | 
153 | 					if not path.exists(g_dir):
154 | 						log(f'  => Group {cat["catalog"]}/{grp["title"]} does not exist, creating')
155 | 						mkdir(g_dir)
156 | 
157 | 					ds.dl_location = g_dir
158 | 				ds.save()
159 | 
160 | 				ds.add_tag(cat_tag)
161 | 				ds.add_tag(grp_tag)
162 | 				for tag in doc['tags']:
163 | 					if tag != '':
164 | 						try:
165 | 							ds_tag = DatasheetTag \
166 | 									.where('scraper_id', '=', sc_id) \
167 | 									.where('name', '=', tag) \
168 | 									.first_or_fail()
169 | 						except:
170 | 							ds_tag = DatasheetTag()
171 | 							ds_tag.scraper_id = sc_id
172 | 							ds_tag.name = tag
173 | 							ds_tag.save()
174 | 
175 | 						ds.add_tag(ds_tag)
176 | 				ds.save()
177 | 
178 | def docnav_runner(args, dl_dir):
179 | 	inf('Downloading datasheets from DocNav')
180 | 	sc = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail()
181 | 
182 | 	if not args.skip_collect:
183 | 		docs = docnav_collect_docs(args)
184 | 		if docs is None:
185 | 			err('Unable to collect Xilinx hubs')
186 | 			return 1
187 | 
188 | 		# Populate the datasheet database
189 | 		docnav_populate(args, docs, dl_dir)
190 | 
191 | 	# Now we have all the datasheets, we can download them
192 | 	if not args.skip_download:
193 | 		sheets = Datasheet.where('url', '!=', 'NULL').where('scraper_id', '=', sc.id).get()
194 | 		with tqdm(
195 | 				miniters = 1, total = len(sheets),
196 | 			) as bar:
197 | 				for ds in sheets:
198 | 					bar.set_description(fixup_title(ds.title))
199 | 					if ds.url[-4:] == 'html' and args.xilinx_get_web_only:
200 | 						if download_resource(dl_dir, ds, args):
201 | 							bar.update(1)
202 | 					else:
203 | 						if download_resource(dl_dir, ds, args):
204 | 							bar.update(1)
205 | 
206 | 	sc.last_run = datetime.now()
207 | 	sc.save()
208 | 
209 | 	return 0
210 | 
211 | def web_runner(args, driver, dl_dir):
212 | 	inf('Downloading datasheets from web')
213 | 	sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id
214 | 
215 | 	if not args.skip_collect:
216 | 		collect_datasheets(driver, args.arm_document_type)
217 | 
218 | 	if not args.skip_extract:
219 | 		for ds in tqdm(Datasheet.all()):
220 | 			extract_datasheet(driver, ds)
221 | 
222 | 	if not args.skip_download:
223 | 		for ds in tqdm(Datasheet.where('url', '!=', 'NULL').get()):
224 | 			download_datasheet(dl_dir, ds)
225 | 
226 | 	return 0
227 | 
228 | def parser_init(parser):
229 | 	xilinx_options = parser.add_argument_group('Xilinx adapter options')
230 | 
231 | 	xilinx_options.add_argument(
232 | 		'--document-source', '-d',
233 | 		dest = 'xilinx_doc_source',
234 | 		type = DocumentSource.from_string,
235 | 		choices = list(DocumentSource),
236 | 		default = 'DocNav',
237 | 		help = 'Documentation Source'
238 | 	)
239 | 
240 | 	xilinx_options.add_argument(
241 | 		'--dont-group', '-G',
242 | 		dest = 'xilinx_doc_group',
243 | 		default = False,
244 | 		action = 'store_true',
245 | 		help = 'Don\'t group the datasheets when using DocNav as the document source'
246 | 	)
247 | 
248 | 	xilinx_options.add_argument(
249 | 		'--collect-web-only', '-W',
250 | 		dest = 'xilinx_get_web_only',
251 | 		default = False,
252 | 		action = 'store_true',
253 | 		help = 'Also archive the web-only content and monolithic HTML pages',
254 | 	)
255 | 
256 | def adapter_main(args, driver, driver_options, dl_dir):
257 | 	if args.xilinx_doc_source == DocumentSource.DocNav:
258 | 		return docnav_runner(args, dl_dir)
259 | 	elif args.xilinx_doc_source == DocumentSource.Web:
260 | 		with driver(options = driver_options) as wd:
261 | 			return web_runner(args, wd, dl_dir)
262 | 	else:
263 | 		err(f'Unknown Xilinx documentation source {args.xilinx_doc_source}!')
264 | 		return 1
265 | 


--------------------------------------------------------------------------------
/trawler/__init__.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: BSD-3-Clause
  2 | 
  3 | __all__ = ('main')
  4 | 
  5 | def _init_directories():
  6 | 	from . import config
  7 | 
  8 | 	from os import path, mkdir
  9 | 
 10 | 	dirs = (
 11 | 		# Core Directories
 12 | 		config.TRAWLER_DATA,
 13 | 		config.TRAWLER_CACHE,
 14 | 		# Sub directories
 15 | 		config.TRAWLER_USER_ADAPTERS,
 16 | 	)
 17 | 
 18 | 	for d in dirs:
 19 | 		if not path.exists(d):
 20 | 			mkdir(d)
 21 | 
 22 | def _populate_webdriver_opts(args):
 23 | 	from .config import WebdriverBackend
 24 | 
 25 | 	from selenium.webdriver import firefox
 26 | 	from selenium.webdriver import chrome
 27 | 
 28 | 	opts = None
 29 | 
 30 | 	if args.webdriver == WebdriverBackend.Chrome:
 31 | 		opts = firefox.options.Options()
 32 | 
 33 | 
 34 | 	elif args.webdriver == WebdriverBackend.FireFox:
 35 | 		opts = chrome.options.Options()
 36 | 
 37 | 	return opts
 38 | 
 39 | def _collect_adapters():
 40 | 	import pkgutil
 41 | 
 42 | 	from . import db
 43 | 	from . import adapters
 44 | 
 45 | 	adpts = []
 46 | 	# Load the built-in internal adapters
 47 | 	for _, name, is_pkg in pkgutil.iter_modules(path = getattr(adapters, '__path__')):
 48 | 		if not is_pkg:
 49 | 			__import__(f'{getattr(adapters, "__name__")}.{name}')
 50 | 			if not hasattr(getattr(adapters, name), 'DONT_LOAD'):
 51 | 				adpts.append({
 52 | 					'name': getattr(adapters, name).ADAPTER_NAME,
 53 | 					'description': getattr(adapters, name).ADAPTER_DESC,
 54 | 					'parser_init': getattr(adapters, name).parser_init,
 55 | 					'main': getattr(adapters, name).adapter_main,
 56 | 					'is_meta': hasattr(getattr(adapters, name), 'META_ADAPTER'),
 57 | 				})
 58 | 	# Load the adapters from the share
 59 | 	# TODO: this
 60 | 
 61 | 	return adpts
 62 | 
 63 | 
 64 | def main():
 65 | 	from . import config
 66 | 	from . import db
 67 | 	from .common import log, err, wrn, inf, dbg
 68 | 
 69 | 	import os
 70 | 	from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 71 | 
 72 | 	from orator import DatabaseManager, Model
 73 | 	from selenium import webdriver
 74 | 
 75 | 	_init_directories()
 76 | 
 77 | 	ADAPTERS = _collect_adapters()
 78 | 
 79 | 	parser = ArgumentParser(formatter_class = ArgumentDefaultsHelpFormatter, description = f'Trawler datasheet scraper')
 80 | 
 81 | 	scraper_options = parser.add_argument_group('Global scraper options')
 82 | 
 83 | 	scraper_options.add_argument(
 84 | 		'--output', '-o',
 85 | 		type = str,
 86 | 		default = config.DEFAULT_OUTPUT_DIR,
 87 | 		help = 'Datasheet download root'
 88 | 	)
 89 | 
 90 | 	scraper_options.add_argument(
 91 | 		'--timeout', '-t',
 92 | 		type = int,
 93 | 		default = config.DEFAULT_TIMEOUT,
 94 | 		help = 'Entry timeout in seconds'
 95 | 	)
 96 | 
 97 | 	scraper_options.add_argument(
 98 | 		'--retry', '-r',
 99 | 		type = int,
100 | 		default = config.DEFAULT_RETRY_COUNT,
101 | 		help = 'Download retry count'
102 | 	)
103 | 
104 | 	scraper_options.add_argument(
105 | 		'--delay', '-d',
106 | 		type = int,
107 | 		default = config.DEFAULT_DOWNLOAD_DELAY,
108 | 		help = 'Download delay in seconds'
109 | 	)
110 | 
111 | 	scraper_options.add_argument(
112 | 		'--cache-database', '-c',
113 | 		type = str,
114 | 		default = config.DEFAULT_DATABASE,
115 | 		help = 'Cache database'
116 | 	)
117 | 
118 | 	scraper_options.add_argument(
119 | 		'--skip-collect', '-C',
120 | 		default = False,
121 | 		action = 'store_true',
122 | 		help = 'Skip the datasheet collection stage'
123 | 	)
124 | 
125 | 	scraper_options.add_argument(
126 | 		'--skip-extract', '-E',
127 | 		default = False,
128 | 		action = 'store_true',
129 | 		help = 'Skip the datasheet extraction stage'
130 | 	)
131 | 
132 | 	scraper_options.add_argument(
133 | 		'--skip-download', '-D',
134 | 		default = False,
135 | 		action = 'store_true',
136 | 		help = 'Skip the datasheet extraction stage'
137 | 	)
138 | 
139 | 	scraper_options.add_argument(
140 | 		'--skip-archives',
141 | 		default = False,
142 | 		action = 'store_true',
143 | 		help = 'Skip downloading archives'
144 | 	)
145 | 
146 | 	scraper_options.add_argument(
147 | 		'--extract-archives',
148 | 		default = False,
149 | 		action = 'store_true',
150 | 		help = 'Extract downloaded archives and attempt to index them'
151 | 	)
152 | 
153 | 	scraper_options.add_argument(
154 | 		'--skip-executables',
155 | 		default = False,
156 | 		action = 'store_true',
157 | 		help = 'Skip downloading executables'
158 | 	)
159 | 
160 | 	scraper_options.add_argument(
161 | 		'--user-agent', '-A',
162 | 		type = str,
163 | 		default = config.DEFAULT_USER_AGENT,
164 | 		help = 'Specify the user-agent to use'
165 | 	)
166 | 
167 | 	wd_options = parser.add_argument_group('Selenium WebDriver Settings')
168 | 
169 | 	wd_options.add_argument(
170 | 		'--profile-directory', '-p',
171 | 		type = str,
172 | 		default = config.DEFAULT_PROFILE_DIRECTORY,
173 | 		help = 'Selenium WebDriver profile directory'
174 | 	)
175 | 
176 | 	wd_options.add_argument(
177 | 		'--webdriver', '-w',
178 | 		type = config.WebdriverBackend.from_string,
179 | 		choices = list(config.WebdriverBackend),
180 | 		default = config.DEFAULT_WEBDRIVER,
181 | 		help = 'Selenium WebDriver to use'
182 | 	)
183 | 
184 | 	wd_options.add_argument(
185 | 		'--headless', '-H',
186 | 		default = config.DEFAULT_WD_HEADLESS,
187 | 		action = 'store_true',
188 | 		help = 'Run the Selenium WebDriver heedlessly'
189 | 	)
190 | 
191 | 	wd_options.add_argument(
192 | 		'--headless-height', '-Y',
193 | 		type = str,
194 | 		default = config.DEFAULT_WD_HEADLESS_RES[0],
195 | 		help = 'Specify the hight of the headless window'
196 | 	)
197 | 
198 | 	wd_options.add_argument(
199 | 		'--headless-width', '-X',
200 | 		type = str,
201 | 		default = config.DEFAULT_WD_HEADLESS_RES[1],
202 | 		help = 'Specify the width of the headless window'
203 | 	)
204 | 
205 | 	# Maybe one day
206 | 	# wd_options.add_argument(
207 | 	# 	'--proxy', '-P',
208 | 	# 	type = str,
209 | 	# 	default = None,
210 | 	# 	help = 'Proxy to use for the Selenium WebDriver',
211 | 	# )
212 | 
213 | 	adapter_parser = parser.add_subparsers(
214 | 			dest = 'adapter',
215 | 			required = True
216 | 		)
217 | 
218 | 	# Add the adapter settings
219 | 	for adpt in ADAPTERS:
220 | 		ap = adapter_parser.add_parser(
221 | 				adpt['name'],
222 | 				help = adpt['description']
223 | 			)
224 | 		adpt['parser_init'](ap)
225 | 
226 | 	# Actually parse the arguments
227 | 	args = parser.parse_args()
228 | 
229 | 	# Initialize the download directory if not done so
230 | 	if not os.path.exists(args.output):
231 | 		wrn(f'Output directory {args.output} does not exist, creating')
232 | 		os.mkdir(args.output)
233 | 
234 | 	# Initialize the Database
235 | 	dbc = config.DATABASE
236 | 	dbc['trawler_cache']['database'] = args.cache_database
237 | 
238 | 	if args.adapter == 'zotero':
239 | 		dbc['zotero']['database'] = args.zotero_db_loc
240 | 
241 | 	dbm = DatabaseManager(config.DATABASE)
242 | 	Model.set_connection_resolver(dbm)
243 | 
244 | 	if not os.path.exists(args.cache_database):
245 | 		common.wrn('Cache database does not exists, creating')
246 | 		db.run_migrations(dbm)
247 | 	else:
248 | 		# Check the DB schema and update it if need be
249 | 		db.check_schema(dbm)
250 | 
251 | 	inf(f'Cache database located at {args.cache_database}')
252 | 
253 | 	# Initialize the datasheet directory
254 | 	if not os.path.exists(args.output):
255 | 		log(f'Datasheet download directory {args.output} does not exist, creating')
256 | 		os.mkdir(args.output)
257 | 
258 | 	# Initialize the profile directory
259 | 	if not os.path.exists(args.profile_directory):
260 | 		log(f'WebDriver profile \'{args.profile_directory}\' does not exist, creating')
261 | 		os.mkdir(args.profile_directory)
262 | 
263 | 	# WebDriver Initialization
264 | 	inf(f'Using the {args.webdriver} WebDriver')
265 | 	if args.webdriver == config.WebdriverBackend.Chrome:
266 | 		wd_opts = webdriver.chrome.options.Options()
267 | 		wd = webdriver.Chrome
268 | 
269 | 		wd_opts.add_argument(f'user-data-dir={args.profile_directory}')
270 | 
271 | 		if args.headless:
272 | 			wd_opts.add_argument('--headless')
273 | 			wd_opts.add_argument(f'--window-size={args.headless_width},{args.headless_height}')
274 | 
275 | 
276 | 	elif args.webdriver == config.WebdriverBackend.FireFox:
277 | 		wd = webdriver.Firefox
278 | 		wd_opts = webdriver.firefox.options.Options()
279 | 
280 | 		wd_profile = webdriver.firefox.firefox_profile.FirefoxProfile(args.profile_directory)
281 | 		wd_opts.profile = wd_profile
282 | 
283 | 		if args.headless:
284 | 			wd_opts.headless = True
285 | 
286 | 	else:
287 | 		err('Unknown WebDriver, what?')
288 | 		return 1
289 | 
290 | 	# Ensure the database is properly populated w/ known adapters
291 | 	for adapter in ADAPTERS:
292 | 		try:
293 | 			db.Scraper.where('name', '=', adapter['name']).first_or_fail()
294 | 		except:
295 | 			s = db.Scraper()
296 | 			s.name = adapter['name']
297 | 			if adapter['is_meta']:
298 | 				s.meta = True
299 | 			s.save()
300 | 
301 | 	# Get the adapter we need to run
302 | 	if args.adapter not in map(lambda a: a['name'], ADAPTERS):
303 | 		err(f'Unknown adapter {args.adapter}')
304 | 		err(f'Known adapters: {", ".join(map(lambda a: a["name"], ADAPTERS))}')
305 | 		return 1
306 | 	else:
307 | 		adpt = list(filter(lambda a: a['name'] == args.adapter, ADAPTERS))[0]
308 | 
309 | 	# Initialize the adapter download directory
310 | 	dl_dir = os.path.join(args.output, adpt['name'])
311 | 	if not os.path.exists(dl_dir) and not adpt['is_meta']:
312 | 		wrn(f'Adapter datasheet directory {dl_dir} does not exist, creating...')
313 | 		os.mkdir(dl_dir)
314 | 
315 | 	# Actually run the adapter
316 | 	if adpt['is_meta']:
317 | 		return adpt['main'](args, dl_dir)
318 | 	else:
319 | 		return adpt['main'](args, wd, wd_opts, dl_dir)
320 | 


--------------------------------------------------------------------------------
/trawler/adapters/zotero.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: BSD-3-Clause
  2 | """
  3 | zotero.py
  4 | ---------
  5 | 
  6 | This is the zotero meta adapter
  7 | 
  8 | """
  9 | import sys
 10 | import os
 11 | 
 12 | from shutil import copyfile
 13 | from tempfile import gettempdir
 14 | from datetime import datetime
 15 | 
 16 | from orator import Model, orm
 17 | 
 18 | from tqdm import tqdm
 19 | 
 20 | from .. import config
 21 | from ..common import *
 22 | from ..net import download_resource
 23 | from ..db import Datasheet, DatasheetTag, Scraper
 24 | 
 25 | 
 26 | META_ADAPTER = 0
 27 | 
 28 | ADAPTER_NAME = 'zotero'
 29 | ADAPTER_DESC = 'Trawler zotero meta adapter'
 30 | 
 31 | ZOTERO_BACKUP_DIR = gettempdir()
 32 | ZOTERO_TRAWLER_ROOT_COLLECTION = 'Trawler'
 33 | 
 34 | def gen_key(key_len = 8):
 35 | 	from random import choice
 36 | 	from string import ascii_uppercase, digits
 37 | 	return ''.join(choice(ascii_uppercase + digits) for i in range(key_len))
 38 | 
 39 | # ==== Zotero DB Models ==== #
 40 | 
 41 | class ZCollectionItem(Model):
 42 | 	__connection__ = 'zotero'
 43 | 	__timestamps__ = False
 44 | 	__table__ = 'collectionItems'
 45 | 	__primary_key__ = 'collectionID'
 46 | 
 47 | 
 48 | class ZCollection(Model):
 49 | 	__connection__ = 'zotero'
 50 | 	__timestamps__ = False
 51 | 	__table__ = 'collections'
 52 | 	__primary_key__ = 'collectionID'
 53 | 
 54 | 	@orm.belongs_to_many('collectionItems', 'collectionID', 'itemID')
 55 | 	def items(self):
 56 | 		return ZItem
 57 | 
 58 | 
 59 | class ZItemType(Model):
 60 | 	__connection__ = 'zotero'
 61 | 	__timestamps__ = False
 62 | 	__table__ = 'itemTypes'
 63 | 	__primary_key__ = 'itemTypeID'
 64 | 
 65 | 
 66 | 	@orm.belongs_to_many('items', 'itemTypeID', 'itemID')
 67 | 	def items(self):
 68 | 		return ZItem
 69 | 
 70 | class ZTag(Model):
 71 | 	__connection__ = 'zotero'
 72 | 	__timestamps__ = False
 73 | 	__table__ = 'tags'
 74 | 	__primary_key__ = 'tagID'
 75 | 
 76 | 
 77 | 	@orm.belongs_to_many('itemTags', 'tagID', 'itemID')
 78 | 	def items(self):
 79 | 		return ZItem
 80 | 
 81 | class ZItem(Model):
 82 | 	__connection__ = 'zotero'
 83 | 	__timestamps__ = False
 84 | 	__table__ = 'items'
 85 | 	__primary_key__ = 'itemID'
 86 | 
 87 | 	@orm.belongs_to_many('itemTags', 'itemID', 'tagID')
 88 | 	def tags(self):
 89 | 		return ZTag
 90 | 
 91 | 	def has_tag(self, tag):
 92 | 		return self.tags().where('tagID', '=', tag.tagID).exists()
 93 | 
 94 | 	def add_tag(self, tag):
 95 | 		if not self.has_tag(tag):
 96 | 			self.tags().attach(tag)
 97 | 
 98 | 	def remove_tag(self, tag):
 99 | 		if self.has_tag(tag):
100 | 			self.tags().detach(tag)
101 | 
102 | 	@orm.belongs_to_many('itemData', 'itemID', 'valueID')
103 | 	def item_data(self):
104 | 		return ZItemDataValue
105 | 
106 | 	def has_item_data(self):
107 | 		return self.item_data().where('itemID', '=', self.itemID)
108 | 
109 | 	def add_item_data(self, item_data):
110 | 		self.item_data().attach(item_data)
111 | 
112 | 	def remove_item_data(self, item_data):
113 | 		self.item_data().detach(item_data)
114 | 
115 | class ZItemTag(Model):
116 | 	__connection__ = 'zotero'
117 | 	__timestamps__ = False
118 | 	__table__ = 'itemTags'
119 | 	__primary_key__ = 'itemID'
120 | 
121 | class ZFieldFormat(Model):
122 | 	__connection__ = 'zotero'
123 | 	__timestamps__ = False
124 | 	__table__ = 'fieldFormats'
125 | 	__primary_key__ = 'fieldFormatID'
126 | 
127 | class ZField(Model):
128 | 	__connection__ = 'zotero'
129 | 	__timestamps__ = False
130 | 	__table__ = 'fields'
131 | 	__primary_key__ = 'fieldID'
132 | 
133 | class ZItemAttachment(Model):
134 | 	__connection__ = 'zotero'
135 | 	__timestamps__ = False
136 | 	__table__ = 'itemAttachments'
137 | 	__primary_key__ = 'itemID'
138 | 
139 | 	@orm.belongs_to('itemID')
140 | 	def item(self):
141 | 		return ZItem
142 | 
143 | class ZItemData(Model):
144 | 	__connection__ = 'zotero'
145 | 	__timestamps__ = False
146 | 	__table__ = 'itemData'
147 | 	__primary_key__ = 'itemID'
148 | 
149 | class ZItemDataValue(Model):
150 | 	__connection__ = 'zotero'
151 | 	__timestamps__ = False
152 | 	__table__ = 'itemDataValues'
153 | 	__primary_key__ = 'valueID'
154 | 
155 | 	@orm.belongs_to('valueID', 'valueID')
156 | 	def item_data(self):
157 | 		return ZItem
158 | 
159 | 	def has_data(self):
160 | 		return self.item_data().where('valueID', '=', self.valueID).exists()
161 | 
162 | 
163 | # ==== Adapter Methods ==== #
164 | 
165 | def sync_scraper(tcol, sc):
166 | 	log(f'  => Syncing datasheets from {sc.name}')
167 | 	date_added = datetime.now()
168 | 	# Try to check if the scraper collection already exists, or create it
169 | 	try:
170 | 		scol = ZCollection \
171 | 			.where('collectionName', '=', sc.name) \
172 | 			.where('parentCollectionID', '=', tcol.collectionID) \
173 | 			.first_or_fail()
174 | 	except:
175 | 		scol = ZCollection()
176 | 		scol.collectionName = sc.name
177 | 		scol.parentCollectionID = tcol.collectionID
178 | 		scol.libraryID = tcol.libraryID
179 | 		scol.key = gen_key()
180 | 		scol.save()
181 | 
182 | 	for ds in tqdm(Datasheet.where('scraper_id', '=', sc.id).get(), desc = f'Zotero sync: {sc.name}'):
183 | 		tlog(f'  ==> Adding datasheet {ds.title}')
184 | 
185 | 		try:
186 | 			name = ZItemDataValue.where('value', '=', ds.filename).first_or_fail()
187 | 		except:
188 | 			name = ZItemDataValue()
189 | 			name.value = ds.filename
190 | 			name.save()
191 | 
192 | 		try:
193 | 			title = ZItemDataValue.where('value', '=', ds.title).first_or_fail()
194 | 		except:
195 | 			title = ZItemDataValue()
196 | 			title.value = ds.title
197 | 			title.save()
198 | 
199 | 		try:
200 | 			url = ZItemDataValue.where('value', '=', ds.url).first_or_fail()
201 | 		except:
202 | 			url = ZItemDataValue()
203 | 			url.value = ds.url
204 | 			url.save()
205 | 
206 | 		try:
207 | 			da = ZItemDataValue.where('value', '=', date_added).first_or_fail()
208 | 		except:
209 | 			da = ZItemDataValue()
210 | 			da.value = date_added
211 | 			da.save()
212 | 
213 | 		# Check to see the the data values we just created are linked with an item
214 | 		if not ZItemData.where('valueID', '=', name.valueID).exists():
215 | 			# If not, then we can assume that the item doesn't exist, so we create one
216 | 			item = ZItem()
217 | 			item.itemTypeID = 12
218 | 			item.libraryID = scol.libraryID
219 | 			item.key = gen_key()
220 | 			item.save()
221 | 
222 | 			i_title = ZItemData()
223 | 			i_title.itemID = item.itemID
224 | 			i_title.valueID = title.valueID
225 | 			i_title.fieldID = 1
226 | 			i_title.save()
227 | 
228 | 			i_url = ZItemData()
229 | 			i_url.itemID = item.itemID
230 | 			i_url.valueID = url.valueID
231 | 			i_url.fieldID = 13
232 | 			i_url.save()
233 | 
234 | 			i_da = ZItemData()
235 | 			i_da.itemID = item.itemID
236 | 			i_da.valueID = da.valueID
237 | 			i_da.fieldID = 6
238 | 			i_da.save()
239 | 
240 | 			# Attach tags to item
241 | 			for tag in ds.tags().get():
242 | 				if tag.name != '':
243 | 					zt = ZTag.where('name', '=', tag.name).first_or_fail()
244 | 
245 | 					zit = ZItemTag()
246 | 					zit.itemID = item.itemID
247 | 					zit.tagID = zt.tagID
248 | 					zit.type = 0
249 | 					zit.save()
250 | 
251 | 			# Create the attachment item and then the attachment link
252 | 			aitm = ZItem()
253 | 			aitm.itemTypeID = 2
254 | 			aitm.libraryID = scol.libraryID
255 | 			aitm.key = gen_key()
256 | 			aitm.save()
257 | 
258 | 			att_name = ZItemData()
259 | 			att_name.itemID = aitm.itemID
260 | 			att_name.valueID = name.valueID
261 | 			att_name.fieldID = 1
262 | 			att_name.save()
263 | 
264 | 			att = ZItemAttachment()
265 | 			att.itemID = aitm.itemID
266 | 			att.parentItemID = item.itemID
267 | 			att.linkMode = 2
268 | 			att.path = ds.dl_location
269 | 			att.contentType = 'application/pdf'
270 | 			att.save()
271 | 
272 | 			# Finally link it to the collection
273 | 			col = ZCollectionItem()
274 | 			col.collectionID = scol.collectionID
275 | 			col.itemID = item.itemID
276 | 			col.save()
277 | 
278 | 
279 | 
280 | 
281 | 
282 | 
283 | 
284 | def sync_database(args, dl_dir):
285 | 	inf('Syncing Zotero with Trawler cache')
286 | 
287 | 	if args.zotero_sync_backup:
288 | 		backup_file = os.path.join(args.zotero_sync_backup_dir, 'zotero_backup.sqlite')
289 | 
290 | 		log(f'  => Backing up Zotero db to {backup_file}')
291 | 		copyfile(args.zotero_db_loc, backup_file)
292 | 
293 | 	# Get the Trawler Zotero collection, or create it otherwise
294 | 	try:
295 | 		tcol = ZCollection.where('collectionName', '=', ZOTERO_TRAWLER_ROOT_COLLECTION).first_or_fail()
296 | 	except:
297 | 		tcol = ZCollection()
298 | 		tcol.collectionName = ZOTERO_TRAWLER_ROOT_COLLECTION
299 | 		tcol.key = gen_key()
300 | 		tcol.libraryID = 1
301 | 		tcol.save()
302 | 
303 | 	# Sync the tags
304 | 	log(f'  => Syncing Trawler tags to Zotero')
305 | 	for tag in tqdm(DatasheetTag.all(), desc = 'Zotero sync: tags'):
306 | 		if tag.name != '':
307 | 			try:
308 | 				zt = ZTag.where('name', '=', tag.name).first_or_fail()
309 | 			except:
310 | 				zt = ZTag()
311 | 				zt.name = tag.name
312 | 				zt.save()
313 | 
314 | 	for sc in Scraper.all():
315 | 		if not sc.meta:
316 | 			sync_scraper(tcol, sc)
317 | 
318 | ZOTERO_ACTIONS = {
319 | 	'sync': sync_database
320 | }
321 | 
322 | def parser_init(parser):
323 | 	zotero_options = parser.add_argument_group('zotero meta adapter options')
324 | 
325 | 	zotero_options.add_argument(
326 | 		'--zotero-db-location',
327 | 		dest = 'zotero_db_loc',
328 | 		type = str,
329 | 		default = config.ZOTERO_DB,
330 | 		help = 'The location of the Zotero SQLite database'
331 | 	)
332 | 
333 | 	zotero_actions = parser.add_subparsers(
334 | 		dest = 'zotero_action',
335 | 		required = True
336 | 	)
337 | 
338 | 	zsync = zotero_actions.add_parser('sync', help = 'Sync Zotero with Trawler')
339 | 
340 | 	zsync.add_argument(
341 | 		'--backup',
342 | 		dest = 'zotero_sync_backup',
343 | 		default = False,
344 | 		action = 'store_true',
345 | 		help = 'Create a backup of the Zotero database before syncing'
346 | 	)
347 | 
348 | 	zsync.add_argument(
349 | 		'--backup-dir',
350 | 		dest = 'zotero_sync_backup_dir',
351 | 		type = str,
352 | 		default = ZOTERO_BACKUP_DIR,
353 | 		help = 'Specify the location for the Zotero backup file'
354 | 	)
355 | 
356 | def adapter_main(args, dl_dir):
357 | 	if not os.path.exists(args.zotero_db_loc):
358 | 		err(f'Unable to find the Zotero database at {args.zotero_db_loc}')
359 | 		err(f'To override the default lookup location use \'--zotero-db-location\'')
360 | 		return 1
361 | 
362 | 	wrn('This will lock the Zotero database, ensure that Zotero is not being used!')
363 | 
364 | 	# Invoke the given action
365 | 	act = ZOTERO_ACTIONS.get(args.zotero_action, lambda a: 1)
366 | 	return act(args, dl_dir)
367 | 
368 | 


--------------------------------------------------------------------------------