├── trawler ├── adapters │ ├── __init__.py │ ├── renesas.py │ ├── usb-if.py │ ├── arm.py │ ├── xilinx.py │ └── zotero.py ├── common.py ├── net.py ├── config.py ├── db.py └── __init__.py ├── requirements.txt ├── .gitignore ├── __main__.py ├── trawler.py ├── setup.py ├── LICENSE └── README.md /trawler/adapters/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | tqdm 3 | selenium 4 | orator 5 | beautifulsoup4 6 | lxml 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__/ 3 | _build/ 4 | build/ 5 | .venv/ 6 | .webdriver_profile/ 7 | datasheets/ 8 | *.db 9 | -------------------------------------------------------------------------------- /__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | import sys 4 | from trawler import main 5 | 6 | if __name__ == '__main__': 7 | sys.exit(main()) 8 | -------------------------------------------------------------------------------- /trawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | 4 | import sys 5 | from pathlib import Path 6 | 7 | trawler_path = Path(sys.argv[0]).resolve() 8 | 9 | if (trawler_path.parent / 'trawler').is_dir(): 10 | sys.path.insert(0, str(trawler_path.parent)) 11 | 12 | from trawler import main 13 | 14 | if __name__ == '__main__': 15 | sys.exit(main()) 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | #!/usr/bin/env python3 3 | 4 | from setuptools import setup, find_packages 5 | 6 | setup( 7 | name = 'Trawler', 8 | version = '0.1', 9 | description = 'Bulk scrape and download datasheets from various vendors (insult)', 10 | license = 'BSD-3-Clause', 11 | python_requires = '~=3.7', 12 | install_requires = [ 13 | 'requests', 14 | 'tqdm', 15 | 'selenium', 16 | 'orator', 17 | 'beautifulsoup4', 18 | 'lxml' 19 | ], 20 | entry_points = { 21 | 'console_scripts': [ 22 | 'trawler = trawler:main', 23 | ] 24 | }, 25 | packages = find_packages(), 26 | project_urls = { 27 | 'Source Code': 'https://github.com/bad-alloc-heavy-industries/Trawler', 28 | 'Bug Tracker': 'https://github.com/bad-alloc-heavy-industries/Trawler/issues' 29 | } 30 | ) 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021, All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, 7 | this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | 13 | 3. Neither the name of ORGANIZATION nor the names of 14 | its contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 21 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 | POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /trawler/common.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | import sys 3 | import os 4 | import collections.abc 5 | from tqdm import tqdm 6 | 7 | __all__ = ( 8 | 'log', 'err', 'wrn', 'inf', 'dbg', 9 | 'tlog', 'terr', 'twrn', 'tinf', 'tdbg', 10 | 'fixup_title', 11 | 12 | 'EXECUTABLE_EXTS', 'ARCHIVE_EXTS' 13 | ) 14 | 15 | EXECUTABLE_EXTS = ( 16 | 'exe', 'msi', 17 | ) 18 | 19 | ARCHIVE_EXTS = ( 20 | 'tar', 'gz', 'bz2', 'xz', 'zip', 21 | 'lzma', '7z', 22 | ) 23 | 24 | def log(str, end = '\n', file = sys.stdout): 25 | print(f'\x1B[35m[*]\x1B[0m {str}', end = end, file = file) 26 | 27 | def err(str, end = '\n', file = sys.stderr): 28 | print(f'\x1B[31m[!]\x1B[0m {str}', end = end, file = file) 29 | 30 | def wrn(str, end = '\n', file = sys.stderr): 31 | print(f'\x1B[33m[~]\x1B[0m {str}', end = end, file = file) 32 | 33 | def inf(str, end = '\n', file = sys.stdout): 34 | print(f'\x1B[36m[~]\x1B[0m {str}', end = end, file = file) 35 | 36 | def dbg(str, end = '\n', file = sys.stdout): 37 | print(f'\x1B[34m[~]\x1B[0m {str}', end = end, file = file) 38 | 39 | def tlog(str, end = '\n', file = sys.stdout): 40 | tqdm.write(f'\x1B[35m[*]\x1B[0m {str}', end = end, file = file) 41 | 42 | def terr(str, end = '\n', file = sys.stderr): 43 | tqdm.write(f'\x1B[31m[!]\x1B[0m {str}', end = end, file = file) 44 | 45 | def twrn(str, end = '\n', file = sys.stderr): 46 | tqdm.write(f'\x1B[33m[~]\x1B[0m {str}', end = end, file = file) 47 | 48 | def tinf(str, end = '\n', file = sys.stdout): 49 | tqdm.write(f'\x1B[36m[~]\x1B[0m {str}', end = end, file = file) 50 | 51 | def tdbg(str, end = '\n', file = sys.stdout): 52 | tqdm.write(f'\x1B[34m[~]\x1B[0m {str}', end = end, file = file) 53 | 54 | def recusive_zip(d, u): 55 | for k, v in u.items(): 56 | if isinstance(v, collections.abc.Mapping): 57 | d[k] = _recusive_zip(d.get(k, {}), v) 58 | else: 59 | d[k] = v 60 | return d 61 | 62 | def fixup_title(s): 63 | if len(s) < 18: 64 | return f'{s}{" "*(18 - len(s))}' 65 | else: 66 | return f'{s[:15]}...' 67 | -------------------------------------------------------------------------------- /trawler/net.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | import sys 3 | import time 4 | import re 5 | 6 | from os import path 7 | 8 | import requests 9 | from requests import utils 10 | 11 | from . import config 12 | from .common import * 13 | 14 | 15 | __all__ = ( 16 | 'download_resource', 'get_content' 17 | ) 18 | 19 | def get_content(url, args): 20 | try_count = 0 21 | while try_count < args.retry: 22 | try: 23 | if args.delay > 0: 24 | time.sleep(args.delay) 25 | 26 | with requests.get( 27 | url, 28 | allow_redirects = True, 29 | timeout = args.timeout, 30 | headers = { 31 | 'User-Agent': args.user_agent 32 | } 33 | ) as r: 34 | return r.content 35 | 36 | except Exception as e: 37 | if isinstance(e, KeyboardInterrupt): 38 | sys.quit() 39 | else: 40 | try_count += 1 41 | 42 | if try_count != 0: 43 | return False 44 | 45 | def download_resource(dl_dir, ds, args): 46 | tlog(f' => Downloading {ds.title} ({ds.id})') 47 | try_count = 0 48 | while try_count < args.retry: 49 | try: 50 | if args.delay > 0: 51 | time.sleep(args.delay) 52 | 53 | with requests.get( 54 | ds.url, 55 | allow_redirects = True, 56 | timeout = args.timeout, 57 | headers = { 58 | 'User-Agent': args.user_agent 59 | } 60 | ) as r: 61 | fname = '' 62 | if 'content-disposition' in r.headers.keys(): 63 | fname = re.findall('filename=(.*)', r.headers['content-disposition'])[0] 64 | else: 65 | fname = ds.url.split('/')[-1] 66 | 67 | ds.filename = fname 68 | if not ds.dl_location.endswith(fname): 69 | ds.dl_location = path.join(ds.dl_location, fname) 70 | ds.save() 71 | tlog(f' ==> Saving {fname} to {ds.dl_location}') 72 | with open(ds.dl_location, 'wb') as file: 73 | file.write(r.content) 74 | 75 | ds.downloaded = True 76 | ds.save() 77 | break 78 | except Exception as e: 79 | if isinstance(e, KeyboardInterrupt): 80 | sys.quit() 81 | else: 82 | twrn(f' => Download failed {e}, retrying') 83 | try_count += 1 84 | 85 | if try_count != 0: 86 | terr(f' => Unable to download datasheet with id {ds.id}') 87 | return False 88 | 89 | return True 90 | -------------------------------------------------------------------------------- /trawler/config.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | import os 3 | import enum 4 | from enum import Enum 5 | 6 | @enum.unique 7 | class WebdriverBackend(Enum): 8 | Chrome = enum.auto() 9 | FireFox = enum.auto() 10 | 11 | def __str__(self) -> str: 12 | return self.name 13 | 14 | @staticmethod 15 | def from_string(s: str): 16 | try: 17 | return WebdriverBackend[s] 18 | except KeyError: 19 | raise ValueError() 20 | 21 | # ==== Various constants ==== # 22 | TRAWLER_NAME = 'trawler' 23 | TRAWLER_VERSION = 'v0.2' 24 | TRAWLER_SCHEMA_VERSION = 1 25 | 26 | # ==== Directories ==== # 27 | XDG_CACHE_DIR = os.path.join(os.path.expanduser('~'), '.cache') if 'XDG_CACHE_HOME' not in os.environ else os.environ['XDG_CACHE_HOME'] 28 | XDG_DATA_HOME = os.path.join(os.path.expanduser('~'), '.local/share') if 'XDG_DATA_HOME' not in os.environ else os.environ['XDG_DATA_HOME'] 29 | XDG_DOCUMENTS_DIR = os.path.join(os.path.expanduser('~'), 'Documents') if 'XDG_DOCUMENTS_DIR' not in os.environ else os.environ['XDG_DOCUMENTS_DIR'] 30 | 31 | TRAWLER_DATA = os.path.join(XDG_DATA_HOME, TRAWLER_NAME) 32 | TRAWLER_CACHE = os.path.join(XDG_CACHE_DIR, TRAWLER_NAME) 33 | 34 | TRAWLER_USER_ADAPTERS = os.path.join(TRAWLER_DATA, 'adapters') 35 | TRAWLER_DL_DIR = os.path.join(XDG_DOCUMENTS_DIR, TRAWLER_NAME) 36 | 37 | # ==== Default Settings ==== # 38 | DEFAULT_OUTPUT_DIR = TRAWLER_DL_DIR 39 | DEFAULT_TIMEOUT = 120 40 | DEFAULT_RETRY_COUNT = 3 41 | DEFAULT_DOWNLOAD_DELAY = 3 42 | DEFAULT_PROFILE_DIRECTORY = os.path.join(TRAWLER_CACHE, '.webdriver_profile') 43 | DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.9999.9999 Safari/537.36' 44 | 45 | DEFAULT_DATABASE = os.path.join(TRAWLER_CACHE, 'datasheets.db') 46 | DEFAULT_WEBDRIVER = WebdriverBackend.Chrome 47 | DEFAULT_WD_HEADLESS = False 48 | DEFAULT_WD_HEADLESS_RES = (1920, 1080) 49 | 50 | # ==== Zotero Stuff ==== # 51 | ZOTERO_ROOT = os.path.join(os.path.expanduser('~'), 'Zotero') 52 | ZOTERO_DB = os.path.join(ZOTERO_ROOT, 'zotero.sqlite') 53 | 54 | # ==== Database Settings ==== # 55 | DATABASE = { 56 | 'default': 'trawler_cache', 57 | 'trawler_cache': { 58 | 'driver': 'sqlite', 59 | 'database': DEFAULT_DATABASE, 60 | }, 61 | 'zotero': { 62 | 'driver': 'sqlite', 63 | 'database': ZOTERO_DB, 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /trawler/adapters/renesas.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | """ 3 | renesas.py 4 | --------- 5 | 6 | This script is designed to scrape all of the datasheets from https://www.renesas.com/us/en/support/document-search 7 | 8 | """ 9 | import sys 10 | import time 11 | import enum 12 | import re 13 | 14 | from enum import Enum, Flag 15 | from os import getcwd, path, mkdir 16 | from datetime import datetime, timedelta 17 | 18 | import requests 19 | from requests import utils 20 | 21 | from tqdm import tqdm 22 | 23 | from ..common import * 24 | from ..net import download_resource, get_content 25 | from ..db import Datasheet, DatasheetTag, Scraper 26 | 27 | from bs4 import BeautifulSoup 28 | 29 | ADAPTER_NAME = 'renesas' 30 | ADAPTER_DESC = 'Renesas datasheet adapter' 31 | 32 | RENESAS_ROOT = 'https://www.renesas.com' 33 | RENESAS_DOCS_ROOT = f'{RENESAS_ROOT}/us/en/support/document-search' 34 | 35 | def collect_datasheets(args, dl_dir): 36 | sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id 37 | log('Collecting datasheets... this might take a while') 38 | 39 | page_index = 0 40 | has_next_page = True 41 | datasheets = 0 42 | start_time = datetime.now() 43 | 44 | while has_next_page: 45 | inf(f' => On page index {page_index+1}, total so far {datasheets}') 46 | url = f'{RENESAS_DOCS_ROOT}?page={page_index}' 47 | 48 | content = get_content(url, args) 49 | soup = BeautifulSoup(content, 'lxml') 50 | try: 51 | doc_tab = soup.find('table').find('tbody') 52 | except Exception as e: 53 | if isinstance(e, KeyboardInterrupt): 54 | sys.quit() 55 | else: 56 | has_next_page = False 57 | break 58 | 59 | for doc in tqdm(doc_tab.find_all('tr')): 60 | tds = doc.find_all('td') 61 | is_locked = tds[0].find('span') is not None 62 | url = tds[1].find_all('a')[1]['href'] 63 | title = tds[1].find_all('a')[1].text 64 | 65 | if not is_locked: 66 | try: 67 | ds = Datasheet \ 68 | .where('title', '=', title) \ 69 | .where('scraper_id', '=', sc_id) \ 70 | .first_or_fail() 71 | except: 72 | ds = Datasheet() 73 | ds.scraper_id = sc_id 74 | ds.found = datetime.now() 75 | ds.src = f'{RENESAS_ROOT}/{url}' 76 | ds.url = f'{RENESAS_ROOT}/{url}' 77 | ds.dl_location = dl_dir 78 | ds.title = title 79 | 80 | ds.last_seen = datetime.now() 81 | ds.save() 82 | datasheets += 1 83 | 84 | page_index += 1 85 | 86 | end_time = datetime.now() 87 | 88 | log(f'Found {datasheets} datasheets in {end_time - start_time}') 89 | 90 | 91 | def parser_init(parser): 92 | renesas_options = parser.add_argument_group('Renesas adapter options') 93 | 94 | def adapter_main(args, driver, driver_options, dl_dir): 95 | sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id 96 | 97 | if not args.skip_collect: 98 | collect_datasheets(args, dl_dir) 99 | 100 | if not args.skip_download: 101 | sheets = Datasheet.where('src', '!=', 'NULL').where('scraper_id', '=', sc_id).get() 102 | with tqdm( 103 | miniters = 1, total = len(sheets), 104 | ) as bar: 105 | for ds in sheets: 106 | bar.set_description(fixup_title(ds.title)) 107 | if download_resource(dl_dir, ds, args): 108 | bar.update(1) 109 | 110 | return 0 111 | -------------------------------------------------------------------------------- /trawler/adapters/usb-if.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | """ 3 | usb-if.py 4 | --------- 5 | 6 | This script is designed to scrape all of the datasheets from https://www.usb.org/documents 7 | 8 | """ 9 | import sys 10 | import time 11 | import enum 12 | import re 13 | 14 | from enum import Enum, Flag 15 | from os import getcwd, path, mkdir 16 | from datetime import datetime, timedelta 17 | 18 | import requests 19 | from requests import utils 20 | 21 | from tqdm import tqdm 22 | 23 | from ..common import * 24 | from ..net import download_resource, get_content 25 | from ..db import Datasheet, DatasheetTag, Scraper 26 | 27 | from bs4 import BeautifulSoup 28 | 29 | ADAPTER_NAME = 'usb-if' 30 | ADAPTER_DESC = 'USB-IF datasheet adapter' 31 | 32 | USB_DOCS_ROOT_URL = 'https://www.usb.org/documents' 33 | USB_DOCS_ALL = f'{USB_DOCS_ROOT_URL}?search=&items_per_page=All' 34 | 35 | 36 | def collect_datasheets(args, dl_dir): 37 | sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id 38 | log('Collecting datasheets... this might take a while') 39 | datasheets = 0 40 | start_time = datetime.now() 41 | 42 | content = get_content(USB_DOCS_ALL, args) 43 | soup = BeautifulSoup(content, 'lxml') 44 | 45 | # We assume that there is only one table on this page, I know I know 46 | doc_tab = soup.find('table').find('tbody') 47 | for doc in tqdm(doc_tab.find_all('tr')): 48 | tds = doc.find_all('td') 49 | try: 50 | file = tds[0].find('span').find_all('span')[2].find('a')['href'] 51 | except Exception as e: 52 | if isinstance(e, KeyboardInterrupt): 53 | sys.quit() 54 | else: 55 | continue 56 | 57 | title = tds[0].find_all('a')[1].text 58 | 59 | try: 60 | ds = Datasheet \ 61 | .where('title', '=', title) \ 62 | .where('scraper_id', '=', sc_id) \ 63 | .first_or_fail() 64 | except: 65 | ds = Datasheet() 66 | ds.scraper_id = sc_id 67 | ds.found = datetime.now() 68 | ds.src = file 69 | ds.url = file 70 | ds.dl_location = dl_dir 71 | ds.title = title 72 | 73 | ds.last_seen = datetime.now() 74 | ds.save() 75 | datasheets += 1 76 | 77 | # Try to pull out the "tags" 78 | t_tags = list(map(lambda t: t.strip(), tds[4].text.split(','))) + [ 79 | tds[1].text.strip() if tds[1] is not None else '', 80 | tds[2].text.strip() if tds[2] is not None else '' 81 | ] 82 | 83 | for tag in t_tags: 84 | if tag != '': 85 | try: 86 | te = DatasheetTag \ 87 | .where('scraper_id', '=', sc_id) \ 88 | .where('name', '=', tag) \ 89 | .first_or_fail() 90 | except: 91 | te = DatasheetTag() 92 | te.scraper_id = sc_id 93 | te.name = tag 94 | te.save() 95 | 96 | ds.add_tag(te) 97 | 98 | end_time = datetime.now() 99 | log(f'Found {datasheets} datasheets in {end_time - start_time}') 100 | 101 | def parser_init(parser): 102 | usbif_options = parser.add_argument_group('USB-IF adapter options') 103 | 104 | 105 | def adapter_main(args, driver, driver_options, dl_dir): 106 | sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id 107 | if not args.skip_collect: 108 | collect_datasheets(args, dl_dir) 109 | 110 | if not args.skip_download: 111 | sheets = Datasheet.where('src', '!=', 'NULL').where('scraper_id', '=', sc_id).get() 112 | with tqdm( 113 | miniters = 1, total = len(sheets), 114 | ) as bar: 115 | for ds in sheets: 116 | bar.set_description(fixup_title(ds.title)) 117 | if download_resource(dl_dir, ds, args): 118 | bar.update(1) 119 | 120 | return 0 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Trawler - Bulk download of datasheets 2 | 3 | Trawler is an application to help in the facilitation of bulk downloads for datasheets and resources from vendor websites. 4 | 5 | This comes about because vendors don't tend to host bulk downloads of their content, or something nice like an rsync mirror, rather we need to result to scraping. 6 | 7 | Trawler is built around selenium, which allows us to pretend to be a user and let AJAX and other bits of JavaSctript run to allow us to interact with the vendor website and collect the datasheets. 8 | 9 | 10 | ## Adapters 11 | 12 | Trawler has two types of adapters, source adapters, and meta adapters. The source adapters are responsible for the collection and download of datasheets, where as the meta adapters are for interacting with the cache data Trawler has. 13 | 14 | ### Source Adapters 15 | 16 | The following source adapters are included with Trawler: 17 | * `arm` - Download documentation from `https://developer.arm.com/documentation` 18 | * `xilinx` - Download the documentation from the Xilinx DocNav service 19 | * `usb-if` - Download the documentation from `https://www.usb.org/documents` 20 | * `renasas` - Download the documentation from `https://www.renesas.com/us/en/support/document-search` 21 | 22 | The following source adapters are planned: 23 | * `ti` - Download the documentation for Texas Instruments. 24 | * `st` - Download the documentation from ST. 25 | * `microchip` - Download the documentation from Microchip. 26 | * `micron` - Download the documentation from Micron 27 | 28 | If there is not an adapter in this list you want, feel free to open an issue or contribute it yourself! 29 | 30 | ### Meta Adapters 31 | 32 | The following meta adapters are implemented currently: 33 | * `zorero` - Integration and sync with a local [Zotero](https://www.zotero.org/) install 34 | 35 | The following meta adapters are planned: 36 | * `query` - Very trivial datasheet lookup by title / tag 37 | * `export` - Export cache information in various formats 38 | 39 | ## Usage 40 | 41 | To use Trawler, in the most simple way, simply invoke the adapter for the datasheet source you want, like so: 42 | ``` 43 | trawler arm 44 | ``` 45 | 46 | This will cause Trawler to initialize everything it needs and then it will automatically start the entire acquisition process. This will absolutely take a long time, and the length of which heavily depends on the adapter. 47 | 48 | Each adapter has their own settings and configuration in addition to the global settings, to see the settings the adapter support, simply issue `--help` to it: 49 | ``` 50 | trawler arm --help 51 | ``` 52 | 53 | To list the adapters that Trawler knows about, simply pass `--help` to Trawler by itself and it will let you know about all the adapters it has 54 | ``` 55 | trawler --help 56 | ``` 57 | 58 | ### Global Settings 59 | 60 | Trawler supports the following settings globally: 61 | * `--output / -o` - Specify the output directory for Trawler to use. 62 | * `--timeout / -t` - Specify the timeout duration in seconds for network operations. 63 | * `--retry / -r` - Specify the number of times to retry network operations. 64 | * `--delay / -d` - Specify the delay in seconds for network operations. 65 | * `--cache-database / -c` - Specify the location and name of the datasheet cache database Trawler uses. 66 | * `--skip-collect / -C` - Skip the datasheet collection stage for the adapter. 67 | * `--skip-extract / -E` - Skip the extraction stage for the adapter. 68 | * `--skip-download / -D` - Skip the download stage for the adapter. 69 | * `--user-agent / -A` - Specify the user-agent to use when downloading files. 70 | 71 | The following settings are used for the WebDriver, and therefore only effect the adapters / stages that use it: 72 | * `--profile-directory / -p` - Specify the WebDriver profile directory. 73 | * `--webdriver / -w` - Specify the WebDriver to use. 74 | * `--headless / -H` - Tell the WebDriver to run in headless mode. 75 | * `--headless-width / -X` - Specify the virtual width of the WebDriver instance. 76 | * `--headless-height / -Y` - Specify the virtual hight of the WebDriver instance. 77 | 78 | ### ARM Adapter Settings 79 | 80 | The following settings are only applicable to the ARM adapter: 81 | * `--arm-document-type / -A` - Specify the types of documents to collect and download. 82 | 83 | ### Xilinx Adapter Settings 84 | 85 | The following settings are only applicable to the Xilinx adapter: 86 | * `--dont-group / -G` - Don't group Datasheets into categories and groups when downloading. 87 | * `--collect-web-only / -W` - Allow Trawler to collect the web-only content. 88 | 89 | ### Zotero Adapter Settings 90 | 91 | The following settings are only applicable to the Zotero meta adapter: 92 | * `--zotero-db-location` - Specify the location of the Zotero database if it's not the default. 93 | 94 | 95 | The Zotero has the following actions it can take: 96 | * `sync` - Sync the Trawler cache with the Zotero database. 97 | 98 | #### Zotero Sync Settings 99 | 100 | The Zotero sync action has the following settings: 101 | * `--backup` - Backup the Zotero database before performing the sync. 102 | * `--backup-dir` - Set the backup directory for the Zotero database. 103 | 104 | ## Installing 105 | 106 | With pip, all the needed dependencies for Trawler should be pulled in automatically 107 | 108 | To install the current development snapshot, simply run: 109 | ``` 110 | pip3 install --user 'git+https://github.com/bad-alloc-heavy-industries/Trawler.git#egg=Trawler' 111 | ``` 112 | Or to install a local development copy: 113 | ``` 114 | git clone https://github.com/bad-alloc-heavy-industries/Trawler.git 115 | cd Trawler 116 | pip3 install --user --editable '.' 117 | ``` 118 | 119 | **NOTE:** The adapters that need a WebDriver will only work if you have one installed for selenium to use! 120 | 121 | ## Important Notes 122 | 123 | * Some adapters won't work if the WebDriver viewport is smaller than 1920x1080, you can possibly fix this by running the WebDriver headless with the correct virtual size if the WebDriver supports it. 124 | 125 | ## License 126 | Trawler is licensed under the [BSD 3-Clause](https://spdx.org/licenses/BSD-3-Clause.html) license, the full text of which can be found in the [`LICENSE`](LICENSE) file. 127 | -------------------------------------------------------------------------------- /trawler/db.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | from orator import Model, orm 3 | from orator.migrations import Migrator, Migration, DatabaseMigrationRepository 4 | 5 | from . import config 6 | from .common import * 7 | 8 | # ==== Models ==== # 9 | 10 | class CacheMetadata(Model): 11 | __connection__ = 'trawler_cache' 12 | __fillable__ = ['name', 'value'] 13 | 14 | class DatasheetTag(Model): 15 | __fillable__ = ['scraper_id', 'name'] 16 | __connection__ = 'trawler_cache' 17 | 18 | @orm.belongs_to_many('tag_links', 'tag_id', 'datasheet_id') 19 | def datasheets(self): 20 | return Datasheet 21 | 22 | class Datasheet(Model): 23 | __fillable__ = [ 24 | 'scraper_id', 'title', 'url', 'filename', 25 | 'dl_location', 'version', 'found', 'last_seen', 26 | 'downloaded', 'src' 27 | ] 28 | __connection__ = 'trawler_cache' 29 | 30 | def has_tag(self, tag): 31 | return self.tags().where('tag_id', '=', tag.id).exists() 32 | 33 | def add_tag(self, tag): 34 | if not self.has_tag(tag): 35 | self.tags().attach(tag) 36 | 37 | def remove_tag(self, tag): 38 | if self.has_tag(tag): 39 | self.tags().detach(tag) 40 | 41 | @orm.belongs_to_many('tag_links', 'datasheet_id', 'tag_id') 42 | def tags(self): 43 | return DatasheetTag 44 | 45 | class Scraper(Model): 46 | __fillable__ = ['name', 'last_run'] 47 | __connection__ = 'trawler_cache' 48 | 49 | @orm.has_many 50 | def datasheets(self): 51 | return Datasheet 52 | 53 | # ==== Migration ==== # 54 | 55 | class CreateDatasheetTable(Migration): 56 | def up(self): 57 | with self.schema.create('datasheets') as table: 58 | table.increments('id').unique() 59 | table.integer('scraper_id').unsigned() 60 | table.foreign('scraper_id').references('id').on('scrapers').on_delete('cascade') 61 | table.string('title').nullable() 62 | table.string('url').nullable() 63 | table.string('src').nullable() 64 | table.string('filename').nullable() 65 | table.string('dl_location').nullable() 66 | table.string('version').nullable() 67 | table.boolean('downloaded').nullable() 68 | table.datetime('last_seen').nullable() 69 | table.datetime('found') 70 | table.timestamps() 71 | 72 | def update(self, from_version): 73 | with self.schema.table('datasheets') as table: 74 | pass 75 | 76 | def down(self): 77 | self.schema.drop('datasheets') 78 | 79 | class CreateScraperTable(Migration): 80 | def up(self): 81 | with self.schema.create('scrapers') as table: 82 | table.increments('id').unique() 83 | table.string('name') 84 | table.datetime('last_run').nullable() 85 | table.boolean('meta').nullable() 86 | table.timestamps() 87 | 88 | def update(self, from_version): 89 | with self.schema.table('scrapers') as table: 90 | # All the fields added in v0.2 91 | if from_version < 1: 92 | table.boolean('meta').nullable() 93 | 94 | def down(self): 95 | self.schema.drop('scrapers') 96 | 97 | class CreateDatasheetTagTable(Migration): 98 | def up(self): 99 | with self.schema.create('datasheet_tags') as table: 100 | table.increments('id').unique() 101 | table.integer('scraper_id').unsigned() 102 | table.foreign('scraper_id').references('id').on('scrapers').on_delete('cascade') 103 | table.string('name') 104 | table.timestamps() 105 | 106 | def update(self, from_version): 107 | with self.schema.table('datasheet_tags') as table: 108 | pass 109 | 110 | def down(self): 111 | self.schema.drop('datasheet_tags') 112 | 113 | class CreateTagLinkTable(Migration): 114 | def up(self): 115 | with self.schema.create('tag_links') as table: 116 | table.increments('id').unique() 117 | table.integer('tag_id').unsigned() 118 | table.foreign('tag_id').references('id').on('datasheet_tags').on_delete('cascade') 119 | table.integer('datasheet_id').unsigned() 120 | table.foreign('datasheet_id').references('id').on('datasheets').on_delete('cascade') 121 | table.timestamps() 122 | 123 | def update(self, from_version): 124 | with self.schema.table('tag_links') as table: 125 | pass 126 | 127 | def down(self): 128 | self.schema.drop('tag_links') 129 | 130 | class CreateCacheMetadataTable(Migration): 131 | def up(self): 132 | with self.schema.create('cache_metadata') as table: 133 | table.increments('id').unique() 134 | table.string('name') 135 | table.string('value').nullable() 136 | table.timestamps() 137 | 138 | def update(self, from_version): 139 | with self.schema.table('cache_metadata') as table: 140 | pass 141 | 142 | def down(self): 143 | self.schema.drop('cache_metadata') 144 | 145 | 146 | _MIGRATIONS = ( 147 | CreateDatasheetTable, 148 | CreateScraperTable, 149 | CreateDatasheetTagTable, 150 | CreateTagLinkTable, 151 | CreateCacheMetadataTable, 152 | ) 153 | 154 | def check_schema(dbm): 155 | # If we don't have the cache metadata table, we absolutely need to run migrations 156 | try: 157 | dbm.table('cache_metadata').exists() 158 | except: 159 | wrn('Trawler schema is massively out of date, updating') 160 | run_migration(dbm, CreateCacheMetadataTable) 161 | finally: 162 | try: 163 | sv = CacheMetadata.where('name', '=', 'schema_version').first_or_fail() 164 | except: 165 | sv = CacheMetadata() 166 | sv.name = 'schema_version' 167 | sv.value = 0 # We plan to run an upgrade anyway so 168 | sv.save() 169 | 170 | if int(sv.value) < config.TRAWLER_SCHEMA_VERSION: 171 | # The schema version is out of date, run the update 172 | run_update(dbm, int(sv.value)) 173 | 174 | # The update ran, save the new schema version 175 | sv.value = config.TRAWLER_SCHEMA_VERSION 176 | sv.save() 177 | 178 | 179 | 180 | def run_migration(dbm, m): 181 | dbm_repo = DatabaseMigrationRepository(dbm, 'migrations') 182 | 183 | mi = m() 184 | mi.set_connection(dbm_repo.get_connection()) 185 | 186 | if mi.transactional: 187 | with mi.db.transaction(): 188 | mi.up() 189 | else: 190 | mi.up() 191 | 192 | 193 | def run_migrations(dbm): 194 | dbm_repo = DatabaseMigrationRepository(dbm, 'migrations') 195 | 196 | for m in _MIGRATIONS: 197 | mi = m() 198 | mi.set_connection(dbm_repo.get_connection()) 199 | 200 | if mi.transactional: 201 | with mi.db.transaction(): 202 | mi.up() 203 | else: 204 | mi.up() 205 | 206 | def run_update(dbm, from_version): 207 | dbm_repo = DatabaseMigrationRepository(dbm, 'migrations') 208 | inf(f'Updating Trawler schema from v{from_version} to v{config.TRAWLER_SCHEMA_VERSION}') 209 | 210 | for m in _MIGRATIONS: 211 | mi = m() 212 | mi.set_connection(dbm_repo.get_connection()) 213 | 214 | if mi.transactional: 215 | with mi.db.transaction(): 216 | mi.update(from_version) 217 | else: 218 | mi.update(from_version) 219 | -------------------------------------------------------------------------------- /trawler/adapters/arm.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | """ 3 | arm.py 4 | --------- 5 | 6 | This script is designed to scrape all of the datasheets from https://developer.arm.com/documentation 7 | 8 | """ 9 | import sys 10 | import time 11 | import enum 12 | 13 | from enum import Enum, Flag 14 | from os import getcwd, path, mkdir 15 | from datetime import datetime, timedelta 16 | 17 | import requests 18 | from requests import utils 19 | 20 | from tqdm import tqdm 21 | from selenium import webdriver 22 | 23 | from ..common import * 24 | from ..net import download_resource 25 | from ..db import Datasheet, DatasheetTag, Scraper 26 | 27 | @enum.unique 28 | class DocumentType(Enum): 29 | ReferenceManual = enum.auto() 30 | Architecture = enum.auto() 31 | Guide = enum.auto() 32 | ApplicationNote = enum.auto() 33 | KnowledgeBaseArticle = enum.auto() 34 | ReleaseNote = enum.auto() 35 | SoftwareErrata = enum.auto() 36 | 37 | def __str__(self) -> str: 38 | return self.name 39 | 40 | @staticmethod 41 | def from_string(s): 42 | try: 43 | return DocumentType[s] 44 | except KeyError: 45 | raise ValueError() 46 | 47 | def filter_name(self): 48 | if self.value == DocumentType.ReferenceManual.value: 49 | return 'Technical%20Reference%20Manual' 50 | if self.value == DocumentType.Architecture.value: 51 | return 'Architecture%20Document' 52 | if self.value == DocumentType.Guide.value: 53 | return 'Guide' 54 | if self.value == DocumentType.ApplicationNote.value: 55 | return 'Application%20Note' 56 | if self.value == DocumentType.KnowledgeBaseArticle.value: 57 | return 'Knowledge%20Base%20Article' 58 | if self.value == DocumentType.ReleaseNote.value: 59 | return 'Release%20Note' 60 | if self.value == DocumentType.SoftwareErrata.value: 61 | return 'Software%20Developer%20Errata%20Notice' 62 | 63 | 64 | ADAPTER_NAME = 'arm' 65 | ADAPTER_DESC = 'arm datasheet adapter' 66 | 67 | ARM_DOCS_ROOT_URL = 'https://developer.arm.com/documentation' 68 | 69 | 70 | def extract_datasheet(driver, ds): 71 | tlog(f' => Extracting datasheet {ds.id} from {ds.src}') 72 | driver.get(ds.src) 73 | time.sleep(3.5) 74 | try: 75 | driver.find_element_by_xpath('/html/body/div/div/div[2]/main/div/div[1]/div/div/div[1]/div/button').click() 76 | dl_loc = driver.find_element_by_xpath('/html/body/div/div/div[2]/main/div/div[1]/div/div/div[1]/div[2]/a') 77 | except Exception as e: 78 | if isinstance(e, KeyboardInterrupt): 79 | sys.quit() 80 | else: 81 | terr(f' => Error: Unable to extract datasheet with id {ds.id}') 82 | return False 83 | 84 | ds.url = dl_loc.get_attribute('href') 85 | ds.save() 86 | return True 87 | 88 | 89 | 90 | def collect_datasheets(driver, doc_types): 91 | sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id 92 | 93 | dt_string = ','.join(map(lambda dt: (DocumentType.from_string(dt)).filter_name(), doc_types)) 94 | doc_url = f'sort=relevancy&f:@navigationhierarchiescontenttype=[{dt_string}]' 95 | has_next_page = True 96 | page_index = 0 97 | datasheets = 0 98 | start_time = datetime.now() 99 | 100 | log('Collecting datasheets... this might take a while') 101 | while has_next_page: 102 | inf(f' => On page index {(page_index//10)+1}, total so far {datasheets}') 103 | # We need to wait because ajax and frames and just bad. 104 | driver.get(f'{ARM_DOCS_ROOT_URL}/#first={page_index}&{doc_url}') 105 | time.sleep(3) 106 | 107 | doc_list = driver.find_elements_by_xpath('//*[@id="search"]/div[2]/div[2]/div[10]/div/*') 108 | 109 | for doc in doc_list: 110 | card_link = doc.find_element_by_xpath('.//div[1]/div/div[2]/div[1]/div/div/a') 111 | try: 112 | tags_row = doc.find_element_by_css_selector('div.documentTagsContainer') 113 | except: 114 | tags_row = None 115 | 116 | title = card_link.get_attribute('title') 117 | url = card_link.get_attribute('href') 118 | 119 | try: 120 | ds = Datasheet \ 121 | .where('title', '=', title) \ 122 | .where('scraper_id', '=', sc_id) \ 123 | .first_or_fail() 124 | 125 | except: 126 | ds = Datasheet() 127 | ds.scraper_id = sc_id 128 | ds.found = datetime.now() 129 | ds.src = url 130 | ds.title = title 131 | 132 | ds.last_seen = datetime.now() 133 | ds.save() 134 | 135 | if tags_row is not None: 136 | tags = tags_row.find_elements_by_xpath('.//span') 137 | for tag in tags: 138 | for t in tag.text.split(' '): 139 | if t != '': 140 | try: 141 | te = DatasheetTag \ 142 | .where('scraper_id', '=', sc_id) \ 143 | .where('name', '=', t) \ 144 | .first_or_fail() 145 | except: 146 | te = DatasheetTag() 147 | te.scraper_id = sc_id 148 | te.name = t 149 | te.save() 150 | 151 | ds.add_tag(te) 152 | 153 | ds.save() 154 | 155 | datasheets += 1 156 | 157 | try: 158 | driver.find_element_by_xpath('//*[@id="search"]/div[2]/div[2]/div[11]/ul/li[6]') 159 | page_index += 10 160 | except: 161 | has_next_page = False 162 | 163 | end_time = datetime.now() 164 | 165 | log(f'Found {datasheets} datasheets in {end_time - start_time}') 166 | 167 | 168 | def parser_init(parser): 169 | arm_options = parser.add_argument_group('arm adapter options') 170 | 171 | arm_options.add_argument( 172 | '--arm-document-type', '-A', 173 | dest = 'arm_doc_type', 174 | type = DocumentType.from_string, 175 | choices = list(DocumentType), 176 | default = [ 'ReferenceManual', 'Architecture' ], 177 | help = 'ARM Documentation types to download' 178 | ) 179 | 180 | def adapter_main(args, driver, driver_options, dl_dir): 181 | sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id 182 | if not args.skip_collect or not args.skip_extract: 183 | with driver(options = driver_options) as wd: 184 | if not args.skip_collect: 185 | collect_datasheets(wd, args.arm_doc_type) 186 | 187 | if not args.skip_extract: 188 | sheets = Datasheet.where('src', '!=', 'NULL').where('scraper_id', '=', sc_id).get() 189 | with tqdm( 190 | miniters = 1, total = len(sheets), 191 | ) as bar: 192 | for ds in sheets: 193 | bar.set_description(fixup_title(ds.title)) 194 | if extract_datasheet(wd, ds): 195 | bar.update(1) 196 | 197 | if not args.skip_download: 198 | sheets = Datasheet.where('url', '!=', 'NULL').where('scraper_id', '=', sc_id).get() 199 | with tqdm( 200 | miniters = 1, total = len(sheets), 201 | ) as bar: 202 | for ds in sheets: 203 | bar.set_description(fixup_title(ds.title)) 204 | if download_resource(dl_dir, ds, args): 205 | bar.update(1) 206 | 207 | 208 | return 0 209 | -------------------------------------------------------------------------------- /trawler/adapters/xilinx.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | """ 3 | xilinx.py 4 | --------- 5 | 6 | This script is designed to scrape all of the datasheets from xilinx.com 7 | 8 | """ 9 | import sys 10 | import time 11 | import enum 12 | 13 | from enum import Enum, Flag 14 | from os import getcwd, path, mkdir 15 | from datetime import datetime, timedelta 16 | 17 | import requests 18 | from requests import utils 19 | 20 | from tqdm import tqdm 21 | from selenium import webdriver 22 | 23 | from ..common import * 24 | from ..net import download_resource, get_content 25 | from ..db import Datasheet, DatasheetTag, Scraper 26 | 27 | from bs4 import BeautifulSoup 28 | 29 | @enum.unique 30 | class DocumentSource(Enum): 31 | DocNav = enum.auto() 32 | Web = enum.auto() 33 | 34 | def __str__(self) -> str: 35 | return self.name 36 | 37 | @staticmethod 38 | def from_string(s): 39 | try: 40 | return DocumentSource[s] 41 | except KeyError: 42 | raise ValueError() 43 | 44 | ADAPTER_NAME = 'xilinx' 45 | ADAPTER_DESC = 'Xilinx datasheet adapter' 46 | 47 | XILINX_DOCNAV_ROOT = 'https://xilinx.com/support/documentation/navigator' 48 | XILINX_HUBS_INDEX = f'{XILINX_DOCNAV_ROOT}/xhubs.xml' 49 | XILINX_DOCS_INDEX = f'{XILINX_DOCNAV_ROOT}/xdocs.xml' 50 | 51 | def extract_datasheet(driver, ds): 52 | tlog(f' => Extracting datasheet {ds.id} from {ds.src}') 53 | 54 | def collect_datasheets(driver, doc_types): 55 | sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id 56 | 57 | 58 | def docnav_collect_docs(args): 59 | log(f'Downloading doc index from {XILINX_DOCS_INDEX}') 60 | catalogs = [] 61 | content = get_content(XILINX_DOCS_INDEX, args) 62 | soup = BeautifulSoup(content, 'lxml') 63 | for catalog in soup.find_all('catalog'): 64 | # ニャ! 65 | cat = { 66 | 'catalog': catalog['label'], 67 | 'product': catalog['productName'] if 'productName' in catalog else catalog['productname'], 68 | 'collection': catalog['collection'], 69 | 'groups': [] 70 | } 71 | 72 | inf(f' => Found catalog {cat["catalog"]}') 73 | for group in catalog.find_all('group'): 74 | grp = { 75 | 'title': group['label'], 76 | 'docs': [] 77 | } 78 | 79 | inf(f' => Found group {grp["title"]}') 80 | for doc in group.find_all('document'): 81 | title = doc.find('title') 82 | loc = doc.find('webLocation') 83 | doc_id = doc.find('docID') 84 | doc_type = doc.find('docType') 85 | desc = doc.find('tooltip') 86 | tags = doc.find('functionTags') 87 | 88 | grp['docs'].append({ 89 | 'title': title.get_text() if title is not None else '', 90 | 'location': loc.get_text() if loc is not None else '', 91 | 'doc_id': doc_id.get_text() if doc_id is not None else '', 92 | 'type': doc_type.get_text() if doc_type is not None else '', 93 | 'desc': desc.get_text() if desc is not None else '', 94 | 'tags': tags.get_text().split(',') if tags is not None else [], 95 | }) 96 | 97 | cat['groups'].append(grp) 98 | catalogs.append(cat) 99 | 100 | return catalogs 101 | 102 | def docnav_populate(args, docs, dl_dir): 103 | inf('Populating datasheet database') 104 | sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id 105 | for cat in docs: 106 | inf(f' => Populating from catalog {cat["catalog"]}') 107 | try: 108 | cat_tag = DatasheetTag \ 109 | .where('scraper_id', '=', sc_id) \ 110 | .where('name', '=', cat['catalog']) \ 111 | .first_or_fail() 112 | except: 113 | cat_tag = DatasheetTag() 114 | cat_tag.scraper_id = sc_id 115 | cat_tag.name = cat['catalog'] 116 | cat_tag.save() 117 | 118 | for grp in cat['groups']: 119 | inf(f' => Populating {len(grp["docs"])} docs from group {cat["catalog"]}/{grp["title"]}') 120 | try: 121 | grp_tag = DatasheetTag \ 122 | .where('scraper_id', '=', sc_id) \ 123 | .where('name', '=', grp['title']) \ 124 | .first_or_fail() 125 | except: 126 | grp_tag = DatasheetTag() 127 | grp_tag.scraper_id = sc_id 128 | grp_tag.name = grp['title'] 129 | grp_tag.save() 130 | 131 | for doc in grp['docs']: 132 | try: 133 | ds = Datasheet \ 134 | .where('title', '=', doc['title']) \ 135 | .where('scraper_id', '=', sc_id) \ 136 | .first_or_fail() 137 | 138 | except: 139 | ds = Datasheet() 140 | ds.scraper_id = sc_id 141 | ds.found = datetime.now() 142 | ds.src = doc['location'] 143 | ds.url = doc['location'] 144 | ds.title = doc['title'] 145 | 146 | if not args.xilinx_doc_group: 147 | c_dir = path.join(dl_dir, cat['catalog'].replace('/', '_')) 148 | g_dir = path.join(c_dir, grp['title'].replace('/', '_')) 149 | if not path.exists(c_dir): 150 | log(f' => Catalog {cat["catalog"]} does not exist, creating') 151 | mkdir(c_dir) 152 | 153 | if not path.exists(g_dir): 154 | log(f' => Group {cat["catalog"]}/{grp["title"]} does not exist, creating') 155 | mkdir(g_dir) 156 | 157 | ds.dl_location = g_dir 158 | ds.save() 159 | 160 | ds.add_tag(cat_tag) 161 | ds.add_tag(grp_tag) 162 | for tag in doc['tags']: 163 | if tag != '': 164 | try: 165 | ds_tag = DatasheetTag \ 166 | .where('scraper_id', '=', sc_id) \ 167 | .where('name', '=', tag) \ 168 | .first_or_fail() 169 | except: 170 | ds_tag = DatasheetTag() 171 | ds_tag.scraper_id = sc_id 172 | ds_tag.name = tag 173 | ds_tag.save() 174 | 175 | ds.add_tag(ds_tag) 176 | ds.save() 177 | 178 | def docnav_runner(args, dl_dir): 179 | inf('Downloading datasheets from DocNav') 180 | sc = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail() 181 | 182 | if not args.skip_collect: 183 | docs = docnav_collect_docs(args) 184 | if docs is None: 185 | err('Unable to collect Xilinx hubs') 186 | return 1 187 | 188 | # Populate the datasheet database 189 | docnav_populate(args, docs, dl_dir) 190 | 191 | # Now we have all the datasheets, we can download them 192 | if not args.skip_download: 193 | sheets = Datasheet.where('url', '!=', 'NULL').where('scraper_id', '=', sc.id).get() 194 | with tqdm( 195 | miniters = 1, total = len(sheets), 196 | ) as bar: 197 | for ds in sheets: 198 | bar.set_description(fixup_title(ds.title)) 199 | if ds.url[-4:] == 'html' and args.xilinx_get_web_only: 200 | if download_resource(dl_dir, ds, args): 201 | bar.update(1) 202 | else: 203 | if download_resource(dl_dir, ds, args): 204 | bar.update(1) 205 | 206 | sc.last_run = datetime.now() 207 | sc.save() 208 | 209 | return 0 210 | 211 | def web_runner(args, driver, dl_dir): 212 | inf('Downloading datasheets from web') 213 | sc_id = Scraper.where('name', '=', ADAPTER_NAME).first_or_fail().id 214 | 215 | if not args.skip_collect: 216 | collect_datasheets(driver, args.arm_document_type) 217 | 218 | if not args.skip_extract: 219 | for ds in tqdm(Datasheet.all()): 220 | extract_datasheet(driver, ds) 221 | 222 | if not args.skip_download: 223 | for ds in tqdm(Datasheet.where('url', '!=', 'NULL').get()): 224 | download_datasheet(dl_dir, ds) 225 | 226 | return 0 227 | 228 | def parser_init(parser): 229 | xilinx_options = parser.add_argument_group('Xilinx adapter options') 230 | 231 | xilinx_options.add_argument( 232 | '--document-source', '-d', 233 | dest = 'xilinx_doc_source', 234 | type = DocumentSource.from_string, 235 | choices = list(DocumentSource), 236 | default = 'DocNav', 237 | help = 'Documentation Source' 238 | ) 239 | 240 | xilinx_options.add_argument( 241 | '--dont-group', '-G', 242 | dest = 'xilinx_doc_group', 243 | default = False, 244 | action = 'store_true', 245 | help = 'Don\'t group the datasheets when using DocNav as the document source' 246 | ) 247 | 248 | xilinx_options.add_argument( 249 | '--collect-web-only', '-W', 250 | dest = 'xilinx_get_web_only', 251 | default = False, 252 | action = 'store_true', 253 | help = 'Also archive the web-only content and monolithic HTML pages', 254 | ) 255 | 256 | def adapter_main(args, driver, driver_options, dl_dir): 257 | if args.xilinx_doc_source == DocumentSource.DocNav: 258 | return docnav_runner(args, dl_dir) 259 | elif args.xilinx_doc_source == DocumentSource.Web: 260 | with driver(options = driver_options) as wd: 261 | return web_runner(args, wd, dl_dir) 262 | else: 263 | err(f'Unknown Xilinx documentation source {args.xilinx_doc_source}!') 264 | return 1 265 | -------------------------------------------------------------------------------- /trawler/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | 3 | __all__ = ('main') 4 | 5 | def _init_directories(): 6 | from . import config 7 | 8 | from os import path, mkdir 9 | 10 | dirs = ( 11 | # Core Directories 12 | config.TRAWLER_DATA, 13 | config.TRAWLER_CACHE, 14 | # Sub directories 15 | config.TRAWLER_USER_ADAPTERS, 16 | ) 17 | 18 | for d in dirs: 19 | if not path.exists(d): 20 | mkdir(d) 21 | 22 | def _populate_webdriver_opts(args): 23 | from .config import WebdriverBackend 24 | 25 | from selenium.webdriver import firefox 26 | from selenium.webdriver import chrome 27 | 28 | opts = None 29 | 30 | if args.webdriver == WebdriverBackend.Chrome: 31 | opts = firefox.options.Options() 32 | 33 | 34 | elif args.webdriver == WebdriverBackend.FireFox: 35 | opts = chrome.options.Options() 36 | 37 | return opts 38 | 39 | def _collect_adapters(): 40 | import pkgutil 41 | 42 | from . import db 43 | from . import adapters 44 | 45 | adpts = [] 46 | # Load the built-in internal adapters 47 | for _, name, is_pkg in pkgutil.iter_modules(path = getattr(adapters, '__path__')): 48 | if not is_pkg: 49 | __import__(f'{getattr(adapters, "__name__")}.{name}') 50 | if not hasattr(getattr(adapters, name), 'DONT_LOAD'): 51 | adpts.append({ 52 | 'name': getattr(adapters, name).ADAPTER_NAME, 53 | 'description': getattr(adapters, name).ADAPTER_DESC, 54 | 'parser_init': getattr(adapters, name).parser_init, 55 | 'main': getattr(adapters, name).adapter_main, 56 | 'is_meta': hasattr(getattr(adapters, name), 'META_ADAPTER'), 57 | }) 58 | # Load the adapters from the share 59 | # TODO: this 60 | 61 | return adpts 62 | 63 | 64 | def main(): 65 | from . import config 66 | from . import db 67 | from .common import log, err, wrn, inf, dbg 68 | 69 | import os 70 | from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter 71 | 72 | from orator import DatabaseManager, Model 73 | from selenium import webdriver 74 | 75 | _init_directories() 76 | 77 | ADAPTERS = _collect_adapters() 78 | 79 | parser = ArgumentParser(formatter_class = ArgumentDefaultsHelpFormatter, description = f'Trawler datasheet scraper') 80 | 81 | scraper_options = parser.add_argument_group('Global scraper options') 82 | 83 | scraper_options.add_argument( 84 | '--output', '-o', 85 | type = str, 86 | default = config.DEFAULT_OUTPUT_DIR, 87 | help = 'Datasheet download root' 88 | ) 89 | 90 | scraper_options.add_argument( 91 | '--timeout', '-t', 92 | type = int, 93 | default = config.DEFAULT_TIMEOUT, 94 | help = 'Entry timeout in seconds' 95 | ) 96 | 97 | scraper_options.add_argument( 98 | '--retry', '-r', 99 | type = int, 100 | default = config.DEFAULT_RETRY_COUNT, 101 | help = 'Download retry count' 102 | ) 103 | 104 | scraper_options.add_argument( 105 | '--delay', '-d', 106 | type = int, 107 | default = config.DEFAULT_DOWNLOAD_DELAY, 108 | help = 'Download delay in seconds' 109 | ) 110 | 111 | scraper_options.add_argument( 112 | '--cache-database', '-c', 113 | type = str, 114 | default = config.DEFAULT_DATABASE, 115 | help = 'Cache database' 116 | ) 117 | 118 | scraper_options.add_argument( 119 | '--skip-collect', '-C', 120 | default = False, 121 | action = 'store_true', 122 | help = 'Skip the datasheet collection stage' 123 | ) 124 | 125 | scraper_options.add_argument( 126 | '--skip-extract', '-E', 127 | default = False, 128 | action = 'store_true', 129 | help = 'Skip the datasheet extraction stage' 130 | ) 131 | 132 | scraper_options.add_argument( 133 | '--skip-download', '-D', 134 | default = False, 135 | action = 'store_true', 136 | help = 'Skip the datasheet extraction stage' 137 | ) 138 | 139 | scraper_options.add_argument( 140 | '--skip-archives', 141 | default = False, 142 | action = 'store_true', 143 | help = 'Skip downloading archives' 144 | ) 145 | 146 | scraper_options.add_argument( 147 | '--extract-archives', 148 | default = False, 149 | action = 'store_true', 150 | help = 'Extract downloaded archives and attempt to index them' 151 | ) 152 | 153 | scraper_options.add_argument( 154 | '--skip-executables', 155 | default = False, 156 | action = 'store_true', 157 | help = 'Skip downloading executables' 158 | ) 159 | 160 | scraper_options.add_argument( 161 | '--user-agent', '-A', 162 | type = str, 163 | default = config.DEFAULT_USER_AGENT, 164 | help = 'Specify the user-agent to use' 165 | ) 166 | 167 | wd_options = parser.add_argument_group('Selenium WebDriver Settings') 168 | 169 | wd_options.add_argument( 170 | '--profile-directory', '-p', 171 | type = str, 172 | default = config.DEFAULT_PROFILE_DIRECTORY, 173 | help = 'Selenium WebDriver profile directory' 174 | ) 175 | 176 | wd_options.add_argument( 177 | '--webdriver', '-w', 178 | type = config.WebdriverBackend.from_string, 179 | choices = list(config.WebdriverBackend), 180 | default = config.DEFAULT_WEBDRIVER, 181 | help = 'Selenium WebDriver to use' 182 | ) 183 | 184 | wd_options.add_argument( 185 | '--headless', '-H', 186 | default = config.DEFAULT_WD_HEADLESS, 187 | action = 'store_true', 188 | help = 'Run the Selenium WebDriver heedlessly' 189 | ) 190 | 191 | wd_options.add_argument( 192 | '--headless-height', '-Y', 193 | type = str, 194 | default = config.DEFAULT_WD_HEADLESS_RES[0], 195 | help = 'Specify the hight of the headless window' 196 | ) 197 | 198 | wd_options.add_argument( 199 | '--headless-width', '-X', 200 | type = str, 201 | default = config.DEFAULT_WD_HEADLESS_RES[1], 202 | help = 'Specify the width of the headless window' 203 | ) 204 | 205 | # Maybe one day 206 | # wd_options.add_argument( 207 | # '--proxy', '-P', 208 | # type = str, 209 | # default = None, 210 | # help = 'Proxy to use for the Selenium WebDriver', 211 | # ) 212 | 213 | adapter_parser = parser.add_subparsers( 214 | dest = 'adapter', 215 | required = True 216 | ) 217 | 218 | # Add the adapter settings 219 | for adpt in ADAPTERS: 220 | ap = adapter_parser.add_parser( 221 | adpt['name'], 222 | help = adpt['description'] 223 | ) 224 | adpt['parser_init'](ap) 225 | 226 | # Actually parse the arguments 227 | args = parser.parse_args() 228 | 229 | # Initialize the download directory if not done so 230 | if not os.path.exists(args.output): 231 | wrn(f'Output directory {args.output} does not exist, creating') 232 | os.mkdir(args.output) 233 | 234 | # Initialize the Database 235 | dbc = config.DATABASE 236 | dbc['trawler_cache']['database'] = args.cache_database 237 | 238 | if args.adapter == 'zotero': 239 | dbc['zotero']['database'] = args.zotero_db_loc 240 | 241 | dbm = DatabaseManager(config.DATABASE) 242 | Model.set_connection_resolver(dbm) 243 | 244 | if not os.path.exists(args.cache_database): 245 | common.wrn('Cache database does not exists, creating') 246 | db.run_migrations(dbm) 247 | else: 248 | # Check the DB schema and update it if need be 249 | db.check_schema(dbm) 250 | 251 | inf(f'Cache database located at {args.cache_database}') 252 | 253 | # Initialize the datasheet directory 254 | if not os.path.exists(args.output): 255 | log(f'Datasheet download directory {args.output} does not exist, creating') 256 | os.mkdir(args.output) 257 | 258 | # Initialize the profile directory 259 | if not os.path.exists(args.profile_directory): 260 | log(f'WebDriver profile \'{args.profile_directory}\' does not exist, creating') 261 | os.mkdir(args.profile_directory) 262 | 263 | # WebDriver Initialization 264 | inf(f'Using the {args.webdriver} WebDriver') 265 | if args.webdriver == config.WebdriverBackend.Chrome: 266 | wd_opts = webdriver.chrome.options.Options() 267 | wd = webdriver.Chrome 268 | 269 | wd_opts.add_argument(f'user-data-dir={args.profile_directory}') 270 | 271 | if args.headless: 272 | wd_opts.add_argument('--headless') 273 | wd_opts.add_argument(f'--window-size={args.headless_width},{args.headless_height}') 274 | 275 | 276 | elif args.webdriver == config.WebdriverBackend.FireFox: 277 | wd = webdriver.Firefox 278 | wd_opts = webdriver.firefox.options.Options() 279 | 280 | wd_profile = webdriver.firefox.firefox_profile.FirefoxProfile(args.profile_directory) 281 | wd_opts.profile = wd_profile 282 | 283 | if args.headless: 284 | wd_opts.headless = True 285 | 286 | else: 287 | err('Unknown WebDriver, what?') 288 | return 1 289 | 290 | # Ensure the database is properly populated w/ known adapters 291 | for adapter in ADAPTERS: 292 | try: 293 | db.Scraper.where('name', '=', adapter['name']).first_or_fail() 294 | except: 295 | s = db.Scraper() 296 | s.name = adapter['name'] 297 | if adapter['is_meta']: 298 | s.meta = True 299 | s.save() 300 | 301 | # Get the adapter we need to run 302 | if args.adapter not in map(lambda a: a['name'], ADAPTERS): 303 | err(f'Unknown adapter {args.adapter}') 304 | err(f'Known adapters: {", ".join(map(lambda a: a["name"], ADAPTERS))}') 305 | return 1 306 | else: 307 | adpt = list(filter(lambda a: a['name'] == args.adapter, ADAPTERS))[0] 308 | 309 | # Initialize the adapter download directory 310 | dl_dir = os.path.join(args.output, adpt['name']) 311 | if not os.path.exists(dl_dir) and not adpt['is_meta']: 312 | wrn(f'Adapter datasheet directory {dl_dir} does not exist, creating...') 313 | os.mkdir(dl_dir) 314 | 315 | # Actually run the adapter 316 | if adpt['is_meta']: 317 | return adpt['main'](args, dl_dir) 318 | else: 319 | return adpt['main'](args, wd, wd_opts, dl_dir) 320 | -------------------------------------------------------------------------------- /trawler/adapters/zotero.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: BSD-3-Clause 2 | """ 3 | zotero.py 4 | --------- 5 | 6 | This is the zotero meta adapter 7 | 8 | """ 9 | import sys 10 | import os 11 | 12 | from shutil import copyfile 13 | from tempfile import gettempdir 14 | from datetime import datetime 15 | 16 | from orator import Model, orm 17 | 18 | from tqdm import tqdm 19 | 20 | from .. import config 21 | from ..common import * 22 | from ..net import download_resource 23 | from ..db import Datasheet, DatasheetTag, Scraper 24 | 25 | 26 | META_ADAPTER = 0 27 | 28 | ADAPTER_NAME = 'zotero' 29 | ADAPTER_DESC = 'Trawler zotero meta adapter' 30 | 31 | ZOTERO_BACKUP_DIR = gettempdir() 32 | ZOTERO_TRAWLER_ROOT_COLLECTION = 'Trawler' 33 | 34 | def gen_key(key_len = 8): 35 | from random import choice 36 | from string import ascii_uppercase, digits 37 | return ''.join(choice(ascii_uppercase + digits) for i in range(key_len)) 38 | 39 | # ==== Zotero DB Models ==== # 40 | 41 | class ZCollectionItem(Model): 42 | __connection__ = 'zotero' 43 | __timestamps__ = False 44 | __table__ = 'collectionItems' 45 | __primary_key__ = 'collectionID' 46 | 47 | 48 | class ZCollection(Model): 49 | __connection__ = 'zotero' 50 | __timestamps__ = False 51 | __table__ = 'collections' 52 | __primary_key__ = 'collectionID' 53 | 54 | @orm.belongs_to_many('collectionItems', 'collectionID', 'itemID') 55 | def items(self): 56 | return ZItem 57 | 58 | 59 | class ZItemType(Model): 60 | __connection__ = 'zotero' 61 | __timestamps__ = False 62 | __table__ = 'itemTypes' 63 | __primary_key__ = 'itemTypeID' 64 | 65 | 66 | @orm.belongs_to_many('items', 'itemTypeID', 'itemID') 67 | def items(self): 68 | return ZItem 69 | 70 | class ZTag(Model): 71 | __connection__ = 'zotero' 72 | __timestamps__ = False 73 | __table__ = 'tags' 74 | __primary_key__ = 'tagID' 75 | 76 | 77 | @orm.belongs_to_many('itemTags', 'tagID', 'itemID') 78 | def items(self): 79 | return ZItem 80 | 81 | class ZItem(Model): 82 | __connection__ = 'zotero' 83 | __timestamps__ = False 84 | __table__ = 'items' 85 | __primary_key__ = 'itemID' 86 | 87 | @orm.belongs_to_many('itemTags', 'itemID', 'tagID') 88 | def tags(self): 89 | return ZTag 90 | 91 | def has_tag(self, tag): 92 | return self.tags().where('tagID', '=', tag.tagID).exists() 93 | 94 | def add_tag(self, tag): 95 | if not self.has_tag(tag): 96 | self.tags().attach(tag) 97 | 98 | def remove_tag(self, tag): 99 | if self.has_tag(tag): 100 | self.tags().detach(tag) 101 | 102 | @orm.belongs_to_many('itemData', 'itemID', 'valueID') 103 | def item_data(self): 104 | return ZItemDataValue 105 | 106 | def has_item_data(self): 107 | return self.item_data().where('itemID', '=', self.itemID) 108 | 109 | def add_item_data(self, item_data): 110 | self.item_data().attach(item_data) 111 | 112 | def remove_item_data(self, item_data): 113 | self.item_data().detach(item_data) 114 | 115 | class ZItemTag(Model): 116 | __connection__ = 'zotero' 117 | __timestamps__ = False 118 | __table__ = 'itemTags' 119 | __primary_key__ = 'itemID' 120 | 121 | class ZFieldFormat(Model): 122 | __connection__ = 'zotero' 123 | __timestamps__ = False 124 | __table__ = 'fieldFormats' 125 | __primary_key__ = 'fieldFormatID' 126 | 127 | class ZField(Model): 128 | __connection__ = 'zotero' 129 | __timestamps__ = False 130 | __table__ = 'fields' 131 | __primary_key__ = 'fieldID' 132 | 133 | class ZItemAttachment(Model): 134 | __connection__ = 'zotero' 135 | __timestamps__ = False 136 | __table__ = 'itemAttachments' 137 | __primary_key__ = 'itemID' 138 | 139 | @orm.belongs_to('itemID') 140 | def item(self): 141 | return ZItem 142 | 143 | class ZItemData(Model): 144 | __connection__ = 'zotero' 145 | __timestamps__ = False 146 | __table__ = 'itemData' 147 | __primary_key__ = 'itemID' 148 | 149 | class ZItemDataValue(Model): 150 | __connection__ = 'zotero' 151 | __timestamps__ = False 152 | __table__ = 'itemDataValues' 153 | __primary_key__ = 'valueID' 154 | 155 | @orm.belongs_to('valueID', 'valueID') 156 | def item_data(self): 157 | return ZItem 158 | 159 | def has_data(self): 160 | return self.item_data().where('valueID', '=', self.valueID).exists() 161 | 162 | 163 | # ==== Adapter Methods ==== # 164 | 165 | def sync_scraper(tcol, sc): 166 | log(f' => Syncing datasheets from {sc.name}') 167 | date_added = datetime.now() 168 | # Try to check if the scraper collection already exists, or create it 169 | try: 170 | scol = ZCollection \ 171 | .where('collectionName', '=', sc.name) \ 172 | .where('parentCollectionID', '=', tcol.collectionID) \ 173 | .first_or_fail() 174 | except: 175 | scol = ZCollection() 176 | scol.collectionName = sc.name 177 | scol.parentCollectionID = tcol.collectionID 178 | scol.libraryID = tcol.libraryID 179 | scol.key = gen_key() 180 | scol.save() 181 | 182 | for ds in tqdm(Datasheet.where('scraper_id', '=', sc.id).get(), desc = f'Zotero sync: {sc.name}'): 183 | tlog(f' ==> Adding datasheet {ds.title}') 184 | 185 | try: 186 | name = ZItemDataValue.where('value', '=', ds.filename).first_or_fail() 187 | except: 188 | name = ZItemDataValue() 189 | name.value = ds.filename 190 | name.save() 191 | 192 | try: 193 | title = ZItemDataValue.where('value', '=', ds.title).first_or_fail() 194 | except: 195 | title = ZItemDataValue() 196 | title.value = ds.title 197 | title.save() 198 | 199 | try: 200 | url = ZItemDataValue.where('value', '=', ds.url).first_or_fail() 201 | except: 202 | url = ZItemDataValue() 203 | url.value = ds.url 204 | url.save() 205 | 206 | try: 207 | da = ZItemDataValue.where('value', '=', date_added).first_or_fail() 208 | except: 209 | da = ZItemDataValue() 210 | da.value = date_added 211 | da.save() 212 | 213 | # Check to see the the data values we just created are linked with an item 214 | if not ZItemData.where('valueID', '=', name.valueID).exists(): 215 | # If not, then we can assume that the item doesn't exist, so we create one 216 | item = ZItem() 217 | item.itemTypeID = 12 218 | item.libraryID = scol.libraryID 219 | item.key = gen_key() 220 | item.save() 221 | 222 | i_title = ZItemData() 223 | i_title.itemID = item.itemID 224 | i_title.valueID = title.valueID 225 | i_title.fieldID = 1 226 | i_title.save() 227 | 228 | i_url = ZItemData() 229 | i_url.itemID = item.itemID 230 | i_url.valueID = url.valueID 231 | i_url.fieldID = 13 232 | i_url.save() 233 | 234 | i_da = ZItemData() 235 | i_da.itemID = item.itemID 236 | i_da.valueID = da.valueID 237 | i_da.fieldID = 6 238 | i_da.save() 239 | 240 | # Attach tags to item 241 | for tag in ds.tags().get(): 242 | if tag.name != '': 243 | zt = ZTag.where('name', '=', tag.name).first_or_fail() 244 | 245 | zit = ZItemTag() 246 | zit.itemID = item.itemID 247 | zit.tagID = zt.tagID 248 | zit.type = 0 249 | zit.save() 250 | 251 | # Create the attachment item and then the attachment link 252 | aitm = ZItem() 253 | aitm.itemTypeID = 2 254 | aitm.libraryID = scol.libraryID 255 | aitm.key = gen_key() 256 | aitm.save() 257 | 258 | att_name = ZItemData() 259 | att_name.itemID = aitm.itemID 260 | att_name.valueID = name.valueID 261 | att_name.fieldID = 1 262 | att_name.save() 263 | 264 | att = ZItemAttachment() 265 | att.itemID = aitm.itemID 266 | att.parentItemID = item.itemID 267 | att.linkMode = 2 268 | att.path = ds.dl_location 269 | att.contentType = 'application/pdf' 270 | att.save() 271 | 272 | # Finally link it to the collection 273 | col = ZCollectionItem() 274 | col.collectionID = scol.collectionID 275 | col.itemID = item.itemID 276 | col.save() 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | def sync_database(args, dl_dir): 285 | inf('Syncing Zotero with Trawler cache') 286 | 287 | if args.zotero_sync_backup: 288 | backup_file = os.path.join(args.zotero_sync_backup_dir, 'zotero_backup.sqlite') 289 | 290 | log(f' => Backing up Zotero db to {backup_file}') 291 | copyfile(args.zotero_db_loc, backup_file) 292 | 293 | # Get the Trawler Zotero collection, or create it otherwise 294 | try: 295 | tcol = ZCollection.where('collectionName', '=', ZOTERO_TRAWLER_ROOT_COLLECTION).first_or_fail() 296 | except: 297 | tcol = ZCollection() 298 | tcol.collectionName = ZOTERO_TRAWLER_ROOT_COLLECTION 299 | tcol.key = gen_key() 300 | tcol.libraryID = 1 301 | tcol.save() 302 | 303 | # Sync the tags 304 | log(f' => Syncing Trawler tags to Zotero') 305 | for tag in tqdm(DatasheetTag.all(), desc = 'Zotero sync: tags'): 306 | if tag.name != '': 307 | try: 308 | zt = ZTag.where('name', '=', tag.name).first_or_fail() 309 | except: 310 | zt = ZTag() 311 | zt.name = tag.name 312 | zt.save() 313 | 314 | for sc in Scraper.all(): 315 | if not sc.meta: 316 | sync_scraper(tcol, sc) 317 | 318 | ZOTERO_ACTIONS = { 319 | 'sync': sync_database 320 | } 321 | 322 | def parser_init(parser): 323 | zotero_options = parser.add_argument_group('zotero meta adapter options') 324 | 325 | zotero_options.add_argument( 326 | '--zotero-db-location', 327 | dest = 'zotero_db_loc', 328 | type = str, 329 | default = config.ZOTERO_DB, 330 | help = 'The location of the Zotero SQLite database' 331 | ) 332 | 333 | zotero_actions = parser.add_subparsers( 334 | dest = 'zotero_action', 335 | required = True 336 | ) 337 | 338 | zsync = zotero_actions.add_parser('sync', help = 'Sync Zotero with Trawler') 339 | 340 | zsync.add_argument( 341 | '--backup', 342 | dest = 'zotero_sync_backup', 343 | default = False, 344 | action = 'store_true', 345 | help = 'Create a backup of the Zotero database before syncing' 346 | ) 347 | 348 | zsync.add_argument( 349 | '--backup-dir', 350 | dest = 'zotero_sync_backup_dir', 351 | type = str, 352 | default = ZOTERO_BACKUP_DIR, 353 | help = 'Specify the location for the Zotero backup file' 354 | ) 355 | 356 | def adapter_main(args, dl_dir): 357 | if not os.path.exists(args.zotero_db_loc): 358 | err(f'Unable to find the Zotero database at {args.zotero_db_loc}') 359 | err(f'To override the default lookup location use \'--zotero-db-location\'') 360 | return 1 361 | 362 | wrn('This will lock the Zotero database, ensure that Zotero is not being used!') 363 | 364 | # Invoke the given action 365 | act = ZOTERO_ACTIONS.get(args.zotero_action, lambda a: 1) 366 | return act(args, dl_dir) 367 | 368 | --------------------------------------------------------------------------------