├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── feedme ├── __init__.py ├── __main__.py ├── cli.py ├── commands │ ├── check-db.py │ ├── cleanup.py │ ├── commands.py │ ├── crawl.py │ ├── export.py │ ├── merge-db.py │ ├── options.py │ ├── resume.py │ ├── upgrade-db.py │ └── utils.py ├── contrib │ ├── __init__.py │ ├── browser.py │ ├── filters.py │ └── tumblr.py ├── datastructures.py ├── docs.py ├── exporters │ ├── __init__.py │ ├── exporters.py │ ├── graph.py │ ├── uncharted.py │ ├── urls.py │ └── utils.py ├── extensions.py ├── feedly.py ├── logger.py ├── middlewares.py ├── pipelines.py ├── requests.py ├── settings.py ├── signals.py ├── spiders │ ├── __init__.py │ ├── base.py │ ├── cluster.py │ ├── feed.py │ └── settings.py ├── sql │ ├── __init__.py │ ├── cli.py │ ├── db.json │ ├── db.py │ ├── describe.py │ ├── factory.py │ ├── functions.py │ ├── migrations │ │ ├── 0.10.3_0.10.5.sql │ │ ├── 0.10.3_0.10.6.sql │ │ ├── 0.10.5_0.10.6.sql │ │ └── 0.10_0.10.3.sql │ ├── schema.py │ ├── stream.py │ └── utils.py ├── urlkit.py └── utils.py ├── poetry.lock ├── presets ├── _autoload.py ├── keywords.py ├── livejournal.py ├── tumblr.py └── wordpress.py ├── pyproject.toml └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # celery beat schedule file 95 | celerybeat-schedule 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # Environments 101 | *.env 102 | .venv 103 | env/ 104 | venv/ 105 | ENV/ 106 | env.bak/ 107 | venv.bak/ 108 | 109 | # Spyder project settings 110 | .spyderproject 111 | .spyproject 112 | 113 | # Rope project settings 114 | .ropeproject 115 | 116 | # mkdocs documentation 117 | /site 118 | 119 | # mypy 120 | .mypy_cache/ 121 | .dmypy.json 122 | dmypy.json 123 | 124 | # Pyre type checker 125 | .pyre/ 126 | 127 | # VSCode 128 | .vscode/ 129 | .env.vscode 130 | 131 | .flake8 132 | 133 | .temp/ 134 | 135 | /*.crawl.* 136 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## Changelog 2 | 3 | - **v0.10.11** 4 | - Accessibility update: 5 | - New command `scrapy wizard`: an interactive command that can perform simple tasks such as scraping and 6 | exporting. 7 | - Batch/shell scripts for setting up the program. 8 | - New command `scrapy resume`. 9 | - Module-level commands are now available as `scrapy` commands. 10 | - `multiprocessing` now uses `spawn` on Windows, `forkserver` on macOS (Darwin), and `fork` on Linux. 11 | - **v0.10.10** 12 | - Architectural update: 13 | - Signal-based request persistence and restoration. 14 | - Accept and log stats from any components. 15 | - Accept and persist state info from any components. 16 | - More thread-based I/O for better and more robuse performance. 17 | - NEW exporter _uncharted_: Export a list of websites that are "uncharted" — websites that were not scraped 18 | as RSS feeds during a crawl, but were recorded in the database because other feeds mentioned them. 19 | - NEW middleware `KeywordPrioritizer`: Adjust the priority of a request based on the frequency of specified 20 | keywords in its text content. 21 | - NEW option `CONTRIB_SPIDER_MIDDLEWARE`: Use additional spider middlewares together with those defined 22 | in the settings. Suitable for defining custom filtering/prioritizing logic. (The use of custom functions 23 | in presets as seen in older versions is no longer supported.) 24 | - NEW option `EXPANSION_THRESHOLD`: An integer. Instead of treating every new website it encounters as a 25 | potential new feed, the cluster spider will only start crawling a new feed if the number of times a website 26 | was seen crosses this threshold. Setting this to higher than 1 makes feeds in the resulting cluster more 27 | related to each other, since they mention each other more. 28 | - NEW command `python -m feedme customizations`: a manual of supported options that can be specified in 29 | a preset. 30 | - **v0.10.6** 31 | - Performance update. 32 | - Fixed memory leak issues with the request persistence module. 33 | - **![#f06073](https://placehold.it/12/f06073/000000?text=+) This version introduces API-breaking changes.** 34 | - Package name has changed. 35 | - Spider options must now be specified using the `-s` command-line option, and not `-a`. 36 | - **v0.10.5** 37 | - _On-the-fly persistence:_ Instead of writing scraped data to a temporary file, then digest that file once crawling is finished 38 | the program now write to databases while scraping, using a separate process. 39 | - _Crawl dead feeds only:_ A new option `FEED_STATE_SELECT` that allows the selection/prioritization of dead/living feeds. 40 | - ![#e5c07b](https://placehold.it/12/e5c07b/000000?text=+) This version introduces database schema changes. Database from 41 | v0.10.1 onwards can be upgraded to this version. 42 | - **v0.10.3** 43 | - _Optimization:_ Persisting data to database now requires less memory (with a slight time trade-off). 44 | - ![#e5c07b](https://placehold.it/12/e5c07b/000000?text=+) This version introduces database schema changes. Database from 45 | v0.10.1 onwards can be upgraded to this version. 46 | - **v0.10.2** 47 | - _Cluster spider algorithm:_ Cluster spider now do breadth-first crawls, meaning it will crawl feeds closer to the starting feed 48 | to completion before crawling feeds that are further away. 49 | - _Persistence:_ Now uses pickle to persist request to achieve more accurate resumption. 50 | - **v0.10.1** 51 | - **![#f06073](https://placehold.it/12/f06073/000000?text=+) This version introduces API-breaking changes.** 52 | - _Command change:_ The commands for both crawling and exporting has changed. See the above sections for details. 53 | - _Output:_ 54 | - All spiders now require the output path be an available directory. 55 | - All spiders now persist scraped data using SQLite databases. 56 | - It is possible to run any of the spiders multiple times on the same output directory, scraped data are automatically 57 | merged and deduplicated. 58 | - _Presets:_ You can now use presets to maintain different sets of crawling options. Since presets are Python files, you can 59 | also specify complex settings, such as custom URL filtering functions, that cannot be specified on the command line. 60 | - _URL templates:_ The search function introduced in v0.3 is now off by default, because it is discovered that Feedly's Search API 61 | is a lot more sensitive to high-volume requests. Instead of relying on search, you can specify URL templates that allow the spiders 62 | to attempt different variations of feed URLs. 63 | - _New cluster spider:_ A new spider that, instead of crawling a single feed, also attempts to crawl any website mentioned in the feed's 64 | content that might themselves be RSS feeds, resulting in a network of sites being crawled. (It's like search engine spiders but for RSS feeds.) 65 | - _Export sorting and format:_ The revamped export module lets you select and sort URLs into different files. You may now export in 66 | both plain-text lines and CSV format. 67 | - _Graph export:_ You may now export link data as GraphML graphs, useful for visualization and network analysis. _Requires `python-igraph`._ 68 | _Install with `pip install -r requirements-graph.txt`_ 69 | - **v0.3** 70 | - _Fuzzy search:_ it's no longer necessary to specify the full URL to the RSS feed data. Spider now uses Feedly's Search API to 71 | determine the correct URL. This means that you can simply specify e.g. the website's domain name, and Feedly will resolve it for you. 72 | In case there are multiple matches, they will be printed so that you can choose one and try again. 73 | - **v0.1** 74 | - _URL filtering:_ you can now specify what URLs to include/exclude when running the `collect-urls` command. For example: 75 | `--include tag=a --exclude domain=secure.bank.com` will print out all URLs found on HTML `` tags, except for those whose 76 | domains or parent domains contain "secure.bank.com". 77 | - _Feedly keywords:_ Feedly keyword data are now included in the crawl data, which you can use for filtering when running `collect-url`, 78 | using the `feedly_keyword=` filter. Additionally, there is a new `collect-keywords` command that lists all keywords found in a crawl. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 tonyzbf 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # feedly-link-feedme 2 | 3 | A Scrapy project for collecting hyperlinks from RSS feeds using Feedly's [Streams API](https://developer.feedly.com/v3/streams/). 4 | 5 | **Note⚠: This project provides a way to quickly aggregate resources such as images in an RSS feed** 6 | **for purposes such as archival work. If you are only looking to browse a feed and/or download a few things,** 7 | **it's more appropriate (and friendly) to use [Feedly](https://feedly.com) directly.** 8 | 9 | ## Requirement 10 | 11 | Requires Python 3.8+ 12 | 13 | ## Quick usage 14 | 15 | > This section describes basic usage of this program that requires little knowledge of Python or even command lines in general. 16 | > 17 | > See the [next section](#setup) for more advanced usage of this program as a command-line tool. 18 | 19 | ![#96c475](https://via.placeholder.com/12/96c475/000000?text=+) [Download](https://github.com/monotony113/feedly-link-feedme/archive/master.zip) the archive and 20 | extract it somewhere, then: 21 | 22 | - On Windows, run `start.bat`. This will start an interactive program that can perform the most common tasks. 23 | - On macOS/Linux, run `start.sh` (you may need to fix the permission first, and you may need to know how to start it 24 | from the terminal if launching it in your file manager doesn't work). 25 | 26 | ## Contents 27 | 28 | - [Documentation](#documentation) 29 | - [Setup](#setup) 30 | - [Crawling](#crawling) 31 | - [Presets](#presets) 32 | - [Exporting](#exporting) 33 | - [Cluster spider](#cluster-spider) 34 | - [Changelog](./CHANGELOG.md) 35 | - [Notes](#notes) 36 | 37 | ## Documentation 38 | 39 | ### Setup 40 | 41 | Using a virtual environment is highly recommended. 42 | 43 | ```bash 44 | > python3 -m pip install -r requirements.txt 45 | ``` 46 | 47 | > Note that this command will fail on Windows if Visual C++ build tools are not installed. The recommended way 48 | > to install dependencies on Windows is to use the `install.bat` script. 49 | 50 | ### Crawling 51 | 52 | ```bash 53 | > scrapy crawl '' -o '' [-s additional options...] 54 | ``` 55 | 56 | > If this command complains that scrapy cannot be found, your Python packages are not on your PATH. 57 | > You may either append your PATH, or begin the command with `python -m scrapy`. 58 | 59 | Currently available spiders are `feed` and `cluster`. `feed` crawls a single feed; [`cluster`](#cluster-spider) 60 | begins with a single feed but attempts to further explore websites that are mentioned in the beginning feed. 61 | 62 | Each spider option is specified using the `-s` option followed by a `key=value` pair. 63 | 64 | Example: 65 | 66 | ```bash 67 | > scrapy crawl feed http://xkcd.com/atom.xml -o xkcd -s download_order=newest 68 | ``` 69 | 70 | ### Presets 71 | 72 | In addition to specifying options via the command line, you can also specify a preset. 73 | 74 | ```bash 75 | > scrapy crawl -s preset='' 76 | ``` 77 | 78 | A preset is a just a Python script whose top-level variable names and values are used as key-value pairs to populate 79 | the spider config: 80 | 81 | ```python 82 | from datetime import datetime 83 | RSS = 'https://xkcd.com/atom.xml' 84 | OUTPUT = f'instance/xkcd-{datetime.now()}' 85 | ... 86 | ``` 87 | 88 | Only variables whose names contain only uppercase letters, numbers and underscores will be used. 89 | 90 | Presets also let you define more complex behaviors, such as URL filtering, since you can define functions and mappings. 91 | 92 | For a list of supported options, run `scrapy options`. Options that are 93 | simple string/integer values can also be specified on the command line with a case-insensitive key, in which case they take 94 | precedence over the ones defined in a preset. 95 | 96 | ### Exporting 97 | 98 | ```bash 99 | > scrapy export -i '' 100 | ``` 101 | 102 | > Previous versions of this program use a different command `python -m feedme` for tasks unrelated to Scrapy, 103 | > which is still supported. 104 | > 105 | > However, the `scrapy` command now supports running those commands as well, and is 106 | > recommended for uniformity. 107 | > 108 | > For a list of all available commands, run `scrapy`. 109 | 110 | Currently `` can be 111 | 112 | - `urls`: Export URLs as plain-text or CSV files. 113 | - `graph`: Represent URLs and their relations using a graph data structure (exported as GraphML files). 114 | 115 | **![#56b6c2](https://via.placeholder.com/12/56b6c2/000000?text=+) Example: Tumblr GIFs** 116 | 117 | ```bash 118 | scrapy export urls -i data \ 119 | --include tag is img \ 120 | --include source:netloc under tumblr.com \ 121 | --include target:netloc under media.tumblr.com \ 122 | --include target:path endswith .gif \ 123 | --include published:year lt 2017 \ 124 | --output "%(feed:netloc)s/%(published:year)d%(published:month)02d.txt" 125 | ``` 126 | 127 | This command will select 128 | 129 | - all image URLs that end with `.gif` 130 | - pointing to domains under `media.tumblr.com` (Tumblr CDN servers) 131 | - from posts before 2017 132 | - found on all crawled subdomains of `tumblr.com` (such as `staff.tumblr.com`), 133 | 134 | export them, and sort them into folders and files based on 135 | 136 | - the source domain name (i.e. blog website) 137 | - followed by the year and month of the date the post was published 138 | 139 | resulting in a folder structure that looks like 140 | 141 | ./data/out/ 142 | staff.tumblr.com/ 143 | 201602.txt 144 | 201603.txt 145 | ... 146 | .../ 147 | 148 | ---- 149 | 150 | For the `urls` exporter, the following features are available. Use the `-h`/`--help` option for a complete documentation: 151 | `scrapy export urls --help`. 152 | 153 | #### Output template 154 | 155 | Instead of specifying a regular file name for the output file with the `-o` option, you can use a Python %-formatted 156 | template string: 157 | 158 | ```python 159 | -o "%(target:netloc).6s-%(published:year)d.txt" 160 | ``` 161 | 162 | This way, you can sort URLs from different sources and/or have different values such as domain names into different 163 | files and even folders to your liking. 164 | 165 | For example, with scraped data from the feed [`https://xkcd.com/atom.xml`](https://xkcd.com/atom.xml), an export command 166 | 167 | ```bash 168 | > scrapy export urls -i data -o "%(feed:title)s/%(tag)s/%(target:netloc)s.csv" 169 | ``` 170 | 171 | could generate the following directory structure: 172 | 173 | ./data/out/ 174 | xkcd.com/ 175 | img/ 176 | imgs.xkcd.com.csv 177 | xkcd.com.csv 178 | ... 179 | a/ 180 | itunes.apple.com.csv 181 | www.barnesandnoble.com.csv 182 | ... 183 | 184 | For a list of available placeholders, see the command help: `scrapy export urls --help`. 185 | 186 | #### Filtering 187 | 188 | Use the `--include`/`--exclude` (shorthands `+f`/`-f`) to specify filters: 189 | 190 | ```bash 191 | +f source:netloc is "xkcd.com" 192 | # URLs that are found in markups from xkcd.com 193 | -f target:netloc is "google.com" 194 | # URLs that are NOT pointing to google.com 195 | +f target:path startswith "/wp-content" 196 | # URLs whose path components begin with "/wp-content". 197 | ``` 198 | 199 | Filter options can be specified multiple times to enable multiple filters, Only URLs that pass _all_ filters are exported. 200 | 201 | You can filter on URL components, feed and post titles, and dates published. For a list of filterable attributes (they are the 202 | same as the naming template placeholders), see the command help: `scrapy export urls --help`. 203 | 204 | ### Cluster spider 205 | 206 | Version v0.10 introduces a new spider called `cluster`. As the name suggests, this spider crawls not a single feed, but a cluster of feeds. 207 | 208 | How it works: 209 | 210 | 1. The spider begins with a single feed, specified throught the `RSS` option. 211 | 2. As it crawls through the beginning feed, it parses the HTML markup snippets provided by Feedly, extracting URLs from them. 212 | 3. For each website it encounters, it will check to see if they exist as a valid RSS feed on Feedly, and if yes, 213 | then it will start crawling that website too. 214 | 4. This process continues, until either 215 | - a depth limit is hit (specified with `-s depth_limit=`, or in a preset file as `DEPTH_LIMIT`), then it will finish crawling the feeds that are 216 | `depth + 1` degrees removed from the starting feed, but will not expand beyond them; or 217 | - the spider was interrupted. 218 | 219 | How many sites the spider can crawl will depend on whether it can find out a valid RSS feed URL from just a domain name. There are 2 ways to make it possible: 220 | - Provide feed templates via a preset file. For example, knowing that WordPress sites provide RSS feeds through 221 | [fixed endpoints such as `/?rss=rss` and `/feed/`](https://wordpress.org/support/article/wordpress-feeds/#finding-your-feed-url) 222 | you can define your templates like such: 223 | 224 | ```python 225 | RSS_TEMPLATES = { 226 | r'.*\.wordpress\.com.*': { # will match *.wordpress.com 227 | 'http://%(netloc)s/?rss=rss': 100, # number denotes precedence 228 | 'http://%(netloc)s/?rss=rss2': 200, 229 | 'http://%(netloc)s/?rss=atom': 300, 230 | 'http://%(netloc)s/feed/': 400, 231 | 'http://%(netloc)s/feed/rdf/': 500, 232 | ... 233 | }, 234 | ... 235 | } 236 | ``` 237 | 238 | Then, if a WordPress site mentions another WordPress site, the spider will try each variation until it hits a valid feed on Feedly. 239 | 240 | - Or, you may also enable the search function (`-s enable_search=True`, or in preset: `ENABLE_SEARCH = True`). This will let the spider search Feedly 241 | for each domain name it encounters, and crawl all returned feed. 242 | 243 | ![#e5c07b](https://via.placeholder.com/12/e5c07b/000000?text=+) **Warning: This is not recommended as the spider can quickly get rate-limited by Feedly.** 244 | 245 | Cluster spider works best for sites that have predefined endpoints for RSS feeds, such as WordPress and Tumblr blogs (for which a 246 | [preset](./presets/tumblr.py) is provided). Of course, if you can provide enough feed templates, it can work with many other sites as well. 247 | 248 | ## Notes 249 | 250 | - `feedly.com` has a `robots.txt` policy that disallows bots. Therefore, this crawler is set to disobey `robots.txt` (even though 251 | what it is doing isn't crawling so much as it is consuming data from a publicly available API). 252 | - The availability of the scraped data depends on Feedly. If no one has ever subscribed to the RSS feed you are 253 | trying to crawl on Feedly, then your crawl may not yield any result. 254 | - Similarly, the data you can crawl from Feedly are only as complete as how much Feedly has scraped your RSS feed. 255 | - Explore the Feedly Cloud API at [developer.feedly.com](https://developer.feedly.com). 256 | 257 | ## Motivation 258 | 259 | I started this project because I found out that Feedly caches a significant amount of data from dead Tumblr blogs :) 260 | 261 | Basically: 262 | 263 | 1. As you may have already known, Tumblr did not actually delete most of the media files in the Great Tumblr Purge, 264 | but rather merely removed the posts containing them, meaning those media files are still available on the internet, 265 | albeit obscured behind their CDN URLs (the `**.media.tumblr.com` links). 266 | 2. Feedly differs from ordinary RSS readers in that it caches data from RSS feeds so that people who subscribe to the same 267 | RSS feed receive data from Feedly first instead of directly from the RSS provider when they are using Feedly. 268 | 3. Among the data that Feedly caches are HTML snippets of each page in the RSS feed, which include our Tumblr media links 269 | –– and _Feedly doesn't seem to delete them even when the original posts are no longer available._ 270 | 271 | And so, effectively, Feedly has been acting as a huge Tumblr cache for as long as it has implemented such 272 | a content-delivery strategy and people have been using it to subscribe to Tumblr blogs ;) 273 | 274 | This project is however usable for any RSS blogs that Feedly has ever scraped (e.g. [`https://xkcd.com/atom.xml`](https://xkcd.com/atom.xml)), 275 | or even other Feedly APIs (see their Streams API for details). 276 | -------------------------------------------------------------------------------- /feedme/__init__.py: -------------------------------------------------------------------------------- 1 | def walk_package(path=None, name=__name__): 2 | import pkgutil 3 | from importlib.util import module_from_spec 4 | from pathlib import Path 5 | if not path: 6 | path = Path(__file__).parent 7 | for loader, module_name, is_pkg in pkgutil.walk_packages([str(path)]): 8 | pkg_name = f'{name}.{module_name}' 9 | if not is_pkg: 10 | spec = loader.find_spec(pkg_name) 11 | mod = module_from_spec(spec) 12 | try: 13 | spec.loader.exec_module(mod) 14 | except (ImportError, ModuleNotFoundError): 15 | continue 16 | yield mod 17 | else: 18 | yield from walk_package(path / module_name, pkg_name) 19 | 20 | 21 | def _config_logging(config=None, *args, **kwargs): 22 | import logging 23 | import sys 24 | from logging.config import dictConfig 25 | 26 | from .logger import make_logging_config 27 | 28 | if config and not config.getbool('LOG_ENABLED'): 29 | return 30 | 31 | if config: 32 | kwargs = { 33 | 'level': config.get('LOG_LEVEL', logging.INFO), 34 | 'colored': True, 35 | } 36 | overrides = [] 37 | 38 | if config.get('LOG_FILE'): 39 | kwargs['logfile'] = config['LOG_FILE'] 40 | 41 | if config.get('LOG_DATEFORMAT'): 42 | kwargs['datefmt'] = config['LOG_DATEFORMAT'] 43 | 44 | if config.get('LOG_STDOUT'): 45 | from scrapy.utils.log import StreamLogger 46 | sys.stdout = StreamLogger(logging.getLogger('stdout')) 47 | 48 | if config.get('LOG_SHORT_NAMES'): 49 | from scrapy.utils.log import TopLevelFormatter 50 | overrides.append({ 51 | 'filters': { 52 | 'tlfmt': { 53 | '()': TopLevelFormatter, 54 | 'loggers': ['scrapy', 'main', 'worker'], 55 | }}}) 56 | 57 | overrides += config.get('LOGGING_OVERRIDES', []) 58 | # logging.basicConfig(force=True) 59 | dictConfig(make_logging_config('feedly', *overrides, **kwargs)) 60 | return 61 | 62 | dictConfig(make_logging_config('feedly', *args, **kwargs)) 63 | -------------------------------------------------------------------------------- /feedme/__main__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from .cli import cli 24 | 25 | if __name__ == '__main__': 26 | cli(prog_name='python -m feedme') 27 | -------------------------------------------------------------------------------- /feedme/cli.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import logging 24 | from importlib import import_module 25 | from pathlib import Path 26 | from textwrap import dedent 27 | 28 | import click 29 | from scrapy.crawler import CrawlerProcess 30 | from scrapy.utils.project import get_project_settings 31 | 32 | from . import _config_logging, exporters 33 | from .docs import markdown_inline, numpydoc2click 34 | from .sql.cli import check, leftovers, merge, migrate 35 | 36 | get_help_gen = markdown_inline(lambda ctx: (yield ctx.get_help())) 37 | 38 | 39 | def get_help(ctx): 40 | return next(get_help_gen(ctx)) 41 | 42 | 43 | @click.group() 44 | @click.option('--debug', is_flag=True) 45 | @click.pass_context 46 | def cli(ctx, debug=False): 47 | level = logging.DEBUG if debug else logging.INFO 48 | _config_logging(level=level) 49 | ctx.ensure_object(dict) 50 | ctx.obj['DEBUG'] = debug 51 | 52 | 53 | def export_load_exporter(ctx: click.Context, param, value): 54 | not_found = False 55 | try: 56 | exporter = import_module(f'.{value}', exporters.__name__) 57 | assert exporter.export 58 | except (AttributeError, ModuleNotFoundError, AssertionError): 59 | not_found = True 60 | exporter = export 61 | if ctx.params.get('help') or value == 'help': 62 | ctx.meta['topic_name'] = value 63 | ctx.invoke(help_export, ctx, None, exporter) 64 | elif not_found: 65 | click.secho(str(ValueError(f"No exporter found for topic '{value}'")), fg='red') 66 | ctx.exit(1) 67 | return exporter 68 | 69 | 70 | def help_export(ctx: click.Context, param, exporter): 71 | if not exporter or ctx.resilient_parsing: 72 | return 73 | if exporter is True: 74 | return True 75 | if exporter is export: 76 | click.echo(get_help(ctx)) 77 | ctx.exit() 78 | 79 | @markdown_inline 80 | def help_subcommand(): 81 | yield from [ 82 | click.style('Data Exporter Help\n\n'.upper(), fg='black', bg='white', bold=True), 83 | 'For help on the syntax of the ~export~ command itself, use `export --help`.\n\n', 84 | click.style(ctx.meta['topic_name'], fg='black', bg='magenta', bold=True), 85 | ] 86 | doc = numpydoc2click(exporter.help_text) 87 | yield click.style(' - ' + next(doc), fg='black', bg='magenta', bold=True) 88 | yield from doc 89 | click.echo_via_pager(help_subcommand()) 90 | ctx.exit() 91 | 92 | 93 | def ensure_index_db(path): 94 | path = Path(path) 95 | if path.is_dir(): 96 | return path / 'index.db' 97 | return path 98 | 99 | 100 | @cli.command() 101 | @click.argument('topic', callback=export_load_exporter, default='help', metavar='topic') 102 | @click.option('-h', '--help', callback=help_export, is_flag=True, is_eager=True, 103 | help=""" 104 | Show this help and exit.\n 105 | Use `export --help` to see more info for a particular exporter. 106 | """) 107 | @click.option('-i', '--input', 'wd', required=True, type=click.Path(exists=True, file_okay=False), 108 | help=""" 109 | Path to the directory containing scraped data. 110 | """) 111 | @click.option('-o', '--output', 'fmt', type=click.Path(writable=True, dir_okay=False), 112 | help=""" 113 | Path to which exported data is written. Will always be under an `out/` directory inside the input directory.\n 114 | Some exporters support output path templates, see their help for more info. 115 | """) 116 | @click.option('+f', '--include', nargs=3, multiple=True, default=None, metavar='EXPR', help='') 117 | @click.option('-f', '--exclude', nargs=3, multiple=True, default=None, metavar='EXPR', 118 | help=""" 119 | Filter results based on the expression EXPR, specified with 3 values _attr predicate value_, 120 | such as `source:url is example.org`.\n 121 | Expressions themselves should not be quoted.\n 122 | Each exporter supports different filters, some does not support filtering. 123 | See their help for more info. 124 | """) 125 | @click.argument('exporter-args', nargs=-1, type=click.UNPROCESSED, metavar='additional-params') 126 | def export(topic, exporter_args, **kwargs): 127 | """ 128 | Export items from scraped data. 129 | 130 | `topic` is the kind of information to export. Currently 2 topics are available: 131 | 132 | \b 133 | _urls_: Export URLs in formats such as plain-text lines or CSV. 134 | _graph_: Export scraped data as GraphML graphs. 135 | 136 | Some exporters accept additional parameters not listed below, which can be specified as 137 | a list of _key=value_ pairs after other options. 138 | 139 | For more info on each exporter, specify the topic and the `-h/--help` option, 140 | such as `export urls --help`. 141 | """ 142 | options = dict([a.split('=', 1) for a in exporter_args]) 143 | kwargs = {k: v for k, v in kwargs.items() if v is not None} 144 | topic.export(**kwargs, **options) 145 | 146 | 147 | @cli.command(hidden=True) 148 | @click.option('-s', 'spider') 149 | @click.option('-p', 'preset') 150 | def run_spider(spider, preset, **kwargs): 151 | settings = get_project_settings() 152 | settings['PRESET'] = preset 153 | process = CrawlerProcess(settings, install_root_handler=False) 154 | process.crawl(spider) 155 | process.start(stop_after_crawl=True) 156 | 157 | 158 | @cli.command() 159 | @click.option('-i', '--input', 'db_path', required=True, type=click.Path(exists=True), 160 | help='Path to the database.') 161 | @click.option('-d', '--sql-debug', 'debug', type=click.Path(exists=False, dir_okay=False), 162 | help='Optional file to write executed SQL statements to.') 163 | @click.pass_context 164 | def check_db(ctx, db_path, debug=False, **kwargs): 165 | """Check a database for potential problems and inconsistencies.""" 166 | 167 | ctx.exit(check(ensure_index_db(db_path), debug=debug)) 168 | 169 | 170 | @cli.command() 171 | @click.option('-i', '--input', 'db_path', required=True, type=click.Path(exists=True), 172 | help='Path to the database.') 173 | @click.option('-d', '--sql-debug', 'debug', type=click.Path(exists=False, dir_okay=False), 174 | help='Optional file to write executed SQL statements to.') 175 | @click.pass_context 176 | def upgrade_db(ctx, db_path, debug=False, **kwargs): 177 | """Upgrade an older database to the latest schema version.""" 178 | 179 | ctx.exit(migrate(ensure_index_db(db_path), debug=debug)) 180 | 181 | 182 | @cli.command() 183 | @click.option('-i', '--input', 'db_paths', multiple=True, required=True, type=click.Path(exists=True, dir_okay=False), 184 | help='Path to the database to be merged. Can be specified multiple times.') 185 | @click.option('-o', '--output', 'output', required=True, type=click.Path(exists=False, dir_okay=False), 186 | help='Optional file to write executed SQL statements to.') 187 | @click.option('-d', '--sql-debug', 'debug', type=click.Path(exists=False, dir_okay=False), 188 | help='Optional file to write executed SQL statements to.') 189 | @click.pass_context 190 | def merge_db(ctx, *, db_paths, output, debug=False, **kwargs): 191 | """Merge multiple databases into a new database.""" 192 | 193 | ctx.exit(merge(output, *db_paths, debug=debug)) 194 | 195 | 196 | @cli.command() 197 | @click.option('-i', '--input', 'wd', required=True, type=click.Path(exists=True, file_okay=False), 198 | help='Path to the directory containing scraped data.') 199 | @click.option('-d', '--sql-debug', 'debug', type=click.Path(exists=False, dir_okay=False), 200 | help='Optional file to write executed SQL statements to.') 201 | @click.pass_context 202 | def cleanup(ctx, wd, debug=False, **kwargs): 203 | """Find all temporary databases and attempt to merge them into the main database.""" 204 | 205 | ctx.exit(leftovers(wd, debug=debug)) 206 | 207 | 208 | @cli.command() 209 | def options(): 210 | """List available spider options.""" 211 | 212 | from . import walk_package 213 | for _ in walk_package(): 214 | pass 215 | 216 | from .docs import OptionsContributor 217 | click.echo_via_pager(OptionsContributor.format_docs()) 218 | 219 | 220 | @cli.command(hidden=True) 221 | def gen_commands(): 222 | template = dedent(""" 223 | from scrapy.commands import ScrapyCommand 224 | 225 | from .utils import _ClickCommand 226 | 227 | 228 | class Command(_ClickCommand, ScrapyCommand): 229 | def click_command(self): 230 | return __name__.split('.')[-1].replace('-', '_') 231 | """).lstrip().rstrip(' ') 232 | 233 | path = Path(__file__).with_name('commands') 234 | for p in path.iterdir(): 235 | if not p.is_file(): 236 | continue 237 | content = open(p).read() 238 | if content == template: 239 | p.unlink() 240 | for k, v in cli.commands.items(): 241 | if not v.hidden: 242 | with open(path / f'{k}.py', 'w+') as f: 243 | f.write(template) 244 | 245 | 246 | @cli.command(hidden=True) 247 | @click.option('-i', '--input', 'wd', required=True, type=click.Path(exists=True, file_okay=False)) 248 | @click.option('-p', '--pattern', required=False, default='tumblr.com') 249 | def browser(wd, pattern): 250 | from .contrib.browser import run_app 251 | run_app(wd, pattern) 252 | -------------------------------------------------------------------------------- /feedme/commands/check-db.py: -------------------------------------------------------------------------------- 1 | from scrapy.commands import ScrapyCommand 2 | 3 | from .utils import _ClickCommand 4 | 5 | 6 | class Command(_ClickCommand, ScrapyCommand): 7 | def click_command(self): 8 | return __name__.split('.')[-1].replace('-', '_') 9 | -------------------------------------------------------------------------------- /feedme/commands/cleanup.py: -------------------------------------------------------------------------------- 1 | from scrapy.commands import ScrapyCommand 2 | 3 | from .utils import _ClickCommand 4 | 5 | 6 | class Command(_ClickCommand, ScrapyCommand): 7 | def click_command(self): 8 | return __name__.split('.')[-1].replace('-', '_') 9 | -------------------------------------------------------------------------------- /feedme/commands/commands.py: -------------------------------------------------------------------------------- 1 | from scrapy.commands import ScrapyCommand 2 | 3 | 4 | class Command(ScrapyCommand): 5 | def short_desc(self): 6 | return 'List available commands' 7 | 8 | def add_options(self, parser): 9 | import logging 10 | 11 | logging.getLogger('scrapy.utils.log').disabled = True 12 | super().add_options(parser) 13 | 14 | def run(self, *args, **kwargs): 15 | from ..cli import cli 16 | cli(['--help'], prog_name='scrapy') 17 | -------------------------------------------------------------------------------- /feedme/commands/crawl.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from scrapy.commands import ScrapyCommand 24 | from scrapy.commands.crawl import Command 25 | 26 | from .utils import _LoggingMixin 27 | 28 | 29 | class CrawlCommand(Command, _LoggingMixin): 30 | def add_options(self, parser): 31 | super().add_options(parser) 32 | parser.add_option('-v', '--verbose', action='store_true', 33 | help='Log more information') 34 | parser.remove_option('-a') 35 | parser.remove_option('-t') 36 | 37 | def process_options(self, args, opts): 38 | ScrapyCommand.process_options(self, args, opts) 39 | 40 | opts.spargs = {} 41 | 42 | if len(args) == 2: 43 | self.settings['RSS'] = args.pop() 44 | 45 | self._takeover_logging() 46 | 47 | if opts.output: 48 | self.settings['OUTPUT'] = opts.output[0] 49 | self.settings.pop('FEEDS') 50 | 51 | self.settings['CMDLINE_ARGS'] = {'args': args, 'opts': vars(opts)} 52 | 53 | if opts.verbose: 54 | self.settings['VERBOSE'] = True 55 | self.settings.set('LOG_VIOLATIONS', True, priority='cmdline') 56 | self.settings.set('STATS_DUMP', True, priority='cmdline') 57 | -------------------------------------------------------------------------------- /feedme/commands/export.py: -------------------------------------------------------------------------------- 1 | from scrapy.commands import ScrapyCommand 2 | 3 | from .utils import _ClickCommand 4 | 5 | 6 | class Command(_ClickCommand, ScrapyCommand): 7 | def click_command(self): 8 | return __name__.split('.')[-1].replace('-', '_') 9 | -------------------------------------------------------------------------------- /feedme/commands/merge-db.py: -------------------------------------------------------------------------------- 1 | from scrapy.commands import ScrapyCommand 2 | 3 | from .utils import _ClickCommand 4 | 5 | 6 | class Command(_ClickCommand, ScrapyCommand): 7 | def click_command(self): 8 | return __name__.split('.')[-1].replace('-', '_') 9 | -------------------------------------------------------------------------------- /feedme/commands/options.py: -------------------------------------------------------------------------------- 1 | from scrapy.commands import ScrapyCommand 2 | 3 | from .utils import _ClickCommand 4 | 5 | 6 | class Command(_ClickCommand, ScrapyCommand): 7 | def click_command(self): 8 | return __name__.split('.')[-1].replace('-', '_') 9 | -------------------------------------------------------------------------------- /feedme/commands/resume.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from pathlib import Path 24 | 25 | import simplejson as json 26 | from scrapy.commands import ScrapyCommand 27 | from scrapy.exceptions import UsageError 28 | 29 | from .crawl import CrawlCommand 30 | 31 | 32 | class ResumeCrawlCommand(CrawlCommand): 33 | def syntax(self): 34 | return '' 35 | 36 | def short_desc(self): 37 | return 'Continue an existing crawl' 38 | 39 | def add_options(self, parser): 40 | ScrapyCommand.add_options(self, parser) 41 | 42 | def process_options(self, args, opts): 43 | if len(args) < 1: 44 | raise UsageError() 45 | 46 | datadir = Path(args[0]) 47 | 48 | if not datadir.exists(): 49 | raise UsageError(f'Directory `{datadir}` does not exist.') 50 | if not datadir.is_dir(): 51 | raise UsageError(f'{datadir} is not a directory.', print_help=False) 52 | 53 | try: 54 | with open(datadir / 'options.json') as f: 55 | options = json.load(f) 56 | except (OSError, json.JSONDecodeError): 57 | raise UsageError(f'{datadir} does not contain a valid "options.json" file.\n' 58 | 'Cannot restore command line arguments used to initiate the program.') 59 | 60 | args.clear() 61 | args.extend(options['args']) 62 | for k, v in options['opts'].items(): 63 | setattr(opts, k, v) 64 | 65 | super().process_options(args, opts) 66 | -------------------------------------------------------------------------------- /feedme/commands/upgrade-db.py: -------------------------------------------------------------------------------- 1 | from scrapy.commands import ScrapyCommand 2 | 3 | from .utils import _ClickCommand 4 | 5 | 6 | class Command(_ClickCommand, ScrapyCommand): 7 | def click_command(self): 8 | return __name__.split('.')[-1].replace('-', '_') 9 | -------------------------------------------------------------------------------- /feedme/commands/utils.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from pathlib import Path 24 | 25 | 26 | def dir_validator(path): 27 | path_ = Path(path) 28 | if path_.exists() and not path_.is_dir(): 29 | raise ValueError(f'{path_} exists and is not a directory.') 30 | return path 31 | 32 | 33 | def exists_validator(path): 34 | path_ = Path(path) 35 | if not path_.exists(): 36 | raise ValueError(f'{path_} does not exist.') 37 | return path 38 | 39 | 40 | def _restore_sigint(): 41 | import signal 42 | 43 | signal.signal(signal.SIGINT, signal.default_int_handler) 44 | signal.signal(signal.SIGTERM, signal.SIG_DFL) 45 | 46 | 47 | def _disable_initial_log(): 48 | import logging 49 | 50 | logging.getLogger('scrapy.utils.log').disabled = True 51 | 52 | 53 | class _LoggingMixin: 54 | def _takeover_logging(self, force=False): 55 | from scrapy.utils.log import configure_logging 56 | 57 | from .. import _config_logging 58 | 59 | enabled = (self.settings.getbool('LOG_ENABLED') 60 | and self.settings.getbool('CUSTOM_LOGGING_ENABLED', True)) 61 | if not force and not enabled: 62 | return 63 | 64 | settings = self.settings 65 | configure_logging(install_root_handler=False) 66 | _config_logging(settings) 67 | 68 | settings['CUSTOM_LOGGING_ENABLED'] = True 69 | settings.set('LOG_ENABLED', False, priority=9999) 70 | 71 | if 'LOG_FILE' in settings: 72 | settings['_LOG_FILE'] = settings['LOG_FILE'] 73 | del settings['LOG_FILE'] 74 | 75 | 76 | class _ClickCommand: 77 | def click_command(self): 78 | raise NotImplementedError 79 | 80 | def _get_command(self): 81 | from .. import cli 82 | return getattr(cli, self.click_command()) 83 | 84 | def add_options(self, parser): 85 | from click import Option 86 | 87 | _disable_initial_log() 88 | super().add_options(parser) 89 | parser.add_option('-h', '--help', action='store_true') 90 | 91 | command = self._get_command() 92 | for param in command.params: 93 | if isinstance(param, Option): 94 | opts = [o for o in param.opts if o[0] == '-'] 95 | parser.add_option(*opts, action='store_true') 96 | 97 | def short_desc(self): 98 | command = self._get_command() 99 | help_ = command.help or '' 100 | return help_.split('\n')[0] 101 | 102 | def long_desc(self): 103 | command = self._get_command() 104 | return command.help 105 | 106 | def run(self, args, opts): 107 | import sys 108 | 109 | from ..cli import cli 110 | 111 | _restore_sigint() 112 | cli(sys.argv, prog_name='scrapy') 113 | -------------------------------------------------------------------------------- /feedme/contrib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonywu7/feedly-link-aggregator/7c32b45fbfa1794b081e90add033403308a2da85/feedme/contrib/__init__.py -------------------------------------------------------------------------------- /feedme/contrib/browser.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | import re 5 | import sqlite3 6 | from pathlib import Path 7 | 8 | import aiofiles 9 | import aiohttp 10 | from aiohttp import web 11 | from more_itertools import chunked 12 | 13 | from ..exporters.urls import CTE, SELECT, build_ctes, build_where_clause 14 | from ..sql.db import db 15 | from ..sql.functions import register_all 16 | from ..sql.utils import offset_fetch 17 | 18 | SUFFIX = re.compile(r'_\d+\.(jpg|png|gif)$', re.IGNORECASE) 19 | 20 | 21 | class ResourceIterator: 22 | def __init__(self, conn: sqlite3.Connection, pattern: str): 23 | self.conn = conn 24 | self.log = logging.getLogger('iterator') 25 | self.pattern = pattern 26 | 27 | def get_row_iterator(self): 28 | cte, column_maps = build_ctes(CTE) 29 | keys = ('target:url',) 30 | includes = [('tag', 'is', 'img'), 31 | ('source:netloc', 'contains', self.pattern), 32 | ('target:netloc', 'under', 'media.tumblr.com')] 33 | where, values, _ = build_where_clause(includes, []) 34 | columns = ', '.join([f'{v} AS "{k}"' for k, v in column_maps.items()]) 35 | column_keys = ', '.join([f'"{k}"' for k in keys]) 36 | 37 | select = SELECT % {'columns': columns} 38 | select = f'{cte}{select} WHERE %(offset)s AND {where} GROUP BY {column_keys}' 39 | 40 | fetch = offset_fetch(self.conn, select, 'hyperlink', values=values, log=self.log, size=200000) 41 | return fetch 42 | 43 | def __iter__(self): 44 | while True: 45 | fetch = self.get_row_iterator() 46 | for chunk in chunked(fetch, 10000): 47 | random.shuffle(chunk) 48 | yield from chunk 49 | 50 | 51 | class ResourceIteratorApp(web.Application): 52 | def __init__(self, *args, index: Path, pattern: str = 'tumblr.com', **kwargs): 53 | super().__init__(*args, **kwargs) 54 | 55 | db_path = index / 'index.db' 56 | conn = sqlite3.connect(db_path, isolation_level=None) 57 | conn.row_factory = sqlite3.Row 58 | db.verify_version(conn) 59 | register_all(conn) 60 | self.iterator = iter(ResourceIterator(conn, pattern)) 61 | 62 | self.output = index / 'cache' 63 | os.makedirs(self.output, exist_ok=True) 64 | 65 | self.add_routes([ 66 | web.get('/', self.index), 67 | ]) 68 | 69 | self.client = aiohttp.ClientSession(headers={'User-Agent': 'curl/7.64.1'}) 70 | self.on_cleanup.append(self.close) 71 | 72 | async def index(self, req: web.Request): 73 | row = next(self.iterator) 74 | url = row['target:url'] 75 | url = re.sub(SUFFIX, r'_1280.\g<1>', url) 76 | url = url.replace('http://', 'https://') 77 | async with self.client.get(url) as res: 78 | data = await res.read() 79 | output = self.output / f'{row["source:netloc"]}/{row["target:path"]}' 80 | os.makedirs(output.parent, exist_ok=True) 81 | async with aiofiles.open(output, 'wb+') as f: 82 | await f.write(data) 83 | return web.Response(body=data, content_type=res.content_type) 84 | 85 | async def close(self, *args, **kwargs): 86 | await self.client.close() 87 | 88 | 89 | def run_app(index, pattern): 90 | index = Path(index) 91 | app = ResourceIteratorApp(index=index, pattern=pattern) 92 | web.run_app(app, host='0.0.0.0', port=5000) 93 | -------------------------------------------------------------------------------- /feedme/contrib/filters.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import logging 24 | import re 25 | from math import inf 26 | from urllib.parse import urlsplit 27 | 28 | from scrapy.exceptions import NotConfigured 29 | 30 | from ..docs import OptionsContributor 31 | from ..requests import ProbeFeed 32 | from ..signals import register_state, start_from_scratch 33 | 34 | 35 | class KeywordPrioritizer(OptionsContributor, _doc_order=-5): 36 | """ 37 | Enable this Spider Middleware to (de)prioritize certain feeds based on keywords. 38 | 39 | When using the cluster spider, changing the priorities of requests will shift 40 | the overall direction the spider is going, by causing some feeds to be crawled sooner 41 | than others. 42 | """ 43 | 44 | @classmethod 45 | def from_crawler(cls, crawler): 46 | instance = cls(crawler.settings) 47 | crawler.signals.send_catch_log( 48 | register_state, obj=instance, 49 | namespace='kwprioritizer', attrs=['priorities'], 50 | ) 51 | crawler.signals.connect(instance.clear_state_info, start_from_scratch) 52 | return instance 53 | 54 | def __init__(self, settings): 55 | self.log = logging.getLogger('contrib.keywordprioritizer') 56 | weighted_kws = settings.get('PRIORITIZED_KEYWORDS', {}) 57 | if not weighted_kws: 58 | raise NotConfigured() 59 | 60 | self.keywords = {p: re.compile(r'(?:%s)' % '|'.join(kws), re.IGNORECASE) 61 | for p, kws in weighted_kws.items()} 62 | self.keywords_fullword = {p: re.compile(r'\b(?:%s)\b' % '|'.join(kws), re.IGNORECASE) 63 | for p, kws in weighted_kws.items()} 64 | 65 | self.priorities = {} 66 | self.starting_weight = 0 67 | 68 | def clear_state_info(self): 69 | self.priorities.clear() 70 | 71 | def update_priority(self, item, source, target): 72 | prios = self.priorities 73 | starting = self.priorities.setdefault(source, self.starting_weight) 74 | prio = self.priorities.setdefault(target, self.starting_weight + starting) 75 | if prio is None: 76 | return True 77 | delta = 0 78 | 79 | for p, r in self.keywords.items(): 80 | s = r.search(target) 81 | if not s: 82 | continue 83 | delta += p 84 | self.log.debug(f'{source} {target} {s.group(0)} {p}') 85 | if delta == -inf: 86 | break 87 | 88 | if delta == -inf: 89 | prios[target] = -inf 90 | return 91 | 92 | phrases = list(item.keywords) 93 | phrases.extend([item.markup.get('summary', ''), item.title]) 94 | phrases = ' '.join(phrases) 95 | for p, r in self.keywords_fullword.items(): 96 | s = r.search(phrases) 97 | if not s: 98 | continue 99 | delta += p 100 | self.log.debug(f'{source} {target} {s.group(0)} {p}') 101 | if delta == -inf: 102 | break 103 | 104 | prios[target] = prio + delta 105 | 106 | def process_spider_output(self, response, result, spider): 107 | for res in result: 108 | if not isinstance(res, ProbeFeed): 109 | yield res 110 | continue 111 | 112 | item = res.meta.get('source_item') 113 | feed_url = res.meta.get('feed_url') 114 | if not item or not feed_url: 115 | yield res 116 | continue 117 | 118 | source = urlsplit(item.url).netloc 119 | target = urlsplit(feed_url).netloc 120 | self.update_priority(item, source, target) 121 | 122 | prio = self.priorities.get(target, 0) 123 | if prio == -inf: 124 | continue 125 | if not prio: 126 | yield res 127 | continue 128 | 129 | yield res.replace(priority=res.priority + prio) 130 | 131 | @staticmethod 132 | def _help_options(): 133 | return { 134 | 'PRIORITIZED_KEYWORDS': """ 135 | A mapping of weights to a list of keywords. 136 | 137 | Before a new feed is crawled, the crawling request is processed here. This 138 | middleware will then search the text content from which this new feed is 139 | discovered, such as keywords and HTML markups, and adjust the priority of 140 | the request accordingly. 141 | 142 | A ~positive~ weight will increase the priority, causing the feed to be crawled 143 | sooner. A ~negative~ weight will decrease the priority. 144 | 145 | If you use the special `-inf` (negative infinity) value, the new feed will 146 | be dropped. (`inf` can be imported from `math` or specified as `float('inf')`). 147 | 148 | **Example** 149 | 150 | `PRIORITIZED_KEYWORDS = {` 151 | ` 10: ['cats', 'kitties'],` 152 | ` 5: ['dogs', 'puppies'],` 153 | ` -5: ['goldfish'],` 154 | ` -float('inf'): ['rat'],` 155 | `}` 156 | """, 157 | } 158 | -------------------------------------------------------------------------------- /feedme/contrib/tumblr.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlsplit 2 | 3 | from scrapy import Request 4 | from scrapy.exceptions import NotConfigured 5 | 6 | from ..docs import OptionsContributor 7 | 8 | 9 | class TumblrFilter(OptionsContributor, _doc_order=-5): 10 | @classmethod 11 | def from_crawler(cls, crawler): 12 | return cls(crawler.settings) 13 | 14 | def __init__(self, settings): 15 | self.domains = settings.get('TUMBLR_IGNORE') 16 | if not self.domains: 17 | raise NotConfigured() 18 | 19 | def process_spider_output(self, response, result, spider): 20 | for r in result: 21 | if not isinstance(r, Request): 22 | yield r 23 | continue 24 | 25 | feed_url = r.meta.get('feed_url') 26 | if not feed_url: 27 | yield r 28 | continue 29 | 30 | domain = urlsplit(feed_url).netloc 31 | if domain in self.domains: 32 | continue 33 | if domain[-16:] == 'media.tumblr.com': 34 | continue 35 | yield r 36 | 37 | @staticmethod 38 | def _help_options(): 39 | return { 40 | 'TUMBLR_IGNORE': """ 41 | A list of Tumblr sites to ignore. 42 | 43 | **Example** 44 | 45 | `TUMBLR_IGNORE = {` 46 | ` 'www.tumblr.com', 'staff.tumblr.com', 'tumblr.com',` 47 | ` 'engineering.tumblr.com', 'support.tumblr.com',` 48 | ` 'assets.tumblr.com',` 49 | `}` 50 | """, 51 | } 52 | -------------------------------------------------------------------------------- /feedme/datastructures.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from __future__ import annotations 24 | 25 | from collections.abc import (Hashable, MutableMapping, MutableSequence, 26 | MutableSet) 27 | from collections.abc import Set as SetCollection 28 | from typing import Dict, Set, Tuple 29 | 30 | Keywords = Set[Hashable] 31 | KeywordCollection = Dict[Hashable, Hashable] 32 | 33 | 34 | def compose_mappings(*mappings): 35 | base = {} 36 | base.update(mappings[0]) 37 | for m in mappings[1:]: 38 | for k, v in m.items(): 39 | if k in base and type(base[k]) is type(v): 40 | if isinstance(v, MutableMapping): 41 | base[k] = compose_mappings(base[k], v) 42 | elif isinstance(v, MutableSet): 43 | base[k] |= v 44 | elif isinstance(v, MutableSequence): 45 | base[k].extend(v) 46 | else: 47 | base[k] = v 48 | else: 49 | base[k] = v 50 | return base 51 | 52 | 53 | class KeywordStore: 54 | def __init__(self): 55 | self._index: Dict[int, Hashable] = {} 56 | self._taggings: Dict[int, KeywordCollection] = {} 57 | 58 | def _get_hashes(self, **kws: Dict[Hashable, Hashable]) -> int: 59 | for hash_, keywords in self._taggings.items(): 60 | match = True 61 | for category, keyword in kws.items(): 62 | if category[0] == '_': 63 | if category[1:] not in keywords: 64 | match = False 65 | break 66 | elif keyword not in keywords.get(category, {}): 67 | match = False 68 | break 69 | if match: 70 | yield hash_ 71 | 72 | def all(self, **kws: Dict[Hashable, Hashable]) -> Hashable: 73 | for hash_ in self._get_hashes(**kws): 74 | yield self._index[hash_] 75 | 76 | def keywords(self, item): 77 | return self._taggings.get(hash(item), {}) 78 | 79 | def items(self, **kws: Dict[Hashable, Hashable]) -> Tuple[Hashable, KeywordCollection]: 80 | for hash_ in self._get_hashes(**kws): 81 | yield self._index[hash_], self._taggings[hash_] 82 | 83 | def put(self, item: Hashable, **kws: KeywordCollection): 84 | hash_ = hash(item) 85 | self._index[hash_] = item 86 | taggings = self._taggings.setdefault(hash_, {}) 87 | for category, kwset in kws.items(): 88 | if not isinstance(kwset, SetCollection): 89 | kwset = {kwset} 90 | if category[0] == '_': 91 | raise ValueError('Keys that begin with _ are reserved') 92 | keywords = taggings.setdefault(category, set()) 93 | keywords |= kwset 94 | 95 | def __len__(self) -> int: 96 | return len(self._index) 97 | 98 | def __and__(self, other: KeywordStore) -> KeywordStore: 99 | if not isinstance(other, KeywordStore): 100 | return NotImplemented 101 | new = KeywordStore() 102 | common_keys = self._index.keys() & other._index.keys() 103 | taggings = {} 104 | for k in common_keys: 105 | this = self._taggings[k] 106 | that = other._taggings[k] 107 | tagging = {t: this[t] & that[t] for t in this.keys() & that.keys()} 108 | tagging = {k: v for k, v in tagging.items() if v} 109 | taggings[k] = tagging 110 | index = {k: self._index[k] for k in taggings} 111 | new._index = index 112 | new._taggings = taggings 113 | return new 114 | 115 | def __or__(self, other: KeywordStore) -> KeywordStore: 116 | if not isinstance(other, KeywordStore): 117 | return NotImplemented 118 | new = KeywordStore() 119 | index = {**self._index, **other._index} 120 | taggings = {} 121 | for k in index: 122 | this = self._taggings.get(k, {}) 123 | that = other._taggings.get(k, {}) 124 | tagging = {t: this.get(t, set()) | that.get(t, set()) for t in this.keys() & that.keys()} 125 | tagging.update({t: this[t] for t in this.keys() - that.keys()}) 126 | tagging.update({t: that[t] for t in that.keys() - this.keys()}) 127 | taggings[k] = tagging 128 | new._index = index 129 | new._taggings = taggings 130 | return new 131 | 132 | def __sub__(self, other: KeywordStore) -> KeywordStore: 133 | if not isinstance(other, KeywordStore): 134 | return NotImplemented 135 | new = KeywordStore() 136 | taggings = {} 137 | for k in self._index: 138 | this = self._taggings[k] 139 | that = other._taggings.get(k, {}) 140 | tagging = {t: this[t] - that.get(t, set()) for t in this} 141 | tagging = {k: v for k, v in tagging.items() if v} 142 | taggings[k] = tagging 143 | index = {k: self._index[k] for k in taggings} 144 | new._index = index 145 | new._taggings = taggings 146 | return new 147 | 148 | def __str__(self): 149 | return str(self.for_json()) 150 | 151 | def __repr__(self): 152 | return repr(self.for_json()) 153 | 154 | def for_json(self): 155 | return {item: self._taggings[hash_] for hash_, item in self._index.items()} 156 | 157 | 158 | def labeled_sequence(seq, key=True, start=0, as_str=False): 159 | r = range(start, len(seq) + start) 160 | if key: 161 | z = zip(r, seq) 162 | else: 163 | z = zip(seq, r) 164 | if as_str: 165 | return {str(k): v for k, v in z} 166 | return {k: v for k, v in z} 167 | -------------------------------------------------------------------------------- /feedme/docs.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import re 24 | from functools import wraps 25 | from textwrap import dedent, indent 26 | 27 | import click 28 | 29 | docs = [] 30 | 31 | 32 | def stylize(pattern, **styles): 33 | def wrapper(func): 34 | @wraps(func) 35 | def wrapped(*args, **kwargs): 36 | for s in func(*args, **kwargs): 37 | yield re.sub(pattern, lambda m: click.style(m.group(1), **styles), s) 38 | return wrapped 39 | return wrapper 40 | 41 | 42 | def markdown_inline(func): 43 | @stylize(re.compile(r'`(.*?)`'), fg='green') 44 | @stylize(re.compile(r'~(.*?)~'), fg='blue', underline=True) 45 | @stylize(re.compile(r'\*\*(.*?)\*\*'), fg='yellow', bold=True) 46 | def f(*args, **kwargs): 47 | yield from func(*args, **kwargs) 48 | return f 49 | 50 | 51 | def numpydoc2click(doc: str): 52 | PARA = re.compile(r'((?:.+\n)+)') 53 | PARA_WITH_HEADER = re.compile(r'(^ *)(.+)\n(?:\s*(?:-+|=+))\n((?:.+\n)+)') 54 | paragraphs = list(PARA.findall(dedent(doc))) 55 | yield paragraphs[0] + '\n' 56 | for i in range(1, len(paragraphs)): 57 | p = paragraphs[i] 58 | match = PARA_WITH_HEADER.match(p) 59 | if match: 60 | indentation, header, p = match.group(1), match.group(2), match.group(3) 61 | if not indentation: 62 | header = header.upper() 63 | yield indent(click.style(header, bold=True), indentation) 64 | yield '\n' 65 | yield indent(p, ' ') 66 | yield '\n' 67 | 68 | 69 | class OptionsContributor: 70 | _subclassed = set() 71 | 72 | @classmethod 73 | def __init_subclass__(cls, _doc_order=0): 74 | for c in cls.mro(): 75 | if c.__qualname__ in cls._subclassed: 76 | return 77 | 78 | cls._subclassed.add(cls.__qualname__) 79 | docs.append((cls, cls._help_options(), _doc_order)) 80 | 81 | @staticmethod 82 | @markdown_inline 83 | def format_docs(): 84 | yield from [ 85 | click.style('feedme Customization Manual\n\n'.upper(), fg='black', bg='white', bold=True), 86 | 'This program supports the use of presets, which lets you define \n' 87 | 'options for different scenarios.\n', 88 | '\n', 89 | 'A preset works like a Scrapy settings file: you simply declare your options as \n' 90 | 'uppercase-only top level variables, such as `FOLLOW_DOMAINS = ["abc.xyz"]`.\n' 91 | '\n', 92 | 'Then, run Scrapy with the command-line option ~-s PRESET=~.\n', 93 | '\n', 94 | 'You may also specify options directly on the command line with the ~-s~ option:\n', 95 | 'such as `-s RSS=http://xkcd.com/atom.xml`, in which case those declared on the\n', 96 | 'command line take precedence over those in a preset.\n', 97 | '\n', 98 | 'Some example presets are located in the `presets/` directory.\n', 99 | '\n', 100 | 'The following is the list of supported options, grouped by the components they\n', 101 | 'belong to.\n' 102 | '\n', 103 | click.style('============*============\n', fg='white', bold=True), 104 | '\n', 105 | ] 106 | 107 | for cls, options, _ in sorted(docs, key=lambda t: t[2], reverse=True): 108 | yield click.style('-------------\n', fg='black', bold=True) 109 | yield f'**{repr(cls)}**\n' 110 | yield '\n' 111 | 112 | docstring = cls.__doc__ 113 | if docstring: 114 | yield from numpydoc2click(dedent(docstring)) 115 | 116 | for opt, doc in options.items(): 117 | yield f' ~{opt}~\n' 118 | yield f'{indent(dedent(doc), " ")}\n' 119 | 120 | yield '\n\n' 121 | -------------------------------------------------------------------------------- /feedme/exporters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonywu7/feedly-link-aggregator/7c32b45fbfa1794b081e90add033403308a2da85/feedme/exporters/__init__.py -------------------------------------------------------------------------------- /feedme/exporters/exporters.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import csv 24 | import logging 25 | import os 26 | from abc import ABC, abstractmethod 27 | from pathlib import Path 28 | from typing import Callable, Set 29 | 30 | import simplejson as json 31 | 32 | from ..utils import JSONDict 33 | 34 | 35 | class MappingExporter(ABC): 36 | def __init__(self, output: Path, filename: str, escape: Callable[[str], str] = None): 37 | self.output = output 38 | self.filename = filename 39 | self.ext = ''.join(Path(filename).suffixes) 40 | self.escape = escape or (lambda s: s) 41 | self.files = {} 42 | self.logger = logging.getLogger('exporter') 43 | self.opened = 0 44 | 45 | @abstractmethod 46 | def format(self, item: JSONDict): 47 | return item 48 | 49 | def get_file(self, item: JSONDict): 50 | if self.opened > 200: 51 | for f in self.files.values(): 52 | f.close() 53 | self.opened = 0 54 | 55 | filename = self.escape(self.filename % item) 56 | if filename[-1] == '/': 57 | filename = f'{filename}index{self.ext}' 58 | if filename == '.': 59 | filename = '-.' 60 | if filename == '..': 61 | filename = '-..' 62 | path = self.output / filename 63 | 64 | f, new = self.open_file(path) 65 | return f, path, new 66 | 67 | def open_file(self, path): 68 | out = self.files.get(path) 69 | is_newfile = out is None 70 | if not out or out.closed: 71 | os.makedirs(path.parent, exist_ok=True) 72 | if is_newfile: 73 | self.logger.info(f'New file {path}') 74 | self.files[path] = out = open(path, 'a+') 75 | self.opened += 1 76 | return out, is_newfile 77 | 78 | def write(self, item: JSONDict): 79 | out, _, _ = self.get_file(item) 80 | out.write(f'{self.format(item)}\n') 81 | 82 | def close(self): 83 | if not self.files: 84 | self.logger.warning('Exported nothing!') 85 | for f in self.files.values(): 86 | f.close() 87 | 88 | def __enter__(self): 89 | return self 90 | 91 | def __exit__(self, typ, val=None, tb=None): 92 | self.close() 93 | if not typ: 94 | return True 95 | if val is None: 96 | if tb is None: 97 | raise typ 98 | val = typ() 99 | if tb is not None: 100 | val = val.with_traceback(tb) 101 | raise val 102 | 103 | 104 | class MappingJSONExporter(MappingExporter): 105 | def __init__(self, key: str, *args, **kwargs): 106 | super().__init__(*args, **kwargs) 107 | self.key = key 108 | self.storage = {} 109 | 110 | def format(self, item: JSONDict): 111 | return super().format(item) 112 | 113 | def write(self, item: JSONDict): 114 | _, fn, _ = self.get_file(item) 115 | s = self.storage.setdefault(fn, {}) 116 | s[item[self.key]] = item 117 | 118 | def close(self): 119 | for k in self.files: 120 | f, _ = self.open_file(k) 121 | json.dump(self.storage[k], f) 122 | return super().close() 123 | 124 | 125 | class MappingLineExporter(MappingExporter): 126 | def __init__(self, key: str, *args, **kwargs): 127 | super().__init__(*args, **kwargs) 128 | self.key = key 129 | 130 | def format(self, item): 131 | return item[self.key] 132 | 133 | 134 | class MappingCSVExporter(MappingExporter): 135 | def __init__(self, fieldnames: Set[str], *args, **kwargs): 136 | super().__init__(*args, **kwargs) 137 | self.writers = {} 138 | self.fieldnames = fieldnames 139 | 140 | def format(self, item: JSONDict): 141 | return super().format(item) 142 | 143 | def get_file(self, item: JSONDict): 144 | if len(self.writers) > 200: 145 | for k in self.writers: 146 | self.writers[k] = False 147 | 148 | f, fn, new = super().get_file(item) 149 | if not self.fieldnames: 150 | self.fieldnames = tuple(item.keys()) 151 | writer = self.writers.get(fn) 152 | if not writer or not new: 153 | writer = self.writers[fn] = csv.DictWriter(f, self.fieldnames, extrasaction='ignore') 154 | if new: 155 | writer.writeheader() 156 | return writer 157 | 158 | def write(self, item: JSONDict): 159 | self.get_file(item).writerow({**item}) 160 | -------------------------------------------------------------------------------- /feedme/exporters/graph.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import logging 24 | import sqlite3 25 | from collections import defaultdict 26 | from pathlib import Path 27 | 28 | import igraph 29 | 30 | from ..datastructures import labeled_sequence 31 | from ..sql.utils import offset_fetch 32 | from .utils import filter_by_domains, with_db 33 | 34 | log = logging.getLogger('exporter.graph') 35 | 36 | 37 | def filter_vertices(g, vertex_ids, include, exclude): 38 | if include or exclude: 39 | log.info('Filtering graph') 40 | vertex_ids = {f'http://{k}': i for k, i in vertex_ids.items()} 41 | if include: 42 | vertex_ids = {k: i for k, i in vertex_ids.items() 43 | if filter_by_domains(include)(k)} 44 | if exclude: 45 | vertex_ids = {k: i for k, i in vertex_ids.items() 46 | if filter_by_domains(exclude, True)(k)} 47 | g = g.subgraph(vertex_ids.values()) 48 | return g 49 | 50 | 51 | def create_hyperlink_graph(db: sqlite3.Connection, include=None, exclude=None): 52 | SELECT = """ 53 | SELECT 54 | source.url AS "source", 55 | target.url AS "target", 56 | hyperlink.element AS "tag", 57 | item.published AS "timestamp" 58 | FROM 59 | hyperlink 60 | JOIN url AS source ON source.id == hyperlink.source_id 61 | JOIN url AS target ON target.id == hyperlink.target_id 62 | JOIN item ON hyperlink.source_id == item.url 63 | WHERE 64 | %(offset)s 65 | """ 66 | vertices = {} 67 | edges = {} 68 | log.debug(SELECT) 69 | 70 | log.info('Reading database') 71 | for row in offset_fetch(db, SELECT, 'hyperlink', log=log): 72 | src = row['source'] 73 | dst = row['target'] 74 | vertices[src] = True 75 | vertices[dst] = True 76 | edges[(src, dst)] = (row['tag'], row['timestamp']) 77 | log.info('Finished reading database') 78 | 79 | log.info('Creating graph') 80 | g = igraph.Graph(directed=True) 81 | vertex_ids = labeled_sequence(vertices, key=False) 82 | edges = {(vertex_ids[t[0]], vertex_ids[t[1]]): v for t, v in edges.items()} 83 | g.add_vertices(len(vertices)) 84 | g.add_edges(edges) 85 | g.vs['name'] = list(vertices) 86 | g.es['type'], g.es['timestamp'] = tuple(zip(*edges.values())) 87 | log.info(f'|V| = {g.vcount()}; |E| = {g.ecount()}') 88 | g = filter_vertices(g, vertex_ids, include, exclude) 89 | return g 90 | 91 | 92 | def create_domain_graph(db: sqlite3.Connection, include=None, exclude=None): 93 | temp = """ 94 | CREATE TEMP TABLE domains (id INTEGER, domain VARCHAR) 95 | """ 96 | index = """ 97 | CREATE INDEX temp_ix_domains ON domains (id) 98 | """ 99 | insert_domains = """ 100 | INSERT INTO domains 101 | SELECT url.id AS id, urlsplit(url.url, 'netloc') AS domain 102 | FROM url 103 | """ 104 | count_domains = """ 105 | SELECT domains.domain, count(domains.domain) 106 | FROM domains 107 | GROUP BY domains.domain 108 | """ 109 | 110 | select_pairs = """ 111 | SELECT 112 | src.domain AS source, 113 | dst.domain AS target, 114 | hyperlink.element AS tag, 115 | count(hyperlink.element) AS count 116 | FROM 117 | hyperlink 118 | JOIN domains AS src ON hyperlink.source_id == src.id 119 | JOIN domains AS dst ON hyperlink.target_id == dst.id 120 | WHERE %(offset)s 121 | GROUP BY 122 | source, 123 | target, 124 | tag 125 | """ 126 | db.execute('BEGIN EXCLUSIVE') 127 | db.execute(temp) 128 | 129 | log.info('Building domain list') 130 | db.execute(insert_domains) 131 | db.execute(index) 132 | 133 | vertices = {} 134 | edges = defaultdict(lambda: defaultdict(int)) 135 | attrs = set() 136 | 137 | log.info('Counting domains') 138 | for domain, count in db.execute(count_domains): 139 | vertices[domain] = count 140 | 141 | log.info('Fetching hyperlinks') 142 | for row in offset_fetch(db, select_pairs, 'hyperlink', size=500000, log=log): 143 | src = row['source'] 144 | dst = row['target'] 145 | tag = row['tag'] 146 | attrs.add(tag) 147 | edges[(src, dst)][tag] += row['count'] 148 | 149 | db.rollback() 150 | 151 | log.info('Creating graph') 152 | g = igraph.Graph(directed=True) 153 | vertex_ids = labeled_sequence(vertices, key=False) 154 | edges = {(vertex_ids[t[0]], vertex_ids[t[1]]): v for t, v in edges.items()} 155 | g.add_vertices(len(vertices)) 156 | g.add_edges(edges) 157 | g.vs['name'] = list(vertices) 158 | g.vs['weight'] = list(vertices.values()) 159 | attrs = {a: tuple(v.get(a, 0) for v in edges.values()) for a in attrs} 160 | for k, t in attrs.items(): 161 | g.es[k] = t 162 | g = filter_vertices(g, vertex_ids, include, exclude) 163 | log.info(f'|V| = {g.vcount()}; |E| = {g.ecount()}') 164 | return g 165 | 166 | 167 | @with_db 168 | def export(conn: sqlite3.Connection, wd: Path, output: Path, 169 | fmt='index.graphml', graphtype='hyperlink', 170 | include=None, exclude=None, **kwargs): 171 | 172 | reader = { 173 | 'hyperlink': create_hyperlink_graph, 174 | 'domain': create_domain_graph, 175 | }[graphtype] 176 | g = reader(conn, include, exclude) 177 | log.info('Writing') 178 | with open(output / fmt, 'w+') as f: 179 | g.save(f, format='graphml') 180 | log.info('Done.') 181 | 182 | 183 | help_text = """ 184 | Export feed data as graph data. 185 | 186 | Synopsis 187 | -------- 188 | export ~graph~ -i -o [name] [**graphtype=**~hyperlink|domain~] 189 | 190 | Description 191 | ----------- 192 | This exporter lets you represent scraped URL data using graph data structure. 193 | 194 | **Requires igraph. You must install ~requirements-optional.txt~.** 195 | 196 | Currently this exports graphs in ~GraphML~ format only. 197 | 198 | This exporter does not support name templates. 199 | 200 | Filters 201 | ------- 202 | 203 | ~domain~ ~under~ ... 204 | 205 | Include/exclude websites/hyperlinks whose domain name is under the specified 206 | domain. 207 | 208 | Options 209 | ------- 210 | ~graphtype=[hyperlink|domain]~ 211 | 212 | **~hyperlink~** 213 | **Directed, self-loop allowed** 214 | **Vertices** 215 | Each ~source~ or ~target~ URL (representing a file on a website); 216 | **Attributes** 217 | ~name~: The URL 218 | **Edges** 219 | Each hyperlink found in ~source~ pointing to ~target~; 220 | **Attributes** 221 | ~type~: The HTML element 222 | ~timestamp~: UTC date and time when ~source~ was published, 223 | in ISO-8601 format 224 | 225 | **~domain~** 226 | **Directed, self-loop allowed** 227 | **Vertices** 228 | Domains of each URL 229 | **Attributes** 230 | ~name~: Domain name 231 | ~weight~: The number of files found under the domain 232 | **Edges** 233 | Each hyperlink found in ~source~ pointing to ~target~ creates an 234 | edge from ~source:domain~ to ~target:domain~; not repeated. 235 | **Attributes** 236 | ~~: Each hyperlink in ~source~ pointing to 237 | ~target~ that is found on a particular HTML tag increases 238 | the ~~ attribute by 1. 239 | """ 240 | -------------------------------------------------------------------------------- /feedme/exporters/uncharted.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import logging 24 | import sqlite3 25 | from collections import defaultdict 26 | from pathlib import Path 27 | 28 | import simplejson as json 29 | 30 | from .utils import filter_by_domains, with_db 31 | 32 | log = logging.getLogger('exporter.uncharted') 33 | 34 | 35 | @with_db 36 | def export(conn: sqlite3.Connection, wd: Path, output: Path, 37 | include=None, exclude=None, 38 | fmt='uncharted.json', **kwargs): 39 | 40 | temp = """ 41 | CREATE TEMP TABLE domains (id INTEGER, domain VARCHAR) 42 | """ 43 | index = """ 44 | CREATE INDEX temp_ix_domains ON domains (id) 45 | """ 46 | insert_domains = """ 47 | INSERT INTO domains 48 | SELECT url.id AS id, 'http://' || urlsplit(url.url, 'netloc') AS domain 49 | FROM url 50 | """ 51 | count_domains = """ 52 | SELECT domains.domain, count(domains.domain) 53 | FROM domains 54 | GROUP BY domains.domain 55 | """ 56 | select_feeds = """ 57 | SELECT domains.domain 58 | FROM feed 59 | JOIN domains ON feed.url_id == domains.id 60 | GROUP BY domains.domain 61 | """ 62 | select_keywords = """ 63 | SELECT domains.domain, keyword.keyword, count(keyword.keyword) 64 | FROM tagging 65 | JOIN domains ON tagging.url_id == domains.id 66 | JOIN keyword ON tagging.keyword_id == keyword.id 67 | GROUP BY domains.domain, keyword.keyword 68 | """ 69 | select_hyperlinks = """ 70 | SELECT src.domain, dst.domain, count(src.domain) 71 | FROM hyperlink 72 | JOIN domains AS src ON hyperlink.source_id == src.id 73 | JOIN domains AS dst ON hyperlink.target_id == dst.id 74 | GROUP BY src.domain, dst.domain 75 | """ 76 | 77 | conn.execute('BEGIN EXCLUSIVE') 78 | conn.execute(temp) 79 | 80 | log.info('Building domain list') 81 | conn.execute(insert_domains) 82 | conn.execute(index) 83 | 84 | domains = defaultdict(lambda: { 85 | 'page_count': 0, 86 | 'keywords': defaultdict(int), 87 | 'referrers': defaultdict(int), 88 | }) 89 | log.info('Counting domains') 90 | for domain, count in conn.execute(count_domains): 91 | domains[domain]['page_count'] = count 92 | 93 | log.info('Counting keywords') 94 | for domain, keyword, count in conn.execute(select_keywords): 95 | domains[domain]['keywords'][keyword] += count 96 | 97 | log.info('Counting referrers') 98 | for src, dst, count in conn.execute(select_hyperlinks): 99 | domains[dst]['referrers'][src] += count 100 | 101 | log.info('Filtering') 102 | for feed in conn.execute(select_feeds): 103 | del domains[feed[0]] 104 | 105 | if include: 106 | domains = {k: v for k, v in domains.items() 107 | if filter_by_domains(include)(k)} 108 | 109 | if exclude: 110 | domains = {k: v for k, v in domains.items() 111 | if filter_by_domains(exclude, True)(k)} 112 | 113 | with open(output / fmt, 'w+') as f: 114 | json.dump(domains, f) 115 | 116 | conn.rollback() 117 | log.info('Done.') 118 | 119 | 120 | help_text = """ 121 | Export a list of websites that are "uncharted" — websites that were not scraped 122 | as RSS feeds during a crawl, but were recorded in the database because other 123 | feeds mentioned them. 124 | 125 | That is, if you are using the cluster spider, this exporter will export the list 126 | of websites that are not crawled due to the spider hitting the depth limit 127 | — the outermost nodes. 128 | 129 | Note that websites that are "uncharted" because they were filtered out by the 130 | domain filter (instead of hitting the depth limit) or because they were not 131 | RSS feeds in the first place will also be included here. To only export sites 132 | under a certain domain, use ~+f domain under ...~, to exclude a certain domain, 133 | use ~-f domain under ...~. 134 | """ 135 | -------------------------------------------------------------------------------- /feedme/exporters/utils.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import logging 24 | import operator 25 | import os 26 | import sqlite3 27 | from functools import wraps 28 | from pathlib import Path 29 | 30 | from scrapy.utils.url import url_is_from_any_domain 31 | 32 | from ..datastructures import labeled_sequence 33 | from ..sql.db import db 34 | from ..sql.functions import register_all 35 | 36 | log = logging.getLogger('exporter.utils') 37 | 38 | 39 | def subdomain(x, y): 40 | return x == y or x[-(len(y) + 1):] == f'.{y}' 41 | 42 | 43 | filter_ops = { 44 | 'is': operator.eq, 45 | 'under': subdomain, 46 | 'startswith': str.startswith, 47 | 'endswith': str.endswith, 48 | 'contains': operator.contains, 49 | 'gt': operator.gt, 50 | 'ge': operator.ge, 51 | 'lt': operator.lt, 52 | 'le': operator.le, 53 | } 54 | sql_ops = { 55 | ('is', 'None'): ('"%(column)s" IS NULL', '%s'), 56 | ('is', 'True'): ('"%(column)s" == 1', '%s'), 57 | ('is', 'False'): ('"%(column)s" == 0', '%s'), 58 | 'is': ('"%(column)s" == :%(id)d', '%s'), 59 | 'under': ('subdomain("%(column)s", :%(id)d)', '%s'), 60 | 'startswith': ('"%(column)s" LIKE :%(id)d', '%s%%'), 61 | 'endswith': ('"%(column)s" LIKE :%(id)d', '%%%s'), 62 | 'contains': ('"%(column)s" LIKE :%(id)d', '%%%s%%'), 63 | 'gt': ('"%(column)s" > :%(id)d', '%s'), 64 | 'ge': ('"%(column)s" >= :%(id)d', '%s'), 65 | 'lt': ('"%(column)s" < :%(id)d', '%s'), 66 | 'le': ('"%(column)s" <= :%(id)d', '%s'), 67 | } 68 | equivalencies = [('==', 'is'), ('in', 'contains'), ('>', 'gt'), ('<', 'lt'), ('>=', 'ge'), ('<=', 'le')] 69 | for k, v in equivalencies: 70 | filter_ops[k] = filter_ops[v] 71 | sql_ops[k] = sql_ops[v] 72 | 73 | 74 | def build_where_clause(includes=None, excludes=None): 75 | if not includes and not excludes: 76 | return '1', (), set() 77 | values = [] 78 | includes = includes or [] 79 | excludes = excludes or [] 80 | clauses = [] 81 | required_columns = set() 82 | for prefix, criteria in (('', includes), ('NOT ', excludes)): 83 | for key, op, val in criteria: 84 | required_columns.add(key) 85 | op = sql_ops.get((op, val), sql_ops[op]) 86 | values.append(op[1] % (val,)) 87 | value_id = len(values) 88 | clauses.append(prefix + op[0] % {'column': key, 'id': value_id}) 89 | clauses = ' AND '.join(clauses) 90 | values = labeled_sequence(values, start=1, as_str=True) 91 | return clauses, values, required_columns 92 | 93 | 94 | class MappingFilter: 95 | def __init__(self): 96 | self.filters = [] 97 | 98 | def includes(self, key, op, val): 99 | self.filters.append(lambda row, x=key, y=val, op=filter_ops[op]: op(row[x], y)) 100 | 101 | def excludes(self, key, op, val): 102 | self.filters.append(lambda row, x=key, y=val, op=filter_ops[op]: not op(row[x], y)) 103 | 104 | def __call__(self, item): 105 | return all(f(item) for f in self.filters) 106 | 107 | 108 | def with_db(exporter): 109 | @wraps(exporter) 110 | def e(wd, *args, **kwargs): 111 | wd = Path(wd) 112 | output = wd / 'out' 113 | os.makedirs(output, exist_ok=True) 114 | 115 | db_path = wd / 'index.db' 116 | if not db_path.exists(): 117 | raise FileNotFoundError(f'index.db not found in {wd}') 118 | 119 | conn = sqlite3.connect(db_path, isolation_level=None) 120 | if db.is_locked(conn): 121 | log.error('Database was left in a partially consistent state.') 122 | log.error('Run `python -m feedme check-db` to fix it first.') 123 | return 1 124 | 125 | conn.row_factory = sqlite3.Row 126 | db.verify_version(conn) 127 | register_all(conn) 128 | 129 | try: 130 | exporter(conn, wd, output, *args, **kwargs) 131 | finally: 132 | conn.close() 133 | return e 134 | 135 | 136 | def filter_by_domains(ls, exclude=False): 137 | domains = [] 138 | for key, op, val in ls: 139 | if key != 'domain' or op != 'under': 140 | log.warning(f'Unknown filter {key} {op}') 141 | continue 142 | domains.append(val) 143 | return lambda u: url_is_from_any_domain(u, domains) ^ exclude 144 | -------------------------------------------------------------------------------- /feedme/feedly.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from __future__ import annotations 24 | 25 | from datetime import datetime 26 | from typing import Any, Dict, Optional 27 | from urllib.parse import SplitResult, quote, urlsplit 28 | 29 | import attr 30 | from attr.converters import optional 31 | from attr.validators import instance_of 32 | 33 | from . import utils 34 | from .datastructures import Keywords 35 | from .utils import HyperlinkStore, JSONDict 36 | 37 | API_BASE = { 38 | 'scheme': 'https', 39 | 'netloc': 'cloud.feedly.com', 40 | 'fragment': '', 41 | } 42 | API_ENDPOINTS = { 43 | 'streams': '/v3/streams/contents', 44 | 'search': '/v3/search/feeds', 45 | 'feeds': '/v3/feeds/', 46 | } 47 | 48 | 49 | def build_api_url(endpoint, argument='', **params): 50 | if endpoint not in API_ENDPOINTS: 51 | raise ValueError(f'{endpoint} API is not supported') 52 | url = {**API_BASE, 'path': API_ENDPOINTS[endpoint] + quote(argument, safe='')} 53 | url['query'] = '&'.join([f'{quote(k)}={quote(str(v))}' for k, v in params.items()]) 54 | return SplitResult(**url).geturl() 55 | 56 | 57 | def get_feed_uri(s): 58 | return s.split('/', 1)[1] 59 | 60 | 61 | def lowercase_set(iterable=None): 62 | if not iterable: 63 | return set() 64 | return {str(k).lower() for k in iterable} 65 | 66 | 67 | @attr.s(kw_only=True, frozen=True) 68 | class FeedlyEntry: 69 | url: str = attr.ib(validator=instance_of(str)) 70 | source: str = attr.ib(repr=False) 71 | published: datetime = attr.ib(converter=utils.datetime_converters) 72 | updated: datetime = attr.ib(default=None, converter=optional(utils.datetime_converters), repr=False) 73 | 74 | keywords: Keywords = attr.ib(converter=utils.ensure_collection(lowercase_set), factory=lowercase_set, repr=False) 75 | author: Optional[str] = attr.ib(default='', repr=False) 76 | title: Optional[str] = attr.ib(default='', repr=False) 77 | 78 | markup: Dict[str, str] = attr.ib(factory=dict, repr=False) 79 | hyperlinks: HyperlinkStore = attr.ib(factory=HyperlinkStore, repr=False) 80 | 81 | @classmethod 82 | def from_upstream(cls, item: JSONDict) -> FeedlyEntry: 83 | data = {} 84 | for name in attr.fields_dict(cls): 85 | value = item.get(name) 86 | if value: 87 | data[name] = value 88 | data['url'] = cls._get_page_url(item) 89 | data['source'] = cls._get_source_url(item) 90 | entry = cls(**data) 91 | cls._set_markup(entry, item) 92 | return entry 93 | 94 | @staticmethod 95 | def _get_page_url(item): 96 | url = urlsplit(item.get('originId', '')) 97 | if url.netloc: 98 | url = url.geturl() 99 | else: 100 | url = '' 101 | alt = item.get('alternate') 102 | if alt and alt != 'none': 103 | url = alt[0]['href'] 104 | return url 105 | 106 | @staticmethod 107 | def _get_source_url(item): 108 | source = item.get('origin') 109 | if source: 110 | return get_feed_uri(source.get('streamId', '/')) 111 | return '' 112 | 113 | @staticmethod 114 | def _set_markup(entry, item): 115 | content = item.get('content', item.get('summary')) 116 | if content: 117 | content = content.get('content') 118 | if content: 119 | entry.add_markup('summary', content) 120 | 121 | @staticmethod 122 | def _filter_attrib(attrib: attr.Attribute, value: Any) -> bool: 123 | return attrib.name[0] != '_' 124 | 125 | def add_markup(self, name, markup): 126 | self.markup[name] = markup 127 | self.hyperlinks.parse_html(self.url, markup) 128 | 129 | def for_json(self) -> JSONDict: 130 | dict_ = attr.asdict(self, filter=self._filter_attrib) 131 | return dict_ 132 | -------------------------------------------------------------------------------- /feedme/logger.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import logging 24 | import sys 25 | from typing import Dict, Union 26 | 27 | try: 28 | import termcolor 29 | _ = termcolor.colored 30 | except ImportError: 31 | _ = None 32 | 33 | 34 | def compose_mappings(*mappings): 35 | base = {} 36 | base.update(mappings[0]) 37 | for m in mappings[1:]: 38 | for k, v in m.items(): 39 | if k in base and type(base[k]) is type(v): 40 | if isinstance(v, dict): 41 | base[k] = compose_mappings(base[k], v) 42 | elif isinstance(v, set): 43 | base[k] |= v 44 | elif isinstance(v, list): 45 | base[k].extend(v) 46 | else: 47 | base[k] = v 48 | else: 49 | base[k] = v 50 | return base 51 | 52 | 53 | class _LogContainer: 54 | pass 55 | 56 | 57 | class _ColoredFormatter(logging.Formatter): 58 | def __init__(self, fmt=None, datefmt=None, style='%', *, color='white'): 59 | super().__init__(fmt, datefmt, style) 60 | self.termcolor_args = lambda self, record: () 61 | if isinstance(color, str): 62 | self.termcolor_args = lambda self, record: (color,) 63 | elif isinstance(color, tuple): 64 | self.termcolor_args = lambda self, record: color 65 | elif callable(color): 66 | self.termcolor_args = color 67 | 68 | def format(self, record): 69 | color_args = self.termcolor_args(self, record) 70 | return _(super().format(record), *color_args) 71 | 72 | 73 | class _CascadingFormatter(logging.Formatter): 74 | def __init__( 75 | self, sections: str, 76 | stylesheet: Dict[str, Union[str, logging.Formatter]], 77 | style='%', stacktrace=None, datefmt=None, 78 | ): 79 | self.stylesheet = {} 80 | for section, fmt in stylesheet.items(): 81 | formatter = logging.Formatter(fmt) if isinstance(fmt, str) else fmt 82 | if section != stacktrace: 83 | formatter.formatException = lambda info: '' 84 | formatter.formatStack = lambda info: '' 85 | self.stylesheet[section] = formatter 86 | super().__init__(sections, datefmt, style) 87 | 88 | def format(self, record): 89 | parent = _LogContainer() 90 | for child, fmt in self.stylesheet.items(): 91 | setattr(parent, child, fmt.format(record)) 92 | return super().formatMessage(parent) 93 | 94 | @classmethod 95 | def from_config(cls, *, sections, stylesheet, **kwargs): 96 | stylesheet_ = {} 97 | for k, fmt in stylesheet.items(): 98 | if isinstance(fmt, str): 99 | stylesheet_[k] = fmt 100 | continue 101 | f_kwargs = {} 102 | f_kwargs.update(fmt) 103 | factory = f_kwargs.pop('()', logging.Formatter) 104 | stylesheet_[k] = factory(**f_kwargs) 105 | return cls(sections, stylesheet_, **kwargs) 106 | 107 | 108 | LOG_LEVEL_PREFIX_COLORS = { 109 | 'DEBUG': ('magenta', None, ['bold']), 110 | 'INFO': ('white', None, ['bold']), 111 | 'WARNING': ('yellow', None, ['bold']), 112 | 'ERROR': ('red', None, ['bold']), 113 | 'CRITICAL': ('grey', 'on_red', ['bold']), 114 | } 115 | LOG_LEVEL_PREFIX_COLORS_DEBUG = { 116 | **LOG_LEVEL_PREFIX_COLORS, 117 | 'INFO': ('blue', None, ['bold']), 118 | } 119 | 120 | 121 | def _color_stacktrace(self, record: logging.LogRecord): 122 | return ('red',) if record.exc_info else ('white',) 123 | 124 | 125 | def _conditional_color(field, rules, default=('white',)): 126 | def fn(self, record): 127 | return rules.get(getattr(record, field), default) 128 | return fn 129 | 130 | 131 | FMT_PREFIX = '%(asctime)s %(levelname)8s' 132 | FMT_LOGGER = '[%(processName)s:%(name)s]' 133 | FMT_SOURCE = '(%(module)s.%(funcName)s:%(lineno)d)' 134 | 135 | formatter_styles = { 136 | 'standard': { 137 | 'normal': { 138 | 'format': f'{FMT_PREFIX} {FMT_LOGGER} %(message)s', 139 | }, 140 | 'colored': { 141 | '()': _CascadingFormatter.from_config, 142 | 'sections': '%(prefix)s %(name)s %(message)s', 143 | 'stylesheet': { 144 | 'prefix': { 145 | '()': _ColoredFormatter, 146 | 'fmt': FMT_PREFIX, 147 | 'color': _conditional_color('levelname', LOG_LEVEL_PREFIX_COLORS), 148 | }, 149 | 'name': { 150 | '()': _ColoredFormatter, 151 | 'fmt': FMT_LOGGER, 152 | 'color': 'blue', 153 | }, 154 | 'message': { 155 | '()': _ColoredFormatter, 156 | 'fmt': '%(message)s', 157 | 'color': _color_stacktrace, 158 | }, 159 | }, 160 | 'stacktrace': 'message', 161 | }, 162 | }, 163 | 'debug': { 164 | 'normal': { 165 | 'format': f'{FMT_PREFIX} {FMT_LOGGER}{FMT_SOURCE} %(message)s', 166 | }, 167 | 'colored': { 168 | '()': _CascadingFormatter.from_config, 169 | 'sections': '%(prefix)s %(name)s%(source)s %(message)s', 170 | 'stylesheet': { 171 | 'prefix': { 172 | '()': _ColoredFormatter, 173 | 'fmt': FMT_PREFIX, 174 | 'color': _conditional_color('levelname', LOG_LEVEL_PREFIX_COLORS_DEBUG), 175 | }, 176 | 'name': { 177 | '()': _ColoredFormatter, 178 | 'fmt': FMT_LOGGER, 179 | 'color': 'blue', 180 | }, 181 | 'source': { 182 | '()': _ColoredFormatter, 183 | 'fmt': FMT_SOURCE, 184 | 'color': 'cyan', 185 | }, 186 | 'message': { 187 | '()': _ColoredFormatter, 188 | 'fmt': '%(message)s', 189 | 'color': _color_stacktrace, 190 | }, 191 | }, 192 | 'stacktrace': 'message', 193 | }, 194 | }, 195 | } 196 | 197 | logging_config_template = { 198 | 'disable_existing_loggers': False, 199 | 'version': 1, 200 | 'handlers': { 201 | 'console': { 202 | 'class': 'logging.StreamHandler', 203 | 'stream': sys.stderr, 204 | }, 205 | }, 206 | 'loggers': { 207 | 'main': { 208 | 'level': logging.NOTSET, 209 | }, 210 | 'scrapy.core': { 211 | 'level': logging.NOTSET, 212 | }, 213 | 'scrapy.core.engine': { 214 | 'level': logging.NOTSET, 215 | }, 216 | 'scrapy.crawler': { 217 | 'level': logging.WARNING, 218 | }, 219 | 'scrapy.middleware': { 220 | 'level': logging.WARNING, 221 | }, 222 | 'twisted': { 223 | 'level': logging.ERROR, 224 | }, 225 | }, 226 | 'root': { 227 | 'handlers': ['console'], 228 | }, 229 | } 230 | 231 | 232 | def make_logging_config( 233 | app_name, *overrides, level=logging.INFO, 234 | style='standard', colored=True, datefmt=None, 235 | logfile=None, **kwargs, 236 | ): 237 | color_mode = 'colored' if colored and _ else 'normal' 238 | if style in formatter_styles: 239 | formatter = formatter_styles[style][color_mode] 240 | else: 241 | formatter = style 242 | 243 | app_logging_config = { 244 | 'formatters': { 245 | 'default_fmt': formatter, 246 | }, 247 | 'handlers': { 248 | 'console': { 249 | 'formatter': 'default_fmt', 250 | 'level': level, 251 | }, 252 | }, 253 | 'loggers': { 254 | f'{app_name}': { 255 | 'level': logging.NOTSET, 256 | }, 257 | }, 258 | 'root': { 259 | 'level': level, 260 | }, 261 | } 262 | 263 | file_handler_config = {} 264 | if logfile: 265 | file_handler_config = { 266 | 'formatters': { 267 | 'no_color': (formatter_styles[style]['normal'] 268 | if style in formatter_styles else style), 269 | }, 270 | 'handlers': { 271 | 'file': { 272 | 'class': 'logging.FileHandler', 273 | 'filename': logfile, 274 | 'formatter': 'no_color', 275 | }, 276 | }, 277 | 'root': { 278 | 'handlers': ['file'], 279 | }, 280 | } 281 | 282 | datefmt_config = {} 283 | if datefmt: 284 | datefmt_config = { 285 | 'formatters': { 286 | 'default_fmt': { 287 | 'datefmt': datefmt, 288 | }, 289 | 'no_color': { 290 | 'datefmt': datefmt, 291 | }, 292 | }, 293 | } 294 | 295 | log_config = compose_mappings( 296 | logging_config_template, 297 | app_logging_config, 298 | file_handler_config, 299 | datefmt_config, 300 | *overrides, 301 | ) 302 | return log_config 303 | 304 | 305 | class _LoggingParticipant: 306 | def __init__(self, *args, _logger=None, **kwargs): 307 | if _logger: 308 | self.log: logging.Logger = _logger 309 | elif isinstance(getattr(self, '_logger_name', None), str): 310 | self.log: logging.Logger = logging.getLogger(self._logger_name) 311 | self.log.disabled = True 312 | else: 313 | raise NotImplementedError('_logger_name is not defined') 314 | -------------------------------------------------------------------------------- /feedme/requests.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from scrapy import Request 24 | 25 | 26 | class ProbeFeed(Request): 27 | def __init__(self, *, url, callback, source=None, **kwargs): 28 | meta = kwargs.pop('meta', {}) 29 | meta['feed_url'] = url 30 | meta['is_probe'] = True 31 | meta['pkey'] = (url, 'search') 32 | super().__init__(url=url, callback=callback, meta=meta, **kwargs) 33 | self.priority = source.priority + 5 if source else self.priority + 5 34 | 35 | 36 | def reconstruct_request(cls, instance, **kwargs): 37 | callback = kwargs.pop('callback') 38 | callback = getattr(instance, callback) 39 | return cls(callback=callback, **kwargs) 40 | -------------------------------------------------------------------------------- /feedme/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for feedly project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | __version__ = '0.10.11' 11 | 12 | BOT_NAME = 'feedly' 13 | 14 | SPIDER_MODULES = ['feedme.spiders'] 15 | NEWSPIDER_MODULE = 'feedme.spiders' 16 | 17 | LOG_ENABLED = True 18 | LOG_LEVEL = 20 19 | 20 | LOGSTATS_INTERVAL = 60.0 21 | METRICS_CALC_INTERVAL = 20.0 22 | 23 | LOG_VIOLATIONS = False 24 | STATS_DUMP = False 25 | 26 | COMMANDS_MODULE = 'feedme.commands' 27 | 28 | # This program uses a custom logging config (see __init__.py) 29 | # To give control of logging back to Scrapy, set this to False 30 | CUSTOM_LOGGING_ENABLED = True 31 | 32 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 33 | USER_AGENT = f'Mozilla/5.0 (compatible; hyperlinkfeedme/{__version__}; +https://github.com/monotony113/feedly-link-feedme)' 34 | 35 | # Obey robots.txt rules 36 | ROBOTSTXT_OBEY = True 37 | 38 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 39 | # CONCURRENT_REQUESTS = 32 40 | 41 | # Configure a delay for requests for the same website (default: 0) 42 | # See https://docs.scrapy.org/en/latest/topics/settings.html# download-delay 43 | # See also autothrottle settings and docs 44 | # DOWNLOAD_DELAY = 3 45 | # The download delay setting will honor only one of: 46 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 47 | # CONCURRENT_REQUESTS_PER_IP = 16 48 | 49 | # Disable cookies (enabled by default) 50 | COOKIES_ENABLED = False 51 | 52 | # Disable Telnet Console (enabled by default) 53 | # TELNETCONSOLE_ENABLED = False 54 | 55 | # Override the default request headers: 56 | DEFAULT_REQUEST_HEADERS = { 57 | 'Accept': 'application/json;q=0.9;text/html,application/xhtml+xml,application/xml;q=0.8,*/*;q=0.7', 58 | } 59 | 60 | # Enable or disable spider middlewares 61 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 62 | SPIDER_MIDDLEWARES = { 63 | 'scrapy.spidermiddlewares.depth.DepthMiddleware': None, 64 | 'feedme.middlewares.RequestDefrosterSpiderMiddleware': 100, 65 | 'feedme.middlewares.DerefItemSpiderMiddleware': 101, 66 | 'feedme.middlewares.OffsiteFeedSpiderMiddleware': 500, 67 | 'feedme.middlewares.ConditionalDepthSpiderMiddleware': 550, 68 | 'feedme.middlewares.FetchSourceSpiderMiddleware': 600, 69 | 'feedme.middlewares.CrawledItemSpiderMiddleware': 800, 70 | 'feedme.spiders.cluster.ExplorationSpiderMiddleware': 900, 71 | } 72 | 73 | HTTPERROR_ALLOWED_CODES = [403, 404] 74 | 75 | # Enable or disable downloader middlewares 76 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 77 | DOWNLOADER_MIDDLEWARES = { 78 | 'feedme.middlewares.RequestPersistenceDownloaderMiddleware': 150, 79 | 'feedme.middlewares.FeedProbingDownloaderMiddleware': 200, 80 | 'feedme.middlewares.HTTPErrorDownloaderMiddleware': 500, 81 | 'feedme.middlewares.AuthorizationDownloaderMiddleware': 600, 82 | } 83 | 84 | # Enable or disable extensions 85 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 86 | EXTENSIONS = { 87 | 'scrapy.extensions.logstats.LogStats': None, 88 | 'feedme.extensions._LoggingHelper': 99, 89 | 'feedme.extensions.PresetLoader': 100, 90 | 'feedme.extensions.SettingsLoader': 101, 91 | 'feedme.extensions.LogStatsExtended': 102, 92 | 'feedme.extensions.RequestMetrics': 102, 93 | 'feedme.extensions.ContribMiddleware': 200, 94 | 'feedme.extensions.GlobalPersistence': 999, 95 | } 96 | 97 | # Configure item pipelines 98 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 99 | ITEM_PIPELINES = { 100 | # 'feedme.pipelines.CompressedStreamExportPipeline': 900, 101 | # 'feedme.pipelines.SQLiteExportPipeline': 900, 102 | 'feedme.pipelines.SQLiteExportProcessPipeline': 900, 103 | } 104 | 105 | AUTO_LOAD_PREDEFINED_PRESETS = True 106 | 107 | # Enable and configure the AutoThrottle extension (disabled by default) 108 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 109 | AUTOTHROTTLE_ENABLED = True 110 | # The initial download delay 111 | AUTOTHROTTLE_START_DELAY = 0 112 | # The maximum download delay to be set in case of high latencies 113 | AUTOTHROTTLE_MAX_DELAY = 60 114 | # The average number of requests Scrapy should be sending in parallel to 115 | # each remote server 116 | AUTOTHROTTLE_TARGET_CONCURRENCY = 1.5 117 | # Enable showing throttling stats for every response received: 118 | AUTOTHROTTLE_DEBUG = False 119 | 120 | # Enable and configure HTTP caching (disabled by default) 121 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html# httpcache-middleware-settings 122 | # HTTPCACHE_ENABLED = True 123 | # HTTPCACHE_EXPIRATION_SECS = 0 124 | # HTTPCACHE_DIR = 'httpcache' 125 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 126 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 127 | 128 | DEPTH_LIMIT = 1 129 | -------------------------------------------------------------------------------- /feedme/signals.py: -------------------------------------------------------------------------------- 1 | register_state = object() 2 | show_stats = object() 3 | 4 | start_from_scratch = object() 5 | resume_requests = object() 6 | request_finished = object() 7 | -------------------------------------------------------------------------------- /feedme/spiders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonywu7/feedly-link-aggregator/7c32b45fbfa1794b081e90add033403308a2da85/feedme/spiders/__init__.py -------------------------------------------------------------------------------- /feedme/spiders/base.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from __future__ import annotations 24 | 25 | import os 26 | import time 27 | from abc import ABC, abstractmethod 28 | from datetime import datetime 29 | from pprint import pformat 30 | from typing import Optional, Union 31 | 32 | from scrapy import Spider 33 | from scrapy.exceptions import CloseSpider 34 | from scrapy.http import Request, TextResponse 35 | from scrapy.signals import spider_opened 36 | 37 | from ..feedly import FeedlyEntry, build_api_url, get_feed_uri 38 | from ..requests import ProbeFeed 39 | from ..signals import (request_finished, resume_requests, show_stats, 40 | start_from_scratch) 41 | from ..urlkit import build_urls, select_templates 42 | from ..utils import LOG_LISTENER, JSONDict 43 | from ..utils import colored as _ 44 | from ..utils import guard_json 45 | 46 | 47 | class FeedlyRSSSpider(Spider, ABC): 48 | custom_settings = { 49 | 'ROBOTSTXT_OBEY': False, 50 | } 51 | 52 | class SpiderConfig: 53 | OUTPUT = f'./crawl.{datetime.now().strftime("%Y%m%d%H%M%S")}' 54 | 55 | RSS = 'https://xkcd.com/atom.xml' 56 | RSS_TEMPLATES = {} 57 | 58 | DOWNLOAD_ORDER = 'oldest' 59 | DOWNLOAD_PER_BATCH = 1000 60 | 61 | ENABLE_SEARCH = False 62 | ACCESS_TOKEN = None 63 | 64 | STREAM_ID_PREFIX = 'feed/' 65 | 66 | DATABASE_CACHE_SIZE = 100000 67 | 68 | SELECTION_STRATS = { 69 | 'dead': {None: 1, True: 1, False: 0}, 70 | 'alive': {None: 1, True: 0, False: 1}, 71 | 'dead+': {None: 1, True: 1, False: -128}, 72 | 'alive+': {None: 1, True: -128, False: 1}, 73 | 'all': {None: 1, True: 1, False: 1}, 74 | } 75 | 76 | @classmethod 77 | def from_crawler(cls, crawler, *args, **kwargs): 78 | spider: FeedlyRSSSpider = super().from_crawler(crawler, *args, config=crawler.settings, **kwargs) 79 | spider.stats = crawler.stats 80 | spider.signals = crawler.signals 81 | crawler.signals.connect(spider.open_spider, spider_opened) 82 | crawler.signals.connect(spider.resume_crawl, resume_requests) 83 | crawler.signals.send_catch_log(show_stats, names=['rss/page_count']) 84 | return spider 85 | 86 | def __init__(self, *, name=None, config, **kwargs): 87 | super().__init__(name=name, **kwargs) 88 | self.config = config 89 | self.item_limit = config.getint('DOWNLOAD_LIMIT', 0) 90 | 91 | output_dir = config['OUTPUT'] 92 | os.makedirs(output_dir, exist_ok=True) 93 | 94 | self.api_base_params = { 95 | 'count': int(config['DOWNLOAD_PER_BATCH']), 96 | 'ranked': config['DOWNLOAD_ORDER'], 97 | 'similar': 'true', 98 | 'unreadOnly': 'false', 99 | } 100 | 101 | self.freezer = None 102 | self.resume_iter = None 103 | 104 | def open_spider(self, spider): 105 | conf = self.config['SPIDER_CONFIG'] 106 | self.logger.info(f'Spider parameters:\n{pformat(conf.copy_to_dict())}') 107 | 108 | @abstractmethod 109 | def start_requests(self): 110 | self.signals.send_catch_log(resume_requests, spider=self) 111 | yield from self.resume_crawl() 112 | 113 | def resume_crawl(self): 114 | freezer = self.freezer 115 | may_resume = False 116 | if freezer is not None: 117 | requests = freezer.defrost(self) 118 | try: 119 | req = next(requests) 120 | except StopIteration: 121 | pass 122 | else: 123 | may_resume = self.ask_if_resume(freezer) 124 | 125 | if not may_resume: 126 | feed = self.config['RSS'] 127 | freezer.dump_info({'crawling': feed}) 128 | self.signals.send_catch_log(start_from_scratch) 129 | yield self.probe_feed(feed, meta={'reason': 'user_specified', 'depth': 1}) 130 | return 131 | 132 | self.logger.info(_('Resuming crawl.', color='cyan')) 133 | self.resume_iter = requests 134 | yield req 135 | 136 | def ask_if_resume(self, freezer): 137 | feed = self.config['RSS'] 138 | resume_feed = freezer.load_info().get('crawling') 139 | if resume_feed != feed: 140 | self.logger.info(_('Found unfinished crawl job:', color='cyan')) 141 | self.logger.info(_(f"Continue crawling '{resume_feed}'?", color='cyan')) 142 | self.logger.info(_(f"Start new crawl with '{feed}'?", color='cyan')) 143 | self.logger.info(_('Or exit?', color='cyan')) 144 | action = 'x' 145 | else: 146 | action = 'c' 147 | 148 | LOG_LISTENER.stop() 149 | while action not in 'cse': 150 | action = input('(continue/start/exit) [c]: ')[:1] 151 | LOG_LISTENER.start() 152 | 153 | if action == 'e': 154 | raise CloseSpider() 155 | if action == 's': 156 | freezer.clear() 157 | freezer.dump_info({'crawling': feed}) 158 | return False 159 | return True 160 | 161 | def get_streams_url(self, feed_id: str, **params) -> str: 162 | params = {**self.api_base_params, **params} 163 | return build_api_url('streams', streamId=feed_id, **params) 164 | 165 | def probe_feed(self, query: str, derive: bool = True, source: Optional[Request] = None, **kwargs): 166 | templates = self.config['RSS_TEMPLATES'] 167 | if derive and templates: 168 | try: 169 | urls = build_urls(query, *select_templates(query, templates)) 170 | except ValueError: 171 | self.logger.debug(f'No template for {query}') 172 | urls = [query] 173 | else: 174 | urls = [query] 175 | 176 | prefix = self.config['STREAM_ID_PREFIX'] 177 | meta = kwargs.pop('meta', {}) 178 | meta['try_feeds'] = {f'{prefix}{u}': None for u in urls} 179 | return ProbeFeed(url=query, callback=self.start_feeds, meta=meta, source=source, **kwargs) 180 | 181 | def start_feeds(self, response: TextResponse): 182 | meta = response.meta 183 | self.signals.send_catch_log(request_finished, request=response.request.copy()) 184 | 185 | del meta['is_probe'] 186 | feeds = meta.get('valid_feeds') 187 | if feeds is None: 188 | feeds = meta.get('try_feeds', {}) 189 | if not feeds and meta['reason'] == 'user_specified': 190 | self.logger.info(f'No valid RSS feed can be found using `{meta["feed_url"]}` and available feed templates.') 191 | self.logger.critical('No feed to crawl!') 192 | 193 | yield from self.filter_feeds(feeds, meta) 194 | yield from self.get_feed_info(feeds, meta) 195 | 196 | def filter_feeds(self, feeds, meta): 197 | for feed in feeds: 198 | yield self.next_page({'id': feed}, meta=meta, initial=True) 199 | 200 | def get_feed_info(self, feeds, meta): 201 | feed_info = meta.get('feed_info', {}) 202 | for feed, info in feed_info.items(): 203 | yield {'source': info, 'dead': feeds.get(feed)} 204 | 205 | def next_page(self, data: JSONDict, response: Optional[TextResponse] = None, initial: bool = False, **kwargs) -> Union[JSONDict, Request]: 206 | feed = data['id'] 207 | 208 | if response: 209 | meta = {**response.meta} 210 | else: 211 | meta = {} 212 | meta.update(kwargs.pop('meta', {})) 213 | if not initial: 214 | meta['no_filter'] = True 215 | meta.pop('inc_depth', None) 216 | 217 | feed_url = get_feed_uri(feed) 218 | meta['feed_url'] = feed_url 219 | 220 | meta['pkey'] = (feed_url, 'main') 221 | 222 | params = {} 223 | cont = data.get('continuation') 224 | if cont: 225 | params['continuation'] = cont 226 | meta['reason'] = 'continuation' 227 | elif not initial: 228 | self.logger.info(f'Exhausted: {feed_url}') 229 | self.signals.send_catch_log(request_finished, request=response.request.copy()) 230 | return 231 | 232 | depth = meta.get('depth') 233 | reason = meta.get('reason') 234 | self.logger.debug(f'initial={initial} depth={depth} reason={reason} {feed}') 235 | 236 | url = self.get_streams_url(feed, **params) 237 | if response: 238 | return response.request.replace(url=url, meta=meta, **kwargs) 239 | return Request(url, callback=self.parse_feed, meta=meta, **kwargs) 240 | 241 | def parse_feed(self, response: TextResponse): 242 | if not response: 243 | return 244 | 245 | data = guard_json(response.text) 246 | items = data.get('items') 247 | source = response.meta['feed_url'] 248 | if items: 249 | response.meta['valid_feed'] = True 250 | if response.meta.get('reason') != 'continuation': 251 | self.logger.info(_(f'Got new feed: {source}', color='green')) 252 | 253 | count = response.meta.get('item_scraped', 0) 254 | for item in items: 255 | entry = FeedlyEntry.from_upstream(item) 256 | if not entry.source: 257 | entry.source = {'feed': source} 258 | if not entry: 259 | continue 260 | 261 | self.stats.inc_value('rss/page_count') 262 | 263 | depth = response.meta.get('depth', 0) 264 | yield { 265 | 'item': entry, 266 | 'depth': depth, 267 | 'time_crawled': time.time(), 268 | } 269 | count += 1 270 | response.meta['item_scraped'] = count 271 | 272 | if self.item_limit and count >= self.item_limit: 273 | return 274 | 275 | next_page = self.next_page(data, response=response) 276 | if next_page: 277 | yield next_page 278 | -------------------------------------------------------------------------------- /feedme/spiders/cluster.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import logging 24 | from collections import defaultdict 25 | from urllib.parse import urlsplit 26 | 27 | from scrapy.crawler import Crawler 28 | from scrapy.exceptions import NotConfigured 29 | from scrapy.http import Request, TextResponse 30 | 31 | from ..datastructures import compose_mappings 32 | from ..docs import OptionsContributor 33 | from ..feedly import FeedlyEntry 34 | from ..signals import (register_state, request_finished, show_stats, 35 | start_from_scratch) 36 | from ..utils import SpiderOutput 37 | from ..utils import colored as _ 38 | from .base import FeedlyRSSSpider 39 | 40 | 41 | class ExplorationSpiderMiddleware: 42 | @classmethod 43 | def from_crawler(cls, crawler): 44 | if crawler.spidercls is not FeedClusterSpider: 45 | raise NotConfigured() 46 | 47 | crawler.signals.send_catch_log(show_stats, names=[ 48 | 'rss/hyperlink_count', 49 | 'cluster/1_discovered_nodes', 50 | 'cluster/2_scheduled_nodes', 51 | 'cluster/3_finished_nodes', 52 | 'cluster/4_explored', 53 | ]) 54 | return cls(crawler) 55 | 56 | def __init__(self, crawler: Crawler): 57 | self.stats = crawler.stats 58 | self.logger = logging.getLogger('explore') 59 | self._depth_limit = crawler.settings.getint('DEPTH_LIMIT', 1) 60 | self._threshold = crawler.settings.getint('EXPANSION_THRESHOLD', 0) 61 | self._discovered = defaultdict(int) 62 | self._scheduled = set() 63 | self._finished = set() 64 | 65 | crawler.signals.connect(self.clear_state_info, start_from_scratch) 66 | crawler.signals.connect(self.update_finished, request_finished) 67 | crawler.signals.send_catch_log( 68 | register_state, obj=self, namespace='explore', 69 | attrs=['_discovered', '_scheduled', '_finished'], 70 | ) 71 | 72 | def process_spider_output(self, response: TextResponse, result: SpiderOutput, spider): 73 | depth = response.meta.get('depth', 0) 74 | for data in result: 75 | if isinstance(data, Request): 76 | yield data 77 | continue 78 | if 'item' in data: 79 | item = data['item'] 80 | self.stats.inc_value('rss/page_count') 81 | yield from self.process_item(response, item, depth, spider) 82 | yield data 83 | 84 | def process_item( 85 | self, response: TextResponse, 86 | item: FeedlyEntry, depth: int, 87 | spider, 88 | ): 89 | dest = {urlsplit(k): v for k, v in item.hyperlinks.items()} 90 | dest = {k: v for k, v in dest.items() if k.netloc} 91 | self.stats.inc_value('rss/hyperlink_count', len(dest)) 92 | 93 | for u in dest: 94 | self._discovered[f'{u.scheme}://{u.netloc}'] += 1 95 | 96 | if not self._depth_limit or depth < self._depth_limit: 97 | yield from self.schedule_new_nodes(item, depth, response.request, spider) 98 | 99 | self.update_ratio() 100 | 101 | def schedule_new_nodes(self, item, depth, request, spider): 102 | sites = ({u for u, v in self._discovered.items() if v > self._threshold} 103 | - self._scheduled) 104 | self._scheduled |= sites 105 | self.logger.debug(f'depth={depth}; +{len(sites)}') 106 | 107 | for url in sites: 108 | self.logger.debug(f'{url} (depth={depth})') 109 | yield spider.probe_feed( 110 | url, source=request, 111 | meta={ 112 | 'inc_depth': 1, 113 | 'depth': depth, 114 | 'reason': 'newly_discovered', 115 | 'source_item': item, 116 | }) 117 | 118 | def update_finished(self, request: Request): 119 | if 'is_probe' in request.meta: 120 | return 121 | feed_url = request.meta.get('feed_url') 122 | if not feed_url: 123 | return 124 | self._finished.add(urlsplit(feed_url).netloc) 125 | self.stats.set_value('cluster/3_finished_nodes', len(self._finished)) 126 | self.update_ratio() 127 | 128 | def update_ratio(self): 129 | scheduled = len(self._scheduled) 130 | self.stats.set_value('cluster/1_discovered_nodes', len(self._discovered)) 131 | self.stats.set_value('cluster/2_scheduled_nodes', scheduled) 132 | finished = self.stats.get_value('cluster/3_finished_nodes', 0) 133 | if not scheduled: 134 | return 135 | ratio = finished / scheduled 136 | self.stats.set_value('cluster/4_explored', f'{ratio * 100:.2f}%') 137 | 138 | def clear_state_info(self): 139 | self._discovered.clear() 140 | self._scheduled.clear() 141 | self._finished.clear() 142 | 143 | 144 | class FeedClusterSpider(FeedlyRSSSpider, OptionsContributor, _doc_order=9): 145 | """ 146 | Spider to crawl a group of feeds. 147 | 148 | It works by recursively trying to crawl websites found in the contents of a feed, 149 | until it hits the depth limit, or until no more crawlable website can be found. 150 | 151 | Usage 152 | ----- 153 | `scrapy crawl cluster -s OPTIONS=... ...` 154 | 155 | This spider supports all options supported by the single feed spider. 156 | """ 157 | 158 | name = 'cluster' 159 | 160 | custom_settings = compose_mappings(FeedlyRSSSpider.custom_settings, { 161 | 'DEPTH_PRIORITY': 1, 162 | 'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue', 163 | 'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue', 164 | }) 165 | 166 | class SpiderConfig(FeedlyRSSSpider.SpiderConfig): 167 | FOLLOW_DOMAINS = None 168 | DEPTH_LIMIT = 1 169 | 170 | def start_requests(self): 171 | return super().start_requests() 172 | 173 | def filter_feeds(self, feeds, meta): 174 | if meta['reason'] == 'user_specified': 175 | for feed in feeds: 176 | yield self.next_page({'id': feed}, meta=meta, initial=True) 177 | return 178 | 179 | select = self.config.get('SELECT_FEED_STATE', 'all') 180 | for feed, dead in feeds.items(): 181 | prio = self.SELECTION_STRATS[select][dead] 182 | if not prio: 183 | self.logger.info(_(f'Dropped {"dead" if dead else "living"} feed {feed[5:]}', color='grey')) 184 | else: 185 | yield self.next_page({'id': feed}, meta=meta, initial=True, priority=prio) 186 | 187 | @staticmethod 188 | def _help_options(): 189 | return { 190 | 'EXPANSION_THRESHOLD': """ 191 | Number of times a website must be mentioned by a feed before it will be scheduled. 192 | 193 | Set to a number > 1 to filter out sites that are only mentioned a few times. 194 | """, 195 | 'FOLLOW_DOMAINS': """ 196 | Only nodes whose domains or parent domains are included here will be expanded upon. 197 | 198 | Value should be a collection of domains. (Other nodes are still recorded, 199 | but are not used to find new feeds). 200 | 201 | If set to None, spider will not filter nodes based on domains. 202 | 203 | **Example** 204 | 205 | `FOLLOW_DOMAINS = ['tumblr.com', 'wordpress.com']` 206 | """, 207 | 'DEPTH_LIMIT': """ 208 | How much the spider will expand the cluster. Value should be an integer. 209 | 210 | (This is the same settings as the one used by the built-in ~DepthMiddleware~.) 211 | 212 | Nodes that are more than `depth + 1` degree removed from the starting feed 213 | will not be expanded upon. 214 | 215 | If set to ~1~, only the starting feed will be crawled. 216 | If set to ~0~ or ~None~, spider will keep crawling until manually stopped. 217 | """, 218 | 'SELECT_FEED_STATE': """ 219 | Only crawl feeds that are of a certain `state`. 220 | 221 | A feed can be in one of two states: 222 | `dead` - The feed URL is unreachable (e.g. timed out); or a HEAD request 223 | returns a status code other than `200 OK`, `206 Partial`, or 224 | `405 Method Not Allowed`; 225 | or the responded MIME type is anything other than that of a 226 | valid RSS feed `(text/xml, application/xml, application/rss+xml,` 227 | `application/rdf+xml, application/atom+xml)`. 228 | `alive` - All other feeds are considered alive. 229 | 230 | This option accepts the following values: 231 | ~all~ - Do not filter feeds based on their state 232 | ~dead~ - Only crawl dead feeds 233 | ~alive~ - Only crawl living feeds 234 | ~dead+~ - Crawl all feeds, but dead feeds receive a higher priority 235 | ~alive+~ - Crawl all feeds, but living feeds receive a higher priority 236 | 237 | Note that values other than `all` cause the spider to send a HEAD request to 238 | each feed URL about to be crawled, which will add a slight overhead to the running time. 239 | """, 240 | } 241 | -------------------------------------------------------------------------------- /feedme/spiders/feed.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from __future__ import annotations 24 | 25 | from ..docs import OptionsContributor 26 | from .base import FeedlyRSSSpider 27 | 28 | 29 | class FeedSpider(FeedlyRSSSpider, OptionsContributor, _doc_order=10): 30 | """ 31 | Spider to crawl a single feed. 32 | 33 | Usage 34 | ----- 35 | `scrapy crawl feed -s OPTIONS=... ...` 36 | """ 37 | 38 | name = 'feed' 39 | 40 | def start_requests(self): 41 | return super().start_requests() 42 | 43 | @staticmethod 44 | def _help_options(): 45 | return { 46 | 'OUTPUT': """ 47 | Path where scraped data will be saved; will be a directory 48 | If an existing directory with scraped data is specified, newly gathered 49 | data will be merged with existing one. 50 | """, 51 | 'RSS': """ 52 | URL to the RSS feed you would like to scrape. 53 | Must contain the protocol part of the URL, e.g. `http://`. 54 | """, 55 | 'DOWNLOAD_ORDER': """ 56 | The part of the feed to download first: either `oldest` or `newest` 57 | """, 58 | 'DOWNLOAD_PER_BATCH': """ 59 | Number of entries to download per API request. The minimum is 1 and the maximum is 1000. 60 | """, 61 | 'DOWNLOAD_LIMIT': """ 62 | Number of items to scrape from one RSS feed source before stopping. 63 | 64 | Note that one website may have multiple sources e.g. if using feed templates. This number 65 | applies to each source. 66 | """, 67 | 'RSS_TEMPLATES': """ 68 | Templates to generate different versions of RSS URLs based on the value of the RSS setting. 69 | 70 | Because Feedly sometimes store an RSS feed's source URL with slight variations (e.g. using 71 | HTTP instead of HTTPS), the URL that you provide above may yield incomplete results 72 | (sometimes no result at all). 73 | 74 | If you know how the URLs could vary, this option allows you to define URL templates, 75 | so that Scrapy can try different versions of URLs to increase the chance of finding the 76 | correct feed ID on Feedly. 77 | 78 | This option should be a mapping (a dict), where the key should be a valid regular 79 | expression that matches the URLs you wish to apply the corresponding the templates, and the 80 | value should be either another mapping, where the key is a %-format string with named placeholders, 81 | which will be formatted into the final URL and the value is a number that denotes the priority of the 82 | template: templates with a lower number are tried first (similar to how Scrapy middlewares are ordered). 83 | 84 | Or it could also be a callable, in which case it is passed the matched URL as a `urlsplit` tuple, 85 | and the regex match object, and it should return an iterable. 86 | 87 | Note that only the templates under the first matching pattern are used. Since dicts are ordered 88 | you should place more specific patterns at the top of the mapping. 89 | 90 | Available placeholders are: 91 | **The components of a urllib.parse.urlsplit named tuple:** 92 | ~%(scheme)s~ - Network protocol (usually `http` or `https`) 93 | ~%(netloc)s~ - Domain name 94 | ~%(path)s~ - Path of the URL, with leading / and without the query string 95 | ~%(query)s~ - Query string, without the question mark (`key1=value1&key2=value2...`) 96 | **Plus some convenient values:** 97 | ~%(original)s~ - The original string, unchanged 98 | ~%(network_path)s~ - URL minus the protocol part, equivalent to `//%(netloc)s/%(path)s?%(query)s` 99 | ~%(path_query)s~ - URL minus protocol and domain name, equivalent to `/%(path)s?%(query)s` 100 | **If you define capture groups in your pattern:** 101 | ~%(key)s ...~ - Named groups 102 | ~%(1)s, %(2)s~ - Numbered groups 103 | 104 | **Example**: 105 | 106 | `RSS_TEMPLATES = {` 107 | ` r'.*': { # This regular expression will match any strings` 108 | ` 'http:%(network_path)s': 997,` 109 | ` 'https:%(network_path)s': 998,` 110 | ` '%(original)s': 999,` 111 | ` },` 112 | `}` 113 | """, 114 | 'ENABLE_SEARCH': """ 115 | Whether or not to enable the search function 116 | If enabled, when the feed URL you provided above does not yield any result from Feedly, 117 | Scrapy will use Feedly's Search API to try to find the correct URL. 118 | 119 | It is recommended that you disable search when using the cluster spider, because it could generate 120 | a large number of search requests, and Feedly's Search API is a lot more sensitive to 121 | high volume requests than its Streams API, meaning you may quickly run into rate-limiting issues. 122 | """, 123 | 'ACCESS_TOKEN': """ 124 | If you have a developer access token, you can provide it here. 125 | """, 126 | } 127 | -------------------------------------------------------------------------------- /feedme/spiders/settings.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import re 24 | from pathlib import Path 25 | from typing import List 26 | from urllib.parse import unquote 27 | 28 | from scrapy.utils.url import add_http_if_no_scheme 29 | 30 | 31 | def single_item(f): 32 | def wrapped(*args, **kwargs): 33 | return {f.__name__.upper(): f(*args, **kwargs)} 34 | return wrapped 35 | 36 | 37 | class SettingsAdapter: 38 | @staticmethod 39 | def output(v): 40 | p = Path(v) 41 | return {'OUTPUT': p, 'JOBDIR': p / 'scheduled/jobs'} 42 | 43 | @staticmethod 44 | @single_item 45 | def rss(v): 46 | return add_http_if_no_scheme(unquote(v)) 47 | 48 | @staticmethod 49 | @single_item 50 | def rss_templates(conf): 51 | return {re.compile(k): v for k, v in conf.items()} 52 | 53 | @staticmethod 54 | @single_item 55 | def follow_domains(domains): 56 | if isinstance(domains, str): 57 | domains = set(domains.split(' ')) 58 | elif isinstance(domains, List): 59 | domains = set(domains) 60 | return domains 61 | -------------------------------------------------------------------------------- /feedme/sql/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonywu7/feedly-link-aggregator/7c32b45fbfa1794b081e90add033403308a2da85/feedme/sql/__init__.py -------------------------------------------------------------------------------- /feedme/sql/cli.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import logging 24 | import os 25 | import re 26 | import shutil 27 | import sqlite3 28 | from functools import reduce 29 | from pathlib import Path 30 | 31 | from setuptools.version import pkg_resources 32 | 33 | from ..utils import colored as _ 34 | from ..utils import findpath, randstr 35 | from .db import db 36 | from .factory import DatabaseVersionError 37 | from .stream import DatabaseWriter 38 | 39 | MIGRATIONS = Path(Path(__file__).with_name('migrations')).resolve(True) 40 | Version = pkg_resources.parse_version 41 | 42 | 43 | def check(db_path, debug=False): 44 | log = logging.getLogger('db.check') 45 | try: 46 | writer = DatabaseWriter(db_path, db, debug=debug, cache_path=':memory:') 47 | writer._verify(writer._main) 48 | writer.close() 49 | log.info(_('Database is OK.', color='green')) 50 | except DatabaseVersionError as exc: 51 | log.critical(exc) 52 | log.error(_('Run `python -m feedme upgrade-db` to upgrade it to the current version.', color='cyan')) 53 | return 1 54 | except Exception as exc: 55 | log.critical(exc, exc_info=True) 56 | log.error(_('Database has irrecoverable inconsistencies.', color='red')) 57 | return 1 58 | else: 59 | return 0 60 | 61 | 62 | def merge(output, *db_paths, debug=False): 63 | log = logging.getLogger('db.merge') 64 | for path in db_paths: 65 | log.info(_(f'Checking database at {path}', color='cyan')) 66 | exc = check(path, debug) 67 | if exc: 68 | return exc 69 | output = Path(output) 70 | db_paths = [Path(p) for p in db_paths] 71 | initial = db_paths[0] 72 | log.info(_(f'Copying initial database {initial}', color='cyan')) 73 | shutil.copyfile(initial, output) 74 | out = DatabaseWriter(output, db, debug=debug, cache_path=':memory:') 75 | for path in db_paths[1:]: 76 | log.info(_(f'Copying database {path}', color='cyan')) 77 | cp = output.with_name(randstr(8) + '.db') 78 | shutil.copyfile(path, cp) 79 | log.info(_(f'Merging {path}', color='cyan')) 80 | out._merge_other(other=cp) 81 | cp.unlink() 82 | cp.with_suffix('.db-shm').unlink() 83 | cp.with_suffix('.db-wal').unlink() 84 | out.report() 85 | out.close() 86 | return 0 87 | 88 | 89 | def migrate(db_path, debug=False, version=db.version): 90 | conn = sqlite3.Connection(db_path, isolation_level=None) 91 | log = logging.getLogger('db.migrate') 92 | if debug: 93 | conn.set_trace_callback(log.debug) 94 | 95 | if db.is_locked(conn): 96 | log.error('Database was left in a partially consistent state.') 97 | log.error('Run `python -m feedme check-db` to fix it first.') 98 | return 1 99 | 100 | outdated = False 101 | try: 102 | db.verify_version(conn) 103 | except DatabaseVersionError as e: 104 | outdated = e.db 105 | 106 | if not outdated: 107 | log.info(_('Database version is already up-to-date.', color='green')) 108 | return 0 109 | 110 | source_ver = Version(outdated) 111 | target_ver = Version(version) 112 | versions = {} 113 | for cmd in os.listdir(MIGRATIONS): 114 | from_, to_ = cmd[:-4].split('_') 115 | from_ = Version(from_) 116 | to_ = Version(to_) 117 | to_versions = versions.setdefault(from_, set()) 118 | to_versions.add(to_) 119 | 120 | path = [] 121 | scripts = [] 122 | if findpath(source_ver, target_ver, versions, path): 123 | reduce(lambda x, y: scripts.append((x, y, f'{x}_{y}.sql')) or y, path) 124 | else: 125 | log.error(f'This version of the program no longer supports migrating from {source_ver} to {target_ver}') 126 | return 1 127 | 128 | for old, new, cmd in scripts: 129 | log.info(f'Upgrading database schema from v{old} to v{new}. This may take a long time.') 130 | with open(MIGRATIONS / cmd) as f: 131 | try: 132 | conn.executescript(f.read()) 133 | except sqlite3.OperationalError as e: 134 | log.error(e, exc_info=True) 135 | log.error('Failed to upgrade database. Undoing.') 136 | conn.rollback() 137 | conn.close() 138 | return 1 139 | else: 140 | conn.commit() 141 | 142 | log.info(_('Compacting database... This may take a long time.', color='cyan')) 143 | conn.execute('VACUUM;') 144 | log.info(_('Done.', color='green')) 145 | return 0 146 | 147 | 148 | def leftovers(wd, debug=False): 149 | log = logging.getLogger('db.leftovers') 150 | main = Path(wd) / 'index.db' 151 | tmp_pattern = re.compile(r'.*~tmp-[0-9a-f]{8}\.db$') 152 | for temp in os.listdir(wd): 153 | if tmp_pattern.match(temp): 154 | temp = main.with_name(temp) 155 | log.info(f'Found unmerged temp database {temp}') 156 | writer = DatabaseWriter(main, db, debug=debug, cache_path=temp) 157 | writer.merge() 158 | writer.close() 159 | writer.cleanup() 160 | log.info(_('All temporary databases have been merged.', color='green')) 161 | -------------------------------------------------------------------------------- /feedme/sql/db.json: -------------------------------------------------------------------------------- 1 | {"order": ["url", "keyword", "item", "hyperlink", "feed", "tagging", "summary", "webpage"], "version": "0.10.6", "versioning": {"create": "CREATE TABLE IF NOT EXISTS __version__ (\n\tversion VARCHAR NOT NULL, \n\tCONSTRAINT pk___version__ PRIMARY KEY (version)\n)", "insert": "INSERT OR REPLACE INTO __version__ (version) VALUES (?)"}, "init": ["PRAGMA foreign_keys = ON", "PRAGMA journal_mode = WAL"], "tables": {"url": "CREATE TABLE IF NOT EXISTS url (\n\tid INTEGER NOT NULL, \n\turl VARCHAR NOT NULL, \n\tCONSTRAINT pk_url PRIMARY KEY (id)\n)", "keyword": "CREATE TABLE IF NOT EXISTS keyword (\n\tid INTEGER NOT NULL, \n\tkeyword VARCHAR NOT NULL, \n\tCONSTRAINT pk_keyword PRIMARY KEY (id)\n)", "item": "CREATE TABLE IF NOT EXISTS item (\n\tid INTEGER NOT NULL, \n\turl INTEGER NOT NULL, \n\tsource INTEGER NOT NULL, \n\ttitle VARCHAR, \n\tauthor VARCHAR, \n\tpublished DATETIME, \n\tupdated DATETIME, \n\tcrawled FLOAT, \n\tCONSTRAINT pk_item PRIMARY KEY (id), \n\tCONSTRAINT fk_item_url_url FOREIGN KEY(url) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, \n\tCONSTRAINT fk_item_source_url FOREIGN KEY(source) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE\n)", "hyperlink": "CREATE TABLE IF NOT EXISTS hyperlink (\n\tid INTEGER NOT NULL, \n\tsource_id INTEGER NOT NULL, \n\ttarget_id INTEGER NOT NULL, \n\telement VARCHAR NOT NULL, \n\tCONSTRAINT pk_hyperlink PRIMARY KEY (id), \n\tCONSTRAINT fk_hyperlink_source_id_url FOREIGN KEY(source_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, \n\tCONSTRAINT fk_hyperlink_target_id_url FOREIGN KEY(target_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE\n)", "feed": "CREATE TABLE IF NOT EXISTS feed (\n\tid INTEGER NOT NULL, \n\turl_id INTEGER NOT NULL, \n\ttitle TEXT NOT NULL, \n\tdead BOOLEAN, \n\tCONSTRAINT pk_feed PRIMARY KEY (id), \n\tCONSTRAINT fk_feed_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, \n\tCONSTRAINT ck_feed_dead CHECK (dead IN (0, 1))\n)", "tagging": "CREATE TABLE IF NOT EXISTS tagging (\n\tid INTEGER NOT NULL, \n\turl_id INTEGER NOT NULL, \n\tkeyword_id INTEGER NOT NULL, \n\tCONSTRAINT pk_tagging PRIMARY KEY (id), \n\tCONSTRAINT fk_tagging_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, \n\tCONSTRAINT fk_tagging_keyword_id_keyword FOREIGN KEY(keyword_id) REFERENCES keyword (id) ON DELETE RESTRICT ON UPDATE CASCADE\n)", "summary": "CREATE TABLE IF NOT EXISTS summary (\n\tid INTEGER NOT NULL, \n\turl_id INTEGER NOT NULL, \n\tmarkup TEXT NOT NULL, \n\tCONSTRAINT pk_summary PRIMARY KEY (id), \n\tCONSTRAINT fk_summary_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE\n)", "webpage": "CREATE TABLE IF NOT EXISTS webpage (\n\tid INTEGER NOT NULL, \n\turl_id INTEGER NOT NULL, \n\tmarkup TEXT NOT NULL, \n\tCONSTRAINT pk_webpage PRIMARY KEY (id), \n\tCONSTRAINT fk_webpage_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE\n)"}, "indices": {"ix_url_url": "CREATE UNIQUE INDEX IF NOT EXISTS ix_url_url ON url (url)", "ix_keyword_keyword": "CREATE UNIQUE INDEX IF NOT EXISTS ix_keyword_keyword ON keyword (keyword)", "ix_item_url": "CREATE UNIQUE INDEX IF NOT EXISTS ix_item_url ON item (url)", "ix_hyperlink_source_id_target_id_element": "CREATE UNIQUE INDEX IF NOT EXISTS ix_hyperlink_source_id_target_id_element ON hyperlink (source_id, target_id, element)", "ix_feed_url_id": "CREATE UNIQUE INDEX IF NOT EXISTS ix_feed_url_id ON feed (url_id)", "ix_tagging_url_id_keyword_id": "CREATE UNIQUE INDEX IF NOT EXISTS ix_tagging_url_id_keyword_id ON tagging (url_id, keyword_id)", "ix_summary_url_id": "CREATE UNIQUE INDEX IF NOT EXISTS ix_summary_url_id ON summary (url_id)", "ix_webpage_url_id": "CREATE UNIQUE INDEX IF NOT EXISTS ix_webpage_url_id ON webpage (url_id)"}, "models": {"url": {"columns": {"id": {}, "url": {}}, "info": {}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["url"]], "foreign_keys": []}, "keyword": {"columns": {"id": {}, "keyword": {}}, "info": {}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["keyword"]], "foreign_keys": []}, "item": {"columns": {"id": {}, "url": {}, "source": {}, "title": {}, "author": {}, "published": {}, "updated": {}, "crawled": {}}, "info": {}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["url"]], "foreign_keys": [["url", "url", "id"], ["source", "url", "id"]]}, "hyperlink": {"columns": {"id": {}, "source_id": {}, "target_id": {}, "element": {}}, "info": {}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["element", "source_id", "target_id"]], "foreign_keys": [["target_id", "url", "id"], ["source_id", "url", "id"]]}, "feed": {"columns": {"id": {}, "url_id": {}, "title": {}, "dead": {}}, "info": {}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["url_id"]], "foreign_keys": [["url_id", "url", "id"]]}, "tagging": {"columns": {"id": {}, "url_id": {}, "keyword_id": {}}, "info": {}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["keyword_id", "url_id"]], "foreign_keys": [["url_id", "url", "id"], ["keyword_id", "keyword", "id"]]}, "summary": {"columns": {"id": {}, "url_id": {}, "markup": {}}, "info": {"dedup": "max", "onconflict": "REPLACE"}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["url_id"]], "foreign_keys": [["url_id", "url", "id"]]}, "webpage": {"columns": {"id": {}, "url_id": {}, "markup": {}}, "info": {"dedup": "max", "onconflict": "REPLACE"}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["url_id"]], "foreign_keys": [["url_id", "url", "id"]]}}} -------------------------------------------------------------------------------- /feedme/sql/db.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import simplejson as json 4 | 5 | from .factory import Database 6 | 7 | with open(Path(__file__).with_name('db.json')) as f: 8 | db = Database(json.load(f)) 9 | -------------------------------------------------------------------------------- /feedme/sql/describe.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import sys 24 | from importlib.util import module_from_spec, spec_from_file_location 25 | from typing import Dict, List 26 | 27 | import simplejson as json 28 | from sqlalchemy import Table 29 | from sqlalchemy.dialects import sqlite 30 | from sqlalchemy.schema import (CreateIndex, CreateTable, Index, 31 | PrimaryKeyConstraint, UniqueConstraint) 32 | 33 | 34 | def describe_model(table): 35 | config = { 36 | 'columns': {}, 37 | 'info': table.info, 38 | 'autoincrement': [None], 39 | 'primary_key': set(), 40 | 'unique': set(), 41 | 'foreign_keys': set(), 42 | } 43 | for name, column in table.columns.items(): 44 | config['columns'][name] = column.info 45 | if column.autoincrement is True: 46 | config['autoincrement'] = (name,) 47 | for constraint in table.constraints: 48 | cols = tuple(sorted(c.name for c in constraint.columns)) 49 | if isinstance(constraint, PrimaryKeyConstraint): 50 | config['primary_key'] = cols 51 | if isinstance(constraint, UniqueConstraint): 52 | s = config.setdefault('unique', set()) 53 | s.add(cols) 54 | for index in table.indexes: 55 | if index.unique: 56 | cols = tuple(sorted(c.name for c in index.columns)) 57 | s = config.setdefault('unique', set()) 58 | s.add(cols) 59 | fks = [] 60 | for constraint in table.foreign_key_constraints: 61 | column = constraint.column_keys[0] 62 | foreign_column = list(constraint.columns[column].foreign_keys)[0].column 63 | fks.append((column, foreign_column.table.name, foreign_column.name)) 64 | config['foreign_keys'] = fks 65 | return config 66 | 67 | 68 | def create_table(table: Table): 69 | stmt = CreateTable(table).compile(dialect=sqlite.dialect()) 70 | stmt = str(stmt).replace('TABLE', 'TABLE IF NOT EXISTS').strip() 71 | return stmt 72 | 73 | 74 | def create_index(table: Table): 75 | creates = {} 76 | for index in table.indexes: 77 | index: Index 78 | stmt = CreateIndex(index).compile(dialect=sqlite.dialect()) 79 | stmt = str(stmt).replace('INDEX', 'INDEX IF NOT EXISTS').strip() 80 | creates[index.name] = stmt 81 | return creates 82 | 83 | 84 | def describe_database(path, out): 85 | spec = spec_from_file_location('schema', path) 86 | schema = module_from_spec(spec) 87 | spec.loader.exec_module(schema) 88 | 89 | tables: List[Table] = schema.tables 90 | tablemap: Dict[str, Table] = {t.name: t for t in tables} 91 | meta = {} 92 | meta['order'] = [t.name for t in tables] 93 | 94 | version = schema.version 95 | meta['version'] = version 96 | vers = meta['versioning'] = {} 97 | vers['create'] = create_table(schema.__Version__.__table__) 98 | vers['insert'] = 'INSERT OR REPLACE INTO __version__ (version) VALUES (?)' 99 | 100 | init: List[str] = schema.init 101 | tables_create = {} 102 | indices_create = {} 103 | for t in tables: 104 | tables_create[t.name] = create_table(t) 105 | indices_create.update(create_index(t)) 106 | meta['init'] = init 107 | meta['tables'] = tables_create 108 | meta['indices'] = indices_create 109 | 110 | models = meta['models'] = {} 111 | for name, table in tablemap.items(): 112 | models[name] = describe_model(table) 113 | 114 | with open(out, 'w+') as f: 115 | json.dump(meta, f, iterable_as_array=True) 116 | 117 | 118 | if __name__ == '__main__': 119 | describe_database(sys.argv[1], sys.argv[2]) 120 | -------------------------------------------------------------------------------- /feedme/sql/functions.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from urllib.parse import urlsplit 24 | 25 | functions = {} 26 | 27 | 28 | def sqlitefunc(name, num_params): 29 | def decorate(func): 30 | functions[func] = (name, num_params, func) 31 | return func 32 | return decorate 33 | 34 | 35 | @sqlitefunc('urlsplit', 2) 36 | def urlsplitf(url, key): 37 | return getattr(urlsplit(url), key, None) 38 | 39 | 40 | @sqlitefunc('subdomain', 2) 41 | def subdomain(x, y): 42 | return x == y or x[-(len(y) + 1):] == f'.{y}' 43 | 44 | 45 | def register_all(conn): 46 | for f in functions.values(): 47 | conn.create_function(*f) 48 | -------------------------------------------------------------------------------- /feedme/sql/migrations/0.10.3_0.10.5.sql: -------------------------------------------------------------------------------- 1 | BEGIN EXCLUSIVE; 2 | 3 | DROP INDEX IF EXISTS ix_url_id_url; 4 | 5 | DROP INDEX IF EXISTS ix_item_id_hash; 6 | 7 | ALTER TABLE 8 | url RENAME TO tmp; 9 | 10 | CREATE TABLE url ( 11 | id INTEGER NOT NULL, 12 | url VARCHAR NOT NULL, 13 | CONSTRAINT pk_url PRIMARY KEY (id) 14 | ); 15 | 16 | INSERT INTO 17 | url (id, url) 18 | SELECT 19 | * 20 | FROM 21 | tmp; 22 | 23 | DROP TABLE tmp; 24 | 25 | ALTER TABLE 26 | keyword RENAME TO tmp; 27 | 28 | CREATE TABLE keyword ( 29 | id INTEGER NOT NULL, 30 | keyword VARCHAR NOT NULL, 31 | CONSTRAINT pk_keyword PRIMARY KEY (id) 32 | ); 33 | 34 | INSERT INTO 35 | keyword (id, keyword) 36 | SELECT 37 | * 38 | FROM 39 | tmp; 40 | 41 | DROP TABLE tmp; 42 | 43 | CREATE TABLE tmp_tagging (url_id INTEGER, keyword_id INTEGER); 44 | 45 | INSERT INTO 46 | tmp_tagging (url_id, keyword_id) 47 | SELECT 48 | item.url AS url_id, 49 | tagging.keyword_id AS keyword_id 50 | FROM 51 | tagging 52 | JOIN item ON tagging.item_id == item.id; 53 | 54 | DELETE FROM 55 | tmp_tagging 56 | WHERE 57 | rowid NOT IN ( 58 | SELECT 59 | min(rowid) 60 | FROM 61 | tmp_tagging 62 | GROUP BY 63 | url_id, 64 | keyword_id 65 | ); 66 | 67 | DROP TABLE tagging; 68 | 69 | CREATE TABLE tagging ( 70 | id INTEGER NOT NULL, 71 | url_id INTEGER NOT NULL, 72 | keyword_id INTEGER NOT NULL, 73 | CONSTRAINT pk_tagging PRIMARY KEY (id), 74 | CONSTRAINT fk_tagging_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT, 75 | CONSTRAINT fk_tagging_keyword_id_keyword FOREIGN KEY(keyword_id) REFERENCES keyword (id) ON DELETE RESTRICT ON UPDATE RESTRICT 76 | ); 77 | 78 | INSERT INTO 79 | tagging (url_id, keyword_id) 80 | SELECT 81 | * 82 | FROM 83 | tmp_tagging; 84 | 85 | DROP TABLE tmp_tagging; 86 | 87 | ALTER TABLE 88 | item RENAME TO tmp; 89 | 90 | DELETE FROM 91 | tmp 92 | WHERE 93 | rowid NOT IN ( 94 | SELECT 95 | min(rowid) 96 | FROM 97 | tmp 98 | GROUP BY 99 | url 100 | ); 101 | 102 | CREATE TABLE item ( 103 | id INTEGER NOT NULL, 104 | url INTEGER NOT NULL, 105 | source INTEGER NOT NULL, 106 | title VARCHAR, 107 | author VARCHAR, 108 | published DATETIME, 109 | updated DATETIME, 110 | crawled FLOAT, 111 | CONSTRAINT pk_item PRIMARY KEY (id), 112 | CONSTRAINT fk_item_url_url FOREIGN KEY(url) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT, 113 | CONSTRAINT fk_item_source_url FOREIGN KEY(source) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT 114 | ); 115 | 116 | INSERT INTO 117 | item ( 118 | id, 119 | url, 120 | source, 121 | title, 122 | author, 123 | published, 124 | updated, 125 | crawled 126 | ) 127 | SELECT 128 | id, 129 | url, 130 | source, 131 | title, 132 | author, 133 | published, 134 | updated, 135 | crawled 136 | FROM 137 | tmp; 138 | 139 | DROP TABLE tmp; 140 | 141 | ALTER TABLE 142 | hyperlink RENAME TO tmp; 143 | 144 | CREATE TABLE hyperlink ( 145 | id INTEGER NOT NULL, 146 | source_id INTEGER NOT NULL, 147 | target_id INTEGER NOT NULL, 148 | element VARCHAR NOT NULL, 149 | CONSTRAINT pk_hyperlink PRIMARY KEY (id), 150 | CONSTRAINT fk_hyperlink_source_id_url FOREIGN KEY(source_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT, 151 | CONSTRAINT fk_hyperlink_target_id_url FOREIGN KEY(target_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT 152 | ); 153 | 154 | INSERT INTO 155 | hyperlink (source_id, target_id, element) 156 | SELECT 157 | * 158 | FROM 159 | tmp; 160 | 161 | DROP TABLE tmp; 162 | 163 | ALTER TABLE 164 | feed RENAME TO tmp; 165 | 166 | CREATE TABLE feed ( 167 | id INTEGER NOT NULL, 168 | url_id INTEGER NOT NULL, 169 | title TEXT NOT NULL, 170 | dead BOOLEAN, 171 | CONSTRAINT pk_feed PRIMARY KEY (id), 172 | CONSTRAINT fk_feed_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT, 173 | CONSTRAINT ck_feed_dead CHECK (dead IN (0, 1)) 174 | ); 175 | 176 | INSERT INTO 177 | feed (url_id, title) 178 | SELECT 179 | * 180 | FROM 181 | tmp; 182 | 183 | DROP TABLE tmp; 184 | 185 | ALTER TABLE 186 | summary RENAME TO tmp; 187 | 188 | CREATE TABLE summary ( 189 | id INTEGER NOT NULL, 190 | url_id INTEGER NOT NULL, 191 | markup TEXT NOT NULL, 192 | CONSTRAINT pk_summary PRIMARY KEY (id), 193 | CONSTRAINT fk_summary_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT 194 | ); 195 | 196 | INSERT INTO 197 | summary (url_id, markup) 198 | SELECT 199 | * 200 | FROM 201 | tmp; 202 | 203 | DROP TABLE tmp; 204 | 205 | ALTER TABLE 206 | webpage RENAME TO tmp; 207 | 208 | CREATE TABLE webpage ( 209 | id INTEGER NOT NULL, 210 | url_id INTEGER NOT NULL, 211 | markup TEXT NOT NULL, 212 | CONSTRAINT pk_webpage PRIMARY KEY (id), 213 | CONSTRAINT fk_webpage_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT 214 | ); 215 | 216 | INSERT INTO 217 | webpage (url_id, markup) 218 | SELECT 219 | * 220 | FROM 221 | tmp; 222 | 223 | DROP TABLE tmp; 224 | 225 | CREATE UNIQUE INDEX IF NOT EXISTS ix_url_url ON url (url); 226 | 227 | CREATE UNIQUE INDEX IF NOT EXISTS ix_keyword_keyword ON keyword (keyword); 228 | 229 | CREATE UNIQUE INDEX IF NOT EXISTS ix_item_url ON item (url); 230 | 231 | CREATE UNIQUE INDEX IF NOT EXISTS ix_hyperlink_source_id_target_id_element ON hyperlink (source_id, target_id, element); 232 | 233 | CREATE UNIQUE INDEX IF NOT EXISTS ix_feed_url_id ON feed (url_id); 234 | 235 | CREATE UNIQUE INDEX IF NOT EXISTS ix_tagging_url_id_keyword_id ON tagging (url_id, keyword_id); 236 | 237 | CREATE UNIQUE INDEX IF NOT EXISTS ix_summary_url_id ON summary (url_id); 238 | 239 | CREATE UNIQUE INDEX IF NOT EXISTS ix_webpage_url_id ON webpage (url_id); 240 | 241 | UPDATE 242 | __version__ 243 | SET 244 | version = '0.10.5'; 245 | 246 | COMMIT; -------------------------------------------------------------------------------- /feedme/sql/migrations/0.10.3_0.10.6.sql: -------------------------------------------------------------------------------- 1 | BEGIN EXCLUSIVE; 2 | 3 | DROP INDEX IF EXISTS ix_url_id_url; 4 | 5 | DROP INDEX IF EXISTS ix_item_id_hash; 6 | 7 | ALTER TABLE 8 | url RENAME TO tmp; 9 | 10 | CREATE TABLE url ( 11 | id INTEGER NOT NULL, 12 | url VARCHAR NOT NULL, 13 | CONSTRAINT pk_url PRIMARY KEY (id) 14 | ); 15 | 16 | INSERT INTO 17 | url (id, url) 18 | SELECT 19 | * 20 | FROM 21 | tmp; 22 | 23 | DROP TABLE tmp; 24 | 25 | ALTER TABLE 26 | keyword RENAME TO tmp; 27 | 28 | CREATE TABLE keyword ( 29 | id INTEGER NOT NULL, 30 | keyword VARCHAR NOT NULL, 31 | CONSTRAINT pk_keyword PRIMARY KEY (id) 32 | ); 33 | 34 | INSERT INTO 35 | keyword (id, keyword) 36 | SELECT 37 | * 38 | FROM 39 | tmp; 40 | 41 | DROP TABLE tmp; 42 | 43 | CREATE TABLE tmp_tagging (url_id INTEGER, keyword_id INTEGER); 44 | 45 | INSERT INTO 46 | tmp_tagging (url_id, keyword_id) 47 | SELECT 48 | item.url AS url_id, 49 | tagging.keyword_id AS keyword_id 50 | FROM 51 | tagging 52 | JOIN item ON tagging.item_id == item.id; 53 | 54 | DELETE FROM 55 | tmp_tagging 56 | WHERE 57 | rowid NOT IN ( 58 | SELECT 59 | min(rowid) 60 | FROM 61 | tmp_tagging 62 | GROUP BY 63 | url_id, 64 | keyword_id 65 | ); 66 | 67 | DROP TABLE tagging; 68 | 69 | CREATE TABLE tagging ( 70 | id INTEGER NOT NULL, 71 | url_id INTEGER NOT NULL, 72 | keyword_id INTEGER NOT NULL, 73 | CONSTRAINT pk_tagging PRIMARY KEY (id), 74 | CONSTRAINT fk_tagging_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, 75 | CONSTRAINT fk_tagging_keyword_id_keyword FOREIGN KEY(keyword_id) REFERENCES keyword (id) ON DELETE RESTRICT ON UPDATE CASCADE 76 | ); 77 | 78 | INSERT INTO 79 | tagging (url_id, keyword_id) 80 | SELECT 81 | * 82 | FROM 83 | tmp_tagging; 84 | 85 | DROP TABLE tmp_tagging; 86 | 87 | ALTER TABLE 88 | item RENAME TO tmp; 89 | 90 | DELETE FROM 91 | tmp 92 | WHERE 93 | rowid NOT IN ( 94 | SELECT 95 | min(rowid) 96 | FROM 97 | tmp 98 | GROUP BY 99 | url 100 | ); 101 | 102 | CREATE TABLE item ( 103 | id INTEGER NOT NULL, 104 | url INTEGER NOT NULL, 105 | source INTEGER NOT NULL, 106 | title VARCHAR, 107 | author VARCHAR, 108 | published DATETIME, 109 | updated DATETIME, 110 | crawled FLOAT, 111 | CONSTRAINT pk_item PRIMARY KEY (id), 112 | CONSTRAINT fk_item_url_url FOREIGN KEY(url) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, 113 | CONSTRAINT fk_item_source_url FOREIGN KEY(source) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE 114 | ); 115 | 116 | INSERT INTO 117 | item ( 118 | id, 119 | url, 120 | source, 121 | title, 122 | author, 123 | published, 124 | updated, 125 | crawled 126 | ) 127 | SELECT 128 | id, 129 | url, 130 | source, 131 | title, 132 | author, 133 | published, 134 | updated, 135 | crawled 136 | FROM 137 | tmp; 138 | 139 | DROP TABLE tmp; 140 | 141 | ALTER TABLE 142 | hyperlink RENAME TO tmp; 143 | 144 | CREATE TABLE hyperlink ( 145 | id INTEGER NOT NULL, 146 | source_id INTEGER NOT NULL, 147 | target_id INTEGER NOT NULL, 148 | element VARCHAR NOT NULL, 149 | CONSTRAINT pk_hyperlink PRIMARY KEY (id), 150 | CONSTRAINT fk_hyperlink_source_id_url FOREIGN KEY(source_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, 151 | CONSTRAINT fk_hyperlink_target_id_url FOREIGN KEY(target_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE 152 | ); 153 | 154 | INSERT INTO 155 | hyperlink (source_id, target_id, element) 156 | SELECT 157 | * 158 | FROM 159 | tmp; 160 | 161 | DROP TABLE tmp; 162 | 163 | ALTER TABLE 164 | feed RENAME TO tmp; 165 | 166 | CREATE TABLE feed ( 167 | id INTEGER NOT NULL, 168 | url_id INTEGER NOT NULL, 169 | title TEXT NOT NULL, 170 | dead BOOLEAN, 171 | CONSTRAINT pk_feed PRIMARY KEY (id), 172 | CONSTRAINT fk_feed_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, 173 | CONSTRAINT ck_feed_dead CHECK (dead IN (0, 1)) 174 | ); 175 | 176 | INSERT INTO 177 | feed (url_id, title) 178 | SELECT 179 | * 180 | FROM 181 | tmp; 182 | 183 | DROP TABLE tmp; 184 | 185 | ALTER TABLE 186 | summary RENAME TO tmp; 187 | 188 | CREATE TABLE summary ( 189 | id INTEGER NOT NULL, 190 | url_id INTEGER NOT NULL, 191 | markup TEXT NOT NULL, 192 | CONSTRAINT pk_summary PRIMARY KEY (id), 193 | CONSTRAINT fk_summary_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE 194 | ); 195 | 196 | INSERT INTO 197 | summary (url_id, markup) 198 | SELECT 199 | * 200 | FROM 201 | tmp; 202 | 203 | DROP TABLE tmp; 204 | 205 | ALTER TABLE 206 | webpage RENAME TO tmp; 207 | 208 | CREATE TABLE webpage ( 209 | id INTEGER NOT NULL, 210 | url_id INTEGER NOT NULL, 211 | markup TEXT NOT NULL, 212 | CONSTRAINT pk_webpage PRIMARY KEY (id), 213 | CONSTRAINT fk_webpage_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE 214 | ); 215 | 216 | INSERT INTO 217 | webpage (url_id, markup) 218 | SELECT 219 | * 220 | FROM 221 | tmp; 222 | 223 | DROP TABLE tmp; 224 | 225 | CREATE UNIQUE INDEX IF NOT EXISTS ix_url_url ON url (url); 226 | 227 | CREATE UNIQUE INDEX IF NOT EXISTS ix_keyword_keyword ON keyword (keyword); 228 | 229 | CREATE UNIQUE INDEX IF NOT EXISTS ix_item_url ON item (url); 230 | 231 | CREATE UNIQUE INDEX IF NOT EXISTS ix_hyperlink_source_id_target_id_element ON hyperlink (source_id, target_id, element); 232 | 233 | CREATE UNIQUE INDEX IF NOT EXISTS ix_feed_url_id ON feed (url_id); 234 | 235 | CREATE UNIQUE INDEX IF NOT EXISTS ix_tagging_url_id_keyword_id ON tagging (url_id, keyword_id); 236 | 237 | CREATE UNIQUE INDEX IF NOT EXISTS ix_summary_url_id ON summary (url_id); 238 | 239 | CREATE UNIQUE INDEX IF NOT EXISTS ix_webpage_url_id ON webpage (url_id); 240 | 241 | UPDATE 242 | __version__ 243 | SET 244 | version = '0.10.6'; 245 | 246 | COMMIT; -------------------------------------------------------------------------------- /feedme/sql/migrations/0.10.5_0.10.6.sql: -------------------------------------------------------------------------------- 1 | PRAGMA foreign_keys = OFF; 2 | 3 | BEGIN EXCLUSIVE; 4 | 5 | ALTER TABLE 6 | item RENAME TO tmp; 7 | 8 | CREATE TABLE item ( 9 | id INTEGER NOT NULL, 10 | url INTEGER NOT NULL, 11 | source INTEGER NOT NULL, 12 | title VARCHAR, 13 | author VARCHAR, 14 | published DATETIME, 15 | updated DATETIME, 16 | crawled FLOAT, 17 | CONSTRAINT pk_item PRIMARY KEY (id), 18 | CONSTRAINT fk_item_url_url FOREIGN KEY(url) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, 19 | CONSTRAINT fk_item_source_url FOREIGN KEY(source) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE 20 | ); 21 | 22 | INSERT INTO 23 | item 24 | SELECT 25 | * 26 | FROM 27 | tmp; 28 | 29 | DROP TABLE tmp; 30 | 31 | ALTER TABLE 32 | hyperlink RENAME TO tmp; 33 | 34 | CREATE TABLE hyperlink ( 35 | id INTEGER NOT NULL, 36 | source_id INTEGER NOT NULL, 37 | target_id INTEGER NOT NULL, 38 | element VARCHAR NOT NULL, 39 | CONSTRAINT pk_hyperlink PRIMARY KEY (id), 40 | CONSTRAINT fk_hyperlink_source_id_url FOREIGN KEY(source_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, 41 | CONSTRAINT fk_hyperlink_target_id_url FOREIGN KEY(target_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE 42 | ); 43 | 44 | INSERT INTO 45 | hyperlink 46 | SELECT 47 | * 48 | FROM 49 | tmp; 50 | 51 | DROP TABLE tmp; 52 | 53 | ALTER TABLE 54 | feed RENAME TO tmp; 55 | 56 | CREATE TABLE feed ( 57 | id INTEGER NOT NULL, 58 | url_id INTEGER NOT NULL, 59 | title TEXT NOT NULL, 60 | dead BOOLEAN, 61 | CONSTRAINT pk_feed PRIMARY KEY (id), 62 | CONSTRAINT fk_feed_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, 63 | CONSTRAINT ck_feed_dead CHECK (dead IN (0, 1)) 64 | ); 65 | 66 | INSERT INTO 67 | feed 68 | SELECT 69 | * 70 | FROM 71 | tmp; 72 | 73 | DROP TABLE tmp; 74 | 75 | ALTER TABLE 76 | tagging RENAME TO tmp; 77 | 78 | CREATE TABLE tagging ( 79 | id INTEGER NOT NULL, 80 | url_id INTEGER NOT NULL, 81 | keyword_id INTEGER NOT NULL, 82 | CONSTRAINT pk_tagging PRIMARY KEY (id), 83 | CONSTRAINT fk_tagging_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, 84 | CONSTRAINT fk_tagging_keyword_id_keyword FOREIGN KEY(keyword_id) REFERENCES keyword (id) ON DELETE RESTRICT ON UPDATE CASCADE 85 | ); 86 | 87 | INSERT INTO 88 | tagging 89 | SELECT 90 | * 91 | FROM 92 | tmp; 93 | 94 | DROP TABLE tmp; 95 | 96 | ALTER TABLE 97 | summary RENAME TO tmp; 98 | 99 | CREATE TABLE summary ( 100 | id INTEGER NOT NULL, 101 | url_id INTEGER NOT NULL, 102 | markup TEXT NOT NULL, 103 | CONSTRAINT pk_summary PRIMARY KEY (id), 104 | CONSTRAINT fk_summary_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE 105 | ); 106 | 107 | INSERT INTO 108 | summary 109 | SELECT 110 | * 111 | FROM 112 | tmp; 113 | 114 | DROP TABLE tmp; 115 | 116 | ALTER TABLE 117 | webpage RENAME TO tmp; 118 | 119 | CREATE TABLE webpage ( 120 | id INTEGER NOT NULL, 121 | url_id INTEGER NOT NULL, 122 | markup TEXT NOT NULL, 123 | CONSTRAINT pk_webpage PRIMARY KEY (id), 124 | CONSTRAINT fk_webpage_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE 125 | ); 126 | 127 | INSERT INTO 128 | webpage 129 | SELECT 130 | * 131 | FROM 132 | tmp; 133 | 134 | DROP TABLE tmp; 135 | 136 | CREATE UNIQUE INDEX IF NOT EXISTS ix_url_url ON url (url); 137 | 138 | CREATE UNIQUE INDEX IF NOT EXISTS ix_keyword_keyword ON keyword (keyword); 139 | 140 | CREATE UNIQUE INDEX IF NOT EXISTS ix_item_url ON item (url); 141 | 142 | CREATE UNIQUE INDEX IF NOT EXISTS ix_hyperlink_source_id_target_id_element ON hyperlink (source_id, target_id, element); 143 | 144 | CREATE UNIQUE INDEX IF NOT EXISTS ix_feed_url_id ON feed (url_id); 145 | 146 | CREATE UNIQUE INDEX IF NOT EXISTS ix_tagging_url_id_keyword_id ON tagging (url_id, keyword_id); 147 | 148 | CREATE UNIQUE INDEX IF NOT EXISTS ix_summary_url_id ON summary (url_id); 149 | 150 | CREATE UNIQUE INDEX IF NOT EXISTS ix_webpage_url_id ON webpage (url_id); 151 | 152 | UPDATE 153 | __version__ 154 | SET 155 | version = '0.10.6'; 156 | 157 | COMMIT; 158 | 159 | PRAGMA foreign_keys = ON; 160 | -------------------------------------------------------------------------------- /feedme/sql/migrations/0.10_0.10.3.sql: -------------------------------------------------------------------------------- 1 | BEGIN EXCLUSIVE; 2 | 3 | ALTER TABLE 4 | markup RENAME TO tmp; 5 | 6 | CREATE TABLE IF NOT EXISTS summary ( 7 | url_id INTEGER NOT NULL, 8 | markup TEXT NOT NULL, 9 | CONSTRAINT pk_summary PRIMARY KEY (url_id), 10 | CONSTRAINT fk_summary_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) 11 | ); 12 | 13 | CREATE TABLE IF NOT EXISTS webpage ( 14 | url_id INTEGER NOT NULL, 15 | markup TEXT NOT NULL, 16 | CONSTRAINT pk_webpage PRIMARY KEY (url_id), 17 | CONSTRAINT fk_webpage_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) 18 | ); 19 | 20 | INSERT INTO 21 | summary (url_id, markup) 22 | SELECT 23 | url.id AS url_id, 24 | tmp.markup AS markup 25 | FROM 26 | tmp 27 | JOIN item ON tmp.item_id == item.id 28 | JOIN url ON item.url == url.id 29 | WHERE 30 | tmp.type == 'summary' 31 | GROUP BY 32 | url_id; 33 | 34 | INSERT 35 | OR REPLACE INTO summary (url_id, markup) 36 | SELECT 37 | url.id AS url_id, 38 | tmp.markup AS markup 39 | FROM 40 | tmp 41 | JOIN item ON tmp.item_id == item.id 42 | JOIN url ON item.url == url.id 43 | WHERE 44 | tmp.type == 'content' 45 | GROUP BY 46 | url_id; 47 | 48 | DROP TABLE tmp; 49 | 50 | UPDATE 51 | __version__ 52 | SET 53 | version = '0.10.3'; 54 | 55 | COMMIT; -------------------------------------------------------------------------------- /feedme/sql/schema.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from sqlalchemy import MetaData, types 24 | from sqlalchemy.ext.declarative import declarative_base, declared_attr 25 | from sqlalchemy.schema import Column, ForeignKey, Index 26 | 27 | SCHEMA_VERSION = '0.10.6' 28 | 29 | metadata = MetaData( 30 | naming_convention={ 31 | 'ix': 'ix_%(table_name)s_%(column_0_N_name)s', 32 | 'uq': 'uq_%(table_name)s_%(column_0_N_name)s', 33 | 'ck': 'ck_%(table_name)s_%(column_0_N_name)s', 34 | 'fk': 'fk_%(table_name)s_%(column_0_N_name)s_%(referred_table_name)s', 35 | 'pk': 'pk_%(table_name)s', 36 | }, 37 | ) 38 | RESTRICT = 'RESTRICT' 39 | CASCADE = 'CASCADE' 40 | 41 | 42 | class BaseDefaults: 43 | @declared_attr 44 | def __tablename__(self): 45 | return self.__name__.lower() 46 | 47 | 48 | Base = declarative_base(metadata=metadata, cls=BaseDefaults) 49 | 50 | 51 | class __Version__(Base): 52 | version = Column(types.String(), primary_key=True) 53 | 54 | 55 | class URL(Base): 56 | id = Column(types.Integer(), primary_key=True, autoincrement=True) 57 | url = Column(types.String(), nullable=False) 58 | 59 | @declared_attr 60 | def __table_args__(self): 61 | return (Index(None, 'url', unique=True),) 62 | 63 | 64 | class Keyword(Base): 65 | id = Column(types.Integer(), primary_key=True, autoincrement=True) 66 | keyword = Column(types.String(), nullable=False) 67 | 68 | @declared_attr 69 | def __table_args__(self): 70 | return (Index(None, 'keyword', unique=True),) 71 | 72 | 73 | class Item(Base): 74 | id = Column(types.Integer(), primary_key=True, autoincrement=True) 75 | 76 | url = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False) 77 | source = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False) 78 | 79 | title = Column(types.String()) 80 | author = Column(types.String()) 81 | published = Column(types.DateTime()) 82 | updated = Column(types.DateTime()) 83 | crawled = Column(types.Float()) 84 | 85 | @declared_attr 86 | def __table_args__(self): 87 | return (Index(None, 'url', unique=True),) 88 | 89 | 90 | class Hyperlink(Base): 91 | id = Column(types.Integer(), primary_key=True, autoincrement=True) 92 | source_id = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False) 93 | target_id = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False) 94 | element = Column(types.String(), nullable=False) 95 | 96 | @declared_attr 97 | def __table_args__(self): 98 | return (Index(None, 'source_id', 'target_id', 'element', unique=True),) 99 | 100 | 101 | class Feed(Base): 102 | id = Column(types.Integer(), primary_key=True, autoincrement=True) 103 | url_id = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False) 104 | title = Column(types.Text(), nullable=False) 105 | dead = Column(types.Boolean()) 106 | 107 | @declared_attr 108 | def __table_args__(self): 109 | return (Index(None, 'url_id', unique=True),) 110 | 111 | 112 | class Tagging(Base): 113 | id = Column(types.Integer(), primary_key=True, autoincrement=True) 114 | url_id = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False) 115 | keyword_id = Column(ForeignKey('keyword.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False) 116 | 117 | @declared_attr 118 | def __table_args__(self): 119 | return (Index(None, 'url_id', 'keyword_id', unique=True),) 120 | 121 | 122 | class Summary(Base): 123 | id = Column(types.Integer(), primary_key=True, autoincrement=True) 124 | url_id = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False) 125 | markup = Column(types.Text(), nullable=False) 126 | 127 | @declared_attr 128 | def __table_args__(self): 129 | return ( 130 | Index(None, 'url_id', unique=True), 131 | {'info': {'dedup': 'max', 'onconflict': 'REPLACE'}}, 132 | ) 133 | 134 | 135 | class Webpage(Base): 136 | id = Column(types.Integer(), primary_key=True, autoincrement=True) 137 | url_id = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False) 138 | markup = Column(types.Text(), nullable=False) 139 | 140 | @declared_attr 141 | def __table_args__(self): 142 | return ( 143 | Index(None, 'url_id', unique=True), 144 | {'info': {'dedup': 'max', 'onconflict': 'REPLACE'}}, 145 | ) 146 | 147 | 148 | models = [URL, Keyword, Item, Hyperlink, Feed, Tagging, Summary, Webpage] 149 | tables = [m.__table__ for m in models] 150 | 151 | version = SCHEMA_VERSION 152 | init = ['PRAGMA foreign_keys = ON', 'PRAGMA journal_mode = WAL'] 153 | -------------------------------------------------------------------------------- /feedme/sql/stream.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import logging 24 | import sqlite3 25 | from collections import deque 26 | from contextlib import suppress 27 | from pathlib import Path 28 | from threading import Lock 29 | from typing import Union 30 | 31 | from ..utils import append_stem, randstr, watch_for_timing 32 | from .factory import Database 33 | 34 | _PathLike = Union[str, Path] 35 | 36 | 37 | class DatabaseWriter: 38 | def __init__(self, path: _PathLike, database: Database, 39 | debug=False, cache_path=None, silent=False): 40 | self.log = logging.getLogger('db.writer') 41 | if silent: 42 | self.log.setLevel(logging.WARNING) 43 | 44 | main_db = Path(path) 45 | cache_db = Path(cache_path) if cache_path else append_stem(path, f'~tmp-{randstr(8)}') 46 | 47 | self.db = database 48 | self._queues = {t: deque() for t in database.tablemap} 49 | self._flush_lock = Lock() 50 | 51 | self._main = self._connect(main_db, 'main', debug) 52 | self._cache = self._connect(cache_db, 'temp', debug) 53 | self._paths = {self._main: main_db, self._cache: cache_db} 54 | 55 | self._corked = True 56 | self._closed = False 57 | self._rowcounts = {conn: {t: None for t in database.tablemap} 58 | for conn in (self._main, self._cache)} 59 | 60 | self._bind_tables() 61 | self.report() 62 | self.uncork() 63 | self.flush() 64 | 65 | def _connect(self, path: _PathLike, name=None, debug=False): 66 | conn = sqlite3.connect(path, isolation_level=None, timeout=30, 67 | check_same_thread=False) 68 | conn.row_factory = sqlite3.Row 69 | if debug: 70 | self._setup_debug(conn, name, debug) 71 | 72 | self.db.verify_version(conn) 73 | self.db.set_version(conn) 74 | self.db.create_all(conn) 75 | return conn 76 | 77 | @property 78 | def record_count(self): 79 | return sum(len(q) for q in self._queues.values()) 80 | 81 | def _lock_db(self, conn: sqlite3.Connection): 82 | self.log.debug(f'Locking database {self._paths[conn]}') 83 | if self.db.is_locked(conn): 84 | self.log.warning('Database lock table exists') 85 | self.log.warning('Previous crawler did not exit properly') 86 | self.db.mark_as_locked(conn) 87 | 88 | def _unlock_db(self, conn: sqlite3.Connection): 89 | self.log.debug(f'Unlocking database {self._paths[conn]}') 90 | self.db.mark_as_unlocked(conn) 91 | 92 | def _setup_debug(self, conn: sqlite3.Connection, name, debug_out): 93 | sql_log = logging.getLogger(f'db.sql.{name}') 94 | sql_log.setLevel(logging.DEBUG) 95 | conn.set_trace_callback(sql_log.debug) 96 | if not isinstance(debug_out, bool): 97 | sql_log.propagate = False 98 | path = append_stem(Path(debug_out), f'-{name}') 99 | file = open(path, 'w+') 100 | handler = logging.StreamHandler(file) 101 | sql_log.addHandler(handler) 102 | 103 | def _bind_tables(self): 104 | for table in self.db.tables: 105 | table.bind_foreign_key(self._cache) 106 | table.bind_offset(self._main) 107 | 108 | def _foreign_key_off(self, conn: sqlite3.Connection): 109 | conn.execute('PRAGMA foreign_keys = OFF') 110 | self.log.debug(f'Foreign key is OFF for {self._paths[conn]}') 111 | 112 | def _foreign_key_on(self, conn: sqlite3.Connection): 113 | conn.execute('PRAGMA foreign_keys = ON') 114 | self.log.debug(f'Foreign key is ON for {self._paths[conn]}') 115 | 116 | def _rebuild_index(self, conn: sqlite3.Connection): 117 | self.log.info('Rebuilding index') 118 | self.db.create_indices(conn) 119 | 120 | def _begin(self, conn: sqlite3.Connection): 121 | try: 122 | conn.execute('BEGIN') 123 | self.log.debug(f'Began new transaction on {self._paths[conn]}') 124 | except sqlite3.OperationalError: 125 | pass 126 | 127 | def _begin_exclusive(self, conn: sqlite3.Connection): 128 | while True: 129 | try: 130 | conn.execute('BEGIN EXCLUSIVE') 131 | self.log.debug('Began exclusive transaction' 132 | f' on {self._paths[conn]}') 133 | except sqlite3.OperationalError: 134 | self.log.warning('Cannot acquire exclusive write access') 135 | self.log.warning('Another program is writing to the database') 136 | self.log.warning('Retrying...') 137 | else: 138 | return 139 | 140 | def _apply_changes(self): 141 | queues = self._queues 142 | self._queues = {t: deque() for t in self.db.tablemap} 143 | cache = self._cache 144 | for name, table in self.db.tablemap.items(): 145 | q = queues[name] 146 | if not q: 147 | continue 148 | 149 | try: 150 | table.insert(cache, q) 151 | except sqlite3.IntegrityError: 152 | cache.rollback() 153 | for k, v in queues.items(): 154 | self._queues[k].appendleft(v) 155 | raise 156 | else: 157 | cache.commit() 158 | del queues[name] 159 | 160 | def _verify(self, conn: sqlite3.Connection): 161 | self._foreign_key_off(conn) 162 | for table in self.db.tables: 163 | table.drop_proxy(conn) 164 | table.restore_original(conn) 165 | conn.commit() 166 | self.reconcile(conn) 167 | self.deduplicate(conn) 168 | for table in self.db.tables: 169 | table.drop_temp_index(conn) 170 | self._rebuild_index(conn) 171 | self._foreign_key_on(conn) 172 | self._optimize(conn) 173 | 174 | def _optimize(self, conn: sqlite3.Connection): 175 | self.log.debug(f'Optimizing {self._paths[conn]}') 176 | conn.execute('PRAGMA optimize') 177 | 178 | def _merge_other(self, other=None, discard=False): 179 | main = self._main 180 | if not other: 181 | other_db = self._cache 182 | other = str(self._paths[other_db]) 183 | else: 184 | other = str(other) 185 | other_db = sqlite3.connect(other, isolation_level=None) 186 | max_rowids = self.db.get_max_rowids(main) 187 | self._foreign_key_off(main) 188 | self._begin_exclusive(main) 189 | self._lock_db(main) 190 | self.db.attach(main, other) 191 | self.log.debug(f'Attached {other} to {self._paths[main]}') 192 | 193 | try: 194 | self.log.debug('Matching existing records') 195 | with watch_for_timing('Matching'): 196 | for table in self.db.tables: 197 | self.log.debug(f'Matching {table}') 198 | table.match_primary_keys(main) 199 | table.match_foreign_keys(main) 200 | 201 | self.log.debug('Dropping indices') 202 | self.db.drop_indices(main) 203 | 204 | self.log.debug('Merging into main database') 205 | with watch_for_timing('Merging'): 206 | for table in self.db.tables: 207 | self.log.debug(f'Merging {table}') 208 | table.dedup_primary_keys(main) 209 | table.merge_attached(main) 210 | 211 | self.log.debug('Deduplicating records') 212 | with watch_for_timing('Deduplicating'): 213 | for table in self.db.tables: 214 | self.log.debug(f'Deduplicating {table}') 215 | table.dedup(main, max_rowids[table.name]) 216 | 217 | except sqlite3.IntegrityError: 218 | main.rollback() 219 | raise 220 | 221 | else: 222 | self.log.debug('Committing changes') 223 | main.commit() 224 | self.db.detach(main) 225 | self._foreign_key_on(main) 226 | self._optimize(main) 227 | self.log.debug('Finalizing merge') 228 | 229 | finally: 230 | if not discard: 231 | self.log.debug('Removing transcient data') 232 | with watch_for_timing('Restoring'): 233 | for table in self.db.tables: 234 | table.restore_original(other_db) 235 | self._rebuild_index(main) 236 | self._unlock_db(main) 237 | 238 | def uncork(self): 239 | if not self._corked: 240 | return 241 | conn = self._cache 242 | 243 | self._lock_db(conn) 244 | self.db.drop_indices(conn) 245 | self._foreign_key_off(conn) 246 | for table in self.db.tables: 247 | table.create_proxy(conn) 248 | self._corked = False 249 | 250 | def cork(self): 251 | if self._corked: 252 | return 253 | self.flush() 254 | conn = self._cache 255 | self._verify(conn) 256 | self._unlock_db(conn) 257 | self._corked = True 258 | 259 | def write(self, table, item): 260 | self._queues[table].append(item) 261 | 262 | def flush(self): 263 | with self._flush_lock: 264 | if self._corked: 265 | return 266 | 267 | count = self.record_count 268 | if count: 269 | self.log.info(f'Saving {count} records') 270 | with watch_for_timing('Flushing'): 271 | self._apply_changes() 272 | 273 | self._cache.commit() 274 | self._begin(self._cache) 275 | 276 | def deduplicate(self, conn=None): 277 | self.log.info('Deduplicating database records') 278 | conn = conn or self._cache 279 | conn.commit() 280 | self._begin_exclusive(conn) 281 | try: 282 | with watch_for_timing('Deduplicating'): 283 | for table in self.db.tables: 284 | table.fast_dedup(conn) 285 | except sqlite3.IntegrityError: 286 | conn.rollback() 287 | raise 288 | finally: 289 | conn.commit() 290 | 291 | def reconcile(self, conn=None): 292 | self.log.info('Enforcing internal references') 293 | conn = conn or self._cache 294 | conn.commit() 295 | self._begin_exclusive(conn) 296 | try: 297 | with watch_for_timing('Fixing foreign keys'): 298 | mismatches = conn.execute('PRAGMA foreign_key_check') 299 | for table, rowid, parent, fkid in mismatches: 300 | self.db.tablemap[table].update_fk(conn, fkid, rowid) 301 | except sqlite3.IntegrityError: 302 | conn.rollback() 303 | raise 304 | else: 305 | conn.commit() 306 | 307 | def merge(self): 308 | self.cork() 309 | self.log.info('Merging new data into main database') 310 | self._merge_other(discard=True) 311 | self.report() 312 | 313 | def close(self): 314 | self._main.close() 315 | self._cache.close() 316 | self._corked = True 317 | self._closed = True 318 | 319 | def interrupt(self): 320 | self._main.interrupt() 321 | self._cache.interrupt() 322 | 323 | def cleanup(self): 324 | cache = self._paths[self._cache] 325 | shm = cache.with_suffix('.db-shm') 326 | wal = cache.with_suffix('.db-wal') 327 | with suppress(FileNotFoundError): 328 | cache.unlink() 329 | shm.unlink() 330 | wal.unlink() 331 | 332 | def finish(self, merge=True): 333 | if not merge: 334 | self.cork() 335 | self.close() 336 | return 337 | self.merge() 338 | self.close() 339 | self.cleanup() 340 | 341 | def _tally(self, conn): 342 | count = self.db.count_rows(conn) 343 | diff = {t: v is not None and count[t] - v for t, v in self._rowcounts[conn].items()} 344 | msg = ['Database stats:'] 345 | for table in self.db.tablemap: 346 | if diff[table] is not False: 347 | msg.append(f' {table}: {count[table]} ({diff[table]:+})') 348 | else: 349 | msg.append(f' {table}: {count[table]}') 350 | self._rowcounts[conn].update(count) 351 | return msg 352 | 353 | def report(self): 354 | for line in self._tally(self._main): 355 | self.log.info(line) 356 | 357 | def __enter__(self): 358 | return self 359 | 360 | def __exit__(self, typ, val=None, tb=None): 361 | self.close() 362 | if not typ: 363 | return True 364 | if val is None: 365 | if tb is None: 366 | raise typ 367 | val = typ() 368 | if tb is not None: 369 | val = val.with_traceback(tb) 370 | raise val 371 | -------------------------------------------------------------------------------- /feedme/sql/utils.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | def bulk_fetch(cur, size=100000, log=None): 25 | i = 0 26 | rows = cur.fetchmany(size) 27 | while rows: 28 | for row in rows: 29 | i += 1 30 | yield row 31 | if log: 32 | log.info(f'Fetched {i} rows.') 33 | rows = cur.fetchmany(size) 34 | 35 | 36 | def offset_fetch(conn, stmt, table, *, values=(), size=100000, log=None): 37 | i = 0 38 | offset = 0 39 | max_id = conn.execute(f'SELECT max(rowid) FROM {table}').fetchone()[0] 40 | if not max_id: 41 | raise StopIteration 42 | while offset <= max_id: 43 | limited = stmt % {'offset': ( 44 | f'{table}.rowid IN ' 45 | f'(SELECT rowid FROM {table} ' 46 | f'ORDER BY rowid LIMIT {size} OFFSET {offset})' 47 | )} 48 | rows = conn.execute(limited, values) 49 | for row in rows: 50 | i += 1 51 | yield row 52 | if log and i: 53 | log.info(f'Fetched {i} rows.') 54 | offset += size 55 | -------------------------------------------------------------------------------- /feedme/urlkit.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from typing import Tuple 24 | from urllib.parse import SplitResult, urlsplit 25 | 26 | from .datastructures import labeled_sequence 27 | 28 | 29 | def is_http(u): 30 | return isinstance(u, str) and urlsplit(u).scheme in {'http', 'https'} 31 | 32 | 33 | def is_absolute_http(u): 34 | if not isinstance(u, str): 35 | return False 36 | s = urlsplit(u) 37 | return s.scheme in {'http', 'https'} or s.scheme == '' and s.netloc 38 | 39 | 40 | def ensure_protocol(u, protocol='http'): 41 | s = urlsplit(u) 42 | return u if s.scheme else f'{protocol}:{u}' 43 | 44 | 45 | def domain_parents(domain: str) -> Tuple[str]: 46 | parts = domain.split('.') 47 | return tuple('.'.join(parts[-i:]) for i in range(len(parts), 1, -1)) 48 | 49 | 50 | def no_scheme(url: SplitResult) -> str: 51 | return url.geturl()[len(f'{url.scheme}:'):] 52 | 53 | 54 | def path_only(url: SplitResult) -> str: 55 | return url.geturl()[len(f'{url.scheme}://{url.netloc}'):] 56 | 57 | 58 | def select_templates(query, template_tree): 59 | matches = {r: r.match(query) for r in template_tree} 60 | matches = [(r, m) for r, m in matches.items() if m] 61 | if not matches: 62 | raise ValueError('No template provider') 63 | pattern, match = matches[0] 64 | templates = template_tree[pattern] 65 | if not callable(templates): 66 | templates = [t[0] for t in templates.items()] 67 | return match, templates 68 | 69 | 70 | def build_urls(base, match, templates): 71 | parsed = urlsplit(base) 72 | if callable(templates): 73 | return templates(parsed, match) 74 | specifiers = { 75 | **parsed._asdict(), 76 | 'network_path': no_scheme(parsed), 77 | 'path_query': path_only(parsed), 78 | 'original': parsed.geturl(), 79 | **match.groupdict(), 80 | **labeled_sequence(match.groups(), start=1, as_str=True), 81 | } 82 | return [t % specifiers for t in templates] 83 | -------------------------------------------------------------------------------- /feedme/utils.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2020 Tony Wu 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from __future__ import annotations 24 | 25 | import logging 26 | import pickle 27 | import random 28 | import string 29 | import time 30 | from contextlib import contextmanager 31 | from datetime import datetime, timezone 32 | from hashlib import sha1 33 | from logging.handlers import QueueListener 34 | from multiprocessing import Queue 35 | from operator import gt 36 | from typing import Any, Dict, List, Set, TypeVar, Union 37 | from urllib.parse import urlsplit 38 | 39 | import simplejson as json 40 | from scrapy.http import Request, TextResponse 41 | 42 | from .datastructures import KeywordCollection, KeywordStore 43 | from .urlkit import domain_parents, ensure_protocol, is_absolute_http 44 | 45 | try: 46 | from termcolor import colored 47 | except ImportError: 48 | def colored(t, *args, **kwargs): 49 | return t 50 | 51 | JSONType = Union[str, bool, int, float, None, List['JSONType'], Dict[str, 'JSONType']] 52 | JSONDict = Dict[str, JSONType] 53 | SpiderOutput = List[Union[JSONDict, Request]] 54 | 55 | log = logging.getLogger('main.utils') 56 | 57 | 58 | class RobustQueueListener(QueueListener): 59 | def _monitor(self): 60 | try: 61 | super()._monitor() 62 | except EOFError: 63 | log.warning('Log listener has prematurely stopped.') 64 | 65 | 66 | class QueueListenerWrapper: 67 | def __init__(self): 68 | self.queue = None 69 | self.listener = None 70 | 71 | def enable(self): 72 | if self.queue: 73 | return self.queue 74 | self.queue = Queue() 75 | self.listener = RobustQueueListener(self.queue, *logging.getLogger().handlers, respect_handler_level=True) 76 | self.listener.start() 77 | return self.queue 78 | 79 | def disable(self): 80 | if not self.queue: 81 | return 82 | self.listener.stop() 83 | self.queue = None 84 | self.listener = None 85 | 86 | def start(self): 87 | if not self.listener: 88 | return 89 | if not self.listener._thread: 90 | self.listener.start() 91 | return self.queue 92 | 93 | def stop(self): 94 | if not self.listener: 95 | return 96 | if self.listener._thread: 97 | self.listener.stop() 98 | return self.queue 99 | 100 | 101 | LOG_LISTENER = QueueListenerWrapper() 102 | 103 | 104 | def append_stem(path, appendage): 105 | return path.with_name(f'{path.stem}{appendage}').with_suffix(path.suffix) 106 | 107 | 108 | def fmttimedelta(td): 109 | s = int(td.total_seconds()) 110 | h, r = divmod(s, 3600) 111 | m, s = divmod(r, 60) 112 | return f'{h}h {m}m {s}s' 113 | 114 | 115 | def randstr(length, choices='0123456789abcdef'): 116 | return ''.join(random.choices(choices, k=length)) 117 | 118 | 119 | def parse_html(domstring, url='about:blank') -> TextResponse: 120 | return TextResponse(url=url, body=domstring, encoding='utf8') 121 | 122 | 123 | def json_converters(value: Any) -> JSONType: 124 | if isinstance(value, datetime): 125 | return value.isoformat() 126 | raise TypeError(type(value)) 127 | 128 | 129 | def load_jsonlines(file) -> List[JSONDict]: 130 | return [json.loads(line) for line in file.read().split('\n') if line] 131 | 132 | 133 | def datetime_converters(dt: Union[str, int, float, datetime], tz=timezone.utc) -> datetime: 134 | if isinstance(dt, datetime): 135 | return dt 136 | if isinstance(dt, str): 137 | return datetime.fromisoformat(dt) 138 | if isinstance(dt, (int, float)): 139 | try: 140 | return datetime.fromtimestamp(dt, tz=tz) 141 | except (ValueError, OSError): 142 | return datetime.fromtimestamp(dt / 1000, tz=tz) 143 | raise TypeError('dt must be of type str, int, float, or datetime') 144 | 145 | 146 | def sha1sum(s: Union[str, bytes]) -> str: 147 | if isinstance(s, str): 148 | s = s.encode() 149 | return sha1(s).hexdigest() 150 | 151 | 152 | def ensure_collection(supplier): 153 | def converter(obj): 154 | if obj is None: 155 | return supplier() 156 | return supplier(obj) 157 | return converter 158 | 159 | 160 | def is_rss_xml(response: TextResponse): 161 | ctype = response.headers.get('Content-Type') 162 | if not ctype: 163 | return True 164 | ctype = ctype.decode('utf8').split(';')[0] 165 | return ctype in {'text/xml', 'application/xml', 'application/rss+xml', 166 | 'application/rdf+xml', 'application/atom+xml'} 167 | 168 | 169 | def falsy(v): 170 | return v in {0, None, False, '0', 'None', 'none', 'False', 'false', 'null', 'undefined', 'NaN'} 171 | 172 | 173 | def wait(t): 174 | t0 = time.perf_counter() 175 | while time.perf_counter() - t0 < t: 176 | time.sleep(0.1) 177 | 178 | 179 | @contextmanager 180 | def watch_for_timing(name, limit=0): 181 | start = time.perf_counter() 182 | try: 183 | yield 184 | finally: 185 | duration = time.perf_counter() - start 186 | message = None 187 | level = None 188 | if limit and duration > limit: 189 | message = colored(f'[Performance violation] {name} took {duration * 1000:.0f}ms; ' 190 | f'desired time is {limit * 1000:.0f}ms.', color='yellow') 191 | level = logging.INFO 192 | elif not limit: 193 | message = f'{name} took {duration * 1000:.0f}ms' 194 | level = logging.DEBUG 195 | if message: 196 | logging.getLogger('profiler.timing').log(level, message) 197 | 198 | 199 | @contextmanager 200 | def watch_for_len(name, col, limit=0, comp=gt, desc='fewer than'): 201 | try: 202 | yield 203 | finally: 204 | size = len(col) 205 | message = None 206 | level = None 207 | if limit and comp(size, limit): 208 | message = colored(f'[Length violation] Number of {name} is {size}; ' 209 | f'desired is {desc} {limit:.0f}.', color='yellow') 210 | level = logging.INFO 211 | elif not limit: 212 | message = f'Number of {name} is {size}.' 213 | level = logging.DEBUG 214 | if message: 215 | logging.getLogger('profiler.containerlen').log(level, message) 216 | 217 | 218 | def guard_json(text: str) -> JSONDict: 219 | try: 220 | return json.loads(text) 221 | except json.JSONDecodeError as e: 222 | log.error(e) 223 | return {} 224 | 225 | 226 | def read_jsonlines(f, *, delimiter='\0\n', on_error='raise', paginate=100000, on_paginate=None): 227 | i = 0 228 | k = 0 229 | p = paginate - 1 230 | 231 | next_line = f.readline() 232 | while next_line: 233 | i += 1 234 | 235 | if next_line == delimiter: 236 | k += 1 237 | next_line = f.readline() 238 | if paginate and k == p: 239 | p += paginate 240 | yield i, k, on_paginate 241 | continue 242 | 243 | try: 244 | yield i, k, json.loads(next_line.rstrip()) 245 | 246 | except json.JSONDecodeError: 247 | if on_error == 'raise': 248 | raise 249 | if on_error == 'continue': 250 | continue 251 | return 252 | 253 | next_line = f.readline() 254 | 255 | 256 | PATH_UNSAFE = ''.join(set(string.punctuation + ' ') - set('-_/.')) 257 | 258 | 259 | def aggressive_replace_chars(s, encoding='latin_1'): 260 | return s.encode(encoding, 'replace').decode(encoding, 'ignore') 261 | 262 | 263 | def replace_unsafe_chars(s, repl='-', chars=PATH_UNSAFE): 264 | for c in chars: 265 | if c in s: 266 | s = s.replace(c, repl) 267 | return s 268 | 269 | 270 | def pathsafe(s): 271 | return replace_unsafe_chars(aggressive_replace_chars(s)) 272 | 273 | 274 | SIMPLEJSON_KWARGS = { 275 | 'ensure_ascii': True, 276 | 'default': json_converters, 277 | 'for_json': True, 278 | 'iterable_as_array': True, 279 | } 280 | 281 | 282 | class RenamingUnpickler(pickle.Unpickler): 283 | def find_class(self, module, name): 284 | if module[:7] == 'feedly.': 285 | module = 'feedme.' + module[7:] 286 | if name == 'ProbeRequest': 287 | name = 'ProbeFeed' 288 | return super().find_class(module, name) 289 | 290 | 291 | class HyperlinkStore(KeywordStore): 292 | TARGET_ATTRS = {'src', 'href', 'data-src', 'data-href'} 293 | 294 | def __init__(self, serialized: JSONDict = None): 295 | super().__init__() 296 | self._index: Dict[int, str] 297 | if serialized: 298 | self._deserialize(serialized) 299 | 300 | def _deserialize(self, dict_: JSONDict): 301 | for k, v in dict_.items(): 302 | hash_ = hash(k) 303 | self._index[hash_] = k 304 | self._taggings[hash_] = {c: set(ls) for c, ls in v.items()} 305 | 306 | def parse_html(self, source, markup, **kwargs): 307 | markup = parse_html(markup) 308 | for attrib in self.TARGET_ATTRS: 309 | elements = markup.css(f'[{attrib}]') 310 | for tag in elements: 311 | url = tag.attrib.get(attrib) 312 | if not is_absolute_http(url): 313 | continue 314 | url = ensure_protocol(url) 315 | 316 | keywords: KeywordCollection = { 317 | 'source': {source}, 318 | 'domain': set(domain_parents(urlsplit(url).netloc)), 319 | 'tag': set(), 320 | } 321 | keywords['tag'].add(tag.xpath('name()').get()) 322 | self.put(url, **keywords, **kwargs) 323 | 324 | 325 | T = TypeVar('T') 326 | 327 | 328 | def findpath(start: T, dest: T, segments: Dict[T, Set[T]], path: List[T]) -> bool: 329 | path.append(start) 330 | 331 | if start not in segments: 332 | path.pop() 333 | return False 334 | 335 | next_routes = segments.get(start, set()) - set(path) 336 | if dest in next_routes: 337 | path.append(dest) 338 | return True 339 | 340 | for r in next_routes: 341 | found = findpath(r, dest, segments, path) 342 | if found: 343 | return found 344 | 345 | path.pop() 346 | return False 347 | -------------------------------------------------------------------------------- /presets/_autoload.py: -------------------------------------------------------------------------------- 1 | # This file is used to autoload presets in this folder when a feed URL from 2 | # certain websites is provided but the PRESET option is not set. 3 | # 4 | # Deleting this file disables this feature, and deleting/renaming predefined 5 | # presets in this folder causes auto-load for that website to be disabled. 6 | 7 | _SITES = { 8 | r'.*\.livejournal\.com/?.*': 'livejournal', 9 | r'.*\.tumblr\.com/?.*': 'tumblr', 10 | r'.*\.wordpress\.com/?.*': 'wordpress', 11 | } 12 | -------------------------------------------------------------------------------- /presets/keywords.py: -------------------------------------------------------------------------------- 1 | PRIORITIZED_KEYWORDS = { 2 | 10: ['cats', 'kitties'], 3 | 5: ['dogs', 'puppies'], 4 | -5: ['goldfish'], 5 | -float('inf'): ['rat'], 6 | } 7 | -------------------------------------------------------------------------------- /presets/livejournal.py: -------------------------------------------------------------------------------- 1 | DEPTH_LIMIT = 2 2 | 3 | FOLLOW_DOMAINS = {'livejournal.com'} 4 | 5 | RSS_TEMPLATES = { 6 | r'.*\.livejournal\.com/?.*': { 7 | 'http://%(netloc)s/data/rss': 100, 8 | 'https://%(netloc)s/data/rss': 200, 9 | 'http://%(netloc)s/data/atom': 300, 10 | 'https://%(netloc)s/data/atom': 400, 11 | }, 12 | } 13 | -------------------------------------------------------------------------------- /presets/tumblr.py: -------------------------------------------------------------------------------- 1 | DEPTH_LIMIT = 2 2 | 3 | FOLLOW_DOMAINS = {'tumblr.com'} 4 | SELECT_FEED_STATE = 'dead+' 5 | 6 | 7 | def converter(base, match): 8 | for scheme in ('http', 'https'): 9 | for ending in ('rss', 'rss#_=_'): 10 | yield f'{scheme}://{base.netloc}/{ending}' 11 | 12 | 13 | def deactivated_converter(base, match): 14 | for scheme in ('http', 'https'): 15 | for ending in ('rss', 'rss#_=_'): 16 | yield f'{scheme}://{match.group(1)}.tumblr.com/{ending}' 17 | yield from converter(base, match) 18 | 19 | 20 | RSS_TEMPLATES = { 21 | r'https?://(.*)-deactivated\d*\.tumblr\.com/?.*': deactivated_converter, 22 | r'.*\.tumblr\.com/?.*': converter, 23 | } 24 | 25 | TUMBLR_IGNORE = { 26 | 'www.tumblr.com', 'staff.tumblr.com', 'tumblr.com', 27 | 'engineering.tumblr.com', 'support.tumblr.com', 28 | 'assets.tumblr.com', 29 | } 30 | 31 | CONTRIB_SPIDER_MIDDLEWARES = { 32 | 'feedme.contrib.filters.KeywordPrioritizer': 500, 33 | 'feedme.contrib.tumblr.TumblrFilter': 505, 34 | } 35 | -------------------------------------------------------------------------------- /presets/wordpress.py: -------------------------------------------------------------------------------- 1 | DEPTH_LIMIT = 2 2 | 3 | FOLLOW_DOMAINS = {'wordpress.com'} 4 | 5 | 6 | def template(base, match): 7 | for scheme in ('http', 'https'): 8 | for ending in ('?feed=rss', '?feed=rss2', '?feed=rdf', '?feed=atom' 9 | 'feed/', 'feed/rss/', 'feed/rss2/', 'feed/rdf/', 'feed/atom/'): 10 | yield f'{scheme}://{base.netloc}/{ending}' 11 | 12 | 13 | RSS_TEMPLATES = { 14 | r'.*\.wordpress\.com/?.*': template, 15 | } 16 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "feedme" 3 | version = "0.10.11" 4 | description = "" 5 | authors = ["tonyzbf "] 6 | license = "MIT" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.9" 10 | click = "^8.0.3" 11 | Scrapy = "^2.5.1" 12 | cryptography = "^36.0.1" 13 | cffi = "^1.15.0" 14 | pycparser = "^2.21" 15 | six = "^1.16.0" 16 | cssselect = "^1.1.0" 17 | itemadapter = "^0.4.0" 18 | itemloaders = "^1.0.4" 19 | jmespath = "^0.10.0" 20 | parsel = "^1.6.0" 21 | lxml = "^4.7.1" 22 | w3lib = "^1.22.0" 23 | Protego = "^0.1.16" 24 | PyDispatcher = "^2.0.5" 25 | pyOpenSSL = "^21.0.0" 26 | queuelib = "^1.6.2" 27 | service-identity = "^21.1.0" 28 | attrs = "^21.4.0" 29 | pyasn1 = "^0.4.8" 30 | pyasn1-modules = "^0.2.8" 31 | Twisted = "^21.7.0" 32 | Automat = "^20.2.0" 33 | constantly = "^15.1.0" 34 | hyperlink = "^21.0.0" 35 | idna = "^3.3" 36 | incremental = "^21.3.0" 37 | PyHamcrest = "^2.0.3" 38 | "zope.interface" = "^5.4.0" 39 | setuptools = "^60.3.0" 40 | simplejson = "^3.17.6" 41 | termcolor = "^1.1.0" 42 | aiofiles = "^0.8.0" 43 | aiohttp = "^3.8.1" 44 | more-itertools = "^8.12.0" 45 | SQLAlchemy = "^1.4.29" 46 | python-igraph = "^0.9.8" 47 | texttable = "^1.6.4" 48 | 49 | [tool.poetry.dev-dependencies] 50 | 51 | [build-system] 52 | requires = ["poetry-core>=1.0.0"] 53 | build-backend = "poetry.core.masonry.api" 54 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = feedme.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = feedme 12 | --------------------------------------------------------------------------------