├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── feedme
    ├── __init__.py
    ├── __main__.py
    ├── cli.py
    ├── commands
    │   ├── check-db.py
    │   ├── cleanup.py
    │   ├── commands.py
    │   ├── crawl.py
    │   ├── export.py
    │   ├── merge-db.py
    │   ├── options.py
    │   ├── resume.py
    │   ├── upgrade-db.py
    │   └── utils.py
    ├── contrib
    │   ├── __init__.py
    │   ├── browser.py
    │   ├── filters.py
    │   └── tumblr.py
    ├── datastructures.py
    ├── docs.py
    ├── exporters
    │   ├── __init__.py
    │   ├── exporters.py
    │   ├── graph.py
    │   ├── uncharted.py
    │   ├── urls.py
    │   └── utils.py
    ├── extensions.py
    ├── feedly.py
    ├── logger.py
    ├── middlewares.py
    ├── pipelines.py
    ├── requests.py
    ├── settings.py
    ├── signals.py
    ├── spiders
    │   ├── __init__.py
    │   ├── base.py
    │   ├── cluster.py
    │   ├── feed.py
    │   └── settings.py
    ├── sql
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── db.json
    │   ├── db.py
    │   ├── describe.py
    │   ├── factory.py
    │   ├── functions.py
    │   ├── migrations
    │   │   ├── 0.10.3_0.10.5.sql
    │   │   ├── 0.10.3_0.10.6.sql
    │   │   ├── 0.10.5_0.10.6.sql
    │   │   └── 0.10_0.10.3.sql
    │   ├── schema.py
    │   ├── stream.py
    │   └── utils.py
    ├── urlkit.py
    └── utils.py
├── poetry.lock
├── presets
    ├── _autoload.py
    ├── keywords.py
    ├── livejournal.py
    ├── tumblr.py
    └── wordpress.py
├── pyproject.toml
└── scrapy.cfg


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # celery beat schedule file
 95 | celerybeat-schedule
 96 | 
 97 | # SageMath parsed files
 98 | *.sage.py
 99 | 
100 | # Environments
101 | *.env
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 | 
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 | 
113 | # Rope project settings
114 | .ropeproject
115 | 
116 | # mkdocs documentation
117 | /site
118 | 
119 | # mypy
120 | .mypy_cache/
121 | .dmypy.json
122 | dmypy.json
123 | 
124 | # Pyre type checker
125 | .pyre/
126 | 
127 | # VSCode
128 | .vscode/
129 | .env.vscode
130 | 
131 | .flake8
132 | 
133 | .temp/
134 | 
135 | /*.crawl.*
136 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## Changelog
 2 | 
 3 | - **v0.10.11**
 4 |     - Accessibility update:
 5 |         - New command `scrapy wizard`: an interactive command that can perform simple tasks such as scraping and
 6 |         exporting.
 7 |         - Batch/shell scripts for setting up the program.
 8 |     - New command `scrapy resume`.
 9 |     - Module-level commands are now available as `scrapy` commands.
10 |     - `multiprocessing` now uses `spawn` on Windows, `forkserver` on macOS (Darwin), and `fork` on Linux.
11 | - **v0.10.10**
12 |     - Architectural update:
13 |         - Signal-based request persistence and restoration.
14 |         - Accept and log stats from any components.
15 |         - Accept and persist state info from any components.
16 |         - More thread-based I/O for better and more robuse performance.
17 |     - NEW exporter _uncharted_: Export a list of websites that are "uncharted" — websites that were not scraped
18 |     as RSS feeds during a crawl, but were recorded in the database because other feeds mentioned them.
19 |     - NEW middleware `KeywordPrioritizer`: Adjust the priority of a request based on the frequency of specified
20 |     keywords in its text content.
21 |     - NEW option `CONTRIB_SPIDER_MIDDLEWARE`: Use additional spider middlewares together with those defined
22 |     in the settings. Suitable for defining custom filtering/prioritizing logic. (The use of custom functions
23 |     in presets as seen in older versions is no longer supported.)
24 |     - NEW option `EXPANSION_THRESHOLD`: An integer. Instead of treating every new website it encounters as a
25 |     potential new feed, the cluster spider will only start crawling a new feed if the number of times a website
26 |     was seen crosses this threshold. Setting this to higher than 1 makes feeds in the resulting cluster more
27 |     related to each other, since they mention each other more.
28 |     - NEW command `python -m feedme customizations`: a manual of supported options that can be specified in
29 |     a preset.
30 | - **v0.10.6**
31 |     - Performance update.
32 |     - Fixed memory leak issues with the request persistence module.
33 |     - **![#f06073](https://placehold.it/12/f06073/000000?text=+) This version introduces API-breaking changes.**
34 |         - Package name has changed.
35 |         - Spider options must now be specified using the `-s` command-line option, and not `-a`.
36 | - **v0.10.5**
37 |     - _On-the-fly persistence:_ Instead of writing scraped data to a temporary file, then digest that file once crawling is finished
38 |     the program now write to databases while scraping, using a separate process.
39 |     - _Crawl dead feeds only:_ A new option `FEED_STATE_SELECT` that allows the selection/prioritization of dead/living feeds.
40 |     - ![#e5c07b](https://placehold.it/12/e5c07b/000000?text=+) This version introduces database schema changes. Database from
41 |     v0.10.1 onwards can be upgraded to this version.
42 | - **v0.10.3**
43 |     - _Optimization:_ Persisting data to database now requires less memory (with a slight time trade-off).
44 |     - ![#e5c07b](https://placehold.it/12/e5c07b/000000?text=+) This version introduces database schema changes. Database from
45 |     v0.10.1 onwards can be upgraded to this version.
46 | - **v0.10.2**
47 |     - _Cluster spider algorithm:_ Cluster spider now do breadth-first crawls, meaning it will crawl feeds closer to the starting feed
48 |     to completion before crawling feeds that are further away.
49 |     - _Persistence:_ Now uses pickle to persist request to achieve more accurate resumption.
50 | - **v0.10.1**
51 |     - **![#f06073](https://placehold.it/12/f06073/000000?text=+) This version introduces API-breaking changes.**
52 |     - _Command change:_ The commands for both crawling and exporting has changed. See the above sections for details.
53 |     - _Output:_
54 |         - All spiders now require the output path be an available directory.
55 |         - All spiders now persist scraped data using SQLite databases.
56 |         - It is possible to run any of the spiders multiple times on the same output directory, scraped data are automatically
57 |         merged and deduplicated.
58 |     - _Presets:_ You can now use presets to maintain different sets of crawling options. Since presets are Python files, you can
59 |     also specify complex settings, such as custom URL filtering functions, that cannot be specified on the command line.
60 |     - _URL templates:_ The search function introduced in v0.3 is now off by default, because it is discovered that Feedly's Search API
61 |     is a lot more sensitive to high-volume requests. Instead of relying on search, you can specify URL templates that allow the spiders
62 |     to attempt different variations of feed URLs.
63 |     - _New cluster spider:_ A new spider that, instead of crawling a single feed, also attempts to crawl any website mentioned in the feed's
64 |     content that might themselves be RSS feeds, resulting in a network of sites being crawled. (It's like search engine spiders but for RSS feeds.)
65 |     - _Export sorting and format:_ The revamped export module lets you select and sort URLs into different files. You may now export in
66 |     both plain-text lines and CSV format.
67 |     - _Graph export:_ You may now export link data as GraphML graphs, useful for visualization and network analysis. _Requires `python-igraph`._
68 |     _Install with `pip install -r requirements-graph.txt`_
69 | - **v0.3**
70 |     - _Fuzzy search:_ it's no longer necessary to specify the full URL to the RSS feed data. Spider now uses Feedly's Search API to
71 |     determine the correct URL. This means that you can simply specify e.g. the website's domain name, and Feedly will resolve it for you.
72 |     In case there are multiple matches, they will be printed so that you can choose one and try again.
73 | - **v0.1**
74 |     - _URL filtering:_ you can now specify what URLs to include/exclude when running the `collect-urls` command. For example:
75 |     `--include tag=a --exclude domain=secure.bank.com` will print out all URLs found on HTML `<a>` tags, except for those whose
76 |     domains or parent domains contain "secure.bank.com".
77 |     - _Feedly keywords:_ Feedly keyword data are now included in the crawl data, which you can use for filtering when running `collect-url`, 
78 |     using the `feedly_keyword=` filter. Additionally, there is a new `collect-keywords` command that lists all keywords found in a crawl.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 tonyzbf <tonyzbf@protonmail.com>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # feedly-link-feedme
  2 | 
  3 | A Scrapy project for collecting hyperlinks from RSS feeds using Feedly's [Streams API](https://developer.feedly.com/v3/streams/).
  4 | 
  5 | **Note⚠: This project provides a way to quickly aggregate resources such as images in an RSS feed**
  6 | **for purposes such as archival work. If you are only looking to browse a feed and/or download a few things,**
  7 | **it's more appropriate (and friendly) to use [Feedly](https://feedly.com) directly.**
  8 | 
  9 | ## Requirement
 10 | 
 11 | Requires Python 3.8+
 12 | 
 13 | ## Quick usage
 14 | 
 15 | > This section describes basic usage of this program that requires little knowledge of Python or even command lines in general.
 16 | >
 17 | > See the [next section](#setup) for more advanced usage of this program as a command-line tool.
 18 | 
 19 | ![#96c475](https://via.placeholder.com/12/96c475/000000?text=+) [Download](https://github.com/monotony113/feedly-link-feedme/archive/master.zip) the archive and
 20 | extract it somewhere, then:
 21 | 
 22 | - On Windows, run `start.bat`. This will start an interactive program that can perform the most common tasks.
 23 | - On macOS/Linux, run `start.sh` (you may need to fix the permission first, and you may need to know how to start it
 24 | from the terminal if launching it in your file manager doesn't work).
 25 | 
 26 | ## Contents
 27 | 
 28 | - [Documentation](#documentation)
 29 |     - [Setup](#setup)
 30 |     - [Crawling](#crawling)
 31 |     - [Presets](#presets)
 32 |     - [Exporting](#exporting)
 33 |     - [Cluster spider](#cluster-spider)
 34 | - [Changelog](./CHANGELOG.md)
 35 | - [Notes](#notes)
 36 | 
 37 | ## Documentation
 38 | 
 39 | ### Setup
 40 | 
 41 | Using a virtual environment is highly recommended.
 42 | 
 43 | ```bash
 44 | > python3 -m pip install -r requirements.txt
 45 | ```
 46 | 
 47 | > Note that this command will fail on Windows if Visual C++ build tools are not installed. The recommended way
 48 | > to install dependencies on Windows is to use the `install.bat` script.
 49 | 
 50 | ### Crawling
 51 | 
 52 | ```bash
 53 | > scrapy crawl <spider> '<url>' -o '<dir>' [-s additional options...]
 54 | ```
 55 | 
 56 | > If this command complains that scrapy cannot be found, your Python packages are not on your PATH.
 57 | > You may either append your PATH, or begin the command with `python -m scrapy`.
 58 | 
 59 | Currently available spiders are `feed` and `cluster`. `feed` crawls a single feed; [`cluster`](#cluster-spider)
 60 | begins with a single feed but attempts to further explore websites that are mentioned in the beginning feed.
 61 | 
 62 | Each spider option is specified using the `-s` option followed by a `key=value` pair.
 63 | 
 64 | Example:
 65 | 
 66 | ```bash
 67 | > scrapy crawl feed http://xkcd.com/atom.xml -o xkcd -s download_order=newest
 68 | ```
 69 | 
 70 | ### Presets
 71 | 
 72 | In addition to specifying options via the command line, you can also specify a preset.
 73 | 
 74 | ```bash
 75 | > scrapy crawl <spider> -s preset='<path-to-file>'
 76 | ```
 77 | 
 78 | A preset is a just a Python script whose top-level variable names and values are used as key-value pairs to populate
 79 | the spider config:
 80 | 
 81 | ```python
 82 | from datetime import datetime
 83 | RSS = 'https://xkcd.com/atom.xml'
 84 | OUTPUT = f'instance/xkcd-{datetime.now()}'
 85 | ...
 86 | ```
 87 | 
 88 | Only variables whose names contain only uppercase letters, numbers and underscores will be used.
 89 | 
 90 | Presets also let you define more complex behaviors, such as URL filtering, since you can define functions and mappings.
 91 | 
 92 | For a list of supported options, run `scrapy options`. Options that are
 93 | simple string/integer values can also be specified on the command line with a case-insensitive key, in which case they take
 94 | precedence over the ones defined in a preset.
 95 | 
 96 | ### Exporting
 97 | 
 98 | ```bash
 99 | > scrapy export <topic> -i '<dir>'
100 | ```
101 | 
102 | > Previous versions of this program use a different command `python -m feedme` for tasks unrelated to Scrapy,
103 | > which is still supported.
104 | > 
105 | > However, the `scrapy` command now supports running those commands as well, and is
106 | > recommended for uniformity.
107 | > 
108 | > For a list of all available commands, run `scrapy`.
109 | 
110 | Currently `<topic>` can be
111 | 
112 | - `urls`: Export URLs as plain-text or CSV files.
113 | - `graph`: Represent URLs and their relations using a graph data structure (exported as GraphML files).
114 | 
115 | **![#56b6c2](https://via.placeholder.com/12/56b6c2/000000?text=+) Example: Tumblr GIFs**
116 | 
117 | ```bash
118 | scrapy export urls -i data \
119 |   --include tag is img \
120 |   --include source:netloc under tumblr.com \
121 |   --include target:netloc under media.tumblr.com \
122 |   --include target:path endswith .gif \
123 |   --include published:year lt 2017 \
124 |   --output "%(feed:netloc)s/%(published:year)d%(published:month)02d.txt"
125 | ```
126 | 
127 | This command will select
128 | 
129 | - all image URLs that end with `.gif`
130 | - pointing to domains under `media.tumblr.com` (Tumblr CDN servers)
131 | - from posts before 2017
132 | - found on all crawled subdomains of `tumblr.com` (such as `staff.tumblr.com`),
133 | 
134 | export them, and sort them into folders and files based on
135 | 
136 | - the source domain name (i.e. blog website)
137 | - followed by the year and month of the date the post was published
138 | 
139 | resulting in a folder structure that looks like
140 | 
141 |     ./data/out/
142 |         staff.tumblr.com/
143 |             201602.txt
144 |             201603.txt
145 |             ...
146 |         .../
147 | 
148 | ----
149 | 
150 | For the `urls` exporter, the following features are available. Use the `-h`/`--help` option for a complete documentation:
151 | `scrapy export urls --help`.
152 | 
153 | #### Output template
154 | 
155 | Instead of specifying a regular file name for the output file with the `-o` option, you can use a Python %-formatted
156 | template string:
157 | 
158 | ```python
159 | -o "%(target:netloc).6s-%(published:year)d.txt"
160 | ```
161 | 
162 | This way, you can sort URLs from different sources and/or have different values such as domain names into different
163 | files and even folders to your liking.
164 | 
165 | For example, with scraped data from the feed [`https://xkcd.com/atom.xml`](https://xkcd.com/atom.xml), an export command
166 | 
167 | ```bash
168 | > scrapy export urls -i data -o "%(feed:title)s/%(tag)s/%(target:netloc)s.csv"
169 | ```
170 | 
171 | could generate the following directory structure:
172 | 
173 |     ./data/out/
174 |         xkcd.com/
175 |             img/
176 |                 imgs.xkcd.com.csv
177 |                 xkcd.com.csv
178 |                 ...
179 |             a/
180 |                 itunes.apple.com.csv
181 |                 www.barnesandnoble.com.csv
182 |                 ...
183 | 
184 | For a list of available placeholders, see the command help: `scrapy export urls --help`.
185 | 
186 | #### Filtering
187 | 
188 | Use the `--include`/`--exclude` (shorthands `+f`/`-f`) to specify filters:
189 | 
190 | ```bash
191 | +f source:netloc is "xkcd.com"
192 | # URLs that are found in markups from xkcd.com
193 | -f target:netloc is "google.com"
194 | # URLs that are NOT pointing to google.com
195 | +f target:path startswith "/wp-content"
196 | # URLs whose path components begin with "/wp-content".
197 | ```
198 | 
199 | Filter options can be specified multiple times to enable multiple filters, Only URLs that pass _all_ filters are exported.
200 | 
201 | You can filter on URL components, feed and post titles, and dates published. For a list of filterable attributes (they are the
202 | same as the naming template placeholders), see the command help: `scrapy export urls --help`.
203 | 
204 | ### Cluster spider
205 | 
206 | Version v0.10 introduces a new spider called `cluster`. As the name suggests, this spider crawls not a single feed, but a cluster of feeds.
207 | 
208 | How it works:
209 | 
210 | 1. The spider begins with a single feed, specified throught the `RSS` option.
211 | 2. As it crawls through the beginning feed, it parses the HTML markup snippets provided by Feedly, extracting URLs from them.
212 | 3. For each website it encounters, it will check to see if they exist as a valid RSS feed on Feedly, and if yes,
213 | then it will start crawling that website too.
214 | 4. This process continues, until either
215 |     - a depth limit is hit (specified with `-s depth_limit=<depth>`, or in a preset file as `DEPTH_LIMIT`), then it will finish crawling the feeds that are
216 |     `depth + 1` degrees removed from the starting feed, but will not expand beyond them; or
217 |     - the spider was interrupted.
218 | 
219 | How many sites the spider can crawl will depend on whether it can find out a valid RSS feed URL from just a domain name. There are 2 ways to make it possible:
220 | - Provide feed templates via a preset file. For example, knowing that WordPress sites provide RSS feeds through
221 | [fixed endpoints such as `/?rss=rss` and `/feed/`](https://wordpress.org/support/article/wordpress-feeds/#finding-your-feed-url)
222 | you can define your templates like such:
223 | 
224 |     ```python
225 |     RSS_TEMPLATES = {
226 |         r'.*\.wordpress\.com.*': {  # will match *.wordpress.com
227 |             'http://%(netloc)s/?rss=rss': 100,  # number denotes precedence 
228 |             'http://%(netloc)s/?rss=rss2': 200,
229 |             'http://%(netloc)s/?rss=atom': 300,
230 |             'http://%(netloc)s/feed/': 400,
231 |             'http://%(netloc)s/feed/rdf/': 500,
232 |             ...
233 |         },
234 |         ...
235 |     }
236 |     ```
237 | 
238 |     Then, if a WordPress site mentions another WordPress site, the spider will try each variation until it hits a valid feed on Feedly.
239 | 
240 | - Or, you may also enable the search function (`-s enable_search=True`, or in preset: `ENABLE_SEARCH = True`). This will let the spider search Feedly
241 | for each domain name it encounters, and crawl all returned feed.
242 | 
243 |     ![#e5c07b](https://via.placeholder.com/12/e5c07b/000000?text=+) **Warning: This is not recommended as the spider can quickly get rate-limited by Feedly.**
244 | 
245 | Cluster spider works best for sites that have predefined endpoints for RSS feeds, such as WordPress and Tumblr blogs (for which a
246 | [preset](./presets/tumblr.py) is provided). Of course, if you can provide enough feed templates, it can work with many other sites as well.
247 | 
248 | ## Notes
249 | 
250 | - `feedly.com` has a `robots.txt` policy that disallows bots. Therefore, this crawler is set to disobey `robots.txt` (even though
251 | what it is doing isn't crawling so much as it is consuming data from a publicly available API).
252 | - The availability of the scraped data depends on Feedly. If no one has ever subscribed to the RSS feed you are
253 | trying to crawl on Feedly, then your crawl may not yield any result.
254 | - Similarly, the data you can crawl from Feedly are only as complete as how much Feedly has scraped your RSS feed.
255 | - Explore the Feedly Cloud API at [developer.feedly.com](https://developer.feedly.com).
256 | 
257 | ## Motivation
258 | 
259 | I started this project because I found out that Feedly caches a significant amount of data from dead Tumblr blogs :)
260 | 
261 | Basically:
262 | 
263 | 1. As you may have already known, Tumblr did not actually delete most of the media files in the Great Tumblr Purge, 
264 | but rather merely removed the posts containing them, meaning those media files are still available on the internet, 
265 | albeit obscured behind their CDN URLs (the `**.media.tumblr.com` links).
266 | 2. Feedly differs from ordinary RSS readers in that it caches data from RSS feeds so that people who subscribe to the same 
267 | RSS feed receive data from Feedly first instead of directly from the RSS provider when they are using Feedly.
268 | 3. Among the data that Feedly caches are HTML snippets of each page in the RSS feed, which include our Tumblr media links
269 | –– and _Feedly doesn't seem to delete them even when the original posts are no longer available._
270 | 
271 | And so, effectively, Feedly has been acting as a huge Tumblr cache for as long as it has implemented such
272 | a content-delivery strategy and people have been using it to subscribe to Tumblr blogs ;)
273 | 
274 | This project is however usable for any RSS blogs that Feedly has ever scraped (e.g. [`https://xkcd.com/atom.xml`](https://xkcd.com/atom.xml)),
275 | or even other Feedly APIs (see their Streams API for details).
276 | 


--------------------------------------------------------------------------------
/feedme/__init__.py:
--------------------------------------------------------------------------------
 1 | def walk_package(path=None, name=__name__):
 2 |     import pkgutil
 3 |     from importlib.util import module_from_spec
 4 |     from pathlib import Path
 5 |     if not path:
 6 |         path = Path(__file__).parent
 7 |     for loader, module_name, is_pkg in pkgutil.walk_packages([str(path)]):
 8 |         pkg_name = f'{name}.{module_name}'
 9 |         if not is_pkg:
10 |             spec = loader.find_spec(pkg_name)
11 |             mod = module_from_spec(spec)
12 |             try:
13 |                 spec.loader.exec_module(mod)
14 |             except (ImportError, ModuleNotFoundError):
15 |                 continue
16 |             yield mod
17 |         else:
18 |             yield from walk_package(path / module_name, pkg_name)
19 | 
20 | 
21 | def _config_logging(config=None, *args, **kwargs):
22 |     import logging
23 |     import sys
24 |     from logging.config import dictConfig
25 | 
26 |     from .logger import make_logging_config
27 | 
28 |     if config and not config.getbool('LOG_ENABLED'):
29 |         return
30 | 
31 |     if config:
32 |         kwargs = {
33 |             'level': config.get('LOG_LEVEL', logging.INFO),
34 |             'colored': True,
35 |         }
36 |         overrides = []
37 | 
38 |         if config.get('LOG_FILE'):
39 |             kwargs['logfile'] = config['LOG_FILE']
40 | 
41 |         if config.get('LOG_DATEFORMAT'):
42 |             kwargs['datefmt'] = config['LOG_DATEFORMAT']
43 | 
44 |         if config.get('LOG_STDOUT'):
45 |             from scrapy.utils.log import StreamLogger
46 |             sys.stdout = StreamLogger(logging.getLogger('stdout'))
47 | 
48 |         if config.get('LOG_SHORT_NAMES'):
49 |             from scrapy.utils.log import TopLevelFormatter
50 |             overrides.append({
51 |                 'filters': {
52 |                     'tlfmt': {
53 |                         '()': TopLevelFormatter,
54 |                         'loggers': ['scrapy', 'main', 'worker'],
55 |                     }}})
56 | 
57 |         overrides += config.get('LOGGING_OVERRIDES', [])
58 |         # logging.basicConfig(force=True)
59 |         dictConfig(make_logging_config('feedly', *overrides, **kwargs))
60 |         return
61 | 
62 |     dictConfig(make_logging_config('feedly', *args, **kwargs))
63 | 


--------------------------------------------------------------------------------
/feedme/__main__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from .cli import cli
24 | 
25 | if __name__ == '__main__':
26 |     cli(prog_name='python -m feedme')
27 | 


--------------------------------------------------------------------------------
/feedme/cli.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import logging
 24 | from importlib import import_module
 25 | from pathlib import Path
 26 | from textwrap import dedent
 27 | 
 28 | import click
 29 | from scrapy.crawler import CrawlerProcess
 30 | from scrapy.utils.project import get_project_settings
 31 | 
 32 | from . import _config_logging, exporters
 33 | from .docs import markdown_inline, numpydoc2click
 34 | from .sql.cli import check, leftovers, merge, migrate
 35 | 
 36 | get_help_gen = markdown_inline(lambda ctx: (yield ctx.get_help()))
 37 | 
 38 | 
 39 | def get_help(ctx):
 40 |     return next(get_help_gen(ctx))
 41 | 
 42 | 
 43 | @click.group()
 44 | @click.option('--debug', is_flag=True)
 45 | @click.pass_context
 46 | def cli(ctx, debug=False):
 47 |     level = logging.DEBUG if debug else logging.INFO
 48 |     _config_logging(level=level)
 49 |     ctx.ensure_object(dict)
 50 |     ctx.obj['DEBUG'] = debug
 51 | 
 52 | 
 53 | def export_load_exporter(ctx: click.Context, param, value):
 54 |     not_found = False
 55 |     try:
 56 |         exporter = import_module(f'.{value}', exporters.__name__)
 57 |         assert exporter.export
 58 |     except (AttributeError, ModuleNotFoundError, AssertionError):
 59 |         not_found = True
 60 |         exporter = export
 61 |     if ctx.params.get('help') or value == 'help':
 62 |         ctx.meta['topic_name'] = value
 63 |         ctx.invoke(help_export, ctx, None, exporter)
 64 |     elif not_found:
 65 |         click.secho(str(ValueError(f"No exporter found for topic '{value}'")), fg='red')
 66 |         ctx.exit(1)
 67 |     return exporter
 68 | 
 69 | 
 70 | def help_export(ctx: click.Context, param, exporter):
 71 |     if not exporter or ctx.resilient_parsing:
 72 |         return
 73 |     if exporter is True:
 74 |         return True
 75 |     if exporter is export:
 76 |         click.echo(get_help(ctx))
 77 |         ctx.exit()
 78 | 
 79 |     @markdown_inline
 80 |     def help_subcommand():
 81 |         yield from [
 82 |             click.style('Data Exporter Help\n\n'.upper(), fg='black', bg='white', bold=True),
 83 |             'For help on the syntax of the ~export~ command itself, use `export --help`.\n\n',
 84 |             click.style(ctx.meta['topic_name'], fg='black', bg='magenta', bold=True),
 85 |         ]
 86 |         doc = numpydoc2click(exporter.help_text)
 87 |         yield click.style(' - ' + next(doc), fg='black', bg='magenta', bold=True)
 88 |         yield from doc
 89 |     click.echo_via_pager(help_subcommand())
 90 |     ctx.exit()
 91 | 
 92 | 
 93 | def ensure_index_db(path):
 94 |     path = Path(path)
 95 |     if path.is_dir():
 96 |         return path / 'index.db'
 97 |     return path
 98 | 
 99 | 
100 | @cli.command()
101 | @click.argument('topic', callback=export_load_exporter, default='help', metavar='topic')
102 | @click.option('-h', '--help', callback=help_export, is_flag=True, is_eager=True,
103 |               help="""
104 |               Show this help and exit.\n
105 |               Use `export <topic> --help` to see more info for a particular exporter.
106 |               """)
107 | @click.option('-i', '--input', 'wd', required=True, type=click.Path(exists=True, file_okay=False),
108 |               help="""
109 |               Path to the directory containing scraped data.
110 |               """)
111 | @click.option('-o', '--output', 'fmt', type=click.Path(writable=True, dir_okay=False),
112 |               help="""
113 |               Path to which exported data is written. Will always be under an `out/` directory inside the input directory.\n
114 |               Some exporters support output path templates, see their help for more info.
115 |               """)
116 | @click.option('+f', '--include', nargs=3, multiple=True, default=None, metavar='EXPR', help='')
117 | @click.option('-f', '--exclude', nargs=3, multiple=True, default=None, metavar='EXPR',
118 |               help="""
119 |               Filter results based on the expression EXPR, specified with 3 values _attr predicate value_,
120 |               such as `source:url is example.org`.\n
121 |               Expressions themselves should not be quoted.\n
122 |               Each exporter supports different filters, some does not support filtering.
123 |               See their help for more info.
124 |               """)
125 | @click.argument('exporter-args', nargs=-1, type=click.UNPROCESSED, metavar='additional-params')
126 | def export(topic, exporter_args, **kwargs):
127 |     """
128 |     Export items from scraped data.
129 | 
130 |     `topic` is the kind of information to export. Currently 2 topics are available:
131 | 
132 |     \b
133 |         _urls_: Export URLs in formats such as plain-text lines or CSV.
134 |         _graph_: Export scraped data as GraphML graphs.
135 | 
136 |     Some exporters accept additional parameters not listed below, which can be specified as
137 |     a list of _key=value_ pairs after other options.
138 | 
139 |     For more info on each exporter, specify the topic and the `-h/--help` option,
140 |     such as `export urls --help`.
141 |     """
142 |     options = dict([a.split('=', 1) for a in exporter_args])
143 |     kwargs = {k: v for k, v in kwargs.items() if v is not None}
144 |     topic.export(**kwargs, **options)
145 | 
146 | 
147 | @cli.command(hidden=True)
148 | @click.option('-s', 'spider')
149 | @click.option('-p', 'preset')
150 | def run_spider(spider, preset, **kwargs):
151 |     settings = get_project_settings()
152 |     settings['PRESET'] = preset
153 |     process = CrawlerProcess(settings, install_root_handler=False)
154 |     process.crawl(spider)
155 |     process.start(stop_after_crawl=True)
156 | 
157 | 
158 | @cli.command()
159 | @click.option('-i', '--input', 'db_path', required=True, type=click.Path(exists=True),
160 |               help='Path to the database.')
161 | @click.option('-d', '--sql-debug', 'debug', type=click.Path(exists=False, dir_okay=False),
162 |               help='Optional file to write executed SQL statements to.')
163 | @click.pass_context
164 | def check_db(ctx, db_path, debug=False, **kwargs):
165 |     """Check a database for potential problems and inconsistencies."""
166 | 
167 |     ctx.exit(check(ensure_index_db(db_path), debug=debug))
168 | 
169 | 
170 | @cli.command()
171 | @click.option('-i', '--input', 'db_path', required=True, type=click.Path(exists=True),
172 |               help='Path to the database.')
173 | @click.option('-d', '--sql-debug', 'debug', type=click.Path(exists=False, dir_okay=False),
174 |               help='Optional file to write executed SQL statements to.')
175 | @click.pass_context
176 | def upgrade_db(ctx, db_path, debug=False, **kwargs):
177 |     """Upgrade an older database to the latest schema version."""
178 | 
179 |     ctx.exit(migrate(ensure_index_db(db_path), debug=debug))
180 | 
181 | 
182 | @cli.command()
183 | @click.option('-i', '--input', 'db_paths', multiple=True, required=True, type=click.Path(exists=True, dir_okay=False),
184 |               help='Path to the database to be merged. Can be specified multiple times.')
185 | @click.option('-o', '--output', 'output', required=True, type=click.Path(exists=False, dir_okay=False),
186 |               help='Optional file to write executed SQL statements to.')
187 | @click.option('-d', '--sql-debug', 'debug', type=click.Path(exists=False, dir_okay=False),
188 |               help='Optional file to write executed SQL statements to.')
189 | @click.pass_context
190 | def merge_db(ctx, *, db_paths, output, debug=False, **kwargs):
191 |     """Merge multiple databases into a new database."""
192 | 
193 |     ctx.exit(merge(output, *db_paths, debug=debug))
194 | 
195 | 
196 | @cli.command()
197 | @click.option('-i', '--input', 'wd', required=True, type=click.Path(exists=True, file_okay=False),
198 |               help='Path to the directory containing scraped data.')
199 | @click.option('-d', '--sql-debug', 'debug', type=click.Path(exists=False, dir_okay=False),
200 |               help='Optional file to write executed SQL statements to.')
201 | @click.pass_context
202 | def cleanup(ctx, wd, debug=False, **kwargs):
203 |     """Find all temporary databases and attempt to merge them into the main database."""
204 | 
205 |     ctx.exit(leftovers(wd, debug=debug))
206 | 
207 | 
208 | @cli.command()
209 | def options():
210 |     """List available spider options."""
211 | 
212 |     from . import walk_package
213 |     for _ in walk_package():
214 |         pass
215 | 
216 |     from .docs import OptionsContributor
217 |     click.echo_via_pager(OptionsContributor.format_docs())
218 | 
219 | 
220 | @cli.command(hidden=True)
221 | def gen_commands():
222 |     template = dedent("""
223 |     from scrapy.commands import ScrapyCommand
224 | 
225 |     from .utils import _ClickCommand
226 | 
227 | 
228 |     class Command(_ClickCommand, ScrapyCommand):
229 |         def click_command(self):
230 |             return __name__.split('.')[-1].replace('-', '_')
231 |     """).lstrip().rstrip(' ')
232 | 
233 |     path = Path(__file__).with_name('commands')
234 |     for p in path.iterdir():
235 |         if not p.is_file():
236 |             continue
237 |         content = open(p).read()
238 |         if content == template:
239 |             p.unlink()
240 |     for k, v in cli.commands.items():
241 |         if not v.hidden:
242 |             with open(path / f'{k}.py', 'w+') as f:
243 |                 f.write(template)
244 | 
245 | 
246 | @cli.command(hidden=True)
247 | @click.option('-i', '--input', 'wd', required=True, type=click.Path(exists=True, file_okay=False))
248 | @click.option('-p', '--pattern', required=False, default='tumblr.com')
249 | def browser(wd, pattern):
250 |     from .contrib.browser import run_app
251 |     run_app(wd, pattern)
252 | 


--------------------------------------------------------------------------------
/feedme/commands/check-db.py:
--------------------------------------------------------------------------------
1 | from scrapy.commands import ScrapyCommand
2 | 
3 | from .utils import _ClickCommand
4 | 
5 | 
6 | class Command(_ClickCommand, ScrapyCommand):
7 |     def click_command(self):
8 |         return __name__.split('.')[-1].replace('-', '_')
9 | 


--------------------------------------------------------------------------------
/feedme/commands/cleanup.py:
--------------------------------------------------------------------------------
1 | from scrapy.commands import ScrapyCommand
2 | 
3 | from .utils import _ClickCommand
4 | 
5 | 
6 | class Command(_ClickCommand, ScrapyCommand):
7 |     def click_command(self):
8 |         return __name__.split('.')[-1].replace('-', '_')
9 | 


--------------------------------------------------------------------------------
/feedme/commands/commands.py:
--------------------------------------------------------------------------------
 1 | from scrapy.commands import ScrapyCommand
 2 | 
 3 | 
 4 | class Command(ScrapyCommand):
 5 |     def short_desc(self):
 6 |         return 'List available commands'
 7 | 
 8 |     def add_options(self, parser):
 9 |         import logging
10 | 
11 |         logging.getLogger('scrapy.utils.log').disabled = True
12 |         super().add_options(parser)
13 | 
14 |     def run(self, *args, **kwargs):
15 |         from ..cli import cli
16 |         cli(['--help'], prog_name='scrapy')
17 | 


--------------------------------------------------------------------------------
/feedme/commands/crawl.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from scrapy.commands import ScrapyCommand
24 | from scrapy.commands.crawl import Command
25 | 
26 | from .utils import _LoggingMixin
27 | 
28 | 
29 | class CrawlCommand(Command, _LoggingMixin):
30 |     def add_options(self, parser):
31 |         super().add_options(parser)
32 |         parser.add_option('-v', '--verbose', action='store_true',
33 |                           help='Log more information')
34 |         parser.remove_option('-a')
35 |         parser.remove_option('-t')
36 | 
37 |     def process_options(self, args, opts):
38 |         ScrapyCommand.process_options(self, args, opts)
39 | 
40 |         opts.spargs = {}
41 | 
42 |         if len(args) == 2:
43 |             self.settings['RSS'] = args.pop()
44 | 
45 |         self._takeover_logging()
46 | 
47 |         if opts.output:
48 |             self.settings['OUTPUT'] = opts.output[0]
49 |         self.settings.pop('FEEDS')
50 | 
51 |         self.settings['CMDLINE_ARGS'] = {'args': args, 'opts': vars(opts)}
52 | 
53 |         if opts.verbose:
54 |             self.settings['VERBOSE'] = True
55 |             self.settings.set('LOG_VIOLATIONS', True, priority='cmdline')
56 |             self.settings.set('STATS_DUMP', True, priority='cmdline')
57 | 


--------------------------------------------------------------------------------
/feedme/commands/export.py:
--------------------------------------------------------------------------------
1 | from scrapy.commands import ScrapyCommand
2 | 
3 | from .utils import _ClickCommand
4 | 
5 | 
6 | class Command(_ClickCommand, ScrapyCommand):
7 |     def click_command(self):
8 |         return __name__.split('.')[-1].replace('-', '_')
9 | 


--------------------------------------------------------------------------------
/feedme/commands/merge-db.py:
--------------------------------------------------------------------------------
1 | from scrapy.commands import ScrapyCommand
2 | 
3 | from .utils import _ClickCommand
4 | 
5 | 
6 | class Command(_ClickCommand, ScrapyCommand):
7 |     def click_command(self):
8 |         return __name__.split('.')[-1].replace('-', '_')
9 | 


--------------------------------------------------------------------------------
/feedme/commands/options.py:
--------------------------------------------------------------------------------
1 | from scrapy.commands import ScrapyCommand
2 | 
3 | from .utils import _ClickCommand
4 | 
5 | 
6 | class Command(_ClickCommand, ScrapyCommand):
7 |     def click_command(self):
8 |         return __name__.split('.')[-1].replace('-', '_')
9 | 


--------------------------------------------------------------------------------
/feedme/commands/resume.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from pathlib import Path
24 | 
25 | import simplejson as json
26 | from scrapy.commands import ScrapyCommand
27 | from scrapy.exceptions import UsageError
28 | 
29 | from .crawl import CrawlCommand
30 | 
31 | 
32 | class ResumeCrawlCommand(CrawlCommand):
33 |     def syntax(self):
34 |         return '<output>'
35 | 
36 |     def short_desc(self):
37 |         return 'Continue an existing crawl'
38 | 
39 |     def add_options(self, parser):
40 |         ScrapyCommand.add_options(self, parser)
41 | 
42 |     def process_options(self, args, opts):
43 |         if len(args) < 1:
44 |             raise UsageError()
45 | 
46 |         datadir = Path(args[0])
47 | 
48 |         if not datadir.exists():
49 |             raise UsageError(f'Directory `{datadir}` does not exist.')
50 |         if not datadir.is_dir():
51 |             raise UsageError(f'{datadir} is not a directory.', print_help=False)
52 | 
53 |         try:
54 |             with open(datadir / 'options.json') as f:
55 |                 options = json.load(f)
56 |         except (OSError, json.JSONDecodeError):
57 |             raise UsageError(f'{datadir} does not contain a valid "options.json" file.\n'
58 |                              'Cannot restore command line arguments used to initiate the program.')
59 | 
60 |         args.clear()
61 |         args.extend(options['args'])
62 |         for k, v in options['opts'].items():
63 |             setattr(opts, k, v)
64 | 
65 |         super().process_options(args, opts)
66 | 


--------------------------------------------------------------------------------
/feedme/commands/upgrade-db.py:
--------------------------------------------------------------------------------
1 | from scrapy.commands import ScrapyCommand
2 | 
3 | from .utils import _ClickCommand
4 | 
5 | 
6 | class Command(_ClickCommand, ScrapyCommand):
7 |     def click_command(self):
8 |         return __name__.split('.')[-1].replace('-', '_')
9 | 


--------------------------------------------------------------------------------
/feedme/commands/utils.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | from pathlib import Path
 24 | 
 25 | 
 26 | def dir_validator(path):
 27 |     path_ = Path(path)
 28 |     if path_.exists() and not path_.is_dir():
 29 |         raise ValueError(f'{path_} exists and is not a directory.')
 30 |     return path
 31 | 
 32 | 
 33 | def exists_validator(path):
 34 |     path_ = Path(path)
 35 |     if not path_.exists():
 36 |         raise ValueError(f'{path_} does not exist.')
 37 |     return path
 38 | 
 39 | 
 40 | def _restore_sigint():
 41 |     import signal
 42 | 
 43 |     signal.signal(signal.SIGINT, signal.default_int_handler)
 44 |     signal.signal(signal.SIGTERM, signal.SIG_DFL)
 45 | 
 46 | 
 47 | def _disable_initial_log():
 48 |     import logging
 49 | 
 50 |     logging.getLogger('scrapy.utils.log').disabled = True
 51 | 
 52 | 
 53 | class _LoggingMixin:
 54 |     def _takeover_logging(self, force=False):
 55 |         from scrapy.utils.log import configure_logging
 56 | 
 57 |         from .. import _config_logging
 58 | 
 59 |         enabled = (self.settings.getbool('LOG_ENABLED')
 60 |                    and self.settings.getbool('CUSTOM_LOGGING_ENABLED', True))
 61 |         if not force and not enabled:
 62 |             return
 63 | 
 64 |         settings = self.settings
 65 |         configure_logging(install_root_handler=False)
 66 |         _config_logging(settings)
 67 | 
 68 |         settings['CUSTOM_LOGGING_ENABLED'] = True
 69 |         settings.set('LOG_ENABLED', False, priority=9999)
 70 | 
 71 |         if 'LOG_FILE' in settings:
 72 |             settings['_LOG_FILE'] = settings['LOG_FILE']
 73 |             del settings['LOG_FILE']
 74 | 
 75 | 
 76 | class _ClickCommand:
 77 |     def click_command(self):
 78 |         raise NotImplementedError
 79 | 
 80 |     def _get_command(self):
 81 |         from .. import cli
 82 |         return getattr(cli, self.click_command())
 83 | 
 84 |     def add_options(self, parser):
 85 |         from click import Option
 86 | 
 87 |         _disable_initial_log()
 88 |         super().add_options(parser)
 89 |         parser.add_option('-h', '--help', action='store_true')
 90 | 
 91 |         command = self._get_command()
 92 |         for param in command.params:
 93 |             if isinstance(param, Option):
 94 |                 opts = [o for o in param.opts if o[0] == '-']
 95 |                 parser.add_option(*opts, action='store_true')
 96 | 
 97 |     def short_desc(self):
 98 |         command = self._get_command()
 99 |         help_ = command.help or ''
100 |         return help_.split('\n')[0]
101 | 
102 |     def long_desc(self):
103 |         command = self._get_command()
104 |         return command.help
105 | 
106 |     def run(self, args, opts):
107 |         import sys
108 | 
109 |         from ..cli import cli
110 | 
111 |         _restore_sigint()
112 |         cli(sys.argv, prog_name='scrapy')
113 | 


--------------------------------------------------------------------------------
/feedme/contrib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tonywu7/feedly-link-aggregator/7c32b45fbfa1794b081e90add033403308a2da85/feedme/contrib/__init__.py


--------------------------------------------------------------------------------
/feedme/contrib/browser.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import random
 4 | import re
 5 | import sqlite3
 6 | from pathlib import Path
 7 | 
 8 | import aiofiles
 9 | import aiohttp
10 | from aiohttp import web
11 | from more_itertools import chunked
12 | 
13 | from ..exporters.urls import CTE, SELECT, build_ctes, build_where_clause
14 | from ..sql.db import db
15 | from ..sql.functions import register_all
16 | from ..sql.utils import offset_fetch
17 | 
18 | SUFFIX = re.compile(r'_\d+\.(jpg|png|gif)$', re.IGNORECASE)
19 | 
20 | 
21 | class ResourceIterator:
22 |     def __init__(self, conn: sqlite3.Connection, pattern: str):
23 |         self.conn = conn
24 |         self.log = logging.getLogger('iterator')
25 |         self.pattern = pattern
26 | 
27 |     def get_row_iterator(self):
28 |         cte, column_maps = build_ctes(CTE)
29 |         keys = ('target:url',)
30 |         includes = [('tag', 'is', 'img'),
31 |                     ('source:netloc', 'contains', self.pattern),
32 |                     ('target:netloc', 'under', 'media.tumblr.com')]
33 |         where, values, _ = build_where_clause(includes, [])
34 |         columns = ', '.join([f'{v} AS "{k}"' for k, v in column_maps.items()])
35 |         column_keys = ', '.join([f'"{k}"' for k in keys])
36 | 
37 |         select = SELECT % {'columns': columns}
38 |         select = f'{cte}{select} WHERE %(offset)s AND {where} GROUP BY {column_keys}'
39 | 
40 |         fetch = offset_fetch(self.conn, select, 'hyperlink', values=values, log=self.log, size=200000)
41 |         return fetch
42 | 
43 |     def __iter__(self):
44 |         while True:
45 |             fetch = self.get_row_iterator()
46 |             for chunk in chunked(fetch, 10000):
47 |                 random.shuffle(chunk)
48 |                 yield from chunk
49 | 
50 | 
51 | class ResourceIteratorApp(web.Application):
52 |     def __init__(self, *args, index: Path, pattern: str = 'tumblr.com', **kwargs):
53 |         super().__init__(*args, **kwargs)
54 | 
55 |         db_path = index / 'index.db'
56 |         conn = sqlite3.connect(db_path, isolation_level=None)
57 |         conn.row_factory = sqlite3.Row
58 |         db.verify_version(conn)
59 |         register_all(conn)
60 |         self.iterator = iter(ResourceIterator(conn, pattern))
61 | 
62 |         self.output = index / 'cache'
63 |         os.makedirs(self.output, exist_ok=True)
64 | 
65 |         self.add_routes([
66 |             web.get('/', self.index),
67 |         ])
68 | 
69 |         self.client = aiohttp.ClientSession(headers={'User-Agent': 'curl/7.64.1'})
70 |         self.on_cleanup.append(self.close)
71 | 
72 |     async def index(self, req: web.Request):
73 |         row = next(self.iterator)
74 |         url = row['target:url']
75 |         url = re.sub(SUFFIX, r'_1280.\g<1>', url)
76 |         url = url.replace('http://', 'https://')
77 |         async with self.client.get(url) as res:
78 |             data = await res.read()
79 |             output = self.output / f'{row["source:netloc"]}/{row["target:path"]}'
80 |             os.makedirs(output.parent, exist_ok=True)
81 |             async with aiofiles.open(output, 'wb+') as f:
82 |                 await f.write(data)
83 |             return web.Response(body=data, content_type=res.content_type)
84 | 
85 |     async def close(self, *args, **kwargs):
86 |         await self.client.close()
87 | 
88 | 
89 | def run_app(index, pattern):
90 |     index = Path(index)
91 |     app = ResourceIteratorApp(index=index, pattern=pattern)
92 |     web.run_app(app, host='0.0.0.0', port=5000)
93 | 


--------------------------------------------------------------------------------
/feedme/contrib/filters.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import logging
 24 | import re
 25 | from math import inf
 26 | from urllib.parse import urlsplit
 27 | 
 28 | from scrapy.exceptions import NotConfigured
 29 | 
 30 | from ..docs import OptionsContributor
 31 | from ..requests import ProbeFeed
 32 | from ..signals import register_state, start_from_scratch
 33 | 
 34 | 
 35 | class KeywordPrioritizer(OptionsContributor, _doc_order=-5):
 36 |     """
 37 |     Enable this Spider Middleware to (de)prioritize certain feeds based on keywords.
 38 | 
 39 |     When using the cluster spider, changing the priorities of requests will shift
 40 |     the overall direction the spider is going, by causing some feeds to be crawled sooner
 41 |     than others.
 42 |     """
 43 | 
 44 |     @classmethod
 45 |     def from_crawler(cls, crawler):
 46 |         instance = cls(crawler.settings)
 47 |         crawler.signals.send_catch_log(
 48 |             register_state, obj=instance,
 49 |             namespace='kwprioritizer', attrs=['priorities'],
 50 |         )
 51 |         crawler.signals.connect(instance.clear_state_info, start_from_scratch)
 52 |         return instance
 53 | 
 54 |     def __init__(self, settings):
 55 |         self.log = logging.getLogger('contrib.keywordprioritizer')
 56 |         weighted_kws = settings.get('PRIORITIZED_KEYWORDS', {})
 57 |         if not weighted_kws:
 58 |             raise NotConfigured()
 59 | 
 60 |         self.keywords = {p: re.compile(r'(?:%s)' % '|'.join(kws), re.IGNORECASE)
 61 |                          for p, kws in weighted_kws.items()}
 62 |         self.keywords_fullword = {p: re.compile(r'\b(?:%s)\b' % '|'.join(kws), re.IGNORECASE)
 63 |                                   for p, kws in weighted_kws.items()}
 64 | 
 65 |         self.priorities = {}
 66 |         self.starting_weight = 0
 67 | 
 68 |     def clear_state_info(self):
 69 |         self.priorities.clear()
 70 | 
 71 |     def update_priority(self, item, source, target):
 72 |         prios = self.priorities
 73 |         starting = self.priorities.setdefault(source, self.starting_weight)
 74 |         prio = self.priorities.setdefault(target, self.starting_weight + starting)
 75 |         if prio is None:
 76 |             return True
 77 |         delta = 0
 78 | 
 79 |         for p, r in self.keywords.items():
 80 |             s = r.search(target)
 81 |             if not s:
 82 |                 continue
 83 |             delta += p
 84 |             self.log.debug(f'{source} {target} {s.group(0)} {p}')
 85 |             if delta == -inf:
 86 |                 break
 87 | 
 88 |         if delta == -inf:
 89 |             prios[target] = -inf
 90 |             return
 91 | 
 92 |         phrases = list(item.keywords)
 93 |         phrases.extend([item.markup.get('summary', ''), item.title])
 94 |         phrases = ' '.join(phrases)
 95 |         for p, r in self.keywords_fullword.items():
 96 |             s = r.search(phrases)
 97 |             if not s:
 98 |                 continue
 99 |             delta += p
100 |             self.log.debug(f'{source} {target} {s.group(0)} {p}')
101 |             if delta == -inf:
102 |                 break
103 | 
104 |         prios[target] = prio + delta
105 | 
106 |     def process_spider_output(self, response, result, spider):
107 |         for res in result:
108 |             if not isinstance(res, ProbeFeed):
109 |                 yield res
110 |                 continue
111 | 
112 |             item = res.meta.get('source_item')
113 |             feed_url = res.meta.get('feed_url')
114 |             if not item or not feed_url:
115 |                 yield res
116 |                 continue
117 | 
118 |             source = urlsplit(item.url).netloc
119 |             target = urlsplit(feed_url).netloc
120 |             self.update_priority(item, source, target)
121 | 
122 |             prio = self.priorities.get(target, 0)
123 |             if prio == -inf:
124 |                 continue
125 |             if not prio:
126 |                 yield res
127 |                 continue
128 | 
129 |             yield res.replace(priority=res.priority + prio)
130 | 
131 |     @staticmethod
132 |     def _help_options():
133 |         return {
134 |             'PRIORITIZED_KEYWORDS': """
135 |             A mapping of weights to a list of keywords.
136 | 
137 |             Before a new feed is crawled, the crawling request is processed here. This
138 |             middleware will then search the text content from which this new feed is
139 |             discovered, such as keywords and HTML markups, and adjust the priority of
140 |             the request accordingly.
141 | 
142 |             A ~positive~ weight will increase the priority, causing the feed to be crawled
143 |             sooner. A ~negative~ weight will decrease the priority.
144 | 
145 |             If you use the special `-inf` (negative infinity) value, the new feed will
146 |             be dropped. (`inf` can be imported from `math` or specified as `float('inf')`).
147 | 
148 |             **Example**
149 | 
150 |                 `PRIORITIZED_KEYWORDS = {`
151 |                 `    10: ['cats', 'kitties'],`
152 |                 `    5: ['dogs', 'puppies'],`
153 |                 `    -5: ['goldfish'],`
154 |                 `    -float('inf'): ['rat'],`
155 |                 `}`
156 |             """,
157 |         }
158 | 


--------------------------------------------------------------------------------
/feedme/contrib/tumblr.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlsplit
 2 | 
 3 | from scrapy import Request
 4 | from scrapy.exceptions import NotConfigured
 5 | 
 6 | from ..docs import OptionsContributor
 7 | 
 8 | 
 9 | class TumblrFilter(OptionsContributor, _doc_order=-5):
10 |     @classmethod
11 |     def from_crawler(cls, crawler):
12 |         return cls(crawler.settings)
13 | 
14 |     def __init__(self, settings):
15 |         self.domains = settings.get('TUMBLR_IGNORE')
16 |         if not self.domains:
17 |             raise NotConfigured()
18 | 
19 |     def process_spider_output(self, response, result, spider):
20 |         for r in result:
21 |             if not isinstance(r, Request):
22 |                 yield r
23 |                 continue
24 | 
25 |             feed_url = r.meta.get('feed_url')
26 |             if not feed_url:
27 |                 yield r
28 |                 continue
29 | 
30 |             domain = urlsplit(feed_url).netloc
31 |             if domain in self.domains:
32 |                 continue
33 |             if domain[-16:] == 'media.tumblr.com':
34 |                 continue
35 |             yield r
36 | 
37 |     @staticmethod
38 |     def _help_options():
39 |         return {
40 |             'TUMBLR_IGNORE': """
41 |             A list of Tumblr sites to ignore.
42 | 
43 |             **Example**
44 | 
45 |                 `TUMBLR_IGNORE = {`
46 |                 `    'www.tumblr.com', 'staff.tumblr.com', 'tumblr.com',`
47 |                 `    'engineering.tumblr.com', 'support.tumblr.com',`
48 |                 `    'assets.tumblr.com',`
49 |                 `}`
50 |             """,
51 |         }
52 | 


--------------------------------------------------------------------------------
/feedme/datastructures.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | from __future__ import annotations
 24 | 
 25 | from collections.abc import (Hashable, MutableMapping, MutableSequence,
 26 |                              MutableSet)
 27 | from collections.abc import Set as SetCollection
 28 | from typing import Dict, Set, Tuple
 29 | 
 30 | Keywords = Set[Hashable]
 31 | KeywordCollection = Dict[Hashable, Hashable]
 32 | 
 33 | 
 34 | def compose_mappings(*mappings):
 35 |     base = {}
 36 |     base.update(mappings[0])
 37 |     for m in mappings[1:]:
 38 |         for k, v in m.items():
 39 |             if k in base and type(base[k]) is type(v):
 40 |                 if isinstance(v, MutableMapping):
 41 |                     base[k] = compose_mappings(base[k], v)
 42 |                 elif isinstance(v, MutableSet):
 43 |                     base[k] |= v
 44 |                 elif isinstance(v, MutableSequence):
 45 |                     base[k].extend(v)
 46 |                 else:
 47 |                     base[k] = v
 48 |             else:
 49 |                 base[k] = v
 50 |     return base
 51 | 
 52 | 
 53 | class KeywordStore:
 54 |     def __init__(self):
 55 |         self._index: Dict[int, Hashable] = {}
 56 |         self._taggings: Dict[int, KeywordCollection] = {}
 57 | 
 58 |     def _get_hashes(self, **kws: Dict[Hashable, Hashable]) -> int:
 59 |         for hash_, keywords in self._taggings.items():
 60 |             match = True
 61 |             for category, keyword in kws.items():
 62 |                 if category[0] == '_':
 63 |                     if category[1:] not in keywords:
 64 |                         match = False
 65 |                         break
 66 |                 elif keyword not in keywords.get(category, {}):
 67 |                     match = False
 68 |                     break
 69 |             if match:
 70 |                 yield hash_
 71 | 
 72 |     def all(self, **kws: Dict[Hashable, Hashable]) -> Hashable:
 73 |         for hash_ in self._get_hashes(**kws):
 74 |             yield self._index[hash_]
 75 | 
 76 |     def keywords(self, item):
 77 |         return self._taggings.get(hash(item), {})
 78 | 
 79 |     def items(self, **kws: Dict[Hashable, Hashable]) -> Tuple[Hashable, KeywordCollection]:
 80 |         for hash_ in self._get_hashes(**kws):
 81 |             yield self._index[hash_], self._taggings[hash_]
 82 | 
 83 |     def put(self, item: Hashable, **kws: KeywordCollection):
 84 |         hash_ = hash(item)
 85 |         self._index[hash_] = item
 86 |         taggings = self._taggings.setdefault(hash_, {})
 87 |         for category, kwset in kws.items():
 88 |             if not isinstance(kwset, SetCollection):
 89 |                 kwset = {kwset}
 90 |             if category[0] == '_':
 91 |                 raise ValueError('Keys that begin with _ are reserved')
 92 |             keywords = taggings.setdefault(category, set())
 93 |             keywords |= kwset
 94 | 
 95 |     def __len__(self) -> int:
 96 |         return len(self._index)
 97 | 
 98 |     def __and__(self, other: KeywordStore) -> KeywordStore:
 99 |         if not isinstance(other, KeywordStore):
100 |             return NotImplemented
101 |         new = KeywordStore()
102 |         common_keys = self._index.keys() & other._index.keys()
103 |         taggings = {}
104 |         for k in common_keys:
105 |             this = self._taggings[k]
106 |             that = other._taggings[k]
107 |             tagging = {t: this[t] & that[t] for t in this.keys() & that.keys()}
108 |             tagging = {k: v for k, v in tagging.items() if v}
109 |             taggings[k] = tagging
110 |         index = {k: self._index[k] for k in taggings}
111 |         new._index = index
112 |         new._taggings = taggings
113 |         return new
114 | 
115 |     def __or__(self, other: KeywordStore) -> KeywordStore:
116 |         if not isinstance(other, KeywordStore):
117 |             return NotImplemented
118 |         new = KeywordStore()
119 |         index = {**self._index, **other._index}
120 |         taggings = {}
121 |         for k in index:
122 |             this = self._taggings.get(k, {})
123 |             that = other._taggings.get(k, {})
124 |             tagging = {t: this.get(t, set()) | that.get(t, set()) for t in this.keys() & that.keys()}
125 |             tagging.update({t: this[t] for t in this.keys() - that.keys()})
126 |             tagging.update({t: that[t] for t in that.keys() - this.keys()})
127 |             taggings[k] = tagging
128 |         new._index = index
129 |         new._taggings = taggings
130 |         return new
131 | 
132 |     def __sub__(self, other: KeywordStore) -> KeywordStore:
133 |         if not isinstance(other, KeywordStore):
134 |             return NotImplemented
135 |         new = KeywordStore()
136 |         taggings = {}
137 |         for k in self._index:
138 |             this = self._taggings[k]
139 |             that = other._taggings.get(k, {})
140 |             tagging = {t: this[t] - that.get(t, set()) for t in this}
141 |             tagging = {k: v for k, v in tagging.items() if v}
142 |             taggings[k] = tagging
143 |         index = {k: self._index[k] for k in taggings}
144 |         new._index = index
145 |         new._taggings = taggings
146 |         return new
147 | 
148 |     def __str__(self):
149 |         return str(self.for_json())
150 | 
151 |     def __repr__(self):
152 |         return repr(self.for_json())
153 | 
154 |     def for_json(self):
155 |         return {item: self._taggings[hash_] for hash_, item in self._index.items()}
156 | 
157 | 
158 | def labeled_sequence(seq, key=True, start=0, as_str=False):
159 |     r = range(start, len(seq) + start)
160 |     if key:
161 |         z = zip(r, seq)
162 |     else:
163 |         z = zip(seq, r)
164 |     if as_str:
165 |         return {str(k): v for k, v in z}
166 |     return {k: v for k, v in z}
167 | 


--------------------------------------------------------------------------------
/feedme/docs.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import re
 24 | from functools import wraps
 25 | from textwrap import dedent, indent
 26 | 
 27 | import click
 28 | 
 29 | docs = []
 30 | 
 31 | 
 32 | def stylize(pattern, **styles):
 33 |     def wrapper(func):
 34 |         @wraps(func)
 35 |         def wrapped(*args, **kwargs):
 36 |             for s in func(*args, **kwargs):
 37 |                 yield re.sub(pattern, lambda m: click.style(m.group(1), **styles), s)
 38 |         return wrapped
 39 |     return wrapper
 40 | 
 41 | 
 42 | def markdown_inline(func):
 43 |     @stylize(re.compile(r'`(.*?)`'), fg='green')
 44 |     @stylize(re.compile(r'~(.*?)~'), fg='blue', underline=True)
 45 |     @stylize(re.compile(r'\*\*(.*?)\*\*'), fg='yellow', bold=True)
 46 |     def f(*args, **kwargs):
 47 |         yield from func(*args, **kwargs)
 48 |     return f
 49 | 
 50 | 
 51 | def numpydoc2click(doc: str):
 52 |     PARA = re.compile(r'((?:.+\n)+)')
 53 |     PARA_WITH_HEADER = re.compile(r'(^ *)(.+)\n(?:\s*(?:-+|=+))\n((?:.+\n)+)')
 54 |     paragraphs = list(PARA.findall(dedent(doc)))
 55 |     yield paragraphs[0] + '\n'
 56 |     for i in range(1, len(paragraphs)):
 57 |         p = paragraphs[i]
 58 |         match = PARA_WITH_HEADER.match(p)
 59 |         if match:
 60 |             indentation, header, p = match.group(1), match.group(2), match.group(3)
 61 |             if not indentation:
 62 |                 header = header.upper()
 63 |             yield indent(click.style(header, bold=True), indentation)
 64 |             yield '\n'
 65 |         yield indent(p, '    ')
 66 |         yield '\n'
 67 | 
 68 | 
 69 | class OptionsContributor:
 70 |     _subclassed = set()
 71 | 
 72 |     @classmethod
 73 |     def __init_subclass__(cls, _doc_order=0):
 74 |         for c in cls.mro():
 75 |             if c.__qualname__ in cls._subclassed:
 76 |                 return
 77 | 
 78 |         cls._subclassed.add(cls.__qualname__)
 79 |         docs.append((cls, cls._help_options(), _doc_order))
 80 | 
 81 |     @staticmethod
 82 |     @markdown_inline
 83 |     def format_docs():
 84 |         yield from [
 85 |             click.style('feedme Customization Manual\n\n'.upper(), fg='black', bg='white', bold=True),
 86 |             'This program supports the use of presets, which lets you define \n'
 87 |             'options for different scenarios.\n',
 88 |             '\n',
 89 |             'A preset works like a Scrapy settings file: you simply declare your options as \n'
 90 |             'uppercase-only top level variables, such as `FOLLOW_DOMAINS = ["abc.xyz"]`.\n'
 91 |             '\n',
 92 |             'Then, run Scrapy with the command-line option ~-s PRESET=<path-to-preset.py>~.\n',
 93 |             '\n',
 94 |             'You may also specify options directly on the command line with the ~-s~ option:\n',
 95 |             'such as `-s RSS=http://xkcd.com/atom.xml`, in which case those declared on the\n',
 96 |             'command line take precedence over those in a preset.\n',
 97 |             '\n',
 98 |             'Some example presets are located in the `presets/` directory.\n',
 99 |             '\n',
100 |             'The following is the list of supported options, grouped by the components they\n',
101 |             'belong to.\n'
102 |             '\n',
103 |             click.style('============*============\n', fg='white', bold=True),
104 |             '\n',
105 |         ]
106 | 
107 |         for cls, options, _ in sorted(docs, key=lambda t: t[2], reverse=True):
108 |             yield click.style('-------------\n', fg='black', bold=True)
109 |             yield f'**{repr(cls)}**\n'
110 |             yield '\n'
111 | 
112 |             docstring = cls.__doc__
113 |             if docstring:
114 |                 yield from numpydoc2click(dedent(docstring))
115 | 
116 |             for opt, doc in options.items():
117 |                 yield f'  ~{opt}~\n'
118 |                 yield f'{indent(dedent(doc), "      ")}\n'
119 | 
120 |             yield '\n\n'
121 | 


--------------------------------------------------------------------------------
/feedme/exporters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tonywu7/feedly-link-aggregator/7c32b45fbfa1794b081e90add033403308a2da85/feedme/exporters/__init__.py


--------------------------------------------------------------------------------
/feedme/exporters/exporters.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import csv
 24 | import logging
 25 | import os
 26 | from abc import ABC, abstractmethod
 27 | from pathlib import Path
 28 | from typing import Callable, Set
 29 | 
 30 | import simplejson as json
 31 | 
 32 | from ..utils import JSONDict
 33 | 
 34 | 
 35 | class MappingExporter(ABC):
 36 |     def __init__(self, output: Path, filename: str, escape: Callable[[str], str] = None):
 37 |         self.output = output
 38 |         self.filename = filename
 39 |         self.ext = ''.join(Path(filename).suffixes)
 40 |         self.escape = escape or (lambda s: s)
 41 |         self.files = {}
 42 |         self.logger = logging.getLogger('exporter')
 43 |         self.opened = 0
 44 | 
 45 |     @abstractmethod
 46 |     def format(self, item: JSONDict):
 47 |         return item
 48 | 
 49 |     def get_file(self, item: JSONDict):
 50 |         if self.opened > 200:
 51 |             for f in self.files.values():
 52 |                 f.close()
 53 |             self.opened = 0
 54 | 
 55 |         filename = self.escape(self.filename % item)
 56 |         if filename[-1] == '/':
 57 |             filename = f'{filename}index{self.ext}'
 58 |         if filename == '.':
 59 |             filename = '-.'
 60 |         if filename == '..':
 61 |             filename = '-..'
 62 |         path = self.output / filename
 63 | 
 64 |         f, new = self.open_file(path)
 65 |         return f, path, new
 66 | 
 67 |     def open_file(self, path):
 68 |         out = self.files.get(path)
 69 |         is_newfile = out is None
 70 |         if not out or out.closed:
 71 |             os.makedirs(path.parent, exist_ok=True)
 72 |             if is_newfile:
 73 |                 self.logger.info(f'New file {path}')
 74 |             self.files[path] = out = open(path, 'a+')
 75 |             self.opened += 1
 76 |         return out, is_newfile
 77 | 
 78 |     def write(self, item: JSONDict):
 79 |         out, _, _ = self.get_file(item)
 80 |         out.write(f'{self.format(item)}\n')
 81 | 
 82 |     def close(self):
 83 |         if not self.files:
 84 |             self.logger.warning('Exported nothing!')
 85 |         for f in self.files.values():
 86 |             f.close()
 87 | 
 88 |     def __enter__(self):
 89 |         return self
 90 | 
 91 |     def __exit__(self, typ, val=None, tb=None):
 92 |         self.close()
 93 |         if not typ:
 94 |             return True
 95 |         if val is None:
 96 |             if tb is None:
 97 |                 raise typ
 98 |             val = typ()
 99 |         if tb is not None:
100 |             val = val.with_traceback(tb)
101 |         raise val
102 | 
103 | 
104 | class MappingJSONExporter(MappingExporter):
105 |     def __init__(self, key: str, *args, **kwargs):
106 |         super().__init__(*args, **kwargs)
107 |         self.key = key
108 |         self.storage = {}
109 | 
110 |     def format(self, item: JSONDict):
111 |         return super().format(item)
112 | 
113 |     def write(self, item: JSONDict):
114 |         _, fn, _ = self.get_file(item)
115 |         s = self.storage.setdefault(fn, {})
116 |         s[item[self.key]] = item
117 | 
118 |     def close(self):
119 |         for k in self.files:
120 |             f, _ = self.open_file(k)
121 |             json.dump(self.storage[k], f)
122 |         return super().close()
123 | 
124 | 
125 | class MappingLineExporter(MappingExporter):
126 |     def __init__(self, key: str, *args, **kwargs):
127 |         super().__init__(*args, **kwargs)
128 |         self.key = key
129 | 
130 |     def format(self, item):
131 |         return item[self.key]
132 | 
133 | 
134 | class MappingCSVExporter(MappingExporter):
135 |     def __init__(self, fieldnames: Set[str], *args, **kwargs):
136 |         super().__init__(*args, **kwargs)
137 |         self.writers = {}
138 |         self.fieldnames = fieldnames
139 | 
140 |     def format(self, item: JSONDict):
141 |         return super().format(item)
142 | 
143 |     def get_file(self, item: JSONDict):
144 |         if len(self.writers) > 200:
145 |             for k in self.writers:
146 |                 self.writers[k] = False
147 | 
148 |         f, fn, new = super().get_file(item)
149 |         if not self.fieldnames:
150 |             self.fieldnames = tuple(item.keys())
151 |         writer = self.writers.get(fn)
152 |         if not writer or not new:
153 |             writer = self.writers[fn] = csv.DictWriter(f, self.fieldnames, extrasaction='ignore')
154 |             if new:
155 |                 writer.writeheader()
156 |         return writer
157 | 
158 |     def write(self, item: JSONDict):
159 |         self.get_file(item).writerow({**item})
160 | 


--------------------------------------------------------------------------------
/feedme/exporters/graph.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import logging
 24 | import sqlite3
 25 | from collections import defaultdict
 26 | from pathlib import Path
 27 | 
 28 | import igraph
 29 | 
 30 | from ..datastructures import labeled_sequence
 31 | from ..sql.utils import offset_fetch
 32 | from .utils import filter_by_domains, with_db
 33 | 
 34 | log = logging.getLogger('exporter.graph')
 35 | 
 36 | 
 37 | def filter_vertices(g, vertex_ids, include, exclude):
 38 |     if include or exclude:
 39 |         log.info('Filtering graph')
 40 |         vertex_ids = {f'http://{k}': i for k, i in vertex_ids.items()}
 41 |         if include:
 42 |             vertex_ids = {k: i for k, i in vertex_ids.items()
 43 |                           if filter_by_domains(include)(k)}
 44 |         if exclude:
 45 |             vertex_ids = {k: i for k, i in vertex_ids.items()
 46 |                           if filter_by_domains(exclude, True)(k)}
 47 |         g = g.subgraph(vertex_ids.values())
 48 |     return g
 49 | 
 50 | 
 51 | def create_hyperlink_graph(db: sqlite3.Connection, include=None, exclude=None):
 52 |     SELECT = """
 53 |     SELECT
 54 |         source.url AS "source",
 55 |         target.url AS "target",
 56 |         hyperlink.element AS "tag",
 57 |         item.published AS "timestamp"
 58 |     FROM
 59 |         hyperlink
 60 |         JOIN url AS source ON source.id == hyperlink.source_id
 61 |         JOIN url AS target ON target.id == hyperlink.target_id
 62 |         JOIN item ON hyperlink.source_id == item.url
 63 |     WHERE
 64 |         %(offset)s
 65 |     """
 66 |     vertices = {}
 67 |     edges = {}
 68 |     log.debug(SELECT)
 69 | 
 70 |     log.info('Reading database')
 71 |     for row in offset_fetch(db, SELECT, 'hyperlink', log=log):
 72 |         src = row['source']
 73 |         dst = row['target']
 74 |         vertices[src] = True
 75 |         vertices[dst] = True
 76 |         edges[(src, dst)] = (row['tag'], row['timestamp'])
 77 |     log.info('Finished reading database')
 78 | 
 79 |     log.info('Creating graph')
 80 |     g = igraph.Graph(directed=True)
 81 |     vertex_ids = labeled_sequence(vertices, key=False)
 82 |     edges = {(vertex_ids[t[0]], vertex_ids[t[1]]): v for t, v in edges.items()}
 83 |     g.add_vertices(len(vertices))
 84 |     g.add_edges(edges)
 85 |     g.vs['name'] = list(vertices)
 86 |     g.es['type'], g.es['timestamp'] = tuple(zip(*edges.values()))
 87 |     log.info(f'|V| = {g.vcount()}; |E| = {g.ecount()}')
 88 |     g = filter_vertices(g, vertex_ids, include, exclude)
 89 |     return g
 90 | 
 91 | 
 92 | def create_domain_graph(db: sqlite3.Connection, include=None, exclude=None):
 93 |     temp = """
 94 |     CREATE TEMP TABLE domains (id INTEGER, domain VARCHAR)
 95 |     """
 96 |     index = """
 97 |     CREATE INDEX temp_ix_domains ON domains (id)
 98 |     """
 99 |     insert_domains = """
100 |     INSERT INTO domains
101 |     SELECT url.id AS id, urlsplit(url.url, 'netloc') AS domain
102 |     FROM url
103 |     """
104 |     count_domains = """
105 |     SELECT domains.domain, count(domains.domain)
106 |     FROM domains
107 |     GROUP BY domains.domain
108 |     """
109 | 
110 |     select_pairs = """
111 |     SELECT
112 |         src.domain AS source,
113 |         dst.domain AS target,
114 |         hyperlink.element AS tag,
115 |         count(hyperlink.element) AS count
116 |     FROM
117 |         hyperlink
118 |         JOIN domains AS src ON hyperlink.source_id == src.id
119 |         JOIN domains AS dst ON hyperlink.target_id == dst.id
120 |     WHERE %(offset)s
121 |     GROUP BY
122 |         source,
123 |         target,
124 |         tag
125 |     """
126 |     db.execute('BEGIN EXCLUSIVE')
127 |     db.execute(temp)
128 | 
129 |     log.info('Building domain list')
130 |     db.execute(insert_domains)
131 |     db.execute(index)
132 | 
133 |     vertices = {}
134 |     edges = defaultdict(lambda: defaultdict(int))
135 |     attrs = set()
136 | 
137 |     log.info('Counting domains')
138 |     for domain, count in db.execute(count_domains):
139 |         vertices[domain] = count
140 | 
141 |     log.info('Fetching hyperlinks')
142 |     for row in offset_fetch(db, select_pairs, 'hyperlink', size=500000, log=log):
143 |         src = row['source']
144 |         dst = row['target']
145 |         tag = row['tag']
146 |         attrs.add(tag)
147 |         edges[(src, dst)][tag] += row['count']
148 | 
149 |     db.rollback()
150 | 
151 |     log.info('Creating graph')
152 |     g = igraph.Graph(directed=True)
153 |     vertex_ids = labeled_sequence(vertices, key=False)
154 |     edges = {(vertex_ids[t[0]], vertex_ids[t[1]]): v for t, v in edges.items()}
155 |     g.add_vertices(len(vertices))
156 |     g.add_edges(edges)
157 |     g.vs['name'] = list(vertices)
158 |     g.vs['weight'] = list(vertices.values())
159 |     attrs = {a: tuple(v.get(a, 0) for v in edges.values()) for a in attrs}
160 |     for k, t in attrs.items():
161 |         g.es[k] = t
162 |     g = filter_vertices(g, vertex_ids, include, exclude)
163 |     log.info(f'|V| = {g.vcount()}; |E| = {g.ecount()}')
164 |     return g
165 | 
166 | 
167 | @with_db
168 | def export(conn: sqlite3.Connection, wd: Path, output: Path,
169 |            fmt='index.graphml', graphtype='hyperlink',
170 |            include=None, exclude=None, **kwargs):
171 | 
172 |     reader = {
173 |         'hyperlink': create_hyperlink_graph,
174 |         'domain': create_domain_graph,
175 |     }[graphtype]
176 |     g = reader(conn, include, exclude)
177 |     log.info('Writing')
178 |     with open(output / fmt, 'w+') as f:
179 |         g.save(f, format='graphml')
180 |     log.info('Done.')
181 | 
182 | 
183 | help_text = """
184 | Export feed data as graph data.
185 | 
186 | Synopsis
187 | --------
188 | export ~graph~ -i <input> -o [name] [**graphtype=**~hyperlink|domain~]
189 | 
190 | Description
191 | -----------
192 | This exporter lets you represent scraped URL data using graph data structure.
193 | 
194 | **Requires igraph. You must install ~requirements-optional.txt~.**
195 | 
196 | Currently this exports graphs in ~GraphML~ format only.
197 | 
198 | This exporter does not support name templates.
199 | 
200 | Filters
201 | -------
202 | 
203 | ~domain~ ~under~ ...
204 | 
205 | Include/exclude websites/hyperlinks whose domain name is under the specified
206 | domain.
207 | 
208 | Options
209 | -------
210 | ~graphtype=[hyperlink|domain]~
211 | 
212 |     **~hyperlink~**
213 |         **Directed, self-loop allowed**
214 |         **Vertices**
215 |             Each ~source~ or ~target~ URL (representing a file on a website);
216 |             **Attributes**
217 |                 ~name~: The URL
218 |         **Edges**
219 |             Each hyperlink found in ~source~ pointing to ~target~;
220 |             **Attributes**
221 |                 ~type~: The HTML element
222 |                 ~timestamp~: UTC date and time when ~source~ was published,
223 |                   in ISO-8601 format
224 | 
225 |     **~domain~**
226 |         **Directed, self-loop allowed**
227 |         **Vertices**
228 |             Domains of each URL
229 |             **Attributes**
230 |                 ~name~: Domain name
231 |                 ~weight~: The number of files found under the domain
232 |         **Edges**
233 |             Each hyperlink found in ~source~ pointing to ~target~ creates an
234 |               edge from ~source:domain~ to ~target:domain~; not repeated.
235 |             **Attributes**
236 |                 ~<tag names...>~: Each hyperlink in ~source~ pointing to
237 |                   ~target~ that is found on a particular HTML tag increases
238 |                   the ~<tag>~ attribute by 1.
239 | """
240 | 


--------------------------------------------------------------------------------
/feedme/exporters/uncharted.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import logging
 24 | import sqlite3
 25 | from collections import defaultdict
 26 | from pathlib import Path
 27 | 
 28 | import simplejson as json
 29 | 
 30 | from .utils import filter_by_domains, with_db
 31 | 
 32 | log = logging.getLogger('exporter.uncharted')
 33 | 
 34 | 
 35 | @with_db
 36 | def export(conn: sqlite3.Connection, wd: Path, output: Path,
 37 |            include=None, exclude=None,
 38 |            fmt='uncharted.json', **kwargs):
 39 | 
 40 |     temp = """
 41 |     CREATE TEMP TABLE domains (id INTEGER, domain VARCHAR)
 42 |     """
 43 |     index = """
 44 |     CREATE INDEX temp_ix_domains ON domains (id)
 45 |     """
 46 |     insert_domains = """
 47 |     INSERT INTO domains
 48 |     SELECT url.id AS id, 'http://' || urlsplit(url.url, 'netloc') AS domain
 49 |     FROM url
 50 |     """
 51 |     count_domains = """
 52 |     SELECT domains.domain, count(domains.domain)
 53 |     FROM domains
 54 |     GROUP BY domains.domain
 55 |     """
 56 |     select_feeds = """
 57 |     SELECT domains.domain
 58 |     FROM feed
 59 |     JOIN domains ON feed.url_id == domains.id
 60 |     GROUP BY domains.domain
 61 |     """
 62 |     select_keywords = """
 63 |     SELECT domains.domain, keyword.keyword, count(keyword.keyword)
 64 |     FROM tagging
 65 |     JOIN domains ON tagging.url_id == domains.id
 66 |     JOIN keyword ON tagging.keyword_id == keyword.id
 67 |     GROUP BY domains.domain, keyword.keyword
 68 |     """
 69 |     select_hyperlinks = """
 70 |     SELECT src.domain, dst.domain, count(src.domain)
 71 |     FROM hyperlink
 72 |     JOIN domains AS src ON hyperlink.source_id == src.id
 73 |     JOIN domains AS dst ON hyperlink.target_id == dst.id
 74 |     GROUP BY src.domain, dst.domain
 75 |     """
 76 | 
 77 |     conn.execute('BEGIN EXCLUSIVE')
 78 |     conn.execute(temp)
 79 | 
 80 |     log.info('Building domain list')
 81 |     conn.execute(insert_domains)
 82 |     conn.execute(index)
 83 | 
 84 |     domains = defaultdict(lambda: {
 85 |         'page_count': 0,
 86 |         'keywords': defaultdict(int),
 87 |         'referrers': defaultdict(int),
 88 |     })
 89 |     log.info('Counting domains')
 90 |     for domain, count in conn.execute(count_domains):
 91 |         domains[domain]['page_count'] = count
 92 | 
 93 |     log.info('Counting keywords')
 94 |     for domain, keyword, count in conn.execute(select_keywords):
 95 |         domains[domain]['keywords'][keyword] += count
 96 | 
 97 |     log.info('Counting referrers')
 98 |     for src, dst, count in conn.execute(select_hyperlinks):
 99 |         domains[dst]['referrers'][src] += count
100 | 
101 |     log.info('Filtering')
102 |     for feed in conn.execute(select_feeds):
103 |         del domains[feed[0]]
104 | 
105 |     if include:
106 |         domains = {k: v for k, v in domains.items()
107 |                    if filter_by_domains(include)(k)}
108 | 
109 |     if exclude:
110 |         domains = {k: v for k, v in domains.items()
111 |                    if filter_by_domains(exclude, True)(k)}
112 | 
113 |     with open(output / fmt, 'w+') as f:
114 |         json.dump(domains, f)
115 | 
116 |     conn.rollback()
117 |     log.info('Done.')
118 | 
119 | 
120 | help_text = """
121 | Export a list of websites that are "uncharted" — websites that were not scraped
122 | as RSS feeds during a crawl, but were recorded in the database because other
123 | feeds mentioned them.
124 | 
125 | That is, if you are using the cluster spider, this exporter will export the list
126 | of websites that are not crawled due to the spider hitting the depth limit
127 | — the outermost nodes.
128 | 
129 | Note that websites that are "uncharted" because they were filtered out by the
130 | domain filter (instead of hitting the depth limit) or because they were not
131 | RSS feeds in the first place will also be included here. To only export sites
132 | under a certain domain, use ~+f domain under ...~, to exclude a certain domain,
133 | use ~-f domain under ...~.
134 | """
135 | 


--------------------------------------------------------------------------------
/feedme/exporters/utils.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import logging
 24 | import operator
 25 | import os
 26 | import sqlite3
 27 | from functools import wraps
 28 | from pathlib import Path
 29 | 
 30 | from scrapy.utils.url import url_is_from_any_domain
 31 | 
 32 | from ..datastructures import labeled_sequence
 33 | from ..sql.db import db
 34 | from ..sql.functions import register_all
 35 | 
 36 | log = logging.getLogger('exporter.utils')
 37 | 
 38 | 
 39 | def subdomain(x, y):
 40 |     return x == y or x[-(len(y) + 1):] == f'.{y}'
 41 | 
 42 | 
 43 | filter_ops = {
 44 |     'is': operator.eq,
 45 |     'under': subdomain,
 46 |     'startswith': str.startswith,
 47 |     'endswith': str.endswith,
 48 |     'contains': operator.contains,
 49 |     'gt': operator.gt,
 50 |     'ge': operator.ge,
 51 |     'lt': operator.lt,
 52 |     'le': operator.le,
 53 | }
 54 | sql_ops = {
 55 |     ('is', 'None'): ('"%(column)s" IS NULL', '%s'),
 56 |     ('is', 'True'): ('"%(column)s" == 1', '%s'),
 57 |     ('is', 'False'): ('"%(column)s" == 0', '%s'),
 58 |     'is': ('"%(column)s" == :%(id)d', '%s'),
 59 |     'under': ('subdomain("%(column)s", :%(id)d)', '%s'),
 60 |     'startswith': ('"%(column)s" LIKE :%(id)d', '%s%%'),
 61 |     'endswith': ('"%(column)s" LIKE :%(id)d', '%%%s'),
 62 |     'contains': ('"%(column)s" LIKE :%(id)d', '%%%s%%'),
 63 |     'gt': ('"%(column)s" > :%(id)d', '%s'),
 64 |     'ge': ('"%(column)s" >= :%(id)d', '%s'),
 65 |     'lt': ('"%(column)s" < :%(id)d', '%s'),
 66 |     'le': ('"%(column)s" <= :%(id)d', '%s'),
 67 | }
 68 | equivalencies = [('==', 'is'), ('in', 'contains'), ('>', 'gt'), ('<', 'lt'), ('>=', 'ge'), ('<=', 'le')]
 69 | for k, v in equivalencies:
 70 |     filter_ops[k] = filter_ops[v]
 71 |     sql_ops[k] = sql_ops[v]
 72 | 
 73 | 
 74 | def build_where_clause(includes=None, excludes=None):
 75 |     if not includes and not excludes:
 76 |         return '1', (), set()
 77 |     values = []
 78 |     includes = includes or []
 79 |     excludes = excludes or []
 80 |     clauses = []
 81 |     required_columns = set()
 82 |     for prefix, criteria in (('', includes), ('NOT ', excludes)):
 83 |         for key, op, val in criteria:
 84 |             required_columns.add(key)
 85 |             op = sql_ops.get((op, val), sql_ops[op])
 86 |             values.append(op[1] % (val,))
 87 |             value_id = len(values)
 88 |             clauses.append(prefix + op[0] % {'column': key, 'id': value_id})
 89 |     clauses = ' AND '.join(clauses)
 90 |     values = labeled_sequence(values, start=1, as_str=True)
 91 |     return clauses, values, required_columns
 92 | 
 93 | 
 94 | class MappingFilter:
 95 |     def __init__(self):
 96 |         self.filters = []
 97 | 
 98 |     def includes(self, key, op, val):
 99 |         self.filters.append(lambda row, x=key, y=val, op=filter_ops[op]: op(row[x], y))
100 | 
101 |     def excludes(self, key, op, val):
102 |         self.filters.append(lambda row, x=key, y=val, op=filter_ops[op]: not op(row[x], y))
103 | 
104 |     def __call__(self, item):
105 |         return all(f(item) for f in self.filters)
106 | 
107 | 
108 | def with_db(exporter):
109 |     @wraps(exporter)
110 |     def e(wd, *args, **kwargs):
111 |         wd = Path(wd)
112 |         output = wd / 'out'
113 |         os.makedirs(output, exist_ok=True)
114 | 
115 |         db_path = wd / 'index.db'
116 |         if not db_path.exists():
117 |             raise FileNotFoundError(f'index.db not found in {wd}')
118 | 
119 |         conn = sqlite3.connect(db_path, isolation_level=None)
120 |         if db.is_locked(conn):
121 |             log.error('Database was left in a partially consistent state.')
122 |             log.error('Run `python -m feedme check-db` to fix it first.')
123 |             return 1
124 | 
125 |         conn.row_factory = sqlite3.Row
126 |         db.verify_version(conn)
127 |         register_all(conn)
128 | 
129 |         try:
130 |             exporter(conn, wd, output, *args, **kwargs)
131 |         finally:
132 |             conn.close()
133 |     return e
134 | 
135 | 
136 | def filter_by_domains(ls, exclude=False):
137 |     domains = []
138 |     for key, op, val in ls:
139 |         if key != 'domain' or op != 'under':
140 |             log.warning(f'Unknown filter {key} {op}')
141 |             continue
142 |         domains.append(val)
143 |     return lambda u: url_is_from_any_domain(u, domains) ^ exclude
144 | 


--------------------------------------------------------------------------------
/feedme/feedly.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | from __future__ import annotations
 24 | 
 25 | from datetime import datetime
 26 | from typing import Any, Dict, Optional
 27 | from urllib.parse import SplitResult, quote, urlsplit
 28 | 
 29 | import attr
 30 | from attr.converters import optional
 31 | from attr.validators import instance_of
 32 | 
 33 | from . import utils
 34 | from .datastructures import Keywords
 35 | from .utils import HyperlinkStore, JSONDict
 36 | 
 37 | API_BASE = {
 38 |     'scheme': 'https',
 39 |     'netloc': 'cloud.feedly.com',
 40 |     'fragment': '',
 41 | }
 42 | API_ENDPOINTS = {
 43 |     'streams': '/v3/streams/contents',
 44 |     'search': '/v3/search/feeds',
 45 |     'feeds': '/v3/feeds/',
 46 | }
 47 | 
 48 | 
 49 | def build_api_url(endpoint, argument='', **params):
 50 |     if endpoint not in API_ENDPOINTS:
 51 |         raise ValueError(f'{endpoint} API is not supported')
 52 |     url = {**API_BASE, 'path': API_ENDPOINTS[endpoint] + quote(argument, safe='')}
 53 |     url['query'] = '&'.join([f'{quote(k)}={quote(str(v))}' for k, v in params.items()])
 54 |     return SplitResult(**url).geturl()
 55 | 
 56 | 
 57 | def get_feed_uri(s):
 58 |     return s.split('/', 1)[1]
 59 | 
 60 | 
 61 | def lowercase_set(iterable=None):
 62 |     if not iterable:
 63 |         return set()
 64 |     return {str(k).lower() for k in iterable}
 65 | 
 66 | 
 67 | @attr.s(kw_only=True, frozen=True)
 68 | class FeedlyEntry:
 69 |     url: str = attr.ib(validator=instance_of(str))
 70 |     source: str = attr.ib(repr=False)
 71 |     published: datetime = attr.ib(converter=utils.datetime_converters)
 72 |     updated: datetime = attr.ib(default=None, converter=optional(utils.datetime_converters), repr=False)
 73 | 
 74 |     keywords: Keywords = attr.ib(converter=utils.ensure_collection(lowercase_set), factory=lowercase_set, repr=False)
 75 |     author: Optional[str] = attr.ib(default='', repr=False)
 76 |     title: Optional[str] = attr.ib(default='', repr=False)
 77 | 
 78 |     markup: Dict[str, str] = attr.ib(factory=dict, repr=False)
 79 |     hyperlinks: HyperlinkStore = attr.ib(factory=HyperlinkStore, repr=False)
 80 | 
 81 |     @classmethod
 82 |     def from_upstream(cls, item: JSONDict) -> FeedlyEntry:
 83 |         data = {}
 84 |         for name in attr.fields_dict(cls):
 85 |             value = item.get(name)
 86 |             if value:
 87 |                 data[name] = value
 88 |         data['url'] = cls._get_page_url(item)
 89 |         data['source'] = cls._get_source_url(item)
 90 |         entry = cls(**data)
 91 |         cls._set_markup(entry, item)
 92 |         return entry
 93 | 
 94 |     @staticmethod
 95 |     def _get_page_url(item):
 96 |         url = urlsplit(item.get('originId', ''))
 97 |         if url.netloc:
 98 |             url = url.geturl()
 99 |         else:
100 |             url = ''
101 |             alt = item.get('alternate')
102 |             if alt and alt != 'none':
103 |                 url = alt[0]['href']
104 |         return url
105 | 
106 |     @staticmethod
107 |     def _get_source_url(item):
108 |         source = item.get('origin')
109 |         if source:
110 |             return get_feed_uri(source.get('streamId', '/'))
111 |         return ''
112 | 
113 |     @staticmethod
114 |     def _set_markup(entry, item):
115 |         content = item.get('content', item.get('summary'))
116 |         if content:
117 |             content = content.get('content')
118 |         if content:
119 |             entry.add_markup('summary', content)
120 | 
121 |     @staticmethod
122 |     def _filter_attrib(attrib: attr.Attribute, value: Any) -> bool:
123 |         return attrib.name[0] != '_'
124 | 
125 |     def add_markup(self, name, markup):
126 |         self.markup[name] = markup
127 |         self.hyperlinks.parse_html(self.url, markup)
128 | 
129 |     def for_json(self) -> JSONDict:
130 |         dict_ = attr.asdict(self, filter=self._filter_attrib)
131 |         return dict_
132 | 


--------------------------------------------------------------------------------
/feedme/logger.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import logging
 24 | import sys
 25 | from typing import Dict, Union
 26 | 
 27 | try:
 28 |     import termcolor
 29 |     _ = termcolor.colored
 30 | except ImportError:
 31 |     _ = None
 32 | 
 33 | 
 34 | def compose_mappings(*mappings):
 35 |     base = {}
 36 |     base.update(mappings[0])
 37 |     for m in mappings[1:]:
 38 |         for k, v in m.items():
 39 |             if k in base and type(base[k]) is type(v):
 40 |                 if isinstance(v, dict):
 41 |                     base[k] = compose_mappings(base[k], v)
 42 |                 elif isinstance(v, set):
 43 |                     base[k] |= v
 44 |                 elif isinstance(v, list):
 45 |                     base[k].extend(v)
 46 |                 else:
 47 |                     base[k] = v
 48 |             else:
 49 |                 base[k] = v
 50 |     return base
 51 | 
 52 | 
 53 | class _LogContainer:
 54 |     pass
 55 | 
 56 | 
 57 | class _ColoredFormatter(logging.Formatter):
 58 |     def __init__(self, fmt=None, datefmt=None, style='%', *, color='white'):
 59 |         super().__init__(fmt, datefmt, style)
 60 |         self.termcolor_args = lambda self, record: ()
 61 |         if isinstance(color, str):
 62 |             self.termcolor_args = lambda self, record: (color,)
 63 |         elif isinstance(color, tuple):
 64 |             self.termcolor_args = lambda self, record: color
 65 |         elif callable(color):
 66 |             self.termcolor_args = color
 67 | 
 68 |     def format(self, record):
 69 |         color_args = self.termcolor_args(self, record)
 70 |         return _(super().format(record), *color_args)
 71 | 
 72 | 
 73 | class _CascadingFormatter(logging.Formatter):
 74 |     def __init__(
 75 |         self, sections: str,
 76 |         stylesheet: Dict[str, Union[str, logging.Formatter]],
 77 |         style='%', stacktrace=None, datefmt=None,
 78 |     ):
 79 |         self.stylesheet = {}
 80 |         for section, fmt in stylesheet.items():
 81 |             formatter = logging.Formatter(fmt) if isinstance(fmt, str) else fmt
 82 |             if section != stacktrace:
 83 |                 formatter.formatException = lambda info: ''
 84 |                 formatter.formatStack = lambda info: ''
 85 |             self.stylesheet[section] = formatter
 86 |         super().__init__(sections, datefmt, style)
 87 | 
 88 |     def format(self, record):
 89 |         parent = _LogContainer()
 90 |         for child, fmt in self.stylesheet.items():
 91 |             setattr(parent, child, fmt.format(record))
 92 |         return super().formatMessage(parent)
 93 | 
 94 |     @classmethod
 95 |     def from_config(cls, *, sections, stylesheet, **kwargs):
 96 |         stylesheet_ = {}
 97 |         for k, fmt in stylesheet.items():
 98 |             if isinstance(fmt, str):
 99 |                 stylesheet_[k] = fmt
100 |                 continue
101 |             f_kwargs = {}
102 |             f_kwargs.update(fmt)
103 |             factory = f_kwargs.pop('()', logging.Formatter)
104 |             stylesheet_[k] = factory(**f_kwargs)
105 |         return cls(sections, stylesheet_, **kwargs)
106 | 
107 | 
108 | LOG_LEVEL_PREFIX_COLORS = {
109 |     'DEBUG': ('magenta', None, ['bold']),
110 |     'INFO': ('white', None, ['bold']),
111 |     'WARNING': ('yellow', None, ['bold']),
112 |     'ERROR': ('red', None, ['bold']),
113 |     'CRITICAL': ('grey', 'on_red', ['bold']),
114 | }
115 | LOG_LEVEL_PREFIX_COLORS_DEBUG = {
116 |     **LOG_LEVEL_PREFIX_COLORS,
117 |     'INFO': ('blue', None, ['bold']),
118 | }
119 | 
120 | 
121 | def _color_stacktrace(self, record: logging.LogRecord):
122 |     return ('red',) if record.exc_info else ('white',)
123 | 
124 | 
125 | def _conditional_color(field, rules, default=('white',)):
126 |     def fn(self, record):
127 |         return rules.get(getattr(record, field), default)
128 |     return fn
129 | 
130 | 
131 | FMT_PREFIX = '%(asctime)s %(levelname)8s'
132 | FMT_LOGGER = '[%(processName)s:%(name)s]'
133 | FMT_SOURCE = '(%(module)s.%(funcName)s:%(lineno)d)'
134 | 
135 | formatter_styles = {
136 |     'standard': {
137 |         'normal': {
138 |             'format': f'{FMT_PREFIX} {FMT_LOGGER} %(message)s',
139 |         },
140 |         'colored': {
141 |             '()': _CascadingFormatter.from_config,
142 |             'sections': '%(prefix)s %(name)s %(message)s',
143 |             'stylesheet': {
144 |                 'prefix': {
145 |                     '()': _ColoredFormatter,
146 |                     'fmt': FMT_PREFIX,
147 |                     'color': _conditional_color('levelname', LOG_LEVEL_PREFIX_COLORS),
148 |                 },
149 |                 'name': {
150 |                     '()': _ColoredFormatter,
151 |                     'fmt': FMT_LOGGER,
152 |                     'color': 'blue',
153 |                 },
154 |                 'message': {
155 |                     '()': _ColoredFormatter,
156 |                     'fmt': '%(message)s',
157 |                     'color': _color_stacktrace,
158 |                 },
159 |             },
160 |             'stacktrace': 'message',
161 |         },
162 |     },
163 |     'debug': {
164 |         'normal': {
165 |             'format': f'{FMT_PREFIX} {FMT_LOGGER}{FMT_SOURCE}  %(message)s',
166 |         },
167 |         'colored': {
168 |             '()': _CascadingFormatter.from_config,
169 |             'sections': '%(prefix)s %(name)s%(source)s  %(message)s',
170 |             'stylesheet': {
171 |                 'prefix': {
172 |                     '()': _ColoredFormatter,
173 |                     'fmt': FMT_PREFIX,
174 |                     'color': _conditional_color('levelname', LOG_LEVEL_PREFIX_COLORS_DEBUG),
175 |                 },
176 |                 'name': {
177 |                     '()': _ColoredFormatter,
178 |                     'fmt': FMT_LOGGER,
179 |                     'color': 'blue',
180 |                 },
181 |                 'source': {
182 |                     '()': _ColoredFormatter,
183 |                     'fmt': FMT_SOURCE,
184 |                     'color': 'cyan',
185 |                 },
186 |                 'message': {
187 |                     '()': _ColoredFormatter,
188 |                     'fmt': '%(message)s',
189 |                     'color': _color_stacktrace,
190 |                 },
191 |             },
192 |             'stacktrace': 'message',
193 |         },
194 |     },
195 | }
196 | 
197 | logging_config_template = {
198 |     'disable_existing_loggers': False,
199 |     'version': 1,
200 |     'handlers': {
201 |         'console': {
202 |             'class': 'logging.StreamHandler',
203 |             'stream': sys.stderr,
204 |         },
205 |     },
206 |     'loggers': {
207 |         'main': {
208 |             'level': logging.NOTSET,
209 |         },
210 |         'scrapy.core': {
211 |             'level': logging.NOTSET,
212 |         },
213 |         'scrapy.core.engine': {
214 |             'level': logging.NOTSET,
215 |         },
216 |         'scrapy.crawler': {
217 |             'level': logging.WARNING,
218 |         },
219 |         'scrapy.middleware': {
220 |             'level': logging.WARNING,
221 |         },
222 |         'twisted': {
223 |             'level': logging.ERROR,
224 |         },
225 |     },
226 |     'root': {
227 |         'handlers': ['console'],
228 |     },
229 | }
230 | 
231 | 
232 | def make_logging_config(
233 |     app_name, *overrides, level=logging.INFO,
234 |     style='standard', colored=True, datefmt=None,
235 |     logfile=None, **kwargs,
236 | ):
237 |     color_mode = 'colored' if colored and _ else 'normal'
238 |     if style in formatter_styles:
239 |         formatter = formatter_styles[style][color_mode]
240 |     else:
241 |         formatter = style
242 | 
243 |     app_logging_config = {
244 |         'formatters': {
245 |             'default_fmt': formatter,
246 |         },
247 |         'handlers': {
248 |             'console': {
249 |                 'formatter': 'default_fmt',
250 |                 'level': level,
251 |             },
252 |         },
253 |         'loggers': {
254 |             f'{app_name}': {
255 |                 'level': logging.NOTSET,
256 |             },
257 |         },
258 |         'root': {
259 |             'level': level,
260 |         },
261 |     }
262 | 
263 |     file_handler_config = {}
264 |     if logfile:
265 |         file_handler_config = {
266 |             'formatters': {
267 |                 'no_color': (formatter_styles[style]['normal']
268 |                              if style in formatter_styles else style),
269 |             },
270 |             'handlers': {
271 |                 'file': {
272 |                     'class': 'logging.FileHandler',
273 |                     'filename': logfile,
274 |                     'formatter': 'no_color',
275 |                 },
276 |             },
277 |             'root': {
278 |                 'handlers': ['file'],
279 |             },
280 |         }
281 | 
282 |     datefmt_config = {}
283 |     if datefmt:
284 |         datefmt_config = {
285 |             'formatters': {
286 |                 'default_fmt': {
287 |                     'datefmt': datefmt,
288 |                 },
289 |                 'no_color': {
290 |                     'datefmt': datefmt,
291 |                 },
292 |             },
293 |         }
294 | 
295 |     log_config = compose_mappings(
296 |         logging_config_template,
297 |         app_logging_config,
298 |         file_handler_config,
299 |         datefmt_config,
300 |         *overrides,
301 |     )
302 |     return log_config
303 | 
304 | 
305 | class _LoggingParticipant:
306 |     def __init__(self, *args, _logger=None, **kwargs):
307 |         if _logger:
308 |             self.log: logging.Logger = _logger
309 |         elif isinstance(getattr(self, '_logger_name', None), str):
310 |             self.log: logging.Logger = logging.getLogger(self._logger_name)
311 |             self.log.disabled = True
312 |         else:
313 |             raise NotImplementedError('_logger_name is not defined')
314 | 


--------------------------------------------------------------------------------
/feedme/requests.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from scrapy import Request
24 | 
25 | 
26 | class ProbeFeed(Request):
27 |     def __init__(self, *, url, callback, source=None, **kwargs):
28 |         meta = kwargs.pop('meta', {})
29 |         meta['feed_url'] = url
30 |         meta['is_probe'] = True
31 |         meta['pkey'] = (url, 'search')
32 |         super().__init__(url=url, callback=callback, meta=meta, **kwargs)
33 |         self.priority = source.priority + 5 if source else self.priority + 5
34 | 
35 | 
36 | def reconstruct_request(cls, instance, **kwargs):
37 |     callback = kwargs.pop('callback')
38 |     callback = getattr(instance, callback)
39 |     return cls(callback=callback, **kwargs)
40 | 


--------------------------------------------------------------------------------
/feedme/settings.py:
--------------------------------------------------------------------------------
  1 | # Scrapy settings for feedly project
  2 | #
  3 | # For simplicity, this file contains only settings considered important or
  4 | # commonly used. You can find more settings consulting the documentation:
  5 | #
  6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
  7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  9 | 
 10 | __version__ = '0.10.11'
 11 | 
 12 | BOT_NAME = 'feedly'
 13 | 
 14 | SPIDER_MODULES = ['feedme.spiders']
 15 | NEWSPIDER_MODULE = 'feedme.spiders'
 16 | 
 17 | LOG_ENABLED = True
 18 | LOG_LEVEL = 20
 19 | 
 20 | LOGSTATS_INTERVAL = 60.0
 21 | METRICS_CALC_INTERVAL = 20.0
 22 | 
 23 | LOG_VIOLATIONS = False
 24 | STATS_DUMP = False
 25 | 
 26 | COMMANDS_MODULE = 'feedme.commands'
 27 | 
 28 | # This program uses a custom logging config (see __init__.py)
 29 | # To give control of logging back to Scrapy, set this to False
 30 | CUSTOM_LOGGING_ENABLED = True
 31 | 
 32 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 33 | USER_AGENT = f'Mozilla/5.0 (compatible; hyperlinkfeedme/{__version__}; +https://github.com/monotony113/feedly-link-feedme)'
 34 | 
 35 | # Obey robots.txt rules
 36 | ROBOTSTXT_OBEY = True
 37 | 
 38 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 39 | # CONCURRENT_REQUESTS = 32
 40 | 
 41 | # Configure a delay for requests for the same website (default: 0)
 42 | # See https://docs.scrapy.org/en/latest/topics/settings.html# download-delay
 43 | # See also autothrottle settings and docs
 44 | # DOWNLOAD_DELAY = 3
 45 | # The download delay setting will honor only one of:
 46 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 47 | # CONCURRENT_REQUESTS_PER_IP = 16
 48 | 
 49 | # Disable cookies (enabled by default)
 50 | COOKIES_ENABLED = False
 51 | 
 52 | # Disable Telnet Console (enabled by default)
 53 | # TELNETCONSOLE_ENABLED = False
 54 | 
 55 | # Override the default request headers:
 56 | DEFAULT_REQUEST_HEADERS = {
 57 |     'Accept': 'application/json;q=0.9;text/html,application/xhtml+xml,application/xml;q=0.8,*/*;q=0.7',
 58 | }
 59 | 
 60 | # Enable or disable spider middlewares
 61 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 62 | SPIDER_MIDDLEWARES = {
 63 |     'scrapy.spidermiddlewares.depth.DepthMiddleware': None,
 64 |     'feedme.middlewares.RequestDefrosterSpiderMiddleware': 100,
 65 |     'feedme.middlewares.DerefItemSpiderMiddleware': 101,
 66 |     'feedme.middlewares.OffsiteFeedSpiderMiddleware': 500,
 67 |     'feedme.middlewares.ConditionalDepthSpiderMiddleware': 550,
 68 |     'feedme.middlewares.FetchSourceSpiderMiddleware': 600,
 69 |     'feedme.middlewares.CrawledItemSpiderMiddleware': 800,
 70 |     'feedme.spiders.cluster.ExplorationSpiderMiddleware': 900,
 71 | }
 72 | 
 73 | HTTPERROR_ALLOWED_CODES = [403, 404]
 74 | 
 75 | # Enable or disable downloader middlewares
 76 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 77 | DOWNLOADER_MIDDLEWARES = {
 78 |     'feedme.middlewares.RequestPersistenceDownloaderMiddleware': 150,
 79 |     'feedme.middlewares.FeedProbingDownloaderMiddleware': 200,
 80 |     'feedme.middlewares.HTTPErrorDownloaderMiddleware': 500,
 81 |     'feedme.middlewares.AuthorizationDownloaderMiddleware': 600,
 82 | }
 83 | 
 84 | # Enable or disable extensions
 85 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
 86 | EXTENSIONS = {
 87 |     'scrapy.extensions.logstats.LogStats': None,
 88 |     'feedme.extensions._LoggingHelper': 99,
 89 |     'feedme.extensions.PresetLoader': 100,
 90 |     'feedme.extensions.SettingsLoader': 101,
 91 |     'feedme.extensions.LogStatsExtended': 102,
 92 |     'feedme.extensions.RequestMetrics': 102,
 93 |     'feedme.extensions.ContribMiddleware': 200,
 94 |     'feedme.extensions.GlobalPersistence': 999,
 95 | }
 96 | 
 97 | # Configure item pipelines
 98 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 99 | ITEM_PIPELINES = {
100 |     # 'feedme.pipelines.CompressedStreamExportPipeline': 900,
101 |     # 'feedme.pipelines.SQLiteExportPipeline': 900,
102 |     'feedme.pipelines.SQLiteExportProcessPipeline': 900,
103 | }
104 | 
105 | AUTO_LOAD_PREDEFINED_PRESETS = True
106 | 
107 | # Enable and configure the AutoThrottle extension (disabled by default)
108 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
109 | AUTOTHROTTLE_ENABLED = True
110 | # The initial download delay
111 | AUTOTHROTTLE_START_DELAY = 0
112 | # The maximum download delay to be set in case of high latencies
113 | AUTOTHROTTLE_MAX_DELAY = 60
114 | # The average number of requests Scrapy should be sending in parallel to
115 | # each remote server
116 | AUTOTHROTTLE_TARGET_CONCURRENCY = 1.5
117 | # Enable showing throttling stats for every response received:
118 | AUTOTHROTTLE_DEBUG = False
119 | 
120 | # Enable and configure HTTP caching (disabled by default)
121 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html# httpcache-middleware-settings
122 | # HTTPCACHE_ENABLED = True
123 | # HTTPCACHE_EXPIRATION_SECS = 0
124 | # HTTPCACHE_DIR = 'httpcache'
125 | # HTTPCACHE_IGNORE_HTTP_CODES = []
126 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
127 | 
128 | DEPTH_LIMIT = 1
129 | 


--------------------------------------------------------------------------------
/feedme/signals.py:
--------------------------------------------------------------------------------
1 | register_state = object()
2 | show_stats = object()
3 | 
4 | start_from_scratch = object()
5 | resume_requests = object()
6 | request_finished = object()
7 | 


--------------------------------------------------------------------------------
/feedme/spiders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tonywu7/feedly-link-aggregator/7c32b45fbfa1794b081e90add033403308a2da85/feedme/spiders/__init__.py


--------------------------------------------------------------------------------
/feedme/spiders/base.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | from __future__ import annotations
 24 | 
 25 | import os
 26 | import time
 27 | from abc import ABC, abstractmethod
 28 | from datetime import datetime
 29 | from pprint import pformat
 30 | from typing import Optional, Union
 31 | 
 32 | from scrapy import Spider
 33 | from scrapy.exceptions import CloseSpider
 34 | from scrapy.http import Request, TextResponse
 35 | from scrapy.signals import spider_opened
 36 | 
 37 | from ..feedly import FeedlyEntry, build_api_url, get_feed_uri
 38 | from ..requests import ProbeFeed
 39 | from ..signals import (request_finished, resume_requests, show_stats,
 40 |                        start_from_scratch)
 41 | from ..urlkit import build_urls, select_templates
 42 | from ..utils import LOG_LISTENER, JSONDict
 43 | from ..utils import colored as _
 44 | from ..utils import guard_json
 45 | 
 46 | 
 47 | class FeedlyRSSSpider(Spider, ABC):
 48 |     custom_settings = {
 49 |         'ROBOTSTXT_OBEY': False,
 50 |     }
 51 | 
 52 |     class SpiderConfig:
 53 |         OUTPUT = f'./crawl.{datetime.now().strftime("%Y%m%d%H%M%S")}'
 54 | 
 55 |         RSS = 'https://xkcd.com/atom.xml'
 56 |         RSS_TEMPLATES = {}
 57 | 
 58 |         DOWNLOAD_ORDER = 'oldest'
 59 |         DOWNLOAD_PER_BATCH = 1000
 60 | 
 61 |         ENABLE_SEARCH = False
 62 |         ACCESS_TOKEN = None
 63 | 
 64 |         STREAM_ID_PREFIX = 'feed/'
 65 | 
 66 |         DATABASE_CACHE_SIZE = 100000
 67 | 
 68 |     SELECTION_STRATS = {
 69 |         'dead': {None: 1, True: 1, False: 0},
 70 |         'alive': {None: 1, True: 0, False: 1},
 71 |         'dead+': {None: 1, True: 1, False: -128},
 72 |         'alive+': {None: 1, True: -128, False: 1},
 73 |         'all': {None: 1, True: 1, False: 1},
 74 |     }
 75 | 
 76 |     @classmethod
 77 |     def from_crawler(cls, crawler, *args, **kwargs):
 78 |         spider: FeedlyRSSSpider = super().from_crawler(crawler, *args, config=crawler.settings, **kwargs)
 79 |         spider.stats = crawler.stats
 80 |         spider.signals = crawler.signals
 81 |         crawler.signals.connect(spider.open_spider, spider_opened)
 82 |         crawler.signals.connect(spider.resume_crawl, resume_requests)
 83 |         crawler.signals.send_catch_log(show_stats, names=['rss/page_count'])
 84 |         return spider
 85 | 
 86 |     def __init__(self, *, name=None, config, **kwargs):
 87 |         super().__init__(name=name, **kwargs)
 88 |         self.config = config
 89 |         self.item_limit = config.getint('DOWNLOAD_LIMIT', 0)
 90 | 
 91 |         output_dir = config['OUTPUT']
 92 |         os.makedirs(output_dir, exist_ok=True)
 93 | 
 94 |         self.api_base_params = {
 95 |             'count': int(config['DOWNLOAD_PER_BATCH']),
 96 |             'ranked': config['DOWNLOAD_ORDER'],
 97 |             'similar': 'true',
 98 |             'unreadOnly': 'false',
 99 |         }
100 | 
101 |         self.freezer = None
102 |         self.resume_iter = None
103 | 
104 |     def open_spider(self, spider):
105 |         conf = self.config['SPIDER_CONFIG']
106 |         self.logger.info(f'Spider parameters:\n{pformat(conf.copy_to_dict())}')
107 | 
108 |     @abstractmethod
109 |     def start_requests(self):
110 |         self.signals.send_catch_log(resume_requests, spider=self)
111 |         yield from self.resume_crawl()
112 | 
113 |     def resume_crawl(self):
114 |         freezer = self.freezer
115 |         may_resume = False
116 |         if freezer is not None:
117 |             requests = freezer.defrost(self)
118 |             try:
119 |                 req = next(requests)
120 |             except StopIteration:
121 |                 pass
122 |             else:
123 |                 may_resume = self.ask_if_resume(freezer)
124 | 
125 |         if not may_resume:
126 |             feed = self.config['RSS']
127 |             freezer.dump_info({'crawling': feed})
128 |             self.signals.send_catch_log(start_from_scratch)
129 |             yield self.probe_feed(feed, meta={'reason': 'user_specified', 'depth': 1})
130 |             return
131 | 
132 |         self.logger.info(_('Resuming crawl.', color='cyan'))
133 |         self.resume_iter = requests
134 |         yield req
135 | 
136 |     def ask_if_resume(self, freezer):
137 |         feed = self.config['RSS']
138 |         resume_feed = freezer.load_info().get('crawling')
139 |         if resume_feed != feed:
140 |             self.logger.info(_('Found unfinished crawl job:', color='cyan'))
141 |             self.logger.info(_(f"Continue crawling '{resume_feed}'?", color='cyan'))
142 |             self.logger.info(_(f"Start new crawl with '{feed}'?", color='cyan'))
143 |             self.logger.info(_('Or exit?', color='cyan'))
144 |             action = 'x'
145 |         else:
146 |             action = 'c'
147 | 
148 |         LOG_LISTENER.stop()
149 |         while action not in 'cse':
150 |             action = input('(continue/start/exit) [c]: ')[:1]
151 |         LOG_LISTENER.start()
152 | 
153 |         if action == 'e':
154 |             raise CloseSpider()
155 |         if action == 's':
156 |             freezer.clear()
157 |             freezer.dump_info({'crawling': feed})
158 |             return False
159 |         return True
160 | 
161 |     def get_streams_url(self, feed_id: str, **params) -> str:
162 |         params = {**self.api_base_params, **params}
163 |         return build_api_url('streams', streamId=feed_id, **params)
164 | 
165 |     def probe_feed(self, query: str, derive: bool = True, source: Optional[Request] = None, **kwargs):
166 |         templates = self.config['RSS_TEMPLATES']
167 |         if derive and templates:
168 |             try:
169 |                 urls = build_urls(query, *select_templates(query, templates))
170 |             except ValueError:
171 |                 self.logger.debug(f'No template for {query}')
172 |                 urls = [query]
173 |         else:
174 |             urls = [query]
175 | 
176 |         prefix = self.config['STREAM_ID_PREFIX']
177 |         meta = kwargs.pop('meta', {})
178 |         meta['try_feeds'] = {f'{prefix}{u}': None for u in urls}
179 |         return ProbeFeed(url=query, callback=self.start_feeds, meta=meta, source=source, **kwargs)
180 | 
181 |     def start_feeds(self, response: TextResponse):
182 |         meta = response.meta
183 |         self.signals.send_catch_log(request_finished, request=response.request.copy())
184 | 
185 |         del meta['is_probe']
186 |         feeds = meta.get('valid_feeds')
187 |         if feeds is None:
188 |             feeds = meta.get('try_feeds', {})
189 |         if not feeds and meta['reason'] == 'user_specified':
190 |             self.logger.info(f'No valid RSS feed can be found using `{meta["feed_url"]}` and available feed templates.')
191 |             self.logger.critical('No feed to crawl!')
192 | 
193 |         yield from self.filter_feeds(feeds, meta)
194 |         yield from self.get_feed_info(feeds, meta)
195 | 
196 |     def filter_feeds(self, feeds, meta):
197 |         for feed in feeds:
198 |             yield self.next_page({'id': feed}, meta=meta, initial=True)
199 | 
200 |     def get_feed_info(self, feeds, meta):
201 |         feed_info = meta.get('feed_info', {})
202 |         for feed, info in feed_info.items():
203 |             yield {'source': info, 'dead': feeds.get(feed)}
204 | 
205 |     def next_page(self, data: JSONDict, response: Optional[TextResponse] = None, initial: bool = False, **kwargs) -> Union[JSONDict, Request]:
206 |         feed = data['id']
207 | 
208 |         if response:
209 |             meta = {**response.meta}
210 |         else:
211 |             meta = {}
212 |         meta.update(kwargs.pop('meta', {}))
213 |         if not initial:
214 |             meta['no_filter'] = True
215 |             meta.pop('inc_depth', None)
216 | 
217 |         feed_url = get_feed_uri(feed)
218 |         meta['feed_url'] = feed_url
219 | 
220 |         meta['pkey'] = (feed_url, 'main')
221 | 
222 |         params = {}
223 |         cont = data.get('continuation')
224 |         if cont:
225 |             params['continuation'] = cont
226 |             meta['reason'] = 'continuation'
227 |         elif not initial:
228 |             self.logger.info(f'Exhausted: {feed_url}')
229 |             self.signals.send_catch_log(request_finished, request=response.request.copy())
230 |             return
231 | 
232 |         depth = meta.get('depth')
233 |         reason = meta.get('reason')
234 |         self.logger.debug(f'initial={initial} depth={depth} reason={reason} {feed}')
235 | 
236 |         url = self.get_streams_url(feed, **params)
237 |         if response:
238 |             return response.request.replace(url=url, meta=meta, **kwargs)
239 |         return Request(url, callback=self.parse_feed, meta=meta, **kwargs)
240 | 
241 |     def parse_feed(self, response: TextResponse):
242 |         if not response:
243 |             return
244 | 
245 |         data = guard_json(response.text)
246 |         items = data.get('items')
247 |         source = response.meta['feed_url']
248 |         if items:
249 |             response.meta['valid_feed'] = True
250 |             if response.meta.get('reason') != 'continuation':
251 |                 self.logger.info(_(f'Got new feed: {source}', color='green'))
252 | 
253 |         count = response.meta.get('item_scraped', 0)
254 |         for item in items:
255 |             entry = FeedlyEntry.from_upstream(item)
256 |             if not entry.source:
257 |                 entry.source = {'feed': source}
258 |             if not entry:
259 |                 continue
260 | 
261 |             self.stats.inc_value('rss/page_count')
262 | 
263 |             depth = response.meta.get('depth', 0)
264 |             yield {
265 |                 'item': entry,
266 |                 'depth': depth,
267 |                 'time_crawled': time.time(),
268 |             }
269 |             count += 1
270 |         response.meta['item_scraped'] = count
271 | 
272 |         if self.item_limit and count >= self.item_limit:
273 |             return
274 | 
275 |         next_page = self.next_page(data, response=response)
276 |         if next_page:
277 |             yield next_page
278 | 


--------------------------------------------------------------------------------
/feedme/spiders/cluster.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import logging
 24 | from collections import defaultdict
 25 | from urllib.parse import urlsplit
 26 | 
 27 | from scrapy.crawler import Crawler
 28 | from scrapy.exceptions import NotConfigured
 29 | from scrapy.http import Request, TextResponse
 30 | 
 31 | from ..datastructures import compose_mappings
 32 | from ..docs import OptionsContributor
 33 | from ..feedly import FeedlyEntry
 34 | from ..signals import (register_state, request_finished, show_stats,
 35 |                        start_from_scratch)
 36 | from ..utils import SpiderOutput
 37 | from ..utils import colored as _
 38 | from .base import FeedlyRSSSpider
 39 | 
 40 | 
 41 | class ExplorationSpiderMiddleware:
 42 |     @classmethod
 43 |     def from_crawler(cls, crawler):
 44 |         if crawler.spidercls is not FeedClusterSpider:
 45 |             raise NotConfigured()
 46 | 
 47 |         crawler.signals.send_catch_log(show_stats, names=[
 48 |             'rss/hyperlink_count',
 49 |             'cluster/1_discovered_nodes',
 50 |             'cluster/2_scheduled_nodes',
 51 |             'cluster/3_finished_nodes',
 52 |             'cluster/4_explored',
 53 |         ])
 54 |         return cls(crawler)
 55 | 
 56 |     def __init__(self, crawler: Crawler):
 57 |         self.stats = crawler.stats
 58 |         self.logger = logging.getLogger('explore')
 59 |         self._depth_limit = crawler.settings.getint('DEPTH_LIMIT', 1)
 60 |         self._threshold = crawler.settings.getint('EXPANSION_THRESHOLD', 0)
 61 |         self._discovered = defaultdict(int)
 62 |         self._scheduled = set()
 63 |         self._finished = set()
 64 | 
 65 |         crawler.signals.connect(self.clear_state_info, start_from_scratch)
 66 |         crawler.signals.connect(self.update_finished, request_finished)
 67 |         crawler.signals.send_catch_log(
 68 |             register_state, obj=self, namespace='explore',
 69 |             attrs=['_discovered', '_scheduled', '_finished'],
 70 |         )
 71 | 
 72 |     def process_spider_output(self, response: TextResponse, result: SpiderOutput, spider):
 73 |         depth = response.meta.get('depth', 0)
 74 |         for data in result:
 75 |             if isinstance(data, Request):
 76 |                 yield data
 77 |                 continue
 78 |             if 'item' in data:
 79 |                 item = data['item']
 80 |                 self.stats.inc_value('rss/page_count')
 81 |                 yield from self.process_item(response, item, depth, spider)
 82 |             yield data
 83 | 
 84 |     def process_item(
 85 |         self, response: TextResponse,
 86 |         item: FeedlyEntry, depth: int,
 87 |         spider,
 88 |     ):
 89 |         dest = {urlsplit(k): v for k, v in item.hyperlinks.items()}
 90 |         dest = {k: v for k, v in dest.items() if k.netloc}
 91 |         self.stats.inc_value('rss/hyperlink_count', len(dest))
 92 | 
 93 |         for u in dest:
 94 |             self._discovered[f'{u.scheme}://{u.netloc}'] += 1
 95 | 
 96 |         if not self._depth_limit or depth < self._depth_limit:
 97 |             yield from self.schedule_new_nodes(item, depth, response.request, spider)
 98 | 
 99 |         self.update_ratio()
100 | 
101 |     def schedule_new_nodes(self, item, depth, request, spider):
102 |         sites = ({u for u, v in self._discovered.items() if v > self._threshold}
103 |                  - self._scheduled)
104 |         self._scheduled |= sites
105 |         self.logger.debug(f'depth={depth}; +{len(sites)}')
106 | 
107 |         for url in sites:
108 |             self.logger.debug(f'{url} (depth={depth})')
109 |             yield spider.probe_feed(
110 |                 url, source=request,
111 |                 meta={
112 |                     'inc_depth': 1,
113 |                     'depth': depth,
114 |                     'reason': 'newly_discovered',
115 |                     'source_item': item,
116 |                 })
117 | 
118 |     def update_finished(self, request: Request):
119 |         if 'is_probe' in request.meta:
120 |             return
121 |         feed_url = request.meta.get('feed_url')
122 |         if not feed_url:
123 |             return
124 |         self._finished.add(urlsplit(feed_url).netloc)
125 |         self.stats.set_value('cluster/3_finished_nodes', len(self._finished))
126 |         self.update_ratio()
127 | 
128 |     def update_ratio(self):
129 |         scheduled = len(self._scheduled)
130 |         self.stats.set_value('cluster/1_discovered_nodes', len(self._discovered))
131 |         self.stats.set_value('cluster/2_scheduled_nodes', scheduled)
132 |         finished = self.stats.get_value('cluster/3_finished_nodes', 0)
133 |         if not scheduled:
134 |             return
135 |         ratio = finished / scheduled
136 |         self.stats.set_value('cluster/4_explored', f'{ratio * 100:.2f}%')
137 | 
138 |     def clear_state_info(self):
139 |         self._discovered.clear()
140 |         self._scheduled.clear()
141 |         self._finished.clear()
142 | 
143 | 
144 | class FeedClusterSpider(FeedlyRSSSpider, OptionsContributor, _doc_order=9):
145 |     """
146 |     Spider to crawl a group of feeds.
147 | 
148 |     It works by recursively trying to crawl websites found in the contents of a feed,
149 |     until it hits the depth limit, or until no more crawlable website can be found.
150 | 
151 |     Usage
152 |     -----
153 |     `scrapy crawl cluster -s OPTIONS=... ...`
154 | 
155 |     This spider supports all options supported by the single feed spider.
156 |     """
157 | 
158 |     name = 'cluster'
159 | 
160 |     custom_settings = compose_mappings(FeedlyRSSSpider.custom_settings, {
161 |         'DEPTH_PRIORITY': 1,
162 |         'SCHEDULER_DISK_QUEUE': 'scrapy.squeues.PickleFifoDiskQueue',
163 |         'SCHEDULER_MEMORY_QUEUE': 'scrapy.squeues.FifoMemoryQueue',
164 |     })
165 | 
166 |     class SpiderConfig(FeedlyRSSSpider.SpiderConfig):
167 |         FOLLOW_DOMAINS = None
168 |         DEPTH_LIMIT = 1
169 | 
170 |     def start_requests(self):
171 |         return super().start_requests()
172 | 
173 |     def filter_feeds(self, feeds, meta):
174 |         if meta['reason'] == 'user_specified':
175 |             for feed in feeds:
176 |                 yield self.next_page({'id': feed}, meta=meta, initial=True)
177 |             return
178 | 
179 |         select = self.config.get('SELECT_FEED_STATE', 'all')
180 |         for feed, dead in feeds.items():
181 |             prio = self.SELECTION_STRATS[select][dead]
182 |             if not prio:
183 |                 self.logger.info(_(f'Dropped {"dead" if dead else "living"} feed {feed[5:]}', color='grey'))
184 |             else:
185 |                 yield self.next_page({'id': feed}, meta=meta, initial=True, priority=prio)
186 | 
187 |     @staticmethod
188 |     def _help_options():
189 |         return {
190 |             'EXPANSION_THRESHOLD': """
191 |             Number of times a website must be mentioned by a feed before it will be scheduled.
192 | 
193 |             Set to a number > 1 to filter out sites that are only mentioned a few times.
194 |             """,
195 |             'FOLLOW_DOMAINS': """
196 |             Only nodes whose domains or parent domains are included here will be expanded upon.
197 | 
198 |             Value should be a collection of domains. (Other nodes are still recorded,
199 |             but are not used to find new feeds).
200 | 
201 |             If set to None, spider will not filter nodes based on domains.
202 | 
203 |             **Example**
204 | 
205 |                 `FOLLOW_DOMAINS = ['tumblr.com', 'wordpress.com']`
206 |             """,
207 |             'DEPTH_LIMIT': """
208 |             How much the spider will expand the cluster. Value should be an integer.
209 | 
210 |             (This is the same settings as the one used by the built-in ~DepthMiddleware~.)
211 | 
212 |             Nodes that are more than `depth + 1` degree removed from the starting feed
213 |             will not be expanded upon.
214 | 
215 |             If set to ~1~, only the starting feed will be crawled.
216 |             If set to ~0~ or ~None~, spider will keep crawling until manually stopped.
217 |             """,
218 |             'SELECT_FEED_STATE': """
219 |             Only crawl feeds that are of a certain `state`.
220 | 
221 |             A feed can be in one of two states:
222 |             `dead`    - The feed URL is unreachable (e.g. timed out); or a HEAD request
223 |                           returns a status code other than `200 OK`, `206 Partial`, or
224 |                           `405 Method Not Allowed`;
225 |                           or the responded MIME type is anything other than that of a
226 |                           valid RSS feed `(text/xml, application/xml, application/rss+xml,`
227 |                           `application/rdf+xml, application/atom+xml)`.
228 |             `alive`   - All other feeds are considered alive.
229 | 
230 |             This option accepts the following values:
231 |             ~all~     - Do not filter feeds based on their state
232 |             ~dead~    - Only crawl dead feeds
233 |             ~alive~   - Only crawl living feeds
234 |             ~dead+~   - Crawl all feeds, but dead feeds receive a higher priority
235 |             ~alive+~  - Crawl all feeds, but living feeds receive a higher priority
236 | 
237 |             Note that values other than `all` cause the spider to send a HEAD request to
238 |             each feed URL about to be crawled, which will add a slight overhead to the running time.
239 |             """,
240 |         }
241 | 


--------------------------------------------------------------------------------
/feedme/spiders/feed.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | from __future__ import annotations
 24 | 
 25 | from ..docs import OptionsContributor
 26 | from .base import FeedlyRSSSpider
 27 | 
 28 | 
 29 | class FeedSpider(FeedlyRSSSpider, OptionsContributor, _doc_order=10):
 30 |     """
 31 |     Spider to crawl a single feed.
 32 | 
 33 |     Usage
 34 |     -----
 35 |     `scrapy crawl feed -s OPTIONS=... ...`
 36 |     """
 37 | 
 38 |     name = 'feed'
 39 | 
 40 |     def start_requests(self):
 41 |         return super().start_requests()
 42 | 
 43 |     @staticmethod
 44 |     def _help_options():
 45 |         return {
 46 |             'OUTPUT': """
 47 |             Path where scraped data will be saved; will be a directory
 48 |             If an existing directory with scraped data is specified, newly gathered
 49 |             data will be merged with existing one.
 50 |             """,
 51 |             'RSS': """
 52 |             URL to the RSS feed you would like to scrape.
 53 |             Must contain the protocol part of the URL, e.g. `http://`.
 54 |             """,
 55 |             'DOWNLOAD_ORDER': """
 56 |             The part of the feed to download first: either `oldest` or `newest`
 57 |             """,
 58 |             'DOWNLOAD_PER_BATCH': """
 59 |             Number of entries to download per API request. The minimum is 1 and the maximum is 1000.
 60 |             """,
 61 |             'DOWNLOAD_LIMIT': """
 62 |             Number of items to scrape from one RSS feed source before stopping.
 63 | 
 64 |             Note that one website may have multiple sources e.g. if using feed templates. This number
 65 |             applies to each source.
 66 |             """,
 67 |             'RSS_TEMPLATES': """
 68 |             Templates to generate different versions of RSS URLs based on the value of the RSS setting.
 69 | 
 70 |             Because Feedly sometimes store an RSS feed's source URL with slight variations (e.g. using
 71 |             HTTP instead of HTTPS), the URL that you provide above may yield incomplete results
 72 |             (sometimes no result at all).
 73 | 
 74 |             If you know how the URLs could vary, this option allows you to define URL templates,
 75 |             so that Scrapy can try different versions of URLs to increase the chance of finding the
 76 |             correct feed ID on Feedly.
 77 | 
 78 |             This option should be a mapping (a dict), where the key should be a valid regular
 79 |             expression that matches the URLs you wish to apply the corresponding the templates, and the
 80 |             value should be either another mapping, where the key is a %-format string with named placeholders,
 81 |             which will be formatted into the final URL and the value is a number that denotes the priority of the
 82 |             template: templates with a lower number are tried first (similar to how Scrapy middlewares are ordered).
 83 | 
 84 |             Or it could also be a callable, in which case it is passed the matched URL as a `urlsplit` tuple,
 85 |             and the regex match object, and it should return an iterable.
 86 | 
 87 |             Note that only the templates under the first matching pattern are used. Since dicts are ordered
 88 |             you should place more specific patterns at the top of the mapping.
 89 | 
 90 |             Available placeholders are:
 91 |             **The components of a urllib.parse.urlsplit named tuple:**
 92 |                 ~%(scheme)s~          - Network protocol (usually `http` or `https`)
 93 |                 ~%(netloc)s~          - Domain name
 94 |                 ~%(path)s~            - Path of the URL, with leading / and without the query string
 95 |                 ~%(query)s~           - Query string, without the question mark (`key1=value1&key2=value2...`)
 96 |             **Plus some convenient values:**
 97 |                 ~%(original)s~        - The original string, unchanged
 98 |                 ~%(network_path)s~    - URL minus the protocol part, equivalent to `//%(netloc)s/%(path)s?%(query)s`
 99 |                 ~%(path_query)s~      - URL minus protocol and domain name, equivalent to `/%(path)s?%(query)s`
100 |             **If you define capture groups in your pattern:**
101 |                 ~%(key)s ...~         - Named groups
102 |                 ~%(1)s, %(2)s~        - Numbered groups
103 | 
104 |             **Example**:
105 | 
106 |                 `RSS_TEMPLATES = {`
107 |                 `    r'.*': {  # This regular expression will match any strings`
108 |                 `        'http:%(network_path)s': 997,`
109 |                 `        'https:%(network_path)s': 998,`
110 |                 `        '%(original)s': 999,`
111 |                 `    },`
112 |                 `}`
113 |             """,
114 |             'ENABLE_SEARCH': """
115 |             Whether or not to enable the search function
116 |             If enabled, when the feed URL you provided above does not yield any result from Feedly,
117 |             Scrapy will use Feedly's Search API to try to find the correct URL.
118 | 
119 |             It is recommended that you disable search when using the cluster spider, because it could generate
120 |             a large number of search requests, and Feedly's Search API is a lot more sensitive to
121 |             high volume requests than its Streams API, meaning you may quickly run into rate-limiting issues.
122 |             """,
123 |             'ACCESS_TOKEN': """
124 |             If you have a developer access token, you can provide it here.
125 |             """,
126 |         }
127 | 


--------------------------------------------------------------------------------
/feedme/spiders/settings.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | import re
24 | from pathlib import Path
25 | from typing import List
26 | from urllib.parse import unquote
27 | 
28 | from scrapy.utils.url import add_http_if_no_scheme
29 | 
30 | 
31 | def single_item(f):
32 |     def wrapped(*args, **kwargs):
33 |         return {f.__name__.upper(): f(*args, **kwargs)}
34 |     return wrapped
35 | 
36 | 
37 | class SettingsAdapter:
38 |     @staticmethod
39 |     def output(v):
40 |         p = Path(v)
41 |         return {'OUTPUT': p, 'JOBDIR': p / 'scheduled/jobs'}
42 | 
43 |     @staticmethod
44 |     @single_item
45 |     def rss(v):
46 |         return add_http_if_no_scheme(unquote(v))
47 | 
48 |     @staticmethod
49 |     @single_item
50 |     def rss_templates(conf):
51 |         return {re.compile(k): v for k, v in conf.items()}
52 | 
53 |     @staticmethod
54 |     @single_item
55 |     def follow_domains(domains):
56 |         if isinstance(domains, str):
57 |             domains = set(domains.split(' '))
58 |         elif isinstance(domains, List):
59 |             domains = set(domains)
60 |         return domains
61 | 


--------------------------------------------------------------------------------
/feedme/sql/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tonywu7/feedly-link-aggregator/7c32b45fbfa1794b081e90add033403308a2da85/feedme/sql/__init__.py


--------------------------------------------------------------------------------
/feedme/sql/cli.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import logging
 24 | import os
 25 | import re
 26 | import shutil
 27 | import sqlite3
 28 | from functools import reduce
 29 | from pathlib import Path
 30 | 
 31 | from setuptools.version import pkg_resources
 32 | 
 33 | from ..utils import colored as _
 34 | from ..utils import findpath, randstr
 35 | from .db import db
 36 | from .factory import DatabaseVersionError
 37 | from .stream import DatabaseWriter
 38 | 
 39 | MIGRATIONS = Path(Path(__file__).with_name('migrations')).resolve(True)
 40 | Version = pkg_resources.parse_version
 41 | 
 42 | 
 43 | def check(db_path, debug=False):
 44 |     log = logging.getLogger('db.check')
 45 |     try:
 46 |         writer = DatabaseWriter(db_path, db, debug=debug, cache_path=':memory:')
 47 |         writer._verify(writer._main)
 48 |         writer.close()
 49 |         log.info(_('Database is OK.', color='green'))
 50 |     except DatabaseVersionError as exc:
 51 |         log.critical(exc)
 52 |         log.error(_('Run `python -m feedme upgrade-db` to upgrade it to the current version.', color='cyan'))
 53 |         return 1
 54 |     except Exception as exc:
 55 |         log.critical(exc, exc_info=True)
 56 |         log.error(_('Database has irrecoverable inconsistencies.', color='red'))
 57 |         return 1
 58 |     else:
 59 |         return 0
 60 | 
 61 | 
 62 | def merge(output, *db_paths, debug=False):
 63 |     log = logging.getLogger('db.merge')
 64 |     for path in db_paths:
 65 |         log.info(_(f'Checking database at {path}', color='cyan'))
 66 |         exc = check(path, debug)
 67 |         if exc:
 68 |             return exc
 69 |     output = Path(output)
 70 |     db_paths = [Path(p) for p in db_paths]
 71 |     initial = db_paths[0]
 72 |     log.info(_(f'Copying initial database {initial}', color='cyan'))
 73 |     shutil.copyfile(initial, output)
 74 |     out = DatabaseWriter(output, db, debug=debug, cache_path=':memory:')
 75 |     for path in db_paths[1:]:
 76 |         log.info(_(f'Copying database {path}', color='cyan'))
 77 |         cp = output.with_name(randstr(8) + '.db')
 78 |         shutil.copyfile(path, cp)
 79 |         log.info(_(f'Merging {path}', color='cyan'))
 80 |         out._merge_other(other=cp)
 81 |         cp.unlink()
 82 |         cp.with_suffix('.db-shm').unlink()
 83 |         cp.with_suffix('.db-wal').unlink()
 84 |     out.report()
 85 |     out.close()
 86 |     return 0
 87 | 
 88 | 
 89 | def migrate(db_path, debug=False, version=db.version):
 90 |     conn = sqlite3.Connection(db_path, isolation_level=None)
 91 |     log = logging.getLogger('db.migrate')
 92 |     if debug:
 93 |         conn.set_trace_callback(log.debug)
 94 | 
 95 |     if db.is_locked(conn):
 96 |         log.error('Database was left in a partially consistent state.')
 97 |         log.error('Run `python -m feedme check-db` to fix it first.')
 98 |         return 1
 99 | 
100 |     outdated = False
101 |     try:
102 |         db.verify_version(conn)
103 |     except DatabaseVersionError as e:
104 |         outdated = e.db
105 | 
106 |     if not outdated:
107 |         log.info(_('Database version is already up-to-date.', color='green'))
108 |         return 0
109 | 
110 |     source_ver = Version(outdated)
111 |     target_ver = Version(version)
112 |     versions = {}
113 |     for cmd in os.listdir(MIGRATIONS):
114 |         from_, to_ = cmd[:-4].split('_')
115 |         from_ = Version(from_)
116 |         to_ = Version(to_)
117 |         to_versions = versions.setdefault(from_, set())
118 |         to_versions.add(to_)
119 | 
120 |     path = []
121 |     scripts = []
122 |     if findpath(source_ver, target_ver, versions, path):
123 |         reduce(lambda x, y: scripts.append((x, y, f'{x}_{y}.sql')) or y, path)
124 |     else:
125 |         log.error(f'This version of the program no longer supports migrating from {source_ver} to {target_ver}')
126 |         return 1
127 | 
128 |     for old, new, cmd in scripts:
129 |         log.info(f'Upgrading database schema from v{old} to v{new}. This may take a long time.')
130 |         with open(MIGRATIONS / cmd) as f:
131 |             try:
132 |                 conn.executescript(f.read())
133 |             except sqlite3.OperationalError as e:
134 |                 log.error(e, exc_info=True)
135 |                 log.error('Failed to upgrade database. Undoing.')
136 |                 conn.rollback()
137 |                 conn.close()
138 |                 return 1
139 |             else:
140 |                 conn.commit()
141 | 
142 |     log.info(_('Compacting database... This may take a long time.', color='cyan'))
143 |     conn.execute('VACUUM;')
144 |     log.info(_('Done.', color='green'))
145 |     return 0
146 | 
147 | 
148 | def leftovers(wd, debug=False):
149 |     log = logging.getLogger('db.leftovers')
150 |     main = Path(wd) / 'index.db'
151 |     tmp_pattern = re.compile(r'.*~tmp-[0-9a-f]{8}\.db$')
152 |     for temp in os.listdir(wd):
153 |         if tmp_pattern.match(temp):
154 |             temp = main.with_name(temp)
155 |             log.info(f'Found unmerged temp database {temp}')
156 |             writer = DatabaseWriter(main, db, debug=debug, cache_path=temp)
157 |             writer.merge()
158 |             writer.close()
159 |             writer.cleanup()
160 |     log.info(_('All temporary databases have been merged.', color='green'))
161 | 


--------------------------------------------------------------------------------
/feedme/sql/db.json:
--------------------------------------------------------------------------------
1 | {"order": ["url", "keyword", "item", "hyperlink", "feed", "tagging", "summary", "webpage"], "version": "0.10.6", "versioning": {"create": "CREATE TABLE IF NOT EXISTS __version__ (\n\tversion VARCHAR NOT NULL, \n\tCONSTRAINT pk___version__ PRIMARY KEY (version)\n)", "insert": "INSERT OR REPLACE INTO __version__ (version) VALUES (?)"}, "init": ["PRAGMA foreign_keys = ON", "PRAGMA journal_mode = WAL"], "tables": {"url": "CREATE TABLE IF NOT EXISTS url (\n\tid INTEGER NOT NULL, \n\turl VARCHAR NOT NULL, \n\tCONSTRAINT pk_url PRIMARY KEY (id)\n)", "keyword": "CREATE TABLE IF NOT EXISTS keyword (\n\tid INTEGER NOT NULL, \n\tkeyword VARCHAR NOT NULL, \n\tCONSTRAINT pk_keyword PRIMARY KEY (id)\n)", "item": "CREATE TABLE IF NOT EXISTS item (\n\tid INTEGER NOT NULL, \n\turl INTEGER NOT NULL, \n\tsource INTEGER NOT NULL, \n\ttitle VARCHAR, \n\tauthor VARCHAR, \n\tpublished DATETIME, \n\tupdated DATETIME, \n\tcrawled FLOAT, \n\tCONSTRAINT pk_item PRIMARY KEY (id), \n\tCONSTRAINT fk_item_url_url FOREIGN KEY(url) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, \n\tCONSTRAINT fk_item_source_url FOREIGN KEY(source) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE\n)", "hyperlink": "CREATE TABLE IF NOT EXISTS hyperlink (\n\tid INTEGER NOT NULL, \n\tsource_id INTEGER NOT NULL, \n\ttarget_id INTEGER NOT NULL, \n\telement VARCHAR NOT NULL, \n\tCONSTRAINT pk_hyperlink PRIMARY KEY (id), \n\tCONSTRAINT fk_hyperlink_source_id_url FOREIGN KEY(source_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, \n\tCONSTRAINT fk_hyperlink_target_id_url FOREIGN KEY(target_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE\n)", "feed": "CREATE TABLE IF NOT EXISTS feed (\n\tid INTEGER NOT NULL, \n\turl_id INTEGER NOT NULL, \n\ttitle TEXT NOT NULL, \n\tdead BOOLEAN, \n\tCONSTRAINT pk_feed PRIMARY KEY (id), \n\tCONSTRAINT fk_feed_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, \n\tCONSTRAINT ck_feed_dead CHECK (dead IN (0, 1))\n)", "tagging": "CREATE TABLE IF NOT EXISTS tagging (\n\tid INTEGER NOT NULL, \n\turl_id INTEGER NOT NULL, \n\tkeyword_id INTEGER NOT NULL, \n\tCONSTRAINT pk_tagging PRIMARY KEY (id), \n\tCONSTRAINT fk_tagging_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE, \n\tCONSTRAINT fk_tagging_keyword_id_keyword FOREIGN KEY(keyword_id) REFERENCES keyword (id) ON DELETE RESTRICT ON UPDATE CASCADE\n)", "summary": "CREATE TABLE IF NOT EXISTS summary (\n\tid INTEGER NOT NULL, \n\turl_id INTEGER NOT NULL, \n\tmarkup TEXT NOT NULL, \n\tCONSTRAINT pk_summary PRIMARY KEY (id), \n\tCONSTRAINT fk_summary_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE\n)", "webpage": "CREATE TABLE IF NOT EXISTS webpage (\n\tid INTEGER NOT NULL, \n\turl_id INTEGER NOT NULL, \n\tmarkup TEXT NOT NULL, \n\tCONSTRAINT pk_webpage PRIMARY KEY (id), \n\tCONSTRAINT fk_webpage_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE\n)"}, "indices": {"ix_url_url": "CREATE UNIQUE INDEX IF NOT EXISTS ix_url_url ON url (url)", "ix_keyword_keyword": "CREATE UNIQUE INDEX IF NOT EXISTS ix_keyword_keyword ON keyword (keyword)", "ix_item_url": "CREATE UNIQUE INDEX IF NOT EXISTS ix_item_url ON item (url)", "ix_hyperlink_source_id_target_id_element": "CREATE UNIQUE INDEX IF NOT EXISTS ix_hyperlink_source_id_target_id_element ON hyperlink (source_id, target_id, element)", "ix_feed_url_id": "CREATE UNIQUE INDEX IF NOT EXISTS ix_feed_url_id ON feed (url_id)", "ix_tagging_url_id_keyword_id": "CREATE UNIQUE INDEX IF NOT EXISTS ix_tagging_url_id_keyword_id ON tagging (url_id, keyword_id)", "ix_summary_url_id": "CREATE UNIQUE INDEX IF NOT EXISTS ix_summary_url_id ON summary (url_id)", "ix_webpage_url_id": "CREATE UNIQUE INDEX IF NOT EXISTS ix_webpage_url_id ON webpage (url_id)"}, "models": {"url": {"columns": {"id": {}, "url": {}}, "info": {}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["url"]], "foreign_keys": []}, "keyword": {"columns": {"id": {}, "keyword": {}}, "info": {}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["keyword"]], "foreign_keys": []}, "item": {"columns": {"id": {}, "url": {}, "source": {}, "title": {}, "author": {}, "published": {}, "updated": {}, "crawled": {}}, "info": {}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["url"]], "foreign_keys": [["url", "url", "id"], ["source", "url", "id"]]}, "hyperlink": {"columns": {"id": {}, "source_id": {}, "target_id": {}, "element": {}}, "info": {}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["element", "source_id", "target_id"]], "foreign_keys": [["target_id", "url", "id"], ["source_id", "url", "id"]]}, "feed": {"columns": {"id": {}, "url_id": {}, "title": {}, "dead": {}}, "info": {}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["url_id"]], "foreign_keys": [["url_id", "url", "id"]]}, "tagging": {"columns": {"id": {}, "url_id": {}, "keyword_id": {}}, "info": {}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["keyword_id", "url_id"]], "foreign_keys": [["url_id", "url", "id"], ["keyword_id", "keyword", "id"]]}, "summary": {"columns": {"id": {}, "url_id": {}, "markup": {}}, "info": {"dedup": "max", "onconflict": "REPLACE"}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["url_id"]], "foreign_keys": [["url_id", "url", "id"]]}, "webpage": {"columns": {"id": {}, "url_id": {}, "markup": {}}, "info": {"dedup": "max", "onconflict": "REPLACE"}, "autoincrement": ["id"], "primary_key": ["id"], "unique": [["url_id"]], "foreign_keys": [["url_id", "url", "id"]]}}}


--------------------------------------------------------------------------------
/feedme/sql/db.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | 
3 | import simplejson as json
4 | 
5 | from .factory import Database
6 | 
7 | with open(Path(__file__).with_name('db.json')) as f:
8 |     db = Database(json.load(f))
9 | 


--------------------------------------------------------------------------------
/feedme/sql/describe.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import sys
 24 | from importlib.util import module_from_spec, spec_from_file_location
 25 | from typing import Dict, List
 26 | 
 27 | import simplejson as json
 28 | from sqlalchemy import Table
 29 | from sqlalchemy.dialects import sqlite
 30 | from sqlalchemy.schema import (CreateIndex, CreateTable, Index,
 31 |                                PrimaryKeyConstraint, UniqueConstraint)
 32 | 
 33 | 
 34 | def describe_model(table):
 35 |     config = {
 36 |         'columns': {},
 37 |         'info': table.info,
 38 |         'autoincrement': [None],
 39 |         'primary_key': set(),
 40 |         'unique': set(),
 41 |         'foreign_keys': set(),
 42 |     }
 43 |     for name, column in table.columns.items():
 44 |         config['columns'][name] = column.info
 45 |         if column.autoincrement is True:
 46 |             config['autoincrement'] = (name,)
 47 |     for constraint in table.constraints:
 48 |         cols = tuple(sorted(c.name for c in constraint.columns))
 49 |         if isinstance(constraint, PrimaryKeyConstraint):
 50 |             config['primary_key'] = cols
 51 |         if isinstance(constraint, UniqueConstraint):
 52 |             s = config.setdefault('unique', set())
 53 |             s.add(cols)
 54 |     for index in table.indexes:
 55 |         if index.unique:
 56 |             cols = tuple(sorted(c.name for c in index.columns))
 57 |             s = config.setdefault('unique', set())
 58 |             s.add(cols)
 59 |     fks = []
 60 |     for constraint in table.foreign_key_constraints:
 61 |         column = constraint.column_keys[0]
 62 |         foreign_column = list(constraint.columns[column].foreign_keys)[0].column
 63 |         fks.append((column, foreign_column.table.name, foreign_column.name))
 64 |     config['foreign_keys'] = fks
 65 |     return config
 66 | 
 67 | 
 68 | def create_table(table: Table):
 69 |     stmt = CreateTable(table).compile(dialect=sqlite.dialect())
 70 |     stmt = str(stmt).replace('TABLE', 'TABLE IF NOT EXISTS').strip()
 71 |     return stmt
 72 | 
 73 | 
 74 | def create_index(table: Table):
 75 |     creates = {}
 76 |     for index in table.indexes:
 77 |         index: Index
 78 |         stmt = CreateIndex(index).compile(dialect=sqlite.dialect())
 79 |         stmt = str(stmt).replace('INDEX', 'INDEX IF NOT EXISTS').strip()
 80 |         creates[index.name] = stmt
 81 |     return creates
 82 | 
 83 | 
 84 | def describe_database(path, out):
 85 |     spec = spec_from_file_location('schema', path)
 86 |     schema = module_from_spec(spec)
 87 |     spec.loader.exec_module(schema)
 88 | 
 89 |     tables: List[Table] = schema.tables
 90 |     tablemap: Dict[str, Table] = {t.name: t for t in tables}
 91 |     meta = {}
 92 |     meta['order'] = [t.name for t in tables]
 93 | 
 94 |     version = schema.version
 95 |     meta['version'] = version
 96 |     vers = meta['versioning'] = {}
 97 |     vers['create'] = create_table(schema.__Version__.__table__)
 98 |     vers['insert'] = 'INSERT OR REPLACE INTO __version__ (version) VALUES (?)'
 99 | 
100 |     init: List[str] = schema.init
101 |     tables_create = {}
102 |     indices_create = {}
103 |     for t in tables:
104 |         tables_create[t.name] = create_table(t)
105 |         indices_create.update(create_index(t))
106 |     meta['init'] = init
107 |     meta['tables'] = tables_create
108 |     meta['indices'] = indices_create
109 | 
110 |     models = meta['models'] = {}
111 |     for name, table in tablemap.items():
112 |         models[name] = describe_model(table)
113 | 
114 |     with open(out, 'w+') as f:
115 |         json.dump(meta, f, iterable_as_array=True)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     describe_database(sys.argv[1], sys.argv[2])
120 | 


--------------------------------------------------------------------------------
/feedme/sql/functions.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from urllib.parse import urlsplit
24 | 
25 | functions = {}
26 | 
27 | 
28 | def sqlitefunc(name, num_params):
29 |     def decorate(func):
30 |         functions[func] = (name, num_params, func)
31 |         return func
32 |     return decorate
33 | 
34 | 
35 | @sqlitefunc('urlsplit', 2)
36 | def urlsplitf(url, key):
37 |     return getattr(urlsplit(url), key, None)
38 | 
39 | 
40 | @sqlitefunc('subdomain', 2)
41 | def subdomain(x, y):
42 |     return x == y or x[-(len(y) + 1):] == f'.{y}'
43 | 
44 | 
45 | def register_all(conn):
46 |     for f in functions.values():
47 |         conn.create_function(*f)
48 | 


--------------------------------------------------------------------------------
/feedme/sql/migrations/0.10.3_0.10.5.sql:
--------------------------------------------------------------------------------
  1 | BEGIN EXCLUSIVE;
  2 | 
  3 | DROP INDEX IF EXISTS ix_url_id_url;
  4 | 
  5 | DROP INDEX IF EXISTS ix_item_id_hash;
  6 | 
  7 | ALTER TABLE
  8 |     url RENAME TO tmp;
  9 | 
 10 | CREATE TABLE url (
 11 |     id INTEGER NOT NULL,
 12 |     url VARCHAR NOT NULL,
 13 |     CONSTRAINT pk_url PRIMARY KEY (id)
 14 | );
 15 | 
 16 | INSERT INTO
 17 |     url (id, url)
 18 | SELECT
 19 |     *
 20 | FROM
 21 |     tmp;
 22 | 
 23 | DROP TABLE tmp;
 24 | 
 25 | ALTER TABLE
 26 |     keyword RENAME TO tmp;
 27 | 
 28 | CREATE TABLE keyword (
 29 |     id INTEGER NOT NULL,
 30 |     keyword VARCHAR NOT NULL,
 31 |     CONSTRAINT pk_keyword PRIMARY KEY (id)
 32 | );
 33 | 
 34 | INSERT INTO
 35 |     keyword (id, keyword)
 36 | SELECT
 37 |     *
 38 | FROM
 39 |     tmp;
 40 | 
 41 | DROP TABLE tmp;
 42 | 
 43 | CREATE TABLE tmp_tagging (url_id INTEGER, keyword_id INTEGER);
 44 | 
 45 | INSERT INTO
 46 |     tmp_tagging (url_id, keyword_id)
 47 | SELECT
 48 |     item.url AS url_id,
 49 |     tagging.keyword_id AS keyword_id
 50 | FROM
 51 |     tagging
 52 |     JOIN item ON tagging.item_id == item.id;
 53 | 
 54 | DELETE FROM
 55 |     tmp_tagging
 56 | WHERE
 57 |     rowid NOT IN (
 58 |         SELECT
 59 |             min(rowid)
 60 |         FROM
 61 |             tmp_tagging
 62 |         GROUP BY
 63 |             url_id,
 64 |             keyword_id
 65 |     );
 66 | 
 67 | DROP TABLE tagging;
 68 | 
 69 | CREATE TABLE tagging (
 70 |     id INTEGER NOT NULL,
 71 |     url_id INTEGER NOT NULL,
 72 |     keyword_id INTEGER NOT NULL,
 73 |     CONSTRAINT pk_tagging PRIMARY KEY (id),
 74 |     CONSTRAINT fk_tagging_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT,
 75 |     CONSTRAINT fk_tagging_keyword_id_keyword FOREIGN KEY(keyword_id) REFERENCES keyword (id) ON DELETE RESTRICT ON UPDATE RESTRICT
 76 | );
 77 | 
 78 | INSERT INTO
 79 |     tagging (url_id, keyword_id)
 80 | SELECT
 81 |     *
 82 | FROM
 83 |     tmp_tagging;
 84 | 
 85 | DROP TABLE tmp_tagging;
 86 | 
 87 | ALTER TABLE
 88 |     item RENAME TO tmp;
 89 | 
 90 | DELETE FROM
 91 |     tmp
 92 | WHERE
 93 |     rowid NOT IN (
 94 |         SELECT
 95 |             min(rowid)
 96 |         FROM
 97 |             tmp
 98 |         GROUP BY
 99 |             url
100 |     );
101 | 
102 | CREATE TABLE item (
103 |     id INTEGER NOT NULL,
104 |     url INTEGER NOT NULL,
105 |     source INTEGER NOT NULL,
106 |     title VARCHAR,
107 |     author VARCHAR,
108 |     published DATETIME,
109 |     updated DATETIME,
110 |     crawled FLOAT,
111 |     CONSTRAINT pk_item PRIMARY KEY (id),
112 |     CONSTRAINT fk_item_url_url FOREIGN KEY(url) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT,
113 |     CONSTRAINT fk_item_source_url FOREIGN KEY(source) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT
114 | );
115 | 
116 | INSERT INTO
117 |     item (
118 |         id,
119 |         url,
120 |         source,
121 |         title,
122 |         author,
123 |         published,
124 |         updated,
125 |         crawled
126 |     )
127 | SELECT
128 |     id,
129 |     url,
130 |     source,
131 |     title,
132 |     author,
133 |     published,
134 |     updated,
135 |     crawled
136 | FROM
137 |     tmp;
138 | 
139 | DROP TABLE tmp;
140 | 
141 | ALTER TABLE
142 |     hyperlink RENAME TO tmp;
143 | 
144 | CREATE TABLE hyperlink (
145 |     id INTEGER NOT NULL,
146 |     source_id INTEGER NOT NULL,
147 |     target_id INTEGER NOT NULL,
148 |     element VARCHAR NOT NULL,
149 |     CONSTRAINT pk_hyperlink PRIMARY KEY (id),
150 |     CONSTRAINT fk_hyperlink_source_id_url FOREIGN KEY(source_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT,
151 |     CONSTRAINT fk_hyperlink_target_id_url FOREIGN KEY(target_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT
152 | );
153 | 
154 | INSERT INTO
155 |     hyperlink (source_id, target_id, element)
156 | SELECT
157 |     *
158 | FROM
159 |     tmp;
160 | 
161 | DROP TABLE tmp;
162 | 
163 | ALTER TABLE
164 |     feed RENAME TO tmp;
165 | 
166 | CREATE TABLE feed (
167 |     id INTEGER NOT NULL,
168 |     url_id INTEGER NOT NULL,
169 |     title TEXT NOT NULL,
170 |     dead BOOLEAN,
171 |     CONSTRAINT pk_feed PRIMARY KEY (id),
172 |     CONSTRAINT fk_feed_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT,
173 |     CONSTRAINT ck_feed_dead CHECK (dead IN (0, 1))
174 | );
175 | 
176 | INSERT INTO
177 |     feed (url_id, title)
178 | SELECT
179 |     *
180 | FROM
181 |     tmp;
182 | 
183 | DROP TABLE tmp;
184 | 
185 | ALTER TABLE
186 |     summary RENAME TO tmp;
187 | 
188 | CREATE TABLE summary (
189 |     id INTEGER NOT NULL,
190 |     url_id INTEGER NOT NULL,
191 |     markup TEXT NOT NULL,
192 |     CONSTRAINT pk_summary PRIMARY KEY (id),
193 |     CONSTRAINT fk_summary_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT
194 | );
195 | 
196 | INSERT INTO
197 |     summary (url_id, markup)
198 | SELECT
199 |     *
200 | FROM
201 |     tmp;
202 | 
203 | DROP TABLE tmp;
204 | 
205 | ALTER TABLE
206 |     webpage RENAME TO tmp;
207 | 
208 | CREATE TABLE webpage (
209 |     id INTEGER NOT NULL,
210 |     url_id INTEGER NOT NULL,
211 |     markup TEXT NOT NULL,
212 |     CONSTRAINT pk_webpage PRIMARY KEY (id),
213 |     CONSTRAINT fk_webpage_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE RESTRICT
214 | );
215 | 
216 | INSERT INTO
217 |     webpage (url_id, markup)
218 | SELECT
219 |     *
220 | FROM
221 |     tmp;
222 | 
223 | DROP TABLE tmp;
224 | 
225 | CREATE UNIQUE INDEX IF NOT EXISTS ix_url_url ON url (url);
226 | 
227 | CREATE UNIQUE INDEX IF NOT EXISTS ix_keyword_keyword ON keyword (keyword);
228 | 
229 | CREATE UNIQUE INDEX IF NOT EXISTS ix_item_url ON item (url);
230 | 
231 | CREATE UNIQUE INDEX IF NOT EXISTS ix_hyperlink_source_id_target_id_element ON hyperlink (source_id, target_id, element);
232 | 
233 | CREATE UNIQUE INDEX IF NOT EXISTS ix_feed_url_id ON feed (url_id);
234 | 
235 | CREATE UNIQUE INDEX IF NOT EXISTS ix_tagging_url_id_keyword_id ON tagging (url_id, keyword_id);
236 | 
237 | CREATE UNIQUE INDEX IF NOT EXISTS ix_summary_url_id ON summary (url_id);
238 | 
239 | CREATE UNIQUE INDEX IF NOT EXISTS ix_webpage_url_id ON webpage (url_id);
240 | 
241 | UPDATE
242 |     __version__
243 | SET
244 |     version = '0.10.5';
245 | 
246 | COMMIT;


--------------------------------------------------------------------------------
/feedme/sql/migrations/0.10.3_0.10.6.sql:
--------------------------------------------------------------------------------
  1 | BEGIN EXCLUSIVE;
  2 | 
  3 | DROP INDEX IF EXISTS ix_url_id_url;
  4 | 
  5 | DROP INDEX IF EXISTS ix_item_id_hash;
  6 | 
  7 | ALTER TABLE
  8 |     url RENAME TO tmp;
  9 | 
 10 | CREATE TABLE url (
 11 |     id INTEGER NOT NULL,
 12 |     url VARCHAR NOT NULL,
 13 |     CONSTRAINT pk_url PRIMARY KEY (id)
 14 | );
 15 | 
 16 | INSERT INTO
 17 |     url (id, url)
 18 | SELECT
 19 |     *
 20 | FROM
 21 |     tmp;
 22 | 
 23 | DROP TABLE tmp;
 24 | 
 25 | ALTER TABLE
 26 |     keyword RENAME TO tmp;
 27 | 
 28 | CREATE TABLE keyword (
 29 |     id INTEGER NOT NULL,
 30 |     keyword VARCHAR NOT NULL,
 31 |     CONSTRAINT pk_keyword PRIMARY KEY (id)
 32 | );
 33 | 
 34 | INSERT INTO
 35 |     keyword (id, keyword)
 36 | SELECT
 37 |     *
 38 | FROM
 39 |     tmp;
 40 | 
 41 | DROP TABLE tmp;
 42 | 
 43 | CREATE TABLE tmp_tagging (url_id INTEGER, keyword_id INTEGER);
 44 | 
 45 | INSERT INTO
 46 |     tmp_tagging (url_id, keyword_id)
 47 | SELECT
 48 |     item.url AS url_id,
 49 |     tagging.keyword_id AS keyword_id
 50 | FROM
 51 |     tagging
 52 |     JOIN item ON tagging.item_id == item.id;
 53 | 
 54 | DELETE FROM
 55 |     tmp_tagging
 56 | WHERE
 57 |     rowid NOT IN (
 58 |         SELECT
 59 |             min(rowid)
 60 |         FROM
 61 |             tmp_tagging
 62 |         GROUP BY
 63 |             url_id,
 64 |             keyword_id
 65 |     );
 66 | 
 67 | DROP TABLE tagging;
 68 | 
 69 | CREATE TABLE tagging (
 70 |     id INTEGER NOT NULL,
 71 |     url_id INTEGER NOT NULL,
 72 |     keyword_id INTEGER NOT NULL,
 73 |     CONSTRAINT pk_tagging PRIMARY KEY (id),
 74 |     CONSTRAINT fk_tagging_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE,
 75 |     CONSTRAINT fk_tagging_keyword_id_keyword FOREIGN KEY(keyword_id) REFERENCES keyword (id) ON DELETE RESTRICT ON UPDATE CASCADE
 76 | );
 77 | 
 78 | INSERT INTO
 79 |     tagging (url_id, keyword_id)
 80 | SELECT
 81 |     *
 82 | FROM
 83 |     tmp_tagging;
 84 | 
 85 | DROP TABLE tmp_tagging;
 86 | 
 87 | ALTER TABLE
 88 |     item RENAME TO tmp;
 89 | 
 90 | DELETE FROM
 91 |     tmp
 92 | WHERE
 93 |     rowid NOT IN (
 94 |         SELECT
 95 |             min(rowid)
 96 |         FROM
 97 |             tmp
 98 |         GROUP BY
 99 |             url
100 |     );
101 | 
102 | CREATE TABLE item (
103 |     id INTEGER NOT NULL,
104 |     url INTEGER NOT NULL,
105 |     source INTEGER NOT NULL,
106 |     title VARCHAR,
107 |     author VARCHAR,
108 |     published DATETIME,
109 |     updated DATETIME,
110 |     crawled FLOAT,
111 |     CONSTRAINT pk_item PRIMARY KEY (id),
112 |     CONSTRAINT fk_item_url_url FOREIGN KEY(url) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE,
113 |     CONSTRAINT fk_item_source_url FOREIGN KEY(source) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE
114 | );
115 | 
116 | INSERT INTO
117 |     item (
118 |         id,
119 |         url,
120 |         source,
121 |         title,
122 |         author,
123 |         published,
124 |         updated,
125 |         crawled
126 |     )
127 | SELECT
128 |     id,
129 |     url,
130 |     source,
131 |     title,
132 |     author,
133 |     published,
134 |     updated,
135 |     crawled
136 | FROM
137 |     tmp;
138 | 
139 | DROP TABLE tmp;
140 | 
141 | ALTER TABLE
142 |     hyperlink RENAME TO tmp;
143 | 
144 | CREATE TABLE hyperlink (
145 |     id INTEGER NOT NULL,
146 |     source_id INTEGER NOT NULL,
147 |     target_id INTEGER NOT NULL,
148 |     element VARCHAR NOT NULL,
149 |     CONSTRAINT pk_hyperlink PRIMARY KEY (id),
150 |     CONSTRAINT fk_hyperlink_source_id_url FOREIGN KEY(source_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE,
151 |     CONSTRAINT fk_hyperlink_target_id_url FOREIGN KEY(target_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE
152 | );
153 | 
154 | INSERT INTO
155 |     hyperlink (source_id, target_id, element)
156 | SELECT
157 |     *
158 | FROM
159 |     tmp;
160 | 
161 | DROP TABLE tmp;
162 | 
163 | ALTER TABLE
164 |     feed RENAME TO tmp;
165 | 
166 | CREATE TABLE feed (
167 |     id INTEGER NOT NULL,
168 |     url_id INTEGER NOT NULL,
169 |     title TEXT NOT NULL,
170 |     dead BOOLEAN,
171 |     CONSTRAINT pk_feed PRIMARY KEY (id),
172 |     CONSTRAINT fk_feed_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE,
173 |     CONSTRAINT ck_feed_dead CHECK (dead IN (0, 1))
174 | );
175 | 
176 | INSERT INTO
177 |     feed (url_id, title)
178 | SELECT
179 |     *
180 | FROM
181 |     tmp;
182 | 
183 | DROP TABLE tmp;
184 | 
185 | ALTER TABLE
186 |     summary RENAME TO tmp;
187 | 
188 | CREATE TABLE summary (
189 |     id INTEGER NOT NULL,
190 |     url_id INTEGER NOT NULL,
191 |     markup TEXT NOT NULL,
192 |     CONSTRAINT pk_summary PRIMARY KEY (id),
193 |     CONSTRAINT fk_summary_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE
194 | );
195 | 
196 | INSERT INTO
197 |     summary (url_id, markup)
198 | SELECT
199 |     *
200 | FROM
201 |     tmp;
202 | 
203 | DROP TABLE tmp;
204 | 
205 | ALTER TABLE
206 |     webpage RENAME TO tmp;
207 | 
208 | CREATE TABLE webpage (
209 |     id INTEGER NOT NULL,
210 |     url_id INTEGER NOT NULL,
211 |     markup TEXT NOT NULL,
212 |     CONSTRAINT pk_webpage PRIMARY KEY (id),
213 |     CONSTRAINT fk_webpage_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE
214 | );
215 | 
216 | INSERT INTO
217 |     webpage (url_id, markup)
218 | SELECT
219 |     *
220 | FROM
221 |     tmp;
222 | 
223 | DROP TABLE tmp;
224 | 
225 | CREATE UNIQUE INDEX IF NOT EXISTS ix_url_url ON url (url);
226 | 
227 | CREATE UNIQUE INDEX IF NOT EXISTS ix_keyword_keyword ON keyword (keyword);
228 | 
229 | CREATE UNIQUE INDEX IF NOT EXISTS ix_item_url ON item (url);
230 | 
231 | CREATE UNIQUE INDEX IF NOT EXISTS ix_hyperlink_source_id_target_id_element ON hyperlink (source_id, target_id, element);
232 | 
233 | CREATE UNIQUE INDEX IF NOT EXISTS ix_feed_url_id ON feed (url_id);
234 | 
235 | CREATE UNIQUE INDEX IF NOT EXISTS ix_tagging_url_id_keyword_id ON tagging (url_id, keyword_id);
236 | 
237 | CREATE UNIQUE INDEX IF NOT EXISTS ix_summary_url_id ON summary (url_id);
238 | 
239 | CREATE UNIQUE INDEX IF NOT EXISTS ix_webpage_url_id ON webpage (url_id);
240 | 
241 | UPDATE
242 |     __version__
243 | SET
244 |     version = '0.10.6';
245 | 
246 | COMMIT;


--------------------------------------------------------------------------------
/feedme/sql/migrations/0.10.5_0.10.6.sql:
--------------------------------------------------------------------------------
  1 | PRAGMA foreign_keys = OFF;
  2 | 
  3 | BEGIN EXCLUSIVE;
  4 | 
  5 | ALTER TABLE
  6 |     item RENAME TO tmp;
  7 | 
  8 | CREATE TABLE item (
  9 |     id INTEGER NOT NULL,
 10 |     url INTEGER NOT NULL,
 11 |     source INTEGER NOT NULL,
 12 |     title VARCHAR,
 13 |     author VARCHAR,
 14 |     published DATETIME,
 15 |     updated DATETIME,
 16 |     crawled FLOAT,
 17 |     CONSTRAINT pk_item PRIMARY KEY (id),
 18 |     CONSTRAINT fk_item_url_url FOREIGN KEY(url) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE,
 19 |     CONSTRAINT fk_item_source_url FOREIGN KEY(source) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE
 20 | );
 21 | 
 22 | INSERT INTO
 23 |     item
 24 | SELECT
 25 |     *
 26 | FROM
 27 |     tmp;
 28 | 
 29 | DROP TABLE tmp;
 30 | 
 31 | ALTER TABLE
 32 |     hyperlink RENAME TO tmp;
 33 | 
 34 | CREATE TABLE hyperlink (
 35 |     id INTEGER NOT NULL,
 36 |     source_id INTEGER NOT NULL,
 37 |     target_id INTEGER NOT NULL,
 38 |     element VARCHAR NOT NULL,
 39 |     CONSTRAINT pk_hyperlink PRIMARY KEY (id),
 40 |     CONSTRAINT fk_hyperlink_source_id_url FOREIGN KEY(source_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE,
 41 |     CONSTRAINT fk_hyperlink_target_id_url FOREIGN KEY(target_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE
 42 | );
 43 | 
 44 | INSERT INTO
 45 |     hyperlink
 46 | SELECT
 47 |     *
 48 | FROM
 49 |     tmp;
 50 | 
 51 | DROP TABLE tmp;
 52 | 
 53 | ALTER TABLE
 54 |     feed RENAME TO tmp;
 55 | 
 56 | CREATE TABLE feed (
 57 |     id INTEGER NOT NULL,
 58 |     url_id INTEGER NOT NULL,
 59 |     title TEXT NOT NULL,
 60 |     dead BOOLEAN,
 61 |     CONSTRAINT pk_feed PRIMARY KEY (id),
 62 |     CONSTRAINT fk_feed_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE,
 63 |     CONSTRAINT ck_feed_dead CHECK (dead IN (0, 1))
 64 | );
 65 | 
 66 | INSERT INTO
 67 |     feed
 68 | SELECT
 69 |     *
 70 | FROM
 71 |     tmp;
 72 | 
 73 | DROP TABLE tmp;
 74 | 
 75 | ALTER TABLE
 76 |     tagging RENAME TO tmp;
 77 | 
 78 | CREATE TABLE tagging (
 79 |     id INTEGER NOT NULL,
 80 |     url_id INTEGER NOT NULL,
 81 |     keyword_id INTEGER NOT NULL,
 82 |     CONSTRAINT pk_tagging PRIMARY KEY (id),
 83 |     CONSTRAINT fk_tagging_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE,
 84 |     CONSTRAINT fk_tagging_keyword_id_keyword FOREIGN KEY(keyword_id) REFERENCES keyword (id) ON DELETE RESTRICT ON UPDATE CASCADE
 85 | );
 86 | 
 87 | INSERT INTO
 88 |     tagging
 89 | SELECT
 90 |     *
 91 | FROM
 92 |     tmp;
 93 | 
 94 | DROP TABLE tmp;
 95 | 
 96 | ALTER TABLE
 97 |     summary RENAME TO tmp;
 98 | 
 99 | CREATE TABLE summary (
100 |     id INTEGER NOT NULL,
101 |     url_id INTEGER NOT NULL,
102 |     markup TEXT NOT NULL,
103 |     CONSTRAINT pk_summary PRIMARY KEY (id),
104 |     CONSTRAINT fk_summary_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE
105 | );
106 | 
107 | INSERT INTO
108 |     summary
109 | SELECT
110 |     *
111 | FROM
112 |     tmp;
113 | 
114 | DROP TABLE tmp;
115 | 
116 | ALTER TABLE
117 |     webpage RENAME TO tmp;
118 | 
119 | CREATE TABLE webpage (
120 |     id INTEGER NOT NULL,
121 |     url_id INTEGER NOT NULL,
122 |     markup TEXT NOT NULL,
123 |     CONSTRAINT pk_webpage PRIMARY KEY (id),
124 |     CONSTRAINT fk_webpage_url_id_url FOREIGN KEY(url_id) REFERENCES url (id) ON DELETE RESTRICT ON UPDATE CASCADE
125 | );
126 | 
127 | INSERT INTO
128 |     webpage
129 | SELECT
130 |     *
131 | FROM
132 |     tmp;
133 | 
134 | DROP TABLE tmp;
135 | 
136 | CREATE UNIQUE INDEX IF NOT EXISTS ix_url_url ON url (url);
137 | 
138 | CREATE UNIQUE INDEX IF NOT EXISTS ix_keyword_keyword ON keyword (keyword);
139 | 
140 | CREATE UNIQUE INDEX IF NOT EXISTS ix_item_url ON item (url);
141 | 
142 | CREATE UNIQUE INDEX IF NOT EXISTS ix_hyperlink_source_id_target_id_element ON hyperlink (source_id, target_id, element);
143 | 
144 | CREATE UNIQUE INDEX IF NOT EXISTS ix_feed_url_id ON feed (url_id);
145 | 
146 | CREATE UNIQUE INDEX IF NOT EXISTS ix_tagging_url_id_keyword_id ON tagging (url_id, keyword_id);
147 | 
148 | CREATE UNIQUE INDEX IF NOT EXISTS ix_summary_url_id ON summary (url_id);
149 | 
150 | CREATE UNIQUE INDEX IF NOT EXISTS ix_webpage_url_id ON webpage (url_id);
151 | 
152 | UPDATE
153 |     __version__
154 | SET
155 |     version = '0.10.6';
156 | 
157 | COMMIT;
158 | 
159 | PRAGMA foreign_keys = ON;
160 | 


--------------------------------------------------------------------------------
/feedme/sql/migrations/0.10_0.10.3.sql:
--------------------------------------------------------------------------------
 1 | BEGIN EXCLUSIVE;
 2 | 
 3 | ALTER TABLE
 4 |     markup RENAME TO tmp;
 5 | 
 6 | CREATE TABLE IF NOT EXISTS summary (
 7 |     url_id INTEGER NOT NULL,
 8 |     markup TEXT NOT NULL,
 9 |     CONSTRAINT pk_summary PRIMARY KEY (url_id),
10 |     CONSTRAINT fk_summary_url_id_url FOREIGN KEY(url_id) REFERENCES url (id)
11 | );
12 | 
13 | CREATE TABLE IF NOT EXISTS webpage (
14 |     url_id INTEGER NOT NULL,
15 |     markup TEXT NOT NULL,
16 |     CONSTRAINT pk_webpage PRIMARY KEY (url_id),
17 |     CONSTRAINT fk_webpage_url_id_url FOREIGN KEY(url_id) REFERENCES url (id)
18 | );
19 | 
20 | INSERT INTO
21 |     summary (url_id, markup)
22 | SELECT
23 |     url.id AS url_id,
24 |     tmp.markup AS markup
25 | FROM
26 |     tmp
27 |     JOIN item ON tmp.item_id == item.id
28 |     JOIN url ON item.url == url.id
29 | WHERE
30 |     tmp.type == 'summary'
31 | GROUP BY
32 |     url_id;
33 | 
34 | INSERT
35 |     OR REPLACE INTO summary (url_id, markup)
36 | SELECT
37 |     url.id AS url_id,
38 |     tmp.markup AS markup
39 | FROM
40 |     tmp
41 |     JOIN item ON tmp.item_id == item.id
42 |     JOIN url ON item.url == url.id
43 | WHERE
44 |     tmp.type == 'content'
45 | GROUP BY
46 |     url_id;
47 | 
48 | DROP TABLE tmp;
49 | 
50 | UPDATE
51 |     __version__
52 | SET
53 |     version = '0.10.3';
54 | 
55 | COMMIT;


--------------------------------------------------------------------------------
/feedme/sql/schema.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | from sqlalchemy import MetaData, types
 24 | from sqlalchemy.ext.declarative import declarative_base, declared_attr
 25 | from sqlalchemy.schema import Column, ForeignKey, Index
 26 | 
 27 | SCHEMA_VERSION = '0.10.6'
 28 | 
 29 | metadata = MetaData(
 30 |     naming_convention={
 31 |         'ix': 'ix_%(table_name)s_%(column_0_N_name)s',
 32 |         'uq': 'uq_%(table_name)s_%(column_0_N_name)s',
 33 |         'ck': 'ck_%(table_name)s_%(column_0_N_name)s',
 34 |         'fk': 'fk_%(table_name)s_%(column_0_N_name)s_%(referred_table_name)s',
 35 |         'pk': 'pk_%(table_name)s',
 36 |     },
 37 | )
 38 | RESTRICT = 'RESTRICT'
 39 | CASCADE = 'CASCADE'
 40 | 
 41 | 
 42 | class BaseDefaults:
 43 |     @declared_attr
 44 |     def __tablename__(self):
 45 |         return self.__name__.lower()
 46 | 
 47 | 
 48 | Base = declarative_base(metadata=metadata, cls=BaseDefaults)
 49 | 
 50 | 
 51 | class __Version__(Base):
 52 |     version = Column(types.String(), primary_key=True)
 53 | 
 54 | 
 55 | class URL(Base):
 56 |     id = Column(types.Integer(), primary_key=True, autoincrement=True)
 57 |     url = Column(types.String(), nullable=False)
 58 | 
 59 |     @declared_attr
 60 |     def __table_args__(self):
 61 |         return (Index(None, 'url', unique=True),)
 62 | 
 63 | 
 64 | class Keyword(Base):
 65 |     id = Column(types.Integer(), primary_key=True, autoincrement=True)
 66 |     keyword = Column(types.String(), nullable=False)
 67 | 
 68 |     @declared_attr
 69 |     def __table_args__(self):
 70 |         return (Index(None, 'keyword', unique=True),)
 71 | 
 72 | 
 73 | class Item(Base):
 74 |     id = Column(types.Integer(), primary_key=True, autoincrement=True)
 75 | 
 76 |     url = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False)
 77 |     source = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False)
 78 | 
 79 |     title = Column(types.String())
 80 |     author = Column(types.String())
 81 |     published = Column(types.DateTime())
 82 |     updated = Column(types.DateTime())
 83 |     crawled = Column(types.Float())
 84 | 
 85 |     @declared_attr
 86 |     def __table_args__(self):
 87 |         return (Index(None, 'url', unique=True),)
 88 | 
 89 | 
 90 | class Hyperlink(Base):
 91 |     id = Column(types.Integer(), primary_key=True, autoincrement=True)
 92 |     source_id = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False)
 93 |     target_id = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False)
 94 |     element = Column(types.String(), nullable=False)
 95 | 
 96 |     @declared_attr
 97 |     def __table_args__(self):
 98 |         return (Index(None, 'source_id', 'target_id', 'element', unique=True),)
 99 | 
100 | 
101 | class Feed(Base):
102 |     id = Column(types.Integer(), primary_key=True, autoincrement=True)
103 |     url_id = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False)
104 |     title = Column(types.Text(), nullable=False)
105 |     dead = Column(types.Boolean())
106 | 
107 |     @declared_attr
108 |     def __table_args__(self):
109 |         return (Index(None, 'url_id', unique=True),)
110 | 
111 | 
112 | class Tagging(Base):
113 |     id = Column(types.Integer(), primary_key=True, autoincrement=True)
114 |     url_id = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False)
115 |     keyword_id = Column(ForeignKey('keyword.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False)
116 | 
117 |     @declared_attr
118 |     def __table_args__(self):
119 |         return (Index(None, 'url_id', 'keyword_id', unique=True),)
120 | 
121 | 
122 | class Summary(Base):
123 |     id = Column(types.Integer(), primary_key=True, autoincrement=True)
124 |     url_id = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False)
125 |     markup = Column(types.Text(), nullable=False)
126 | 
127 |     @declared_attr
128 |     def __table_args__(self):
129 |         return (
130 |             Index(None, 'url_id', unique=True),
131 |             {'info': {'dedup': 'max', 'onconflict': 'REPLACE'}},
132 |         )
133 | 
134 | 
135 | class Webpage(Base):
136 |     id = Column(types.Integer(), primary_key=True, autoincrement=True)
137 |     url_id = Column(ForeignKey('url.id', ondelete=RESTRICT, onupdate=CASCADE), nullable=False)
138 |     markup = Column(types.Text(), nullable=False)
139 | 
140 |     @declared_attr
141 |     def __table_args__(self):
142 |         return (
143 |             Index(None, 'url_id', unique=True),
144 |             {'info': {'dedup': 'max', 'onconflict': 'REPLACE'}},
145 |         )
146 | 
147 | 
148 | models = [URL, Keyword, Item, Hyperlink, Feed, Tagging, Summary, Webpage]
149 | tables = [m.__table__ for m in models]
150 | 
151 | version = SCHEMA_VERSION
152 | init = ['PRAGMA foreign_keys = ON', 'PRAGMA journal_mode = WAL']
153 | 


--------------------------------------------------------------------------------
/feedme/sql/stream.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | import logging
 24 | import sqlite3
 25 | from collections import deque
 26 | from contextlib import suppress
 27 | from pathlib import Path
 28 | from threading import Lock
 29 | from typing import Union
 30 | 
 31 | from ..utils import append_stem, randstr, watch_for_timing
 32 | from .factory import Database
 33 | 
 34 | _PathLike = Union[str, Path]
 35 | 
 36 | 
 37 | class DatabaseWriter:
 38 |     def __init__(self, path: _PathLike, database: Database,
 39 |                  debug=False, cache_path=None, silent=False):
 40 |         self.log = logging.getLogger('db.writer')
 41 |         if silent:
 42 |             self.log.setLevel(logging.WARNING)
 43 | 
 44 |         main_db = Path(path)
 45 |         cache_db = Path(cache_path) if cache_path else append_stem(path, f'~tmp-{randstr(8)}')
 46 | 
 47 |         self.db = database
 48 |         self._queues = {t: deque() for t in database.tablemap}
 49 |         self._flush_lock = Lock()
 50 | 
 51 |         self._main = self._connect(main_db, 'main', debug)
 52 |         self._cache = self._connect(cache_db, 'temp', debug)
 53 |         self._paths = {self._main: main_db, self._cache: cache_db}
 54 | 
 55 |         self._corked = True
 56 |         self._closed = False
 57 |         self._rowcounts = {conn: {t: None for t in database.tablemap}
 58 |                            for conn in (self._main, self._cache)}
 59 | 
 60 |         self._bind_tables()
 61 |         self.report()
 62 |         self.uncork()
 63 |         self.flush()
 64 | 
 65 |     def _connect(self, path: _PathLike, name=None, debug=False):
 66 |         conn = sqlite3.connect(path, isolation_level=None, timeout=30,
 67 |                                check_same_thread=False)
 68 |         conn.row_factory = sqlite3.Row
 69 |         if debug:
 70 |             self._setup_debug(conn, name, debug)
 71 | 
 72 |         self.db.verify_version(conn)
 73 |         self.db.set_version(conn)
 74 |         self.db.create_all(conn)
 75 |         return conn
 76 | 
 77 |     @property
 78 |     def record_count(self):
 79 |         return sum(len(q) for q in self._queues.values())
 80 | 
 81 |     def _lock_db(self, conn: sqlite3.Connection):
 82 |         self.log.debug(f'Locking database {self._paths[conn]}')
 83 |         if self.db.is_locked(conn):
 84 |             self.log.warning('Database lock table exists')
 85 |             self.log.warning('Previous crawler did not exit properly')
 86 |         self.db.mark_as_locked(conn)
 87 | 
 88 |     def _unlock_db(self, conn: sqlite3.Connection):
 89 |         self.log.debug(f'Unlocking database {self._paths[conn]}')
 90 |         self.db.mark_as_unlocked(conn)
 91 | 
 92 |     def _setup_debug(self, conn: sqlite3.Connection, name, debug_out):
 93 |         sql_log = logging.getLogger(f'db.sql.{name}')
 94 |         sql_log.setLevel(logging.DEBUG)
 95 |         conn.set_trace_callback(sql_log.debug)
 96 |         if not isinstance(debug_out, bool):
 97 |             sql_log.propagate = False
 98 |             path = append_stem(Path(debug_out), f'-{name}')
 99 |             file = open(path, 'w+')
100 |             handler = logging.StreamHandler(file)
101 |             sql_log.addHandler(handler)
102 | 
103 |     def _bind_tables(self):
104 |         for table in self.db.tables:
105 |             table.bind_foreign_key(self._cache)
106 |             table.bind_offset(self._main)
107 | 
108 |     def _foreign_key_off(self, conn: sqlite3.Connection):
109 |         conn.execute('PRAGMA foreign_keys = OFF')
110 |         self.log.debug(f'Foreign key is OFF for {self._paths[conn]}')
111 | 
112 |     def _foreign_key_on(self, conn: sqlite3.Connection):
113 |         conn.execute('PRAGMA foreign_keys = ON')
114 |         self.log.debug(f'Foreign key is ON for {self._paths[conn]}')
115 | 
116 |     def _rebuild_index(self, conn: sqlite3.Connection):
117 |         self.log.info('Rebuilding index')
118 |         self.db.create_indices(conn)
119 | 
120 |     def _begin(self, conn: sqlite3.Connection):
121 |         try:
122 |             conn.execute('BEGIN')
123 |             self.log.debug(f'Began new transaction on {self._paths[conn]}')
124 |         except sqlite3.OperationalError:
125 |             pass
126 | 
127 |     def _begin_exclusive(self, conn: sqlite3.Connection):
128 |         while True:
129 |             try:
130 |                 conn.execute('BEGIN EXCLUSIVE')
131 |                 self.log.debug('Began exclusive transaction'
132 |                                f' on {self._paths[conn]}')
133 |             except sqlite3.OperationalError:
134 |                 self.log.warning('Cannot acquire exclusive write access')
135 |                 self.log.warning('Another program is writing to the database')
136 |                 self.log.warning('Retrying...')
137 |             else:
138 |                 return
139 | 
140 |     def _apply_changes(self):
141 |         queues = self._queues
142 |         self._queues = {t: deque() for t in self.db.tablemap}
143 |         cache = self._cache
144 |         for name, table in self.db.tablemap.items():
145 |             q = queues[name]
146 |             if not q:
147 |                 continue
148 | 
149 |             try:
150 |                 table.insert(cache, q)
151 |             except sqlite3.IntegrityError:
152 |                 cache.rollback()
153 |                 for k, v in queues.items():
154 |                     self._queues[k].appendleft(v)
155 |                 raise
156 |             else:
157 |                 cache.commit()
158 |                 del queues[name]
159 | 
160 |     def _verify(self, conn: sqlite3.Connection):
161 |         self._foreign_key_off(conn)
162 |         for table in self.db.tables:
163 |             table.drop_proxy(conn)
164 |             table.restore_original(conn)
165 |         conn.commit()
166 |         self.reconcile(conn)
167 |         self.deduplicate(conn)
168 |         for table in self.db.tables:
169 |             table.drop_temp_index(conn)
170 |         self._rebuild_index(conn)
171 |         self._foreign_key_on(conn)
172 |         self._optimize(conn)
173 | 
174 |     def _optimize(self, conn: sqlite3.Connection):
175 |         self.log.debug(f'Optimizing {self._paths[conn]}')
176 |         conn.execute('PRAGMA optimize')
177 | 
178 |     def _merge_other(self, other=None, discard=False):
179 |         main = self._main
180 |         if not other:
181 |             other_db = self._cache
182 |             other = str(self._paths[other_db])
183 |         else:
184 |             other = str(other)
185 |             other_db = sqlite3.connect(other, isolation_level=None)
186 |         max_rowids = self.db.get_max_rowids(main)
187 |         self._foreign_key_off(main)
188 |         self._begin_exclusive(main)
189 |         self._lock_db(main)
190 |         self.db.attach(main, other)
191 |         self.log.debug(f'Attached {other} to {self._paths[main]}')
192 | 
193 |         try:
194 |             self.log.debug('Matching existing records')
195 |             with watch_for_timing('Matching'):
196 |                 for table in self.db.tables:
197 |                     self.log.debug(f'Matching {table}')
198 |                     table.match_primary_keys(main)
199 |                     table.match_foreign_keys(main)
200 | 
201 |             self.log.debug('Dropping indices')
202 |             self.db.drop_indices(main)
203 | 
204 |             self.log.debug('Merging into main database')
205 |             with watch_for_timing('Merging'):
206 |                 for table in self.db.tables:
207 |                     self.log.debug(f'Merging {table}')
208 |                     table.dedup_primary_keys(main)
209 |                     table.merge_attached(main)
210 | 
211 |             self.log.debug('Deduplicating records')
212 |             with watch_for_timing('Deduplicating'):
213 |                 for table in self.db.tables:
214 |                     self.log.debug(f'Deduplicating {table}')
215 |                     table.dedup(main, max_rowids[table.name])
216 | 
217 |         except sqlite3.IntegrityError:
218 |             main.rollback()
219 |             raise
220 | 
221 |         else:
222 |             self.log.debug('Committing changes')
223 |             main.commit()
224 |             self.db.detach(main)
225 |             self._foreign_key_on(main)
226 |             self._optimize(main)
227 |             self.log.debug('Finalizing merge')
228 | 
229 |         finally:
230 |             if not discard:
231 |                 self.log.debug('Removing transcient data')
232 |                 with watch_for_timing('Restoring'):
233 |                     for table in self.db.tables:
234 |                         table.restore_original(other_db)
235 |             self._rebuild_index(main)
236 |             self._unlock_db(main)
237 | 
238 |     def uncork(self):
239 |         if not self._corked:
240 |             return
241 |         conn = self._cache
242 | 
243 |         self._lock_db(conn)
244 |         self.db.drop_indices(conn)
245 |         self._foreign_key_off(conn)
246 |         for table in self.db.tables:
247 |             table.create_proxy(conn)
248 |         self._corked = False
249 | 
250 |     def cork(self):
251 |         if self._corked:
252 |             return
253 |         self.flush()
254 |         conn = self._cache
255 |         self._verify(conn)
256 |         self._unlock_db(conn)
257 |         self._corked = True
258 | 
259 |     def write(self, table, item):
260 |         self._queues[table].append(item)
261 | 
262 |     def flush(self):
263 |         with self._flush_lock:
264 |             if self._corked:
265 |                 return
266 | 
267 |             count = self.record_count
268 |             if count:
269 |                 self.log.info(f'Saving {count} records')
270 |                 with watch_for_timing('Flushing'):
271 |                     self._apply_changes()
272 | 
273 |             self._cache.commit()
274 |             self._begin(self._cache)
275 | 
276 |     def deduplicate(self, conn=None):
277 |         self.log.info('Deduplicating database records')
278 |         conn = conn or self._cache
279 |         conn.commit()
280 |         self._begin_exclusive(conn)
281 |         try:
282 |             with watch_for_timing('Deduplicating'):
283 |                 for table in self.db.tables:
284 |                     table.fast_dedup(conn)
285 |         except sqlite3.IntegrityError:
286 |             conn.rollback()
287 |             raise
288 |         finally:
289 |             conn.commit()
290 | 
291 |     def reconcile(self, conn=None):
292 |         self.log.info('Enforcing internal references')
293 |         conn = conn or self._cache
294 |         conn.commit()
295 |         self._begin_exclusive(conn)
296 |         try:
297 |             with watch_for_timing('Fixing foreign keys'):
298 |                 mismatches = conn.execute('PRAGMA foreign_key_check')
299 |                 for table, rowid, parent, fkid in mismatches:
300 |                     self.db.tablemap[table].update_fk(conn, fkid, rowid)
301 |         except sqlite3.IntegrityError:
302 |             conn.rollback()
303 |             raise
304 |         else:
305 |             conn.commit()
306 | 
307 |     def merge(self):
308 |         self.cork()
309 |         self.log.info('Merging new data into main database')
310 |         self._merge_other(discard=True)
311 |         self.report()
312 | 
313 |     def close(self):
314 |         self._main.close()
315 |         self._cache.close()
316 |         self._corked = True
317 |         self._closed = True
318 | 
319 |     def interrupt(self):
320 |         self._main.interrupt()
321 |         self._cache.interrupt()
322 | 
323 |     def cleanup(self):
324 |         cache = self._paths[self._cache]
325 |         shm = cache.with_suffix('.db-shm')
326 |         wal = cache.with_suffix('.db-wal')
327 |         with suppress(FileNotFoundError):
328 |             cache.unlink()
329 |             shm.unlink()
330 |             wal.unlink()
331 | 
332 |     def finish(self, merge=True):
333 |         if not merge:
334 |             self.cork()
335 |             self.close()
336 |             return
337 |         self.merge()
338 |         self.close()
339 |         self.cleanup()
340 | 
341 |     def _tally(self, conn):
342 |         count = self.db.count_rows(conn)
343 |         diff = {t: v is not None and count[t] - v for t, v in self._rowcounts[conn].items()}
344 |         msg = ['Database stats:']
345 |         for table in self.db.tablemap:
346 |             if diff[table] is not False:
347 |                 msg.append(f'  {table}: {count[table]} ({diff[table]:+})')
348 |             else:
349 |                 msg.append(f'  {table}: {count[table]}')
350 |         self._rowcounts[conn].update(count)
351 |         return msg
352 | 
353 |     def report(self):
354 |         for line in self._tally(self._main):
355 |             self.log.info(line)
356 | 
357 |     def __enter__(self):
358 |         return self
359 | 
360 |     def __exit__(self, typ, val=None, tb=None):
361 |         self.close()
362 |         if not typ:
363 |             return True
364 |         if val is None:
365 |             if tb is None:
366 |                 raise typ
367 |             val = typ()
368 |         if tb is not None:
369 |             val = val.with_traceback(tb)
370 |         raise val
371 | 


--------------------------------------------------------------------------------
/feedme/sql/utils.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | def bulk_fetch(cur, size=100000, log=None):
25 |     i = 0
26 |     rows = cur.fetchmany(size)
27 |     while rows:
28 |         for row in rows:
29 |             i += 1
30 |             yield row
31 |         if log:
32 |             log.info(f'Fetched {i} rows.')
33 |         rows = cur.fetchmany(size)
34 | 
35 | 
36 | def offset_fetch(conn, stmt, table, *, values=(), size=100000, log=None):
37 |     i = 0
38 |     offset = 0
39 |     max_id = conn.execute(f'SELECT max(rowid) FROM {table}').fetchone()[0]
40 |     if not max_id:
41 |         raise StopIteration
42 |     while offset <= max_id:
43 |         limited = stmt % {'offset': (
44 |             f'{table}.rowid IN '
45 |             f'(SELECT rowid FROM {table} '
46 |             f'ORDER BY rowid LIMIT {size} OFFSET {offset})'
47 |         )}
48 |         rows = conn.execute(limited, values)
49 |         for row in rows:
50 |             i += 1
51 |             yield row
52 |         if log and i:
53 |             log.info(f'Fetched {i} rows.')
54 |         offset += size
55 | 


--------------------------------------------------------------------------------
/feedme/urlkit.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from typing import Tuple
24 | from urllib.parse import SplitResult, urlsplit
25 | 
26 | from .datastructures import labeled_sequence
27 | 
28 | 
29 | def is_http(u):
30 |     return isinstance(u, str) and urlsplit(u).scheme in {'http', 'https'}
31 | 
32 | 
33 | def is_absolute_http(u):
34 |     if not isinstance(u, str):
35 |         return False
36 |     s = urlsplit(u)
37 |     return s.scheme in {'http', 'https'} or s.scheme == '' and s.netloc
38 | 
39 | 
40 | def ensure_protocol(u, protocol='http'):
41 |     s = urlsplit(u)
42 |     return u if s.scheme else f'{protocol}:{u}'
43 | 
44 | 
45 | def domain_parents(domain: str) -> Tuple[str]:
46 |     parts = domain.split('.')
47 |     return tuple('.'.join(parts[-i:]) for i in range(len(parts), 1, -1))
48 | 
49 | 
50 | def no_scheme(url: SplitResult) -> str:
51 |     return url.geturl()[len(f'{url.scheme}:'):]
52 | 
53 | 
54 | def path_only(url: SplitResult) -> str:
55 |     return url.geturl()[len(f'{url.scheme}://{url.netloc}'):]
56 | 
57 | 
58 | def select_templates(query, template_tree):
59 |     matches = {r: r.match(query) for r in template_tree}
60 |     matches = [(r, m) for r, m in matches.items() if m]
61 |     if not matches:
62 |         raise ValueError('No template provider')
63 |     pattern, match = matches[0]
64 |     templates = template_tree[pattern]
65 |     if not callable(templates):
66 |         templates = [t[0] for t in templates.items()]
67 |     return match, templates
68 | 
69 | 
70 | def build_urls(base, match, templates):
71 |     parsed = urlsplit(base)
72 |     if callable(templates):
73 |         return templates(parsed, match)
74 |     specifiers = {
75 |         **parsed._asdict(),
76 |         'network_path': no_scheme(parsed),
77 |         'path_query': path_only(parsed),
78 |         'original': parsed.geturl(),
79 |         **match.groupdict(),
80 |         **labeled_sequence(match.groups(), start=1, as_str=True),
81 |     }
82 |     return [t % specifiers for t in templates]
83 | 


--------------------------------------------------------------------------------
/feedme/utils.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2020 Tony Wu <tony[dot]wu(at)nyu[dot]edu>
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | from __future__ import annotations
 24 | 
 25 | import logging
 26 | import pickle
 27 | import random
 28 | import string
 29 | import time
 30 | from contextlib import contextmanager
 31 | from datetime import datetime, timezone
 32 | from hashlib import sha1
 33 | from logging.handlers import QueueListener
 34 | from multiprocessing import Queue
 35 | from operator import gt
 36 | from typing import Any, Dict, List, Set, TypeVar, Union
 37 | from urllib.parse import urlsplit
 38 | 
 39 | import simplejson as json
 40 | from scrapy.http import Request, TextResponse
 41 | 
 42 | from .datastructures import KeywordCollection, KeywordStore
 43 | from .urlkit import domain_parents, ensure_protocol, is_absolute_http
 44 | 
 45 | try:
 46 |     from termcolor import colored
 47 | except ImportError:
 48 |     def colored(t, *args, **kwargs):
 49 |         return t
 50 | 
 51 | JSONType = Union[str, bool, int, float, None, List['JSONType'], Dict[str, 'JSONType']]
 52 | JSONDict = Dict[str, JSONType]
 53 | SpiderOutput = List[Union[JSONDict, Request]]
 54 | 
 55 | log = logging.getLogger('main.utils')
 56 | 
 57 | 
 58 | class RobustQueueListener(QueueListener):
 59 |     def _monitor(self):
 60 |         try:
 61 |             super()._monitor()
 62 |         except EOFError:
 63 |             log.warning('Log listener has prematurely stopped.')
 64 | 
 65 | 
 66 | class QueueListenerWrapper:
 67 |     def __init__(self):
 68 |         self.queue = None
 69 |         self.listener = None
 70 | 
 71 |     def enable(self):
 72 |         if self.queue:
 73 |             return self.queue
 74 |         self.queue = Queue()
 75 |         self.listener = RobustQueueListener(self.queue, *logging.getLogger().handlers, respect_handler_level=True)
 76 |         self.listener.start()
 77 |         return self.queue
 78 | 
 79 |     def disable(self):
 80 |         if not self.queue:
 81 |             return
 82 |         self.listener.stop()
 83 |         self.queue = None
 84 |         self.listener = None
 85 | 
 86 |     def start(self):
 87 |         if not self.listener:
 88 |             return
 89 |         if not self.listener._thread:
 90 |             self.listener.start()
 91 |         return self.queue
 92 | 
 93 |     def stop(self):
 94 |         if not self.listener:
 95 |             return
 96 |         if self.listener._thread:
 97 |             self.listener.stop()
 98 |         return self.queue
 99 | 
100 | 
101 | LOG_LISTENER = QueueListenerWrapper()
102 | 
103 | 
104 | def append_stem(path, appendage):
105 |     return path.with_name(f'{path.stem}{appendage}').with_suffix(path.suffix)
106 | 
107 | 
108 | def fmttimedelta(td):
109 |     s = int(td.total_seconds())
110 |     h, r = divmod(s, 3600)
111 |     m, s = divmod(r, 60)
112 |     return f'{h}h {m}m {s}s'
113 | 
114 | 
115 | def randstr(length, choices='0123456789abcdef'):
116 |     return ''.join(random.choices(choices, k=length))
117 | 
118 | 
119 | def parse_html(domstring, url='about:blank') -> TextResponse:
120 |     return TextResponse(url=url, body=domstring, encoding='utf8')
121 | 
122 | 
123 | def json_converters(value: Any) -> JSONType:
124 |     if isinstance(value, datetime):
125 |         return value.isoformat()
126 |     raise TypeError(type(value))
127 | 
128 | 
129 | def load_jsonlines(file) -> List[JSONDict]:
130 |     return [json.loads(line) for line in file.read().split('\n') if line]
131 | 
132 | 
133 | def datetime_converters(dt: Union[str, int, float, datetime], tz=timezone.utc) -> datetime:
134 |     if isinstance(dt, datetime):
135 |         return dt
136 |     if isinstance(dt, str):
137 |         return datetime.fromisoformat(dt)
138 |     if isinstance(dt, (int, float)):
139 |         try:
140 |             return datetime.fromtimestamp(dt, tz=tz)
141 |         except (ValueError, OSError):
142 |             return datetime.fromtimestamp(dt / 1000, tz=tz)
143 |     raise TypeError('dt must be of type str, int, float, or datetime')
144 | 
145 | 
146 | def sha1sum(s: Union[str, bytes]) -> str:
147 |     if isinstance(s, str):
148 |         s = s.encode()
149 |     return sha1(s).hexdigest()
150 | 
151 | 
152 | def ensure_collection(supplier):
153 |     def converter(obj):
154 |         if obj is None:
155 |             return supplier()
156 |         return supplier(obj)
157 |     return converter
158 | 
159 | 
160 | def is_rss_xml(response: TextResponse):
161 |     ctype = response.headers.get('Content-Type')
162 |     if not ctype:
163 |         return True
164 |     ctype = ctype.decode('utf8').split(';')[0]
165 |     return ctype in {'text/xml', 'application/xml', 'application/rss+xml',
166 |                      'application/rdf+xml', 'application/atom+xml'}
167 | 
168 | 
169 | def falsy(v):
170 |     return v in {0, None, False, '0', 'None', 'none', 'False', 'false', 'null', 'undefined', 'NaN'}
171 | 
172 | 
173 | def wait(t):
174 |     t0 = time.perf_counter()
175 |     while time.perf_counter() - t0 < t:
176 |         time.sleep(0.1)
177 | 
178 | 
179 | @contextmanager
180 | def watch_for_timing(name, limit=0):
181 |     start = time.perf_counter()
182 |     try:
183 |         yield
184 |     finally:
185 |         duration = time.perf_counter() - start
186 |         message = None
187 |         level = None
188 |         if limit and duration > limit:
189 |             message = colored(f'[Performance violation] {name} took {duration * 1000:.0f}ms; '
190 |                               f'desired time is {limit * 1000:.0f}ms.', color='yellow')
191 |             level = logging.INFO
192 |         elif not limit:
193 |             message = f'{name} took {duration * 1000:.0f}ms'
194 |             level = logging.DEBUG
195 |         if message:
196 |             logging.getLogger('profiler.timing').log(level, message)
197 | 
198 | 
199 | @contextmanager
200 | def watch_for_len(name, col, limit=0, comp=gt, desc='fewer than'):
201 |     try:
202 |         yield
203 |     finally:
204 |         size = len(col)
205 |         message = None
206 |         level = None
207 |         if limit and comp(size, limit):
208 |             message = colored(f'[Length violation] Number of {name} is {size}; '
209 |                               f'desired is {desc} {limit:.0f}.', color='yellow')
210 |             level = logging.INFO
211 |         elif not limit:
212 |             message = f'Number of {name} is {size}.'
213 |             level = logging.DEBUG
214 |         if message:
215 |             logging.getLogger('profiler.containerlen').log(level, message)
216 | 
217 | 
218 | def guard_json(text: str) -> JSONDict:
219 |     try:
220 |         return json.loads(text)
221 |     except json.JSONDecodeError as e:
222 |         log.error(e)
223 |         return {}
224 | 
225 | 
226 | def read_jsonlines(f, *, delimiter='\0\n', on_error='raise', paginate=100000, on_paginate=None):
227 |     i = 0
228 |     k = 0
229 |     p = paginate - 1
230 | 
231 |     next_line = f.readline()
232 |     while next_line:
233 |         i += 1
234 | 
235 |         if next_line == delimiter:
236 |             k += 1
237 |             next_line = f.readline()
238 |             if paginate and k == p:
239 |                 p += paginate
240 |                 yield i, k, on_paginate
241 |             continue
242 | 
243 |         try:
244 |             yield i, k, json.loads(next_line.rstrip())
245 | 
246 |         except json.JSONDecodeError:
247 |             if on_error == 'raise':
248 |                 raise
249 |             if on_error == 'continue':
250 |                 continue
251 |             return
252 | 
253 |         next_line = f.readline()
254 | 
255 | 
256 | PATH_UNSAFE = ''.join(set(string.punctuation + ' ') - set('-_/.'))
257 | 
258 | 
259 | def aggressive_replace_chars(s, encoding='latin_1'):
260 |     return s.encode(encoding, 'replace').decode(encoding, 'ignore')
261 | 
262 | 
263 | def replace_unsafe_chars(s, repl='-', chars=PATH_UNSAFE):
264 |     for c in chars:
265 |         if c in s:
266 |             s = s.replace(c, repl)
267 |     return s
268 | 
269 | 
270 | def pathsafe(s):
271 |     return replace_unsafe_chars(aggressive_replace_chars(s))
272 | 
273 | 
274 | SIMPLEJSON_KWARGS = {
275 |     'ensure_ascii': True,
276 |     'default': json_converters,
277 |     'for_json': True,
278 |     'iterable_as_array': True,
279 | }
280 | 
281 | 
282 | class RenamingUnpickler(pickle.Unpickler):
283 |     def find_class(self, module, name):
284 |         if module[:7] == 'feedly.':
285 |             module = 'feedme.' + module[7:]
286 |         if name == 'ProbeRequest':
287 |             name = 'ProbeFeed'
288 |         return super().find_class(module, name)
289 | 
290 | 
291 | class HyperlinkStore(KeywordStore):
292 |     TARGET_ATTRS = {'src', 'href', 'data-src', 'data-href'}
293 | 
294 |     def __init__(self, serialized: JSONDict = None):
295 |         super().__init__()
296 |         self._index: Dict[int, str]
297 |         if serialized:
298 |             self._deserialize(serialized)
299 | 
300 |     def _deserialize(self, dict_: JSONDict):
301 |         for k, v in dict_.items():
302 |             hash_ = hash(k)
303 |             self._index[hash_] = k
304 |             self._taggings[hash_] = {c: set(ls) for c, ls in v.items()}
305 | 
306 |     def parse_html(self, source, markup, **kwargs):
307 |         markup = parse_html(markup)
308 |         for attrib in self.TARGET_ATTRS:
309 |             elements = markup.css(f'[{attrib}]')
310 |             for tag in elements:
311 |                 url = tag.attrib.get(attrib)
312 |                 if not is_absolute_http(url):
313 |                     continue
314 |                 url = ensure_protocol(url)
315 | 
316 |                 keywords: KeywordCollection = {
317 |                     'source': {source},
318 |                     'domain': set(domain_parents(urlsplit(url).netloc)),
319 |                     'tag': set(),
320 |                 }
321 |                 keywords['tag'].add(tag.xpath('name()').get())
322 |                 self.put(url, **keywords, **kwargs)
323 | 
324 | 
325 | T = TypeVar('T')
326 | 
327 | 
328 | def findpath(start: T, dest: T, segments: Dict[T, Set[T]], path: List[T]) -> bool:
329 |     path.append(start)
330 | 
331 |     if start not in segments:
332 |         path.pop()
333 |         return False
334 | 
335 |     next_routes = segments.get(start, set()) - set(path)
336 |     if dest in next_routes:
337 |         path.append(dest)
338 |         return True
339 | 
340 |     for r in next_routes:
341 |         found = findpath(r, dest, segments, path)
342 |         if found:
343 |             return found
344 | 
345 |     path.pop()
346 |     return False
347 | 


--------------------------------------------------------------------------------
/presets/_autoload.py:
--------------------------------------------------------------------------------
 1 | # This file is used to autoload presets in this folder when a feed URL from
 2 | # certain websites is provided but the PRESET option is not set.
 3 | #
 4 | # Deleting this file disables this feature, and deleting/renaming predefined
 5 | # presets in this folder causes auto-load for that website to be disabled.
 6 | 
 7 | _SITES = {
 8 |     r'.*\.livejournal\.com/?.*': 'livejournal',
 9 |     r'.*\.tumblr\.com/?.*': 'tumblr',
10 |     r'.*\.wordpress\.com/?.*': 'wordpress',
11 | }
12 | 


--------------------------------------------------------------------------------
/presets/keywords.py:
--------------------------------------------------------------------------------
1 | PRIORITIZED_KEYWORDS = {
2 |     10: ['cats', 'kitties'],
3 |     5: ['dogs', 'puppies'],
4 |     -5: ['goldfish'],
5 |     -float('inf'): ['rat'],
6 | }
7 | 


--------------------------------------------------------------------------------
/presets/livejournal.py:
--------------------------------------------------------------------------------
 1 | DEPTH_LIMIT = 2
 2 | 
 3 | FOLLOW_DOMAINS = {'livejournal.com'}
 4 | 
 5 | RSS_TEMPLATES = {
 6 |     r'.*\.livejournal\.com/?.*': {
 7 |         'http://%(netloc)s/data/rss': 100,
 8 |         'https://%(netloc)s/data/rss': 200,
 9 |         'http://%(netloc)s/data/atom': 300,
10 |         'https://%(netloc)s/data/atom': 400,
11 |     },
12 | }
13 | 


--------------------------------------------------------------------------------
/presets/tumblr.py:
--------------------------------------------------------------------------------
 1 | DEPTH_LIMIT = 2
 2 | 
 3 | FOLLOW_DOMAINS = {'tumblr.com'}
 4 | SELECT_FEED_STATE = 'dead+'
 5 | 
 6 | 
 7 | def converter(base, match):
 8 |     for scheme in ('http', 'https'):
 9 |         for ending in ('rss', 'rss#_=_'):
10 |             yield f'{scheme}://{base.netloc}/{ending}'
11 | 
12 | 
13 | def deactivated_converter(base, match):
14 |     for scheme in ('http', 'https'):
15 |         for ending in ('rss', 'rss#_=_'):
16 |             yield f'{scheme}://{match.group(1)}.tumblr.com/{ending}'
17 |     yield from converter(base, match)
18 | 
19 | 
20 | RSS_TEMPLATES = {
21 |     r'https?://(.*)-deactivated\d*\.tumblr\.com/?.*': deactivated_converter,
22 |     r'.*\.tumblr\.com/?.*': converter,
23 | }
24 | 
25 | TUMBLR_IGNORE = {
26 |     'www.tumblr.com', 'staff.tumblr.com', 'tumblr.com',
27 |     'engineering.tumblr.com', 'support.tumblr.com',
28 |     'assets.tumblr.com',
29 | }
30 | 
31 | CONTRIB_SPIDER_MIDDLEWARES = {
32 |     'feedme.contrib.filters.KeywordPrioritizer': 500,
33 |     'feedme.contrib.tumblr.TumblrFilter': 505,
34 | }
35 | 


--------------------------------------------------------------------------------
/presets/wordpress.py:
--------------------------------------------------------------------------------
 1 | DEPTH_LIMIT = 2
 2 | 
 3 | FOLLOW_DOMAINS = {'wordpress.com'}
 4 | 
 5 | 
 6 | def template(base, match):
 7 |     for scheme in ('http', 'https'):
 8 |         for ending in ('?feed=rss', '?feed=rss2', '?feed=rdf', '?feed=atom'
 9 |                        'feed/', 'feed/rss/', 'feed/rss2/', 'feed/rdf/', 'feed/atom/'):
10 |             yield f'{scheme}://{base.netloc}/{ending}'
11 | 
12 | 
13 | RSS_TEMPLATES = {
14 |     r'.*\.wordpress\.com/?.*': template,
15 | }
16 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "feedme"
 3 | version = "0.10.11"
 4 | description = ""
 5 | authors = ["tonyzbf <tonyzbf@protonmail.com>"]
 6 | license = "MIT"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.9"
10 | click = "^8.0.3"
11 | Scrapy = "^2.5.1"
12 | cryptography = "^36.0.1"
13 | cffi = "^1.15.0"
14 | pycparser = "^2.21"
15 | six = "^1.16.0"
16 | cssselect = "^1.1.0"
17 | itemadapter = "^0.4.0"
18 | itemloaders = "^1.0.4"
19 | jmespath = "^0.10.0"
20 | parsel = "^1.6.0"
21 | lxml = "^4.7.1"
22 | w3lib = "^1.22.0"
23 | Protego = "^0.1.16"
24 | PyDispatcher = "^2.0.5"
25 | pyOpenSSL = "^21.0.0"
26 | queuelib = "^1.6.2"
27 | service-identity = "^21.1.0"
28 | attrs = "^21.4.0"
29 | pyasn1 = "^0.4.8"
30 | pyasn1-modules = "^0.2.8"
31 | Twisted = "^21.7.0"
32 | Automat = "^20.2.0"
33 | constantly = "^15.1.0"
34 | hyperlink = "^21.0.0"
35 | idna = "^3.3"
36 | incremental = "^21.3.0"
37 | PyHamcrest = "^2.0.3"
38 | "zope.interface" = "^5.4.0"
39 | setuptools = "^60.3.0"
40 | simplejson = "^3.17.6"
41 | termcolor = "^1.1.0"
42 | aiofiles = "^0.8.0"
43 | aiohttp = "^3.8.1"
44 | more-itertools = "^8.12.0"
45 | SQLAlchemy = "^1.4.29"
46 | python-igraph = "^0.9.8"
47 | texttable = "^1.6.4"
48 | 
49 | [tool.poetry.dev-dependencies]
50 | 
51 | [build-system]
52 | requires = ["poetry-core>=1.0.0"]
53 | build-backend = "poetry.core.masonry.api"
54 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = feedme.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = feedme
12 | 


--------------------------------------------------------------------------------