634 |
635 | This program is free software: you can redistribute it and/or modify
636 | it under the terms of the GNU Affero General Public License as published
637 | by the Free Software Foundation, either version 3 of the License, or
638 | (at your option) any later version.
639 |
640 | This program is distributed in the hope that it will be useful,
641 | but WITHOUT ANY WARRANTY; without even the implied warranty of
642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
643 | GNU Affero General Public License for more details.
644 |
645 | You should have received a copy of the GNU Affero General Public License
646 | along with this program. If not, see .
647 |
648 | Also add information on how to contact you by electronic and paper mail.
649 |
650 | If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source. For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code. There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 |
658 | You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | .
662 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pwc-feeds
2 | **pwc-feeds** uses Python 3.9 to serve RSS feeds for Papers With Code.
3 | As a disclaimer, it has no affiliation with Papers With Code.
4 |
5 | ## Links
6 | * [Project repo](https://github.com/ml-feeds/pwc-feeds)
7 | * Original HTML listing: [Trending](https://paperswithcode.com/) | [Latest](https://paperswithcode.com/latest) | [Greatest](https://paperswithcode.com/greatest)
8 | * **Unofficial RSS feeds: [Trending](https://us-east1-ml-feeds.cloudfunctions.net/pwc/trending) | [Latest](https://us-east1-ml-feeds.cloudfunctions.net/pwc/latest) | [Greatest](https://us-east1-ml-feeds.cloudfunctions.net/pwc/greatest)**
9 |
10 | ## Deployment
11 | Serverless deployment to [Google Cloud Functions](https://console.cloud.google.com/functions/) is configured.
12 | It requires the following files:
13 | * requirements.txt
14 | * main.py (having callable `serve(request: flask.Request) -> Tuple[Union[bytes, str], int, Dict[str, str]]`)
15 |
16 | Deployment version updates are not automated.
17 | They can be performed manually by editing and saving the function configuration.
18 |
19 | These deployment links require access:
20 | * [Dashboard](https://console.cloud.google.com/functions/details/us-east1/pwc?project=ml-feeds)
21 | * [Logs](https://console.cloud.google.com/logs?service=cloudfunctions.googleapis.com&key1=pwc&key2=us-east1&project=ml-feeds)
22 | * [Repo](https://source.cloud.google.com/ml-feeds/github_ml-feeds_pwc-feeds)
23 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from typing import Dict, Tuple, Union
3 |
4 | import flask
5 |
6 | from pwc import config
7 | from pwc.feed import Feed
8 |
9 | log = logging.getLogger(__name__)
10 |
11 | FEEDS = {feed_type: Feed(feed_type) for feed_type in config.FEED_TYPES}
12 | VALID_PATHS = {f'/{feed_type}' for feed_type in config.FEED_TYPES}
13 | VALID_PATHS_STR = ', '.join(VALID_PATHS)
14 |
15 |
16 | def serve(request: flask.Request) -> Tuple[Union[bytes, str], int, Dict[str, str]]:
17 | hget = request.headers.get
18 | log.info('Received request for "%s" from %s from %s, %s, %s.', request.path, hget('X-Appengine-User-Ip'),
19 | hget('X-Appengine-City'), hget('X-Appengine-Region'), hget('X-Appengine-Country'))
20 |
21 | if request.path not in VALID_PATHS:
22 | request_path = '' if request.path is None else request.path
23 | msg = f'The requested path "{request_path}" is invalid. Valid paths are: {VALID_PATHS_STR}'
24 | log.error('Error handling request from %s: %s', hget('X-Appengine-User-Ip'), msg)
25 | return f'ERROR: {msg}', 400, {'Content-Type': 'text/plain; charset=utf-8'}
26 |
27 | feed_type = request.path[1:] # Strip leading "/".
28 | feed = FEEDS[feed_type]
29 | return feed.feed(), 200, {'Content-Type': 'text/xml; charset=utf-8'}
30 |
--------------------------------------------------------------------------------
/pwc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-feeds/pwc-feeds/25b40c6f5d021ec8fdb3877eb50e207d22e29151/pwc/__init__.py
--------------------------------------------------------------------------------
/pwc/config.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import logging.config
3 | import os
4 | from pathlib import Path
5 |
6 |
7 | def configure_logging() -> None:
8 | logging.config.dictConfig(LOGGING)
9 | log = logging.getLogger(__name__)
10 | log.debug('Logging is configured.')
11 |
12 |
13 | CACHE_TTL = datetime.timedelta(minutes=20).total_seconds()
14 | FEED_DESCRIPTION = 'As a disclaimer, this is an unofficial feed and has no affiliation with Papers with Code.'
15 | FEED_TITLE_TEMPLATE = 'Papers with Code: {feed_type} (unofficial)'
16 | FEED_TYPES = {'trending', 'latest', 'greatest'}
17 | HTML_HEXT = """
18 |
19 |
20 |
21 |
22 | """
23 | HTML_URL_BASE = 'https://paperswithcode.com/'
24 | NUM_PAGES_READ = {'trending': 3, 'latest': 4, 'greatest': 2}
25 | ON_SERVERLESS = bool(os.getenv('GCLOUD_PROJECT'))
26 | PACKAGE_NAME = Path(__file__).parent.stem
27 | REPO_URL = 'https://github.com/ml-feeds/pwc-feeds'
28 | REQUEST_TIMEOUT = 5 # A higher value risks excessive serverless billing.
29 | USER_AGENT = 'Mozilla/5.0'
30 |
31 | LOGGING = { # Ref: https://docs.python.org/3/howto/logging.html#configuring-logging
32 | 'version': 1,
33 | 'formatters': {
34 | 'detailed': {
35 | 'format': '[%(relativeCreated)i] %(name)s:%(lineno)d:%(funcName)s:%(levelname)s: %(message)s',
36 | },
37 | 'serverless': {
38 | 'format': '%(thread)x:%(name)s:%(lineno)d:%(funcName)s:%(levelname)s: %(message)s',
39 | },
40 | },
41 | 'handlers': {
42 | 'console': {
43 | 'class': 'logging.StreamHandler',
44 | 'level': 'DEBUG',
45 | 'formatter': 'serverless' if ON_SERVERLESS else 'detailed',
46 | 'stream': 'ext://sys.stdout',
47 | },
48 | },
49 | 'loggers': {
50 | PACKAGE_NAME: {
51 | 'level': 'INFO' if ON_SERVERLESS else 'DEBUG',
52 | 'handlers': ['console'],
53 | 'propagate': False,
54 | },
55 | '': {
56 | 'level': 'DEBUG',
57 | 'handlers': ['console'],
58 | },
59 | },
60 | }
61 |
--------------------------------------------------------------------------------
/pwc/feed.py:
--------------------------------------------------------------------------------
1 | from functools import lru_cache
2 | import json
3 | import logging
4 | import socket
5 | from typing import cast, Tuple
6 | from urllib.error import URLError
7 | from urllib.request import Request, urlopen
8 |
9 | from cachetools.func import ttl_cache
10 | from feedgen.feed import FeedGenerator
11 | from hext import Html, Rule
12 | from humanize import naturalsize
13 | from lxml.etree import CDATA
14 | from more_itertools import unique_everseen
15 |
16 | from pwc import config
17 |
18 | config.configure_logging()
19 |
20 | log = logging.getLogger(__name__)
21 |
22 | def _fetch_url(request: Request) -> bytes:
23 | try:
24 | return cast(bytes, urlopen(request, timeout=config.REQUEST_TIMEOUT).read())
25 | except (URLError, socket.timeout) as e:
26 | log.warning("Timeout or error fetching %s: %s", request.full_url, e)
27 | raise
28 |
29 |
30 | class Feed:
31 |
32 | def __init__(self, feed_type: str):
33 | html_url_suffix = feed_type if feed_type != "trending" else ''
34 | html_urls = [f'{config.HTML_URL_BASE}{html_url_suffix}?page={page_num}' for page_num in
35 | range(1, config.NUM_PAGES_READ[feed_type] + 1)]
36 | self._html_requests = [Request(html_url, headers={'User-Agent': config.USER_AGENT}) for html_url in html_urls]
37 |
38 | self._feed_type = feed_type
39 | self._feed_type_desc = f'for "{self._feed_type}"'
40 | self._feed_title = config.FEED_TITLE_TEMPLATE.format(feed_type=self._feed_type.title())
41 |
42 | self._hext_rule_extract = Rule(config.HTML_HEXT).extract
43 | self._is_debug_logged = log.isEnabledFor(logging.DEBUG)
44 |
45 | self._output = lru_cache(maxsize=1)(self._output) # type: ignore # Instance level cache
46 | self.feed = ttl_cache(maxsize=1, ttl=config.CACHE_TTL)(self.feed) # type: ignore # Instance level cache
47 |
48 | def _init_feed(self) -> FeedGenerator:
49 | feed = FeedGenerator()
50 | feed.title(self._feed_title)
51 | feed.link(href=config.REPO_URL, rel='self')
52 | feed.description(config.FEED_DESCRIPTION)
53 | return feed
54 |
55 | def _output(self, texts: Tuple[bytes, ...]) -> bytes:
56 | feed_type_desc = self._feed_type_desc
57 | items = [item for text in texts for item in self._hext_rule_extract(Html(text.decode()))]
58 | items = list(unique_everseen(items, json.dumps))
59 | log.info('HTML inputs %s have %s items in all.', feed_type_desc, len(items))
60 |
61 | feed = self._init_feed()
62 | is_debug_logged = self._is_debug_logged
63 | for item in items:
64 | for website in ("https://github.com/", "https://gitlab.com/"):
65 | if item["code_link"].startswith(website):
66 | item["code_author"] = item["code_link"].removeprefix(website).split("/")[0]
67 | item["title"] = "/" + item["code_author"] + "/ " + item["title"]
68 | break
69 | if 'categories' not in item:
70 | item['categories'] = []
71 | elif isinstance(item['categories'], str):
72 | item['categories'] = [item['categories']]
73 |
74 | entry = feed.add_entry(order='append')
75 | entry.title(item['title'])
76 | entry.link(href=item['link'])
77 | entry.guid(item['link'], permalink=True)
78 | # description = '\n\n'.join((item['description'], item['code_link']))
79 | description = f'{item["description"]} Code: {item["code_link"]}
'
80 | entry.description(CDATA(description))
81 | entry.comments(item["code_link"])
82 | for category in item['categories']:
83 | if category.startswith('+') and category[1:].isdigit(): # Ex: +1, +2
84 | continue
85 | category = category.capitalize() if category.isupper() else category
86 | entry.category(term=category)
87 | if is_debug_logged:
88 | log.debug('Added: %s', item['title'])
89 |
90 | text_: bytes = feed.rss_str(pretty=True)
91 | log.info('XML output %s has %s items.', feed_type_desc, text_.count(b'- '))
92 | return text_
93 |
94 | def feed(self) -> bytes:
95 | feed_type_desc = self._feed_type_desc
96 | log.debug(f'Reading %s HTML pages %s.', len(self._html_requests), feed_type_desc)
97 | texts = tuple(_fetch_url(req) for req in self._html_requests)
98 | log.info('HTML inputs %s have sizes: %s', feed_type_desc, ', '.join(humanize_len(text) for text in texts))
99 | text = self._output(texts)
100 | log.info('XML output %s has size %s.', feed_type_desc, humanize_len(text))
101 | return text
102 |
103 |
104 | def humanize_len(text: bytes) -> str:
105 | return naturalsize(len(text), gnu=True, format='%.0f')
106 |
--------------------------------------------------------------------------------
/requirements-dev.in:
--------------------------------------------------------------------------------
1 | mypy
2 | pip
3 | pip-tools
4 |
--------------------------------------------------------------------------------
/requirements.in:
--------------------------------------------------------------------------------
1 | cachetools
2 | feedgen
3 | flask
4 | hext
5 | humanize
6 | more-itertools
7 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile with python 3.9
3 | # To update, run:
4 | #
5 | # pip-compile
6 | #
7 | cachetools==5.0.0
8 | # via -r requirements.in
9 | click==8.0.4
10 | # via flask
11 | feedgen==0.9.0
12 | # via -r requirements.in
13 | flask==2.0.3
14 | # via -r requirements.in
15 | hext==1.0.2
16 | # via -r requirements.in
17 | humanize==4.0.0
18 | # via -r requirements.in
19 | itsdangerous==2.1.0
20 | # via flask
21 | jinja2==3.0.3
22 | # via flask
23 | lxml==4.8.0
24 | # via feedgen
25 | markupsafe==2.1.0
26 | # via jinja2
27 | more-itertools==8.12.0
28 | # via -r requirements.in
29 | python-dateutil==2.8.2
30 | # via feedgen
31 | six==1.16.0
32 | # via python-dateutil
33 | werkzeug==2.0.3
34 | # via flask
35 |
--------------------------------------------------------------------------------
/scripts/feed.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from pwc import config
4 | from pwc.feed import Feed
5 |
6 | log = logging.getLogger(__name__)
7 |
8 | if __name__ == '__main__':
9 | feeds = [Feed(feed_type) for feed_type in config.FEED_TYPES]
10 | outputs1 = {feed: feed.feed().decode() for feed in feeds}
11 | outputs2 = {feed: feed.feed().decode() for feed in feeds}
12 | for o1, o2 in zip(outputs1.values(), outputs2.values()):
13 | assert o1 == o2
14 | print(o1)
15 |
--------------------------------------------------------------------------------
/scripts/install_requirements.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euxo pipefail
3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
4 | cd "${DIR}"/..
5 |
6 | pip install -U -r ./requirements-dev.in
7 | pip-compile -U
8 | pip install -U -r ./requirements.txt
9 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [mypy]
2 | check_untyped_defs = True
3 | disallow_incomplete_defs = True
4 | disallow_untyped_calls = True
5 | ignore_missing_imports = True
6 | incremental = False
7 | warn_unused_ignores = True
8 |
--------------------------------------------------------------------------------