├── .gitignore ├── LICENSE ├── README.md ├── TODO.md ├── feedbuffer ├── __init__.py ├── core.py ├── database.py ├── log.py ├── server.py └── settings.py └── main.py /.gitignore: -------------------------------------------------------------------------------- 1 | ### JetBrains template 2 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio 3 | 4 | *.iml 5 | 6 | ## Directory-based project format: 7 | .idea/ 8 | # if you remove the above rule, at least ignore the following: 9 | 10 | # User-specific stuff: 11 | # .idea/workspace.xml 12 | # .idea/tasks.xml 13 | # .idea/dictionaries 14 | 15 | # Sensitive or high-churn files: 16 | # .idea/dataSources.ids 17 | # .idea/dataSources.xml 18 | # .idea/sqlDataSources.xml 19 | # .idea/dynamic.xml 20 | # .idea/uiDesigner.xml 21 | 22 | # Gradle: 23 | # .idea/gradle.xml 24 | # .idea/libraries 25 | 26 | # Mongo Explorer plugin: 27 | # .idea/mongoSettings.xml 28 | 29 | ## File-based project format: 30 | *.ipr 31 | *.iws 32 | 33 | ## Plugin-specific files: 34 | 35 | # IntelliJ 36 | /out/ 37 | 38 | # mpeltonen/sbt-idea plugin 39 | .idea_modules/ 40 | 41 | # JIRA plugin 42 | atlassian-ide-plugin.xml 43 | 44 | # Crashlytics plugin (for Android Studio and IntelliJ) 45 | com_crashlytics_export_strings.xml 46 | crashlytics.properties 47 | crashlytics-build.properties 48 | ### Python template 49 | # Byte-compiled / optimized / DLL files 50 | __pycache__/ 51 | *.py[cod] 52 | *$py.class 53 | 54 | # C extensions 55 | *.so 56 | 57 | # Distribution / packaging 58 | .Python 59 | env/ 60 | build/ 61 | develop-eggs/ 62 | dist/ 63 | downloads/ 64 | eggs/ 65 | .eggs/ 66 | lib/ 67 | lib64/ 68 | parts/ 69 | sdist/ 70 | var/ 71 | *.egg-info/ 72 | .installed.cfg 73 | *.egg 74 | 75 | # PyInstaller 76 | # Usually these files are written by a python script from a template 77 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 78 | *.manifest 79 | *.spec 80 | 81 | # Installer logs 82 | pip-log.txt 83 | pip-delete-this-directory.txt 84 | 85 | # Unit test / coverage reports 86 | htmlcov/ 87 | .tox/ 88 | .coverage 89 | .coverage.* 90 | .cache 91 | nosetests.xml 92 | coverage.xml 93 | *,cover 94 | 95 | # Translations 96 | *.mo 97 | *.pot 98 | 99 | # Django stuff: 100 | *.log 101 | 102 | # Sphinx documentation 103 | docs/_build/ 104 | 105 | # PyBuilder 106 | target/ 107 | 108 | # Created by .ignore support plugin (hsz.mobi) 109 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Chris Braun 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Feedbuffer 2 | Feedbuffer buffers RSS and Atom syndication feeds, that is to say it caches new feed entries until the news aggregator requests them and then generates the syndication feed with all cached entries. 3 | 4 | This is will be mostly useful to people who read feeds with a very high throughput, only use their news aggregator very rarely or simply want to make very sure that they aren't missing any entries. 5 | 6 | 7 | ## Usage 8 | Install the requirements: `pip install peewee feedparser requests cachecontrol lxml beautifulsoup4 cherrypy` and run `main.py`. By default a HTTP server will respond on http://0.0.0.0:8083/ (check constants.py for more configuration options). Instead of requesting the target feed directly in your news aggregator, prefix the URL like this: `http://0.0.0.0:8083/?url=` where `url` is a URL-quoted version of the original feed URL: 9 | 10 | ``` 11 | >>> from urllib.parse import quote_plus 12 | >>> quote_plus('https://www.reddit.com/.rss') 13 | 'https%3A%2F%2Fwww.reddit.com%2F.rss' 14 | >>> 15 | ``` 16 | 17 | Additionally `&update_interval=` can be used to adjust the interval in which Feedbuffer will check for updates to the feed (default is 180 seconds). 18 | 19 | 20 | ## Details 21 | Feedbuffer will attempt to fix invalid feed entries with a missing unique identifier field by generating the SHA-1 sum of the entry's content and inserting it. The log file will be generated as `feedbuffer.log` in the current working directory. 22 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | - Possibly save fetched feed item ids to reduce outgoing traffic by not sending the same feed item twice 2 | - Remove the feedparser dependency and figure out all ways to get access to the feed item id 3 | -------------------------------------------------------------------------------- /feedbuffer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cryzed/Feedbuffer/0f258bbc7fbcdb39e19fd7d26192a45098b1fc68/feedbuffer/__init__.py -------------------------------------------------------------------------------- /feedbuffer/core.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | import hashlib 3 | import sched 4 | 5 | import bs4 6 | import cachecontrol 7 | import feedparser 8 | import requests 9 | import requests.exceptions 10 | 11 | from feedbuffer import settings, database, log 12 | 13 | _logger = log.get_logger(__name__) 14 | _session = cachecontrol.CacheControl(requests.Session()) 15 | _session.headers['User-Agent'] = settings.USER_AGENT 16 | executor = concurrent.futures.ThreadPoolExecutor(max_workers=settings.MAXIMUM_UPDATE_WORKERS) 17 | scheduled = {} 18 | scheduler = sched.scheduler() 19 | 20 | # XML-processing instructions have to end with "?>". The original code erroneously ends them with ">" which leads to 21 | # errors in almost all parsers, including BeautifulSoup with the lxml treebuilder itself -- so we fix this at runtime. 22 | bs4.element.ProcessingInstruction.SUFFIX = '?>' 23 | 24 | 25 | def extract_feed_entries(soup): 26 | return [item.extract() for item in soup(['item', 'entry'])] 27 | 28 | 29 | def update_feed(url): 30 | try: 31 | response = _session.get(url, timeout=settings.REQUEST_TIMEOUT) 32 | except requests.exceptions.Timeout: 33 | return 34 | 35 | # Don't let requests do the content decoding, instead just supply the encoding detected by requests and let 36 | # BeautifulSoup and the treebuilder do their thing. For example: BeautifulSoup4 with the lxml treebuilder only 37 | # correctly parses content with tags when it can decode the bytes by itself. 38 | try: 39 | soup = bs4.BeautifulSoup(response.content, 'xml', from_encoding=response.encoding) 40 | except UnicodeDecodeError: 41 | soup = bs4.BeautifulSoup(response.content, 'xml', from_encoding=response.apparent_encoding) 42 | 43 | entries = extract_feed_entries(soup) 44 | # TODO: Remove the feedparser dependency and figure out all ways to get access to the feed item id 45 | parsed_feed = feedparser.parse(response.text) 46 | is_rss = parsed_feed.version.startswith('rss') 47 | 48 | entry_ids = [] 49 | for index, parsed_entry in enumerate(parsed_feed.entries): 50 | id_ = parsed_entry.get('id', None) 51 | 52 | # id might be non-existent or simply empty, make sure to handle both cases correctly 53 | if not id_: 54 | id_ = hashlib.sha1(entries[index].encode(settings.ENCODING)).hexdigest() 55 | _logger.info('No identifier found for entry %d of %s. Inserting SHA-1 id: %s...', index, url, id_) 56 | id_tag = soup.new_tag('guid' if is_rss else 'id') 57 | id_tag.string = id_ 58 | entries[index].append(id_tag) 59 | entry_ids.append(id_) 60 | 61 | # Fix missing RSS channel element 62 | if is_rss and not soup.find('channel'): 63 | _logger.info('No RSS channel element found for %s. Inserting channel element...', url) 64 | rss = soup.find('rss') 65 | rss.append(rss.new_tag('channel')) 66 | 67 | database.update_feed(url, str(soup), zip(entry_ids, (str(entry) for entry in entries))) 68 | 69 | 70 | def update_and_reschedule_feed(url): 71 | executor.submit(update_feed, url) 72 | schedule_feed_update(url) 73 | 74 | 75 | def schedule_feed_update(url): 76 | if url in scheduled: 77 | if scheduled[url] in scheduler.queue: 78 | try: 79 | scheduler.cancel(scheduled[url]) 80 | except ValueError: 81 | pass 82 | del scheduled[url] 83 | 84 | if not database.feed_exists(url): 85 | return 86 | 87 | feed = database.get_feed(url) 88 | event = scheduler.enter(feed.update_interval, 1, update_and_reschedule_feed, (url,)) 89 | scheduled[url] = event 90 | 91 | 92 | def generate_feed(feed_data, entries): 93 | feed = bs4.BeautifulSoup(feed_data, 'xml') 94 | 95 | # Find RSS channel element directly 96 | root = feed.find(['channel', 'feed']) 97 | for entry in entries: 98 | entry = bs4.BeautifulSoup(entry, 'xml') 99 | entry = entry.find(['item', 'entry']) 100 | root.insert(len(root.contents), entry) 101 | 102 | return str(feed).encode(settings.ENCODING) 103 | -------------------------------------------------------------------------------- /feedbuffer/database.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | import functools 3 | 4 | import peewee 5 | 6 | from feedbuffer import settings, log 7 | 8 | _database = peewee.SqliteDatabase(settings.DATABASE_PATH) 9 | _logger = log.get_logger(__name__) 10 | 11 | # Easy way to queue function calls and execute them in a single thread, without having to manually write 12 | # producer-consumer logic. 13 | _database_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) 14 | 15 | 16 | class Model(peewee.Model): 17 | class Meta: 18 | database = _database 19 | 20 | 21 | class Feed(Model): 22 | url = peewee.TextField(unique=True) 23 | update_interval = peewee.IntegerField(default=settings.DEFAULT_UPDATE_INTERVAL) 24 | data = peewee.TextField() 25 | 26 | 27 | class FeedItem(Model): 28 | id_ = peewee.TextField(unique=True) 29 | data = peewee.TextField() 30 | feed = peewee.ForeignKeyField(Feed, related_name='entries') 31 | 32 | 33 | _database.create_tables([Feed, FeedItem], safe=True) 34 | 35 | 36 | def _execute_in(executor): 37 | def decorator(function): 38 | @functools.wraps(function) 39 | def wrapper(*args, **kwargs): 40 | future = executor.submit(function, *args, **kwargs) 41 | return future.result() 42 | 43 | return wrapper 44 | 45 | return decorator 46 | 47 | 48 | def _get_feed_query(url): 49 | return Feed.select().where(Feed.url == url) 50 | 51 | 52 | def _feed_item_exists(feed, id_): 53 | return FeedItem.select().where(FeedItem.feed == feed and FeedItem.id_ == id_).exists() 54 | 55 | 56 | def _feed_exists(url): 57 | return _get_feed_query(url).exists() 58 | 59 | 60 | def _get_feed(url): 61 | return _get_feed_query(url).get() 62 | 63 | 64 | @_execute_in(_database_executor) 65 | def feed_exists(url): 66 | return _get_feed_query(url).exists() 67 | 68 | 69 | @_execute_in(_database_executor) 70 | def get_feed(url): 71 | return _get_feed(url) 72 | 73 | 74 | @_execute_in(_database_executor) 75 | def update_feed(url, feed_data, entries): 76 | if _feed_exists(url): 77 | feed = _get_feed(url) 78 | else: 79 | feed = Feed(url=url, data=feed_data) 80 | feed.save() 81 | 82 | data_source = [ 83 | {'id_': id_, 'data': entry, 'feed': feed} for (id_, entry) in entries if not _feed_item_exists(feed, id_) 84 | ] 85 | 86 | _logger.info('Updating feed: %s with %d new entries...', url, len(data_source)) 87 | 88 | with _database.atomic(): 89 | FeedItem.insert_many(data_source).execute() 90 | feed.data = feed_data 91 | feed.save() 92 | 93 | 94 | @_execute_in(_database_executor) 95 | def flush_feed(feed): 96 | query = FeedItem.delete().where(FeedItem.feed == feed) 97 | query.execute() 98 | 99 | 100 | # Generic way to update data in a model instance using the write executor 101 | @_execute_in(_database_executor) 102 | def update_model_data(model, **kwargs): 103 | for key, value in kwargs.items(): 104 | setattr(model, key, value) 105 | 106 | model.save() 107 | -------------------------------------------------------------------------------- /feedbuffer/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import cherrypy 4 | 5 | from feedbuffer import settings 6 | 7 | _PACKAGE_PREFIX = __package__ + '.' 8 | 9 | _logger = logging.getLogger(__package__) 10 | _logger.setLevel(settings.LOGGING_LEVEL) 11 | _formatter = logging.Formatter('[%(asctime)s][%(levelname)s] %(name)s: %(message)s') 12 | 13 | cherrypy.log.screen = False 14 | cherrypy.log.error_log.propagate = False 15 | cherrypy.log.access_log.propagate = False 16 | 17 | for handler in settings.LOGGING_HANDLERS: 18 | handler.setFormatter(_formatter) 19 | _logger.addHandler(handler) 20 | 21 | 22 | def get_logger(name): 23 | if name.startswith(_PACKAGE_PREFIX): 24 | name = name[len(_PACKAGE_PREFIX):] 25 | 26 | return _logger.getChild(name) 27 | -------------------------------------------------------------------------------- /feedbuffer/server.py: -------------------------------------------------------------------------------- 1 | import cherrypy 2 | 3 | from feedbuffer import core, database, log, settings 4 | from feedbuffer.settings import DEFAULT_UPDATE_INTERVAL 5 | 6 | _logger = log.get_logger(__name__) 7 | 8 | 9 | class Server: 10 | @cherrypy.expose 11 | def index(self, url, update_interval=DEFAULT_UPDATE_INTERVAL): 12 | if not database.feed_exists(url): 13 | _logger.info('Adding feed: %s', url) 14 | try: 15 | core.update_feed(url) 16 | except Exception: 17 | _logger.exception('Exception occurred during initial feed update: %s', url) 18 | return None 19 | 20 | core.schedule_feed_update(url) 21 | elif url not in core.scheduled: 22 | _logger.info('Updating feed: %s', url) 23 | core.executor.submit(core.update_feed, url) 24 | core.schedule_feed_update(url) 25 | 26 | feed = database.get_feed(url) 27 | update_interval = int(update_interval) 28 | if feed.update_interval != update_interval: 29 | _logger.info('Changing update interval from %d to %d seconds for feed: %s', 30 | feed.update_interval, update_interval, url) 31 | database.update_model_data(feed, update_interval=update_interval) 32 | core.schedule_feed_update(url) 33 | 34 | _logger.info('Generating feed: %s with %d entries...', url, len(feed.entries)) 35 | response = core.generate_feed(feed.data, [entry.data for entry in feed.entries]) 36 | database.flush_feed(feed) 37 | cherrypy.response.headers['Content-Type'] = '' 38 | return response 39 | -------------------------------------------------------------------------------- /feedbuffer/settings.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | DATABASE_PATH = __package__ + '.db' 5 | DEFAULT_UPDATE_INTERVAL = 180 6 | ENCODING = 'UTF-8' 7 | LOGGING_HANDLERS = [logging.FileHandler(__package__ + '.log')] 8 | LOGGING_LEVEL = logging.WARNING 9 | MAXIMUM_UPDATE_WORKERS = (os.cpu_count() or 1) * 5 10 | PORT = 8083 11 | REQUEST_TIMEOUT = 30 12 | USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0' 13 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import time 3 | import traceback 4 | 5 | import cherrypy 6 | 7 | from feedbuffer import core, log 8 | from feedbuffer.settings import PORT 9 | from feedbuffer.server import Server 10 | 11 | _logger = log.get_logger(__name__) 12 | 13 | 14 | def main(): 15 | cherrypy.config.update({ 16 | 'server.socket_port': PORT, 17 | 'server.socket_host': '0.0.0.0', 18 | 'checker.check_skipped_app_config': False 19 | }) 20 | threading.Thread(target=lambda: cherrypy.quickstart(Server())).start() 21 | while True: 22 | try: 23 | core.scheduler.run() 24 | except Exception: 25 | _logger.error(traceback.format_exc()) 26 | time.sleep(1) 27 | 28 | 29 | if __name__ == '__main__': 30 | main() 31 | --------------------------------------------------------------------------------