├── .gitignore
├── LICENSE
├── README.md
├── TODO.md
├── feedbuffer
    ├── __init__.py
    ├── core.py
    ├── database.py
    ├── log.py
    ├── server.py
    └── settings.py
└── main.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | ### JetBrains template
  2 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
  3 | 
  4 | *.iml
  5 | 
  6 | ## Directory-based project format:
  7 | .idea/
  8 | # if you remove the above rule, at least ignore the following:
  9 | 
 10 | # User-specific stuff:
 11 | # .idea/workspace.xml
 12 | # .idea/tasks.xml
 13 | # .idea/dictionaries
 14 | 
 15 | # Sensitive or high-churn files:
 16 | # .idea/dataSources.ids
 17 | # .idea/dataSources.xml
 18 | # .idea/sqlDataSources.xml
 19 | # .idea/dynamic.xml
 20 | # .idea/uiDesigner.xml
 21 | 
 22 | # Gradle:
 23 | # .idea/gradle.xml
 24 | # .idea/libraries
 25 | 
 26 | # Mongo Explorer plugin:
 27 | # .idea/mongoSettings.xml
 28 | 
 29 | ## File-based project format:
 30 | *.ipr
 31 | *.iws
 32 | 
 33 | ## Plugin-specific files:
 34 | 
 35 | # IntelliJ
 36 | /out/
 37 | 
 38 | # mpeltonen/sbt-idea plugin
 39 | .idea_modules/
 40 | 
 41 | # JIRA plugin
 42 | atlassian-ide-plugin.xml
 43 | 
 44 | # Crashlytics plugin (for Android Studio and IntelliJ)
 45 | com_crashlytics_export_strings.xml
 46 | crashlytics.properties
 47 | crashlytics-build.properties
 48 | ### Python template
 49 | # Byte-compiled / optimized / DLL files
 50 | __pycache__/
 51 | *.py[cod]
 52 | *$py.class
 53 | 
 54 | # C extensions
 55 | *.so
 56 | 
 57 | # Distribution / packaging
 58 | .Python
 59 | env/
 60 | build/
 61 | develop-eggs/
 62 | dist/
 63 | downloads/
 64 | eggs/
 65 | .eggs/
 66 | lib/
 67 | lib64/
 68 | parts/
 69 | sdist/
 70 | var/
 71 | *.egg-info/
 72 | .installed.cfg
 73 | *.egg
 74 | 
 75 | # PyInstaller
 76 | #  Usually these files are written by a python script from a template
 77 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 78 | *.manifest
 79 | *.spec
 80 | 
 81 | # Installer logs
 82 | pip-log.txt
 83 | pip-delete-this-directory.txt
 84 | 
 85 | # Unit test / coverage reports
 86 | htmlcov/
 87 | .tox/
 88 | .coverage
 89 | .coverage.*
 90 | .cache
 91 | nosetests.xml
 92 | coverage.xml
 93 | *,cover
 94 | 
 95 | # Translations
 96 | *.mo
 97 | *.pot
 98 | 
 99 | # Django stuff:
100 | *.log
101 | 
102 | # Sphinx documentation
103 | docs/_build/
104 | 
105 | # PyBuilder
106 | target/
107 | 
108 | # Created by .ignore support plugin (hsz.mobi)
109 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Chris Braun
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Feedbuffer
 2 | Feedbuffer buffers RSS and Atom syndication feeds, that is to say it caches new feed entries until the news aggregator requests them and then generates the syndication feed with all cached entries.
 3 | 
 4 | This is will be mostly useful to people who read feeds with a very high throughput, only use their news aggregator very rarely or simply want to make very sure that they aren't missing any entries.
 5 | 
 6 | 
 7 | ## Usage
 8 | Install the requirements: `pip install peewee feedparser requests cachecontrol lxml beautifulsoup4 cherrypy` and run `main.py`. By default a HTTP server will respond on http://0.0.0.0:8083/ (check constants.py for more configuration options). Instead of requesting the target feed directly in your news aggregator, prefix the URL like this: `http://0.0.0.0:8083/?url=<url>` where `url` is a URL-quoted version of the original feed URL:
 9 | 
10 | ```
11 | >>> from urllib.parse import quote_plus
12 | >>> quote_plus('https://www.reddit.com/.rss')
13 | 'https%3A%2F%2Fwww.reddit.com%2F.rss'
14 | >>>
15 | ```
16 | 
17 | Additionally `&update_interval=<integer>` can be used to adjust the interval in which Feedbuffer will check for updates to the feed (default is 180 seconds).
18 | 
19 | 
20 | ## Details
21 | Feedbuffer will attempt to fix invalid feed entries with a missing unique identifier field by generating the SHA-1 sum of the entry's content and inserting it. The log file will be generated as `feedbuffer.log` in the current working directory.
22 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | - Possibly save fetched feed item ids to reduce outgoing traffic by not sending the same feed item twice
2 | - Remove the feedparser dependency and figure out all ways to get access to the feed item id
3 | 


--------------------------------------------------------------------------------
/feedbuffer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cryzed/Feedbuffer/0f258bbc7fbcdb39e19fd7d26192a45098b1fc68/feedbuffer/__init__.py


--------------------------------------------------------------------------------
/feedbuffer/core.py:
--------------------------------------------------------------------------------
  1 | import concurrent.futures
  2 | import hashlib
  3 | import sched
  4 | 
  5 | import bs4
  6 | import cachecontrol
  7 | import feedparser
  8 | import requests
  9 | import requests.exceptions
 10 | 
 11 | from feedbuffer import settings, database, log
 12 | 
 13 | _logger = log.get_logger(__name__)
 14 | _session = cachecontrol.CacheControl(requests.Session())
 15 | _session.headers['User-Agent'] = settings.USER_AGENT
 16 | executor = concurrent.futures.ThreadPoolExecutor(max_workers=settings.MAXIMUM_UPDATE_WORKERS)
 17 | scheduled = {}
 18 | scheduler = sched.scheduler()
 19 | 
 20 | # XML-processing instructions have to end with "?>". The original code erroneously ends them with ">" which leads to
 21 | # errors in almost all parsers, including BeautifulSoup with the lxml treebuilder itself -- so we fix this at runtime.
 22 | bs4.element.ProcessingInstruction.SUFFIX = '?>'
 23 | 
 24 | 
 25 | def extract_feed_entries(soup):
 26 |     return [item.extract() for item in soup(['item', 'entry'])]
 27 | 
 28 | 
 29 | def update_feed(url):
 30 |     try:
 31 |         response = _session.get(url, timeout=settings.REQUEST_TIMEOUT)
 32 |     except requests.exceptions.Timeout:
 33 |         return
 34 | 
 35 |     # Don't let requests do the content decoding, instead just supply the encoding detected by requests and let
 36 |     # BeautifulSoup and the treebuilder do their thing. For example: BeautifulSoup4 with the lxml treebuilder only
 37 |     # correctly parses content with <content:encoded> tags when it can decode the bytes by itself.
 38 |     try:
 39 |         soup = bs4.BeautifulSoup(response.content, 'xml', from_encoding=response.encoding)
 40 |     except UnicodeDecodeError:
 41 |         soup = bs4.BeautifulSoup(response.content, 'xml', from_encoding=response.apparent_encoding)
 42 | 
 43 |     entries = extract_feed_entries(soup)
 44 |     # TODO: Remove the feedparser dependency and figure out all ways to get access to the feed item id
 45 |     parsed_feed = feedparser.parse(response.text)
 46 |     is_rss = parsed_feed.version.startswith('rss')
 47 | 
 48 |     entry_ids = []
 49 |     for index, parsed_entry in enumerate(parsed_feed.entries):
 50 |         id_ = parsed_entry.get('id', None)
 51 | 
 52 |         # id might be non-existent or simply empty, make sure to handle both cases correctly
 53 |         if not id_:
 54 |             id_ = hashlib.sha1(entries[index].encode(settings.ENCODING)).hexdigest()
 55 |             _logger.info('No identifier found for entry %d of %s. Inserting SHA-1 id: %s...', index, url, id_)
 56 |             id_tag = soup.new_tag('guid' if is_rss else 'id')
 57 |             id_tag.string = id_
 58 |             entries[index].append(id_tag)
 59 |         entry_ids.append(id_)
 60 | 
 61 |     # Fix missing RSS channel element
 62 |     if is_rss and not soup.find('channel'):
 63 |         _logger.info('No RSS channel element found for %s. Inserting channel element...', url)
 64 |         rss = soup.find('rss')
 65 |         rss.append(rss.new_tag('channel'))
 66 | 
 67 |     database.update_feed(url, str(soup), zip(entry_ids, (str(entry) for entry in entries)))
 68 | 
 69 | 
 70 | def update_and_reschedule_feed(url):
 71 |     executor.submit(update_feed, url)
 72 |     schedule_feed_update(url)
 73 | 
 74 | 
 75 | def schedule_feed_update(url):
 76 |     if url in scheduled:
 77 |         if scheduled[url] in scheduler.queue:
 78 |             try:
 79 |                 scheduler.cancel(scheduled[url])
 80 |             except ValueError:
 81 |                 pass
 82 |         del scheduled[url]
 83 | 
 84 |     if not database.feed_exists(url):
 85 |         return
 86 | 
 87 |     feed = database.get_feed(url)
 88 |     event = scheduler.enter(feed.update_interval, 1, update_and_reschedule_feed, (url,))
 89 |     scheduled[url] = event
 90 | 
 91 | 
 92 | def generate_feed(feed_data, entries):
 93 |     feed = bs4.BeautifulSoup(feed_data, 'xml')
 94 | 
 95 |     # Find RSS channel element directly
 96 |     root = feed.find(['channel', 'feed'])
 97 |     for entry in entries:
 98 |         entry = bs4.BeautifulSoup(entry, 'xml')
 99 |         entry = entry.find(['item', 'entry'])
100 |         root.insert(len(root.contents), entry)
101 | 
102 |     return str(feed).encode(settings.ENCODING)
103 | 


--------------------------------------------------------------------------------
/feedbuffer/database.py:
--------------------------------------------------------------------------------
  1 | import concurrent.futures
  2 | import functools
  3 | 
  4 | import peewee
  5 | 
  6 | from feedbuffer import settings, log
  7 | 
  8 | _database = peewee.SqliteDatabase(settings.DATABASE_PATH)
  9 | _logger = log.get_logger(__name__)
 10 | 
 11 | # Easy way to queue function calls and execute them in a single thread, without having to manually write
 12 | # producer-consumer logic.
 13 | _database_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
 14 | 
 15 | 
 16 | class Model(peewee.Model):
 17 |     class Meta:
 18 |         database = _database
 19 | 
 20 | 
 21 | class Feed(Model):
 22 |     url = peewee.TextField(unique=True)
 23 |     update_interval = peewee.IntegerField(default=settings.DEFAULT_UPDATE_INTERVAL)
 24 |     data = peewee.TextField()
 25 | 
 26 | 
 27 | class FeedItem(Model):
 28 |     id_ = peewee.TextField(unique=True)
 29 |     data = peewee.TextField()
 30 |     feed = peewee.ForeignKeyField(Feed, related_name='entries')
 31 | 
 32 | 
 33 | _database.create_tables([Feed, FeedItem], safe=True)
 34 | 
 35 | 
 36 | def _execute_in(executor):
 37 |     def decorator(function):
 38 |         @functools.wraps(function)
 39 |         def wrapper(*args, **kwargs):
 40 |             future = executor.submit(function, *args, **kwargs)
 41 |             return future.result()
 42 | 
 43 |         return wrapper
 44 | 
 45 |     return decorator
 46 | 
 47 | 
 48 | def _get_feed_query(url):
 49 |     return Feed.select().where(Feed.url == url)
 50 | 
 51 | 
 52 | def _feed_item_exists(feed, id_):
 53 |     return FeedItem.select().where(FeedItem.feed == feed and FeedItem.id_ == id_).exists()
 54 | 
 55 | 
 56 | def _feed_exists(url):
 57 |     return _get_feed_query(url).exists()
 58 | 
 59 | 
 60 | def _get_feed(url):
 61 |     return _get_feed_query(url).get()
 62 | 
 63 | 
 64 | @_execute_in(_database_executor)
 65 | def feed_exists(url):
 66 |     return _get_feed_query(url).exists()
 67 | 
 68 | 
 69 | @_execute_in(_database_executor)
 70 | def get_feed(url):
 71 |     return _get_feed(url)
 72 | 
 73 | 
 74 | @_execute_in(_database_executor)
 75 | def update_feed(url, feed_data, entries):
 76 |     if _feed_exists(url):
 77 |         feed = _get_feed(url)
 78 |     else:
 79 |         feed = Feed(url=url, data=feed_data)
 80 |         feed.save()
 81 | 
 82 |     data_source = [
 83 |         {'id_': id_, 'data': entry, 'feed': feed} for (id_, entry) in entries if not _feed_item_exists(feed, id_)
 84 |     ]
 85 | 
 86 |     _logger.info('Updating feed: %s with %d new entries...', url, len(data_source))
 87 | 
 88 |     with _database.atomic():
 89 |         FeedItem.insert_many(data_source).execute()
 90 |         feed.data = feed_data
 91 |         feed.save()
 92 | 
 93 | 
 94 | @_execute_in(_database_executor)
 95 | def flush_feed(feed):
 96 |     query = FeedItem.delete().where(FeedItem.feed == feed)
 97 |     query.execute()
 98 | 
 99 | 
100 | # Generic way to update data in a model instance using the write executor
101 | @_execute_in(_database_executor)
102 | def update_model_data(model, **kwargs):
103 |     for key, value in kwargs.items():
104 |         setattr(model, key, value)
105 | 
106 |     model.save()
107 | 


--------------------------------------------------------------------------------
/feedbuffer/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import cherrypy
 4 | 
 5 | from feedbuffer import settings
 6 | 
 7 | _PACKAGE_PREFIX = __package__ + '.'
 8 | 
 9 | _logger = logging.getLogger(__package__)
10 | _logger.setLevel(settings.LOGGING_LEVEL)
11 | _formatter = logging.Formatter('[%(asctime)s][%(levelname)s] %(name)s: %(message)s')
12 | 
13 | cherrypy.log.screen = False
14 | cherrypy.log.error_log.propagate = False
15 | cherrypy.log.access_log.propagate = False
16 | 
17 | for handler in settings.LOGGING_HANDLERS:
18 |     handler.setFormatter(_formatter)
19 |     _logger.addHandler(handler)
20 | 
21 | 
22 | def get_logger(name):
23 |     if name.startswith(_PACKAGE_PREFIX):
24 |         name = name[len(_PACKAGE_PREFIX):]
25 | 
26 |     return _logger.getChild(name)
27 | 


--------------------------------------------------------------------------------
/feedbuffer/server.py:
--------------------------------------------------------------------------------
 1 | import cherrypy
 2 | 
 3 | from feedbuffer import core, database, log, settings
 4 | from feedbuffer.settings import DEFAULT_UPDATE_INTERVAL
 5 | 
 6 | _logger = log.get_logger(__name__)
 7 | 
 8 | 
 9 | class Server:
10 |     @cherrypy.expose
11 |     def index(self, url, update_interval=DEFAULT_UPDATE_INTERVAL):
12 |         if not database.feed_exists(url):
13 |             _logger.info('Adding feed: %s', url)
14 |             try:
15 |                 core.update_feed(url)
16 |             except Exception:
17 |                 _logger.exception('Exception occurred during initial feed update: %s', url)
18 |                 return None
19 | 
20 |             core.schedule_feed_update(url)
21 |         elif url not in core.scheduled:
22 |             _logger.info('Updating feed: %s', url)
23 |             core.executor.submit(core.update_feed, url)
24 |             core.schedule_feed_update(url)
25 | 
26 |         feed = database.get_feed(url)
27 |         update_interval = int(update_interval)
28 |         if feed.update_interval != update_interval:
29 |             _logger.info('Changing update interval from %d to %d seconds for feed: %s',
30 |                          feed.update_interval, update_interval, url)
31 |             database.update_model_data(feed, update_interval=update_interval)
32 |             core.schedule_feed_update(url)
33 | 
34 |         _logger.info('Generating feed: %s with %d entries...', url, len(feed.entries))
35 |         response = core.generate_feed(feed.data, [entry.data for entry in feed.entries])
36 |         database.flush_feed(feed)
37 |         cherrypy.response.headers['Content-Type'] = ''
38 |         return response
39 | 


--------------------------------------------------------------------------------
/feedbuffer/settings.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | DATABASE_PATH = __package__ + '.db'
 5 | DEFAULT_UPDATE_INTERVAL = 180
 6 | ENCODING = 'UTF-8'
 7 | LOGGING_HANDLERS = [logging.FileHandler(__package__ + '.log')]
 8 | LOGGING_LEVEL = logging.WARNING
 9 | MAXIMUM_UPDATE_WORKERS = (os.cpu_count() or 1) * 5
10 | PORT = 8083
11 | REQUEST_TIMEOUT = 30
12 | USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0'
13 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import time
 3 | import traceback
 4 | 
 5 | import cherrypy
 6 | 
 7 | from feedbuffer import core, log
 8 | from feedbuffer.settings import PORT
 9 | from feedbuffer.server import Server
10 | 
11 | _logger = log.get_logger(__name__)
12 | 
13 | 
14 | def main():
15 |     cherrypy.config.update({
16 |         'server.socket_port': PORT,
17 |         'server.socket_host': '0.0.0.0',
18 |         'checker.check_skipped_app_config': False
19 |     })
20 |     threading.Thread(target=lambda: cherrypy.quickstart(Server())).start()
21 |     while True:
22 |         try:
23 |             core.scheduler.run()
24 |         except Exception:
25 |             _logger.error(traceback.format_exc())
26 |         time.sleep(1)
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     main()
31 | 


--------------------------------------------------------------------------------