├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── entries ├── __init__.py └── get_entries.py ├── main.py ├── requirements.txt └── templates └── index.html /.gitignore: -------------------------------------------------------------------------------- 1 | *.env 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # Environments 84 | .env 85 | .venv 86 | env/ 87 | venv/ 88 | ENV/ 89 | env.bak/ 90 | venv.bak/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6-alpine 2 | 3 | LABEL name "entries-by-votes" 4 | 5 | EXPOSE 8888 6 | 7 | RUN mkdir -p /usr/src/app 8 | WORKDIR /usr/src/app 9 | 10 | COPY requirements.txt /usr/src/app/ 11 | RUN pip install -r requirements.txt 12 | 13 | COPY . /usr/src/app 14 | 15 | CMD python main.py 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 JuanPablo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Engineering-blogs entries ordered by Hacker News and Reddit votes 2 | 3 | 1. Get feeds of [Engineering-blogs](https://github.com/kilimchoi/engineering-blogs). 4 | 2. Get entries of feeds. 5 | 3. Get votes of entries from Hackers News and Reddit. 6 | 4. Sort entries with this ranking method 7 | 8 | (votes - 1) / (t + 2)^1.8 9 | 10 | [Reference - How Hacker News ranking algorithm works](https://medium.com/hacking-and-gonzo/how-hacker-news-ranking-algorithm-works-1d9b0cf2c08d) 11 | 12 | ## Deploy 13 | 14 | ### Environment variables 15 | 16 | PRAW_CLIENT_ID 17 | PRAW_CLIENT_SECRET 18 | PRAW_USER_AGENT 19 | MONGO_ENTRIES 20 | TORNADO_PORT 21 | 22 | ### Deploy to zeit.co/now 23 | 24 | Add the environment variables to a entries.env file. 25 | 26 | Deploy with now 27 | 28 | now --dotenv entries.env --public 29 | -------------------------------------------------------------------------------- /entries/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import listparser 3 | import feedparser 4 | from urllib.parse import urlencode 5 | import requests 6 | import praw 7 | import os 8 | from math import pow 9 | from time import mktime 10 | from datetime import datetime 11 | import binascii 12 | 13 | 14 | def datetime_from_struct_time(struct_time): 15 | try: 16 | return datetime.fromtimestamp(mktime(struct_time)) 17 | except (OverflowError, ValueError) as error: 18 | print('Error with struct_time:', error, struct_time) 19 | return datetime(1970, 1, 1) 20 | 21 | 22 | def string_from_struct_time(struct_time): 23 | return datetime_from_struct_time(struct_time).strftime("%Y-%m-%d %H:%M:%S") 24 | 25 | 26 | class Opml(object): 27 | 28 | def __init__(self): 29 | self.opml_url = ( 30 | 'https://raw.githubusercontent.com/kilimchoi/' 31 | 'engineering-blogs/master/engineering_blogs.opml' 32 | ) 33 | self.feeds = [] 34 | 35 | def request_ompl(self): 36 | self.opml = listparser.parse(self.opml_url) 37 | 38 | def generate_feeds(self): 39 | feeds = [] 40 | 41 | for feed in self.opml.feeds: 42 | feeds.append({'url': feed.url, 'title': feed.title}) 43 | 44 | self.feeds = feeds 45 | 46 | def get_feeds(self): 47 | 48 | if self.feeds == []: 49 | self.request_ompl() 50 | self.generate_feeds() 51 | 52 | return self.feeds 53 | 54 | 55 | class Entry(object): 56 | 57 | def __init__(self, raw_entry): 58 | self.raw_entry = raw_entry 59 | self.published = self.search_published_date(raw_entry) 60 | self.links = self.get_html_links(raw_entry.get('links', [])) 61 | self.title = raw_entry.get('title', '') 62 | self.votes = [] 63 | 64 | def age(self): 65 | return (datetime.now() - self.published) 66 | 67 | def hours_age(self): 68 | return self.age().total_seconds() / 3600. 69 | 70 | def days_age(self): 71 | return self.hours_age() / 24. 72 | 73 | def get_html_links(self, links): 74 | return [link.get('href', '') for link in links 75 | if link.get('type', None) == 'text/html'] 76 | 77 | def search_published_date(self, entry): 78 | 79 | if entry.get('published_parsed', False): 80 | return datetime_from_struct_time(entry["published_parsed"]) 81 | if entry.get('updated_parsed', False): 82 | return datetime_from_struct_time(entry["updated_parsed"]) 83 | 84 | return datetime(1970, 1, 1) 85 | 86 | def set_votes(self, votes): 87 | self.votes = sorted(votes, key=lambda k: k['votes'], reverse=True) 88 | 89 | def get_total_votes(self): 90 | total = 0.0 91 | for vote in self.votes: 92 | total += vote['votes'] - 1 93 | 94 | return total 95 | 96 | def get_rank(self): 97 | try: 98 | return self.get_total_votes() / pow(self.hours_age() + 2., 1.8) 99 | except ValueError: 100 | return 0.0 101 | 102 | def __str__(self): 103 | return ''.format(self.title) 104 | 105 | 106 | class Feed(object): 107 | 108 | def __init__(self, url, title): 109 | self.url = url 110 | self.title = title 111 | self.entries = [] 112 | self.content = None 113 | 114 | def request_entries(self): 115 | try: 116 | self.content = feedparser.parse(self.url) 117 | except (UnicodeEncodeError, binascii.Error) as error: 118 | print('Error:', error, self.url) 119 | 120 | def parse_entries(self): 121 | 122 | if self.content is None: 123 | return 124 | 125 | raw_entries = self.content.entries 126 | entries = [] 127 | 128 | if len(raw_entries) > 0: 129 | 130 | for entry in raw_entries: 131 | entries.append(Entry(entry)) 132 | 133 | self.entries = entries 134 | 135 | def get_entries(self): 136 | 137 | if self.entries == []: 138 | self.request_entries() 139 | self.parse_entries() 140 | 141 | return self.entries 142 | 143 | def __str__(self): 144 | return ''.format(self.title, self.url) 145 | 146 | 147 | class HackerNews(object): 148 | 149 | def __init__(self): 150 | self.api_url = 'https://hn.algolia.com/api/v1/search?' 151 | 152 | def search_url(self, url): 153 | query_url = urlencode({'query': url}) 154 | request = self.api_url + query_url 155 | 156 | response = requests.get(request).json() 157 | 158 | return [hit for hit in response.get('hits', []) if hit['url'] == url] 159 | 160 | def votes_and_comments(self, url): 161 | hits = self.search_url(url) 162 | 163 | return [{ 164 | 'source': 'hacker_news', 165 | 'votes': hit.get('points', 0), 166 | 'comments': hit.get('num_comments', 0), 167 | 'id': hit.get('objectID', 0) 168 | } for hit in hits] 169 | 170 | 171 | class Reddit(object): 172 | 173 | def __init__(self): 174 | self.client = praw.Reddit( 175 | client_id=os.environ['PRAW_CLIENT_ID'], 176 | client_secret=os.environ['PRAW_CLIENT_SECRET'], 177 | user_agent=os.environ['PRAW_USER_AGENT'] 178 | ) 179 | 180 | def votes_and_comments(self, url): 181 | votes = [] 182 | 183 | try: 184 | for sub in self.client.info(url=url): 185 | votes.append({ 186 | 'source': 'reddit', 'id': sub.id, 187 | 'subreddit': sub.subreddit.display_name, 188 | 'votes': sub.ups, 'comments': sub.num_comments 189 | }) 190 | except TypeError as err: 191 | print(url, err) 192 | 193 | return votes 194 | -------------------------------------------------------------------------------- /entries/get_entries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | from . import Opml, Feed 5 | from . import HackerNews, Reddit 6 | from tornado import gen, queues 7 | from tornado.ioloop import IOLoop 8 | from concurrent.futures import ThreadPoolExecutor 9 | from motor.motor_tornado import MotorClient 10 | 11 | thread_pool = ThreadPoolExecutor(2) 12 | 13 | feeds = queues.Queue() 14 | entries = queues.Queue() 15 | client = MotorClient(os.environ['MONGO_ENTRIES']) 16 | db = client['entries-by-votes'] 17 | 18 | ENTRIES_AGE = float(os.environ.get('ENTRIES_AGE', 14)) 19 | 20 | 21 | @gen.coroutine 22 | def do_insert_entry(entry): 23 | yield db.entries.update_one( 24 | {'link': entry['link']}, {'$set': entry}, upsert=True 25 | ) 26 | 27 | 28 | @gen.coroutine 29 | def get_feeds(): 30 | return (yield thread_pool.submit(Opml().get_feeds)) 31 | 32 | 33 | @gen.coroutine 34 | def get_entries(url, title): 35 | return (yield thread_pool.submit(Feed(url, title).get_entries)) 36 | 37 | 38 | @gen.coroutine 39 | def votes_from_hacker_news(url): 40 | return (yield thread_pool.submit(HackerNews().votes_and_comments, url)) 41 | 42 | 43 | @gen.coroutine 44 | def votes_from_reddit(url): 45 | return (yield thread_pool.submit(Reddit().votes_and_comments, url)) 46 | 47 | 48 | @gen.coroutine 49 | def votes_from_entry(link): 50 | votes = [] 51 | 52 | for vote in (yield votes_from_hacker_news(link)): 53 | votes.append(vote) 54 | for vote in (yield votes_from_reddit(link)): 55 | votes.append(vote) 56 | 57 | return votes 58 | 59 | 60 | @gen.coroutine 61 | def entries_consumer(): 62 | while True: 63 | current_entry = yield entries.get() 64 | print( 65 | 'Fetching entry', entries.qsize(), 66 | current_entry.published, current_entry 67 | ) 68 | try: 69 | for link in current_entry.links: 70 | votes = yield votes_from_entry(link) 71 | 72 | if votes != []: 73 | current_entry.set_votes(votes) 74 | print( 75 | current_entry.published, current_entry.title, 76 | link, current_entry.get_rank() 77 | ) 78 | yield do_insert_entry({ 79 | 'title': current_entry.title, 80 | 'link': link, 81 | 'published': current_entry.published, 82 | 'votes': current_entry.votes, 83 | 'total_votes': current_entry.get_total_votes(), 84 | 'rank': current_entry.get_rank() 85 | }) 86 | yield gen.sleep(1) 87 | finally: 88 | entries.task_done() 89 | 90 | 91 | @gen.coroutine 92 | def get_new_entries_from_feed(): 93 | current_feed = yield feeds.get() 94 | print('Fetching feed', feeds.qsize(), current_feed['url']) 95 | try: 96 | url, title = current_feed['url'], current_feed['title'] 97 | for entry in (yield get_entries(url, title)): 98 | if entry.days_age() < ENTRIES_AGE: 99 | yield entries.put(entry) 100 | finally: 101 | feeds.task_done() 102 | 103 | 104 | @gen.coroutine 105 | def feeds_consumer(): 106 | while True: 107 | yield get_new_entries_from_feed() 108 | 109 | 110 | @gen.coroutine 111 | def feeds_producer(): 112 | for feed in (yield get_feeds()): 113 | yield feeds.put(feed) 114 | 115 | 116 | @gen.coroutine 117 | def entries_update(): 118 | 119 | while True: 120 | print('Starting entries update') 121 | yield feeds_producer() 122 | yield feeds.join() 123 | yield entries.join() 124 | print('Entries updated done') 125 | yield gen.sleep(3600) 126 | 127 | 128 | if __name__ == "__main__": 129 | IOLoop.current().spawn_callback(feeds_consumer) 130 | IOLoop.current().spawn_callback(entries_consumer) 131 | IOLoop.current().run_sync(entries_update) 132 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | from tornado import ioloop, gen 5 | from tornado import web 6 | from motor.motor_tornado import MotorClient 7 | from entries.get_entries import entries_update 8 | from entries.get_entries import entries_consumer, feeds_consumer 9 | from datetime import datetime 10 | 11 | client = MotorClient(os.environ['MONGO_ENTRIES']) 12 | db = client['entries-by-votes'] 13 | 14 | 15 | class MainHandler(web.RequestHandler): 16 | 17 | @gen.coroutine 18 | def get(self): 19 | db = self.settings['db'] 20 | 21 | cursor = db.entries.aggregate([ 22 | {'$project': { 23 | '_id':0, 'link':1, 'title':1, 'published':1, 'total_votes':1, 'votes':1, 'rank': { 24 | '$divide': ["$total_votes", { 25 | '$pow': [{ 26 | '$add': [{'$divide': [{'$subtract' : [datetime.now(), "$published"]}, 1000*60*60]}, 2] 27 | }, 1.8] 28 | }] 29 | }}}, 30 | {'$sort': {'rank': -1}}, 31 | {'$limit': 100}, 32 | ]); 33 | 34 | entries = yield cursor.to_list(length=100) 35 | 36 | self.render('templates/index.html', entries=entries) 37 | 38 | 39 | def make_app(): 40 | return web.Application([ 41 | (r'/', MainHandler), 42 | ], db=db) 43 | 44 | 45 | if __name__ == "__main__": 46 | port = os.environ.get('TORNADO_PORT', 8888) 47 | print('Starting app in port {}'.format(port)) 48 | app = make_app() 49 | app.listen(port) 50 | io_loop = ioloop.IOLoop.current() 51 | io_loop.spawn_callback(feeds_consumer) 52 | io_loop.spawn_callback(entries_consumer) 53 | io_loop.spawn_callback(entries_update) 54 | io_loop.start() 55 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | feedparser==5.2.1 2 | flake8==3.4.1 3 | listparser==0.18 4 | motor==1.1 5 | praw==5.1.0 6 | requests==2.18.4 7 | tornado==4.5.2 8 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Entries by votes 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 30 | 31 | 38 | 39 | 40 | 41 |
42 |

Engineering-blogs entries ordered by Hacker News and Reddit votes

43 |
44 | 79 |
80 | 83 |
84 | 85 | 86 | 87 | --------------------------------------------------------------------------------