├── examples ├── __init__.py ├── basic_example.py └── advance_example.py ├── requirements.txt ├── tests ├── __init__.py └── test_asyncio_hn.py ├── MANIFEST.in ├── asyncio_hn ├── __init__.py └── hn.py ├── test_requirements.txt ├── .editorconfig ├── LICENSE ├── .gitignore ├── setup.py ├── README.md └── README /examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | aiohttp==1.3.5 3 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include test_requirements.txt -------------------------------------------------------------------------------- /asyncio_hn/__init__.py: -------------------------------------------------------------------------------- 1 | from asyncio_hn.hn import ClientHN 2 | # __all__ = ["ClientHN"] 3 | -------------------------------------------------------------------------------- /test_requirements.txt: -------------------------------------------------------------------------------- 1 | pip==8.1.2 2 | bumpversion==0.5.3 3 | wheel==0.29.0 4 | watchdog==0.8.3 5 | flake8==2.6.0 6 | tox==2.3.1 7 | coverage==4.1 8 | Sphinx==1.4.8 9 | cryptography==1.7 10 | PyYAML==3.11 11 | pytest==2.9.2 12 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /examples/basic_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from asyncio_hn import ClientHN 4 | 5 | 6 | async def main(loop): 7 | # We init the client - extension of aiohttp.ClientSession 8 | async with ClientHN(loop=loop) as hn: 9 | # Up to 500 top and top stories (only ids) 10 | hn_new_stories = await hn.top_stories() 11 | # Download top 3 story data 12 | top_posts = await hn.items(hn_new_stories[:2]) 13 | # Download the user data for each story 14 | users = await hn.users([post.get("by") for post in top_posts]) 15 | 16 | 17 | if __name__ == '__main__': 18 | loop = asyncio.get_event_loop() 19 | loop.run_until_complete(main(loop)) 20 | -------------------------------------------------------------------------------- /examples/advance_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | 4 | import aiohttp 5 | from asyncio_hn.hn_old import ClientHN 6 | 7 | N = 1_000_000 8 | 9 | 10 | async def advance_run(loop): 11 | # We init the client - extension of aiohttp.ClientSession 12 | conn = aiohttp.TCPConnector(limit=1000, loop=loop) 13 | async with ClientHN(loop=loop, queue_size=1000, connector=conn, progress_bar=True, debug=True) as hn: 14 | # Download the last 1,000,000 stories 15 | hn_new_stories = await hn.last_n_items(n=N) 16 | with open("1_million_posts.json", "w") as f: 17 | json.dump(hn_new_stories, f) 18 | 19 | 20 | if __name__ == '__main__': 21 | loop = asyncio.get_event_loop() 22 | loop.run_until_complete(advance_run(loop)) 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Copyright (c) 2017, Itiel Shwartz 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 7 | 8 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 9 | 10 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 11 | 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #Me 2 | .github 3 | .idea 4 | venv 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | 59 | # Sphinx documentation 60 | docs/_build/ 61 | 62 | # PyBuilder 63 | target/ 64 | 65 | # pyenv python configuration file 66 | .python-version 67 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | 5 | from setuptools import setup 6 | 7 | with open("README") as f: 8 | readme = f.read() 9 | 10 | with open(os.path.join(os.path.dirname(__file__), 'requirements.txt')) as f: 11 | required = f.read().splitlines() 12 | 13 | with open(os.path.join(os.path.dirname(__file__), 'test_requirements.txt')) as f: 14 | test_required = f.read().splitlines() + required 15 | 16 | setup( 17 | name='asyncio_hn', 18 | version='0.4.0', 19 | description=" Simple asyncio wrapper to download hackernews", 20 | long_description=readme + '\n', 21 | author="Itiel Shwartz", 22 | author_email='itiel@etlsh.com', 23 | url='https://github.com/itielshwartz/asyncio-hn', 24 | packages=[ 25 | 'asyncio_hn', 26 | ], 27 | include_package_data=True, 28 | install_requires=required, 29 | license="MIT license", 30 | zip_safe=False, 31 | keywords=['asyncio', 'aiohttp', 'hackernews'], 32 | classifiers=[ 33 | 'Intended Audience :: Developers', 34 | 'License :: OSI Approved :: MIT License', 35 | 'Natural Language :: English', 36 | 'Programming Language :: Python :: 3.6', 37 | ], 38 | test_suite='tests', 39 | tests_require=test_required, 40 | 41 | ) 42 | -------------------------------------------------------------------------------- /tests/test_asyncio_hn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | # -*- coding: utf-8 -*- 3 | 4 | import pytest 5 | 6 | from asyncio_hn import ClientHN 7 | 8 | 9 | @pytest.mark.asyncio 10 | async def test_last_n_posts(): 11 | async with ClientHN() as hn: 12 | posts = await hn.last_n_items(2) 13 | assert len(posts) == 2 14 | 15 | 16 | @pytest.mark.asyncio 17 | async def test_download_posts(): 18 | async with ClientHN() as hn: 19 | posts = await hn.items((42, 4242, 424242)) 20 | for post in posts: 21 | validate_post(post, post_id=424242, post_creator="1gor") 22 | validate_post(post, post_id=4242, post_creator="PindaxDotCom") 23 | validate_post(post, post_id=42, post_creator="sergei") 24 | 25 | 26 | def validate_post(post, post_id, post_creator): 27 | if post.get("id") == post_id: 28 | assert post_creator == post.get("by") 29 | 30 | 31 | @pytest.mark.asyncio 32 | async def test_best_and_latest(): 33 | async with ClientHN() as hn: 34 | stories = await hn.best_stories() 35 | assert len(stories) == 200 36 | latest = await hn.new_stories() 37 | assert len(latest) == 500 38 | 39 | 40 | @pytest.mark.asyncio 41 | async def test_download_users(): 42 | async with ClientHN() as hn: 43 | users = await hn.users(["maximabramchuk", "anthonybsd"]) 44 | for user in users: 45 | if user["id"] == "maximabramchuk": 46 | assert user["created"] == 1441729807 47 | if user["id"] == "'anthonybsd'": 48 | assert user["created"] == 1436886156 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # asyncio-hn 2 | ![python-3.6](https://img.shields.io/badge/python-3.6-blue.svg) 3 | 4 | A simple asyncio wrapper to download 5 | [hacker-news](https://news.ycombinator.com/) 6 | with speed and ease. 7 | 8 | The package supports all endpoints of the official API : [hacker-news API](https://github.com/HackerNews/API) 9 | 10 | Develop proccess: [Using asyncio to download hackernews](http://etlsh.com/2017/01/21/using-asyncio-to-download-hacker-news/) 11 | 12 | 13 | ## Installation 14 | 15 | ```shell 16 | pip install asyncio-hn 17 | ``` 18 | 19 | ## Usage 20 | 21 | ```python 22 | import asyncio 23 | from asyncio_hn import ClientHN 24 | 25 | async def main(loop): 26 | # We init the client - extension of aiohttp.ClientSession 27 | async with ClientHN(loop=loop) as hn: 28 | # Up to 500 top and top stories (only ids) 29 | hn_new_stories = await hn.top_stories() 30 | # Download top 10 story data 31 | top_posts = await hn.items(hn_new_stories[:10]) 32 | # Download the user data for each story 33 | users = await hn.users([post.get("by") for post in top_posts]) 34 | 35 | 36 | if __name__ == '__main__': 37 | loop = asyncio.get_event_loop() 38 | loop.run_until_complete(main(loop)) 39 | ``` 40 | 41 | ### Advance usage 42 | Using this config you can reach 1000+ request/sec. 43 | 44 | ```python 45 | import aiohttp 46 | N = 1_000_000 47 | 48 | async def advance_run(loop): 49 | # We init the client - extension of aiohttp.ClientSession 50 | conn = aiohttp.TCPConnector(limit=1000, loop=loop) 51 | async with ClientHN(loop=loop, queue_size=1000, connector=conn, progress_bar=True, debug=True) as hn: 52 | # Download the last 1,000,000 stories 53 | hn_new_stories = await hn.last_n_items(n=N) 54 | ``` 55 | 56 | ## Output example: 57 | Item: 58 | ``` python 59 | item = {'by': 'amzans', 'descendants': 25, 'id': 13566716, 60 | 'kids': [13567061, 13567631, 13567027, 13567055, 13566798, 13567473], 'score': 171, 'time': 1486210548, 61 | 'title': 'Network programming with Go (2012)', 'type': 'story', 62 | 'url': 'https://jannewmarch.gitbooks.io/network-programming-with-go-golang-/content/'}, 63 | {'by': 'r3bl', 'descendants': 1, 'id': 13567940, 'kids': [13568249], 'score': 24, 'time': 1486230224, 64 | 'title': 'YouTube removes hundreds of the best climate science videos from the Internet', 65 | 'type': 'story', 66 | 'url': 'http://climatestate.com/2017/02/03/youtube-removes-hundreds-of-the-best-climate-science-videos-from-the-internet/'} 67 | ``` 68 | User: 69 | ```python 70 | user = {'created': 1470758993, 'id': 'amzans', 'karma': 174, 71 | 'submitted': [13567884, 13566716, 13566699, 13558456, 13539270, 13539151, 13514498, 13418469, 13417725, 72 | 13416562, 13416097, 13416034, 13415954, 13415894, 13395310, 13394996, 13392554, 12418804, 73 | 12418361, 12413958, 12411992, 12411732, 12411546, 12262383, 12255593]} 74 | 75 | ``` 76 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | asyncio-hn 2 | ========== 3 | 4 | A simple asyncio wrapper to download 5 | `hacker-news `__ with speed and ease. 6 | 7 | The package supports all endpoints of the official API : `hacker-news 8 | API `__ 9 | 10 | Installation 11 | ------------ 12 | 13 | .. code:: shell 14 | 15 | pip install asyncio-hn 16 | 17 | Usage 18 | ----- 19 | 20 | .. code:: python 21 | 22 | import asyncio 23 | from asyncio_hn import ClientHN 24 | 25 | async def main(loop): 26 | # We init the client - extension of aiohttp.ClientSession 27 | async with ClientHN(loop=loop) as hn: 28 | # Up to 500 top and top stories (only ids) 29 | hn_new_stories = await hn.top_stories() 30 | # Download top 10 story data 31 | top_posts = await hn.items(hn_new_stories[:10]) 32 | # Download the user data for each story 33 | users = await hn.users([post.get("by") for post in top_posts]) 34 | 35 | 36 | if __name__ == '__main__': 37 | loop = asyncio.get_event_loop() 38 | loop.run_until_complete(main(loop)) 39 | 40 | Advance usage 41 | ~~~~~~~~~~~~~ 42 | 43 | Using this config you can reach 1000+ request/sec. 44 | 45 | .. code:: python 46 | import aiohttp 47 | from asyncio_hn import ClientHN 48 | 49 | N = 1_000_000 50 | 51 | async def advance_run(loop): 52 | # We init the client - extension of aiohttp.ClientSession 53 | conn = aiohttp.TCPConnector(limit=1000, loop=loop) 54 | async with ClientHN(loop=loop, queue_size=1000, connector=conn, progress_bar=True, debug=True) as hn: 55 | # Download the last 1,000,000 stories 56 | hn_new_stories = await hn.last_n_items(n=N) 57 | 58 | Output example: 59 | --------------- 60 | 61 | Item: 62 | 63 | .. code:: python 64 | 65 | item = {'by': 'amzans', 'descendants': 25, 'id': 13566716, 66 | 'kids': [13567061, 13567631, 13567027, 13567055, 13566798, 13567473], 'score': 171, 'time': 1486210548, 67 | 'title': 'Network programming with Go (2012)', 'type': 'story', 68 | 'url': 'https://jannewmarch.gitbooks.io/network-programming-with-go-golang-/content/'}, 69 | {'by': 'r3bl', 'descendants': 1, 'id': 13567940, 'kids': [13568249], 'score': 24, 'time': 1486230224, 70 | 'title': 'YouTube removes hundreds of the best climate science videos from the Internet', 71 | 'type': 'story', 72 | 'url': 'http://climatestate.com/2017/02/03/youtube-removes-hundreds-of-the-best-climate-science-videos-from-the-internet/'} 73 | 74 | User: 75 | 76 | .. code:: python 77 | 78 | user = {'created': 1470758993, 'id': 'amzans', 'karma': 174, 79 | 'submitted': [13567884, 13566716, 13566699, 13558456, 13539270, 13539151, 13514498, 13418469, 13417725, 80 | 13416562, 13416097, 13416034, 13415954, 13415894, 13395310, 13394996, 13392554, 12418804, 81 | 12418361, 12413958, 12411992, 12411732, 12411546, 12262383, 12255593]} 82 | 83 | -------------------------------------------------------------------------------- /asyncio_hn/hn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.6 2 | import asyncio 3 | import logging 4 | 5 | import aiohttp 6 | import tqdm 7 | from aiohttp import HttpProcessingError 8 | 9 | URL_GET_POST = "https://hacker-news.firebaseio.com/v0/item/{}.json" 10 | URL_GET_USER = "https://hacker-news.firebaseio.com/v0/user/{}.json" 11 | URL_MAX_ITEM = 'https://hacker-news.firebaseio.com/v0/maxitem.json' 12 | URL_TOP_STORIES = "https://hacker-news.firebaseio.com/v0/topstories.json" 13 | URL_NEW_STORIES = "https://hacker-news.firebaseio.com/v0/newstories.json" 14 | URL_BEST_STORIES = "https://hacker-news.firebaseio.com/v0/beststories.json" 15 | URL_UPDATES = "https://hacker-news.firebaseio.com/v0/updates.json" 16 | URL_ASK_STORIES = "https://hacker-news.firebaseio.com/v0/askstories.json" 17 | URL_SHOW_STORIES = "https://hacker-news.firebaseio.com/v0/showstories.json" 18 | URL_JOB_STORIES = "https://hacker-news.firebaseio.com/v0/jobstories.json" 19 | # The max tcp connection we open 20 | MAX_CONNECTION = 1000 21 | 22 | # setting up logger 23 | logger = logging.getLogger(__name__) 24 | console = logging.StreamHandler() 25 | logger.addHandler(console) 26 | 27 | 28 | class ClientHN(aiohttp.ClientSession): 29 | def __init__(self, queue_size=10, progress_bar=False, debug=False, num_dlq_consumers=10, **kwargs): 30 | super(ClientHN, self).__init__(**kwargs) 31 | self.queue_size = queue_size 32 | self.connector_limit = self.connector.limit 33 | self._responses = [] 34 | self.progress_bar = progress_bar 35 | self.num_dlq_consumers = num_dlq_consumers 36 | if debug: 37 | logger.setLevel(logging.DEBUG) 38 | 39 | async def single_download(self, url): 40 | async with self.get(url) as resp: 41 | return await resp.json() 42 | 43 | async def multi_download(self, itr, url, num_of_consumers=None, desc=""): 44 | queue, dlq, responses = asyncio.Queue( 45 | maxsize=self.queue_size), asyncio.Queue(), [] 46 | num_of_consumers = num_of_consumers or min(self.connector_limit, self.try_get_itr_len(itr)) 47 | consumers = [asyncio.ensure_future( 48 | self._consumer(main_queue=queue, dlq=dlq, responses=responses)) for _ in 49 | range(num_of_consumers or self.connector_limit)] 50 | dlq_consumers = [asyncio.ensure_future( 51 | self._consumer(dlq, dlq, responses)) for _ in range(self.num_dlq_consumers)] 52 | produce = await self._produce(itr, url, queue, desc=desc) 53 | await queue.join() 54 | await dlq.join() 55 | for consumer in consumers + dlq_consumers: 56 | consumer.cancel() 57 | return responses 58 | 59 | def try_get_itr_len(self, itr): 60 | try: 61 | return len(itr) 62 | except TypeError: 63 | return 1000000 64 | 65 | async def _produce(self, items, base_url, queue, desc=""): 66 | for item in tqdm.tqdm(items, desc=desc + " (Estimation)", disable=not self.progress_bar): 67 | await queue.put(base_url.format(item)) 68 | 69 | async def _consumer(self, main_queue, dlq, responses): 70 | while True: 71 | try: 72 | url = await main_queue.get() 73 | async with self.get(url, timeout=10) as response: 74 | resp = response 75 | resp.raise_for_status() 76 | responses.append(await resp.json()) 77 | # Notify the queue that the item has been processed 78 | main_queue.task_done() 79 | 80 | except (HttpProcessingError, asyncio.TimeoutError) as e: 81 | logger.debug("Problem with %s, Moving to DLQ" % url) 82 | await dlq.put(url) 83 | main_queue.task_done() 84 | 85 | async def top_stories(self): 86 | return await self.single_download(URL_TOP_STORIES) 87 | 88 | async def best_stories(self): 89 | return await self.single_download(URL_BEST_STORIES) 90 | 91 | async def new_stories(self): 92 | return await self.single_download(URL_NEW_STORIES) 93 | 94 | async def ask_stories(self): 95 | return await self.single_download(URL_ASK_STORIES) 96 | 97 | async def updates(self): 98 | return await self.single_download(URL_UPDATES) 99 | 100 | async def job_stories(self): 101 | return await self.single_download(URL_JOB_STORIES) 102 | 103 | async def max_item(self): 104 | return await self.single_download(URL_MAX_ITEM) 105 | 106 | async def users(self, itr_users, num_of_futures=None): 107 | return await self.multi_download(itr_users, URL_GET_USER, num_of_futures, "Download users") 108 | 109 | async def items(self, posts_itr, num_of_futures=None): 110 | return await self.multi_download(posts_itr, URL_GET_POST, num_of_futures, "Download items") 111 | 112 | async def last_n_items(self, n, num_of_futures=None): 113 | max_item = await self.max_item() 114 | return await self.multi_download(range(max_item, max_item - n, -1), URL_GET_POST, num_of_futures, 115 | "Download last N posts") 116 | --------------------------------------------------------------------------------