├── examples
├── __init__.py
├── basic_example.py
└── advance_example.py
├── requirements.txt
├── tests
├── __init__.py
└── test_asyncio_hn.py
├── MANIFEST.in
├── asyncio_hn
├── __init__.py
└── hn.py
├── test_requirements.txt
├── .editorconfig
├── LICENSE
├── .gitignore
├── setup.py
├── README.md
└── README
/examples/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | aiohttp==1.3.5
3 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include test_requirements.txt
--------------------------------------------------------------------------------
/asyncio_hn/__init__.py:
--------------------------------------------------------------------------------
1 | from asyncio_hn.hn import ClientHN
2 | # __all__ = ["ClientHN"]
3 |
--------------------------------------------------------------------------------
/test_requirements.txt:
--------------------------------------------------------------------------------
1 | pip==8.1.2
2 | bumpversion==0.5.3
3 | wheel==0.29.0
4 | watchdog==0.8.3
5 | flake8==2.6.0
6 | tox==2.3.1
7 | coverage==4.1
8 | Sphinx==1.4.8
9 | cryptography==1.7
10 | PyYAML==3.11
11 | pytest==2.9.2
12 |
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | # http://editorconfig.org
2 |
3 | root = true
4 |
5 | [*]
6 | indent_style = space
7 | indent_size = 4
8 | trim_trailing_whitespace = true
9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 |
13 | [*.bat]
14 | indent_style = tab
15 | end_of_line = crlf
16 |
17 | [LICENSE]
18 | insert_final_newline = false
19 |
20 | [Makefile]
21 | indent_style = tab
22 |
--------------------------------------------------------------------------------
/examples/basic_example.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 |
3 | from asyncio_hn import ClientHN
4 |
5 |
6 | async def main(loop):
7 | # We init the client - extension of aiohttp.ClientSession
8 | async with ClientHN(loop=loop) as hn:
9 | # Up to 500 top and top stories (only ids)
10 | hn_new_stories = await hn.top_stories()
11 | # Download top 3 story data
12 | top_posts = await hn.items(hn_new_stories[:2])
13 | # Download the user data for each story
14 | users = await hn.users([post.get("by") for post in top_posts])
15 |
16 |
17 | if __name__ == '__main__':
18 | loop = asyncio.get_event_loop()
19 | loop.run_until_complete(main(loop))
20 |
--------------------------------------------------------------------------------
/examples/advance_example.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import json
3 |
4 | import aiohttp
5 | from asyncio_hn.hn_old import ClientHN
6 |
7 | N = 1_000_000
8 |
9 |
10 | async def advance_run(loop):
11 | # We init the client - extension of aiohttp.ClientSession
12 | conn = aiohttp.TCPConnector(limit=1000, loop=loop)
13 | async with ClientHN(loop=loop, queue_size=1000, connector=conn, progress_bar=True, debug=True) as hn:
14 | # Download the last 1,000,000 stories
15 | hn_new_stories = await hn.last_n_items(n=N)
16 | with open("1_million_posts.json", "w") as f:
17 | json.dump(hn_new_stories, f)
18 |
19 |
20 | if __name__ == '__main__':
21 | loop = asyncio.get_event_loop()
22 | loop.run_until_complete(advance_run(loop))
23 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | MIT License
3 |
4 | Copyright (c) 2017, Itiel Shwartz
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
7 |
8 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
9 |
10 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
11 |
12 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | #Me
2 | .github
3 | .idea
4 | venv
5 | # Byte-compiled / optimized / DLL files
6 | __pycache__/
7 | *.py[cod]
8 | *$py.class
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *,cover
50 | .hypothesis/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 |
59 | # Sphinx documentation
60 | docs/_build/
61 |
62 | # PyBuilder
63 | target/
64 |
65 | # pyenv python configuration file
66 | .python-version
67 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import os
4 |
5 | from setuptools import setup
6 |
7 | with open("README") as f:
8 | readme = f.read()
9 |
10 | with open(os.path.join(os.path.dirname(__file__), 'requirements.txt')) as f:
11 | required = f.read().splitlines()
12 |
13 | with open(os.path.join(os.path.dirname(__file__), 'test_requirements.txt')) as f:
14 | test_required = f.read().splitlines() + required
15 |
16 | setup(
17 | name='asyncio_hn',
18 | version='0.4.0',
19 | description=" Simple asyncio wrapper to download hackernews",
20 | long_description=readme + '\n',
21 | author="Itiel Shwartz",
22 | author_email='itiel@etlsh.com',
23 | url='https://github.com/itielshwartz/asyncio-hn',
24 | packages=[
25 | 'asyncio_hn',
26 | ],
27 | include_package_data=True,
28 | install_requires=required,
29 | license="MIT license",
30 | zip_safe=False,
31 | keywords=['asyncio', 'aiohttp', 'hackernews'],
32 | classifiers=[
33 | 'Intended Audience :: Developers',
34 | 'License :: OSI Approved :: MIT License',
35 | 'Natural Language :: English',
36 | 'Programming Language :: Python :: 3.6',
37 | ],
38 | test_suite='tests',
39 | tests_require=test_required,
40 |
41 | )
42 |
--------------------------------------------------------------------------------
/tests/test_asyncio_hn.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.6
2 | # -*- coding: utf-8 -*-
3 |
4 | import pytest
5 |
6 | from asyncio_hn import ClientHN
7 |
8 |
9 | @pytest.mark.asyncio
10 | async def test_last_n_posts():
11 | async with ClientHN() as hn:
12 | posts = await hn.last_n_items(2)
13 | assert len(posts) == 2
14 |
15 |
16 | @pytest.mark.asyncio
17 | async def test_download_posts():
18 | async with ClientHN() as hn:
19 | posts = await hn.items((42, 4242, 424242))
20 | for post in posts:
21 | validate_post(post, post_id=424242, post_creator="1gor")
22 | validate_post(post, post_id=4242, post_creator="PindaxDotCom")
23 | validate_post(post, post_id=42, post_creator="sergei")
24 |
25 |
26 | def validate_post(post, post_id, post_creator):
27 | if post.get("id") == post_id:
28 | assert post_creator == post.get("by")
29 |
30 |
31 | @pytest.mark.asyncio
32 | async def test_best_and_latest():
33 | async with ClientHN() as hn:
34 | stories = await hn.best_stories()
35 | assert len(stories) == 200
36 | latest = await hn.new_stories()
37 | assert len(latest) == 500
38 |
39 |
40 | @pytest.mark.asyncio
41 | async def test_download_users():
42 | async with ClientHN() as hn:
43 | users = await hn.users(["maximabramchuk", "anthonybsd"])
44 | for user in users:
45 | if user["id"] == "maximabramchuk":
46 | assert user["created"] == 1441729807
47 | if user["id"] == "'anthonybsd'":
48 | assert user["created"] == 1436886156
49 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # asyncio-hn
2 | 
3 |
4 | A simple asyncio wrapper to download
5 | [hacker-news](https://news.ycombinator.com/)
6 | with speed and ease.
7 |
8 | The package supports all endpoints of the official API : [hacker-news API](https://github.com/HackerNews/API)
9 |
10 | Develop proccess: [Using asyncio to download hackernews](http://etlsh.com/2017/01/21/using-asyncio-to-download-hacker-news/)
11 |
12 |
13 | ## Installation
14 |
15 | ```shell
16 | pip install asyncio-hn
17 | ```
18 |
19 | ## Usage
20 |
21 | ```python
22 | import asyncio
23 | from asyncio_hn import ClientHN
24 |
25 | async def main(loop):
26 | # We init the client - extension of aiohttp.ClientSession
27 | async with ClientHN(loop=loop) as hn:
28 | # Up to 500 top and top stories (only ids)
29 | hn_new_stories = await hn.top_stories()
30 | # Download top 10 story data
31 | top_posts = await hn.items(hn_new_stories[:10])
32 | # Download the user data for each story
33 | users = await hn.users([post.get("by") for post in top_posts])
34 |
35 |
36 | if __name__ == '__main__':
37 | loop = asyncio.get_event_loop()
38 | loop.run_until_complete(main(loop))
39 | ```
40 |
41 | ### Advance usage
42 | Using this config you can reach 1000+ request/sec.
43 |
44 | ```python
45 | import aiohttp
46 | N = 1_000_000
47 |
48 | async def advance_run(loop):
49 | # We init the client - extension of aiohttp.ClientSession
50 | conn = aiohttp.TCPConnector(limit=1000, loop=loop)
51 | async with ClientHN(loop=loop, queue_size=1000, connector=conn, progress_bar=True, debug=True) as hn:
52 | # Download the last 1,000,000 stories
53 | hn_new_stories = await hn.last_n_items(n=N)
54 | ```
55 |
56 | ## Output example:
57 | Item:
58 | ``` python
59 | item = {'by': 'amzans', 'descendants': 25, 'id': 13566716,
60 | 'kids': [13567061, 13567631, 13567027, 13567055, 13566798, 13567473], 'score': 171, 'time': 1486210548,
61 | 'title': 'Network programming with Go (2012)', 'type': 'story',
62 | 'url': 'https://jannewmarch.gitbooks.io/network-programming-with-go-golang-/content/'},
63 | {'by': 'r3bl', 'descendants': 1, 'id': 13567940, 'kids': [13568249], 'score': 24, 'time': 1486230224,
64 | 'title': 'YouTube removes hundreds of the best climate science videos from the Internet',
65 | 'type': 'story',
66 | 'url': 'http://climatestate.com/2017/02/03/youtube-removes-hundreds-of-the-best-climate-science-videos-from-the-internet/'}
67 | ```
68 | User:
69 | ```python
70 | user = {'created': 1470758993, 'id': 'amzans', 'karma': 174,
71 | 'submitted': [13567884, 13566716, 13566699, 13558456, 13539270, 13539151, 13514498, 13418469, 13417725,
72 | 13416562, 13416097, 13416034, 13415954, 13415894, 13395310, 13394996, 13392554, 12418804,
73 | 12418361, 12413958, 12411992, 12411732, 12411546, 12262383, 12255593]}
74 |
75 | ```
76 |
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | asyncio-hn
2 | ==========
3 |
4 | A simple asyncio wrapper to download
5 | `hacker-news `__ with speed and ease.
6 |
7 | The package supports all endpoints of the official API : `hacker-news
8 | API `__
9 |
10 | Installation
11 | ------------
12 |
13 | .. code:: shell
14 |
15 | pip install asyncio-hn
16 |
17 | Usage
18 | -----
19 |
20 | .. code:: python
21 |
22 | import asyncio
23 | from asyncio_hn import ClientHN
24 |
25 | async def main(loop):
26 | # We init the client - extension of aiohttp.ClientSession
27 | async with ClientHN(loop=loop) as hn:
28 | # Up to 500 top and top stories (only ids)
29 | hn_new_stories = await hn.top_stories()
30 | # Download top 10 story data
31 | top_posts = await hn.items(hn_new_stories[:10])
32 | # Download the user data for each story
33 | users = await hn.users([post.get("by") for post in top_posts])
34 |
35 |
36 | if __name__ == '__main__':
37 | loop = asyncio.get_event_loop()
38 | loop.run_until_complete(main(loop))
39 |
40 | Advance usage
41 | ~~~~~~~~~~~~~
42 |
43 | Using this config you can reach 1000+ request/sec.
44 |
45 | .. code:: python
46 | import aiohttp
47 | from asyncio_hn import ClientHN
48 |
49 | N = 1_000_000
50 |
51 | async def advance_run(loop):
52 | # We init the client - extension of aiohttp.ClientSession
53 | conn = aiohttp.TCPConnector(limit=1000, loop=loop)
54 | async with ClientHN(loop=loop, queue_size=1000, connector=conn, progress_bar=True, debug=True) as hn:
55 | # Download the last 1,000,000 stories
56 | hn_new_stories = await hn.last_n_items(n=N)
57 |
58 | Output example:
59 | ---------------
60 |
61 | Item:
62 |
63 | .. code:: python
64 |
65 | item = {'by': 'amzans', 'descendants': 25, 'id': 13566716,
66 | 'kids': [13567061, 13567631, 13567027, 13567055, 13566798, 13567473], 'score': 171, 'time': 1486210548,
67 | 'title': 'Network programming with Go (2012)', 'type': 'story',
68 | 'url': 'https://jannewmarch.gitbooks.io/network-programming-with-go-golang-/content/'},
69 | {'by': 'r3bl', 'descendants': 1, 'id': 13567940, 'kids': [13568249], 'score': 24, 'time': 1486230224,
70 | 'title': 'YouTube removes hundreds of the best climate science videos from the Internet',
71 | 'type': 'story',
72 | 'url': 'http://climatestate.com/2017/02/03/youtube-removes-hundreds-of-the-best-climate-science-videos-from-the-internet/'}
73 |
74 | User:
75 |
76 | .. code:: python
77 |
78 | user = {'created': 1470758993, 'id': 'amzans', 'karma': 174,
79 | 'submitted': [13567884, 13566716, 13566699, 13558456, 13539270, 13539151, 13514498, 13418469, 13417725,
80 | 13416562, 13416097, 13416034, 13415954, 13415894, 13395310, 13394996, 13392554, 12418804,
81 | 12418361, 12413958, 12411992, 12411732, 12411546, 12262383, 12255593]}
82 |
83 |
--------------------------------------------------------------------------------
/asyncio_hn/hn.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.6
2 | import asyncio
3 | import logging
4 |
5 | import aiohttp
6 | import tqdm
7 | from aiohttp import HttpProcessingError
8 |
9 | URL_GET_POST = "https://hacker-news.firebaseio.com/v0/item/{}.json"
10 | URL_GET_USER = "https://hacker-news.firebaseio.com/v0/user/{}.json"
11 | URL_MAX_ITEM = 'https://hacker-news.firebaseio.com/v0/maxitem.json'
12 | URL_TOP_STORIES = "https://hacker-news.firebaseio.com/v0/topstories.json"
13 | URL_NEW_STORIES = "https://hacker-news.firebaseio.com/v0/newstories.json"
14 | URL_BEST_STORIES = "https://hacker-news.firebaseio.com/v0/beststories.json"
15 | URL_UPDATES = "https://hacker-news.firebaseio.com/v0/updates.json"
16 | URL_ASK_STORIES = "https://hacker-news.firebaseio.com/v0/askstories.json"
17 | URL_SHOW_STORIES = "https://hacker-news.firebaseio.com/v0/showstories.json"
18 | URL_JOB_STORIES = "https://hacker-news.firebaseio.com/v0/jobstories.json"
19 | # The max tcp connection we open
20 | MAX_CONNECTION = 1000
21 |
22 | # setting up logger
23 | logger = logging.getLogger(__name__)
24 | console = logging.StreamHandler()
25 | logger.addHandler(console)
26 |
27 |
28 | class ClientHN(aiohttp.ClientSession):
29 | def __init__(self, queue_size=10, progress_bar=False, debug=False, num_dlq_consumers=10, **kwargs):
30 | super(ClientHN, self).__init__(**kwargs)
31 | self.queue_size = queue_size
32 | self.connector_limit = self.connector.limit
33 | self._responses = []
34 | self.progress_bar = progress_bar
35 | self.num_dlq_consumers = num_dlq_consumers
36 | if debug:
37 | logger.setLevel(logging.DEBUG)
38 |
39 | async def single_download(self, url):
40 | async with self.get(url) as resp:
41 | return await resp.json()
42 |
43 | async def multi_download(self, itr, url, num_of_consumers=None, desc=""):
44 | queue, dlq, responses = asyncio.Queue(
45 | maxsize=self.queue_size), asyncio.Queue(), []
46 | num_of_consumers = num_of_consumers or min(self.connector_limit, self.try_get_itr_len(itr))
47 | consumers = [asyncio.ensure_future(
48 | self._consumer(main_queue=queue, dlq=dlq, responses=responses)) for _ in
49 | range(num_of_consumers or self.connector_limit)]
50 | dlq_consumers = [asyncio.ensure_future(
51 | self._consumer(dlq, dlq, responses)) for _ in range(self.num_dlq_consumers)]
52 | produce = await self._produce(itr, url, queue, desc=desc)
53 | await queue.join()
54 | await dlq.join()
55 | for consumer in consumers + dlq_consumers:
56 | consumer.cancel()
57 | return responses
58 |
59 | def try_get_itr_len(self, itr):
60 | try:
61 | return len(itr)
62 | except TypeError:
63 | return 1000000
64 |
65 | async def _produce(self, items, base_url, queue, desc=""):
66 | for item in tqdm.tqdm(items, desc=desc + " (Estimation)", disable=not self.progress_bar):
67 | await queue.put(base_url.format(item))
68 |
69 | async def _consumer(self, main_queue, dlq, responses):
70 | while True:
71 | try:
72 | url = await main_queue.get()
73 | async with self.get(url, timeout=10) as response:
74 | resp = response
75 | resp.raise_for_status()
76 | responses.append(await resp.json())
77 | # Notify the queue that the item has been processed
78 | main_queue.task_done()
79 |
80 | except (HttpProcessingError, asyncio.TimeoutError) as e:
81 | logger.debug("Problem with %s, Moving to DLQ" % url)
82 | await dlq.put(url)
83 | main_queue.task_done()
84 |
85 | async def top_stories(self):
86 | return await self.single_download(URL_TOP_STORIES)
87 |
88 | async def best_stories(self):
89 | return await self.single_download(URL_BEST_STORIES)
90 |
91 | async def new_stories(self):
92 | return await self.single_download(URL_NEW_STORIES)
93 |
94 | async def ask_stories(self):
95 | return await self.single_download(URL_ASK_STORIES)
96 |
97 | async def updates(self):
98 | return await self.single_download(URL_UPDATES)
99 |
100 | async def job_stories(self):
101 | return await self.single_download(URL_JOB_STORIES)
102 |
103 | async def max_item(self):
104 | return await self.single_download(URL_MAX_ITEM)
105 |
106 | async def users(self, itr_users, num_of_futures=None):
107 | return await self.multi_download(itr_users, URL_GET_USER, num_of_futures, "Download users")
108 |
109 | async def items(self, posts_itr, num_of_futures=None):
110 | return await self.multi_download(posts_itr, URL_GET_POST, num_of_futures, "Download items")
111 |
112 | async def last_n_items(self, n, num_of_futures=None):
113 | max_item = await self.max_item()
114 | return await self.multi_download(range(max_item, max_item - n, -1), URL_GET_POST, num_of_futures,
115 | "Download last N posts")
116 |
--------------------------------------------------------------------------------