├── requirements.txt ├── README.md └── async_crawler.py /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==1.2.0 2 | async-timeout==1.1.0 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tangrowth [![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/mehmetkose/python3.5-async-crawler/edit/master/README.md) 2 | Python3.6 Async Crawler Example with aiohttp and asyncio 3 | 4 | ![Image of Tangrowth](https://assets.pokemon.com/assets/cms2/img/pokedex/detail/465.png) 5 | 6 | ## Installation 7 | 8 | ### Installation Python 3.6 9 | 10 | ```bash 11 | sudo add-apt-repository ppa:fkrull/deadsnakes 12 | sudo apt-get update 13 | sudo apt-get install python3.6 14 | ``` 15 | ### In virtualenvwrapper 16 | 17 | ```bash 18 | mkvirtualenv async_crawler --python=/usr/bin/python3.6 19 | python async_crawler.py 20 | ``` 21 | ## or 22 | 23 | ### Replace python3 with python3.6 24 | 25 | ```bash 26 | sudo mv /usr/bin/python3 /usr/bin/python3-backup 27 | sudo ln -s /usr/bin/python3.6 /usr/bin/python3 28 | 29 | sudo apt-get install python3-pip 30 | sudo pip3 install aiohttp 31 | python3 async_crawler.py 32 | ``` 33 | -------------------------------------------------------------------------------- /async_crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # python 3.5 async web crawler. 4 | # https://github.com/mehmetkose/python3.5-async-crawler 5 | 6 | # Licensed under the MIT license: 7 | # http://www.opensource.org/licenses/mit-license 8 | # Copyright (c) 2016 Mehmet Kose mehmet@linux.com 9 | 10 | 11 | import aiohttp 12 | import asyncio 13 | import async_timeout 14 | from urllib.parse import urljoin, urldefrag 15 | 16 | 17 | root_url = "http://python.org/" 18 | crawled_urls, url_hub = [], [root_url, "%s/sitemap.xml" % (root_url), "%s/robots.txt" % (root_url)] 19 | headers = {'user-agent': 'Opera/9.80 (X11; Linux x86_64; U; en) Presto/2.2.15 Version/10.10'} 20 | 21 | 22 | async def get_body(url): 23 | async with aiohttp.ClientSession() as session: 24 | try: 25 | with async_timeout.timeout(10): 26 | async with session.get(url, headers=headers) as response: 27 | if response.status == 200: 28 | html = await response.text() 29 | return {'error': '', 'html': html} 30 | else: 31 | return {'error': response.status, 'html': ''} 32 | except Exception as err: 33 | return {'error': err, 'html': ''} 34 | 35 | async def handle_task(task_id, work_queue): 36 | while not work_queue.empty(): 37 | queue_url = await work_queue.get() 38 | if not queue_url in crawled_urls: 39 | crawled_urls.append(queue_url) 40 | body = await get_body(queue_url) 41 | if not body['error']: 42 | for new_url in get_urls(body['html']): 43 | if root_url in new_url and not new_url in crawled_urls: 44 | work_queue.put_nowait(new_url) 45 | else: 46 | print(f"Error: {body['error']} - {queue_url}") 47 | 48 | def remove_fragment(url): 49 | pure_url, frag = urldefrag(url) 50 | return pure_url 51 | 52 | def get_urls(html): 53 | new_urls = [url.split('"')[0] for url in str(html).replace("'",'"').split('href="')[1:]] 54 | return [urljoin(root_url, remove_fragment(new_url)) for new_url in new_urls] 55 | 56 | if __name__ == "__main__": 57 | q = asyncio.Queue() 58 | [q.put_nowait(url) for url in url_hub] 59 | loop = asyncio.get_event_loop() 60 | tasks = [handle_task(task_id, q) for task_id in range(3)] 61 | loop.run_until_complete(asyncio.wait(tasks)) 62 | loop.close() 63 | for u in crawled_urls: 64 | print(u) 65 | print('-'*30) 66 | print(len(crawled_urls)) 67 | --------------------------------------------------------------------------------