├── requirements.txt
├── README.md
└── async_crawler.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==1.2.0
2 | async-timeout==1.1.0
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Tangrowth [![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/mehmetkose/python3.5-async-crawler/edit/master/README.md)
 2 | Python3.6 Async Crawler Example with aiohttp and asyncio
 3 | 
 4 | ![Image of Tangrowth](https://assets.pokemon.com/assets/cms2/img/pokedex/detail/465.png)
 5 | 
 6 | ## Installation
 7 | 
 8 | ### Installation Python 3.6
 9 | 
10 | ```bash
11 | sudo add-apt-repository ppa:fkrull/deadsnakes
12 | sudo apt-get update
13 | sudo apt-get install python3.6
14 | ```
15 | ### In virtualenvwrapper
16 | 
17 | ```bash
18 | mkvirtualenv async_crawler --python=/usr/bin/python3.6
19 | python async_crawler.py
20 | ```
21 | ## or
22 | 
23 | ### Replace python3 with python3.6
24 | 
25 | ```bash
26 | sudo mv /usr/bin/python3 /usr/bin/python3-backup
27 | sudo ln -s /usr/bin/python3.6 /usr/bin/python3
28 | 
29 | sudo apt-get install python3-pip
30 | sudo pip3 install aiohttp
31 | python3 async_crawler.py
32 | ```
33 | 


--------------------------------------------------------------------------------
/async_crawler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # python 3.5 async web crawler.
 4 | # https://github.com/mehmetkose/python3.5-async-crawler
 5 | 
 6 | # Licensed under the MIT license:
 7 | # http://www.opensource.org/licenses/mit-license
 8 | # Copyright (c) 2016 Mehmet Kose mehmet@linux.com
 9 | 
10 | 
11 | import aiohttp
12 | import asyncio
13 | import async_timeout
14 | from urllib.parse import urljoin, urldefrag
15 | 
16 | 
17 | root_url = "http://python.org/"
18 | crawled_urls, url_hub = [], [root_url, "%s/sitemap.xml" % (root_url), "%s/robots.txt" % (root_url)]
19 | headers = {'user-agent': 'Opera/9.80 (X11; Linux x86_64; U; en) Presto/2.2.15 Version/10.10'}
20 | 
21 | 
22 | async def get_body(url):
23 |     async with aiohttp.ClientSession() as session:
24 |         try:
25 |             with async_timeout.timeout(10):
26 |                 async with session.get(url, headers=headers) as response:
27 |                     if response.status == 200:
28 |                         html = await response.text()
29 |                         return {'error': '', 'html': html}
30 |                     else:
31 |                         return {'error': response.status, 'html': ''}
32 |         except Exception as err:
33 |             return {'error': err, 'html': ''}
34 | 
35 | async def handle_task(task_id, work_queue):
36 |     while not work_queue.empty():
37 |         queue_url = await work_queue.get()
38 |         if not queue_url in crawled_urls:
39 |             crawled_urls.append(queue_url)
40 |             body = await get_body(queue_url)
41 |             if not body['error']:
42 |                 for new_url in get_urls(body['html']):
43 |                     if root_url in new_url and not new_url in crawled_urls:
44 |                         work_queue.put_nowait(new_url)
45 |             else:
46 |                 print(f"Error: {body['error']} - {queue_url}")
47 | 
48 | def remove_fragment(url):
49 |     pure_url, frag = urldefrag(url)
50 |     return pure_url
51 | 
52 | def get_urls(html):
53 |     new_urls = [url.split('"')[0] for url in str(html).replace("'",'"').split('href="')[1:]]
54 |     return [urljoin(root_url, remove_fragment(new_url)) for new_url in new_urls]
55 | 
56 | if __name__ == "__main__":
57 |     q = asyncio.Queue()
58 |     [q.put_nowait(url) for url in url_hub]    
59 |     loop = asyncio.get_event_loop()
60 |     tasks = [handle_task(task_id, q) for task_id in range(3)]
61 |     loop.run_until_complete(asyncio.wait(tasks))
62 |     loop.close()
63 |     for u in crawled_urls:
64 |         print(u)
65 |     print('-'*30)
66 |     print(len(crawled_urls))
67 | 


--------------------------------------------------------------------------------