├── .gitignore ├── LICENSE ├── README.md ├── aiocrawler ├── __init__.py ├── cmdline.py ├── constants.py ├── crawler.py ├── exceptions.py ├── logger.py ├── queues │ ├── __init__.py │ └── redis_queue.py ├── request.py └── responses │ ├── __init__.py │ ├── responses.py │ └── wrap.py ├── examples ├── lianjia_crawl.py └── qisuu_crawl.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.pyc 3 | .idea/ 4 | # build-related files 5 | aiocrawler.egg-info/ 6 | build/ 7 | dist/ 8 | download/ 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 CodingCrush 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AioCrawler 2 | Async crawler framework based on asyncio 3 | -------------------------------------------------------------------------------- /aiocrawler/__init__.py: -------------------------------------------------------------------------------- 1 | from aiocrawler.crawler import AioCrawler 2 | from aiocrawler import responses 3 | 4 | 5 | __version__ = '0.0.2-dev' 6 | 7 | __all__ = [AioCrawler] + responses.__all__ 8 | -------------------------------------------------------------------------------- /aiocrawler/cmdline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodingCrush/AioCrawler/54d18e0bb48f698d71420fee75a19c8cc904dad3/aiocrawler/cmdline.py -------------------------------------------------------------------------------- /aiocrawler/constants.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | # aio download buffer chunk size: 500kb 5 | DOWNLOAD_CHUNK_SIZE = 512000 6 | 7 | DEFAULT_TIMEOUT = 20 8 | DEFAULT_CONCURRENCY = 20 9 | DEFAULT_MAX_TRIES = 3 10 | 11 | WORKING_DIR = os.getcwd() 12 | 13 | 14 | METHOD_HEAD = 'HEAD' 15 | METHOD_GET = 'GET' 16 | METHOD_DELETE = 'DELETE' 17 | METHOD_OPTIONS = 'OPTIONS' 18 | METHOD_PATCH = 'PATCH' 19 | METHOD_POST = 'POST' 20 | METHOD_PUT = 'PUT' 21 | 22 | # aiohttp autogenerates headers like User-Agent or Content-Type 23 | # if these headers are not explicitly passed. 24 | AIOHTTP_AUTO_HEADERS = ("User-Agent", "Content-Type") 25 | 26 | # if response.status not in the set, drop the response. 27 | NORMAL_STATUS_CODES = (200, 201) 28 | 29 | 30 | # sleep for 0.3s when aioredis queue raise QueueFull. 31 | QUEUE_BLOCK_SLEEP_INTERVAL = 0.3 32 | 33 | # Connection pool size. 34 | AIOREDIS_POOL_MIN_SIZE = 1 35 | AIOREDIS_POOL_MAX_SIZE = 5 36 | -------------------------------------------------------------------------------- /aiocrawler/crawler.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import async_timeout 3 | import aiohttp 4 | import aiofiles 5 | import inspect 6 | import os 7 | from datetime import datetime 8 | from urllib import parse as urlparse 9 | from pathlib import Path 10 | from aiocrawler.responses import wrap_response 11 | from aiocrawler.logger import create_logger 12 | from aiocrawler.constants import DOWNLOAD_CHUNK_SIZE, WORKING_DIR, \ 13 | METHOD_DELETE, METHOD_GET, METHOD_HEAD, METHOD_OPTIONS, METHOD_PATCH, \ 14 | METHOD_POST, METHOD_PUT, DEFAULT_TIMEOUT, DEFAULT_CONCURRENCY, \ 15 | DEFAULT_MAX_TRIES, AIOHTTP_AUTO_HEADERS, NORMAL_STATUS_CODES 16 | 17 | try: 18 | import uvloop as async_loop 19 | except ImportError: 20 | async_loop = asyncio 21 | 22 | 23 | class AioCrawler(object): 24 | name = None 25 | concurrency = DEFAULT_CONCURRENCY 26 | timeout = DEFAULT_TIMEOUT 27 | max_tries = DEFAULT_MAX_TRIES 28 | headers = None 29 | loop = None 30 | logger = None 31 | debug = False 32 | 33 | _failed_urls = set() 34 | _seen_urls = set() 35 | 36 | def __init__(self, **kwargs): 37 | self.name = getattr(self, 'name') or self.__class__.__name__ 38 | 39 | self.loop = getattr(self, 'loop') or async_loop.new_event_loop() 40 | asyncio.set_event_loop(self.loop) 41 | 42 | self.ac_session = aiohttp.ClientSession(loop=self.loop) 43 | 44 | # Lifo Queue for Stashing all tasks to be done. 45 | self._tasks_que = asyncio.LifoQueue(loop=self.loop) 46 | 47 | self.logger = getattr(self, 'logger') or create_logger(self) 48 | 49 | self.__dict__.update(kwargs) 50 | 51 | def on_start(self): 52 | raise NotImplementedError() 53 | 54 | @staticmethod 55 | def get_request_url(url, params): 56 | if params is None: 57 | params = {} 58 | url_parts = list(urlparse.urlparse(url)) 59 | query = dict(urlparse.parse_qsl(url_parts[4])) 60 | query.update(params) 61 | url_parts[4] = urlparse.urlencode(query) 62 | return urlparse.urlunparse(url_parts) 63 | 64 | def _update_kwargs_headers(self, **kwargs): 65 | headers = kwargs.get("headers") or self.headers 66 | if callable(headers): 67 | # avoid aiohttp autogenerates headers 68 | kwargs["skip_auto_headers"] = AIOHTTP_AUTO_HEADERS 69 | kwargs["headers"] = headers() 70 | if isinstance(headers, dict): 71 | kwargs["headers"] = headers 72 | return kwargs 73 | 74 | async def _request(self, url, parser=None, sleep=None, 75 | method=None, file=None, **kwargs): 76 | # If url is list, set, or tuple.Then Split the tasks 77 | if not isinstance(url, str): 78 | if hasattr(url, "__iter__"): 79 | for _url in url: 80 | self._tasks_que.put_nowait(self._request( 81 | _url, sleep=sleep, method=method, file=file, 82 | parser=parser, **kwargs 83 | )) 84 | return 85 | else: 86 | url = str(url) 87 | 88 | http_method_request = getattr(self.ac_session, method.lower()) 89 | this_request_url = self.get_request_url(url, kwargs.get('params')) 90 | 91 | kwargs = self._update_kwargs_headers(**kwargs) 92 | 93 | # try max_tries if fail 94 | for try_count in range(1, self.max_tries+1): 95 | try: 96 | with async_timeout.timeout(self.timeout): 97 | response = await http_method_request(url, **kwargs) 98 | self._seen_urls.add(this_request_url) 99 | break 100 | except aiohttp.ClientError: 101 | self.logger.debug("[{}] {} [ClientError][Try:{}]".format( 102 | method, this_request_url, try_count) 103 | ) 104 | except asyncio.TimeoutError: 105 | self.logger.debug("[{}] {} [TimeoutError][Try:{}]".format( 106 | method, this_request_url, try_count) 107 | ) 108 | await asyncio.sleep(sleep or 0) 109 | else: # still fail 110 | self._failed_urls.add(this_request_url) 111 | return self.logger.error("[{}] {} [Failure][Try:{}]".format( 112 | method, this_request_url, self.max_tries) 113 | ) 114 | 115 | self.logger.info("[{}] {} [{} {}]".format( 116 | method, this_request_url, response.status, response.reason) 117 | ) 118 | 119 | if response.status not in NORMAL_STATUS_CODES: 120 | return 121 | 122 | if sleep is not None: 123 | await asyncio.sleep(sleep) 124 | 125 | if parser is None: 126 | return 127 | if parser is self._download: 128 | await parser(response, file) 129 | return self.logger.info("[DOWNLOAD]:{}".format(file)) 130 | 131 | response = await wrap_response(response) 132 | 133 | if inspect.iscoroutinefunction(parser) or \ 134 | inspect.isawaitable(parser): 135 | return await parser(response) 136 | else: 137 | return parser(response) 138 | 139 | # real download method 140 | @staticmethod 141 | async def _download(response, file): 142 | async with aiofiles.open(file, 'wb') as fd: 143 | while True: 144 | chunk = await response.content.read(DOWNLOAD_CHUNK_SIZE) 145 | if not chunk: 146 | break 147 | await fd.write(chunk) 148 | await fd.flush() 149 | 150 | def download(self, url, save_dir=WORKING_DIR, headers=None, filename=None, 151 | params=None, sleep=None, allow_redirects=True, **kwargs): 152 | # recursively mkdir, ignore error when directory exists 153 | if not os.path.exists(save_dir): 154 | path = Path(save_dir) 155 | path.mkdir(parents=True, exist_ok=True) 156 | file = os.path.join(save_dir, filename) 157 | 158 | self._tasks_que.put_nowait(self._request( 159 | url, sleep=sleep, params=params, method=METHOD_GET, 160 | headers=headers, file=file, parser=self._download, 161 | allow_redirects=allow_redirects, **kwargs 162 | )) 163 | 164 | def get(self, urls, params=None, parser=None, headers=None, 165 | sleep=None, allow_redirects=True, **kwargs): 166 | self._tasks_que.put_nowait(self._request( 167 | urls, parser=parser, sleep=sleep, method=METHOD_GET, 168 | params=params, headers=headers, allow_redirects=allow_redirects, 169 | **kwargs 170 | )) 171 | 172 | def post(self, urls, data=None, json=None, parser=None, headers=None, 173 | sleep=None, allow_redirects=True, **kwargs): 174 | self._tasks_que.put_nowait(self._request( 175 | urls, parser=parser, sleep=sleep, method=METHOD_POST, 176 | headers=headers, data=data, json=json, 177 | allow_redirects=allow_redirects, **kwargs 178 | )) 179 | 180 | def patch(self, urls, data=None, json=None, parser=None, headers=None, 181 | sleep=None, allow_redirects=True, **kwargs): 182 | self._tasks_que.put_nowait(self._request( 183 | urls, parser=parser, sleep=sleep, method=METHOD_PATCH, 184 | headers=headers, data=data, json=json, 185 | allow_redirects=allow_redirects, **kwargs 186 | )) 187 | 188 | def put(self, urls, data=None, json=None, parser=None, 189 | sleep=None, allow_redirects=True, **kwargs): 190 | self._tasks_que.put_nowait(self._request( 191 | urls, parser=parser, sleep=sleep, method=METHOD_PUT, data=data, 192 | json=json, allow_redirects=allow_redirects, **kwargs 193 | )) 194 | 195 | def head(self, urls, parser=None, sleep=None, headers=None, 196 | allow_redirects=True, **kwargs): 197 | self._tasks_que.put_nowait(self._request( 198 | urls, parser=parser, sleep=sleep, method=METHOD_HEAD, 199 | headers=headers, allow_redirects=allow_redirects, **kwargs 200 | )) 201 | 202 | def delete(self, urls, parser=None, sleep=None, headers=None, 203 | allow_redirects=True, **kwargs): 204 | self._tasks_que.put_nowait(self._request( 205 | urls, parser=parser, sleep=sleep, method=METHOD_DELETE, 206 | headers=headers, allow_redirects=allow_redirects, **kwargs 207 | )) 208 | 209 | def options(self, urls, parser=None, sleep=None, headers=None, 210 | allow_redirects=True, **kwargs): 211 | self._tasks_que.put_nowait(self._request( 212 | urls, parser=parser, sleep=sleep, method=METHOD_OPTIONS, 213 | headers=headers, allow_redirects=allow_redirects, **kwargs 214 | )) 215 | 216 | async def workers(self): 217 | while True: 218 | task = await self._tasks_que.get() 219 | await task 220 | self._tasks_que.task_done() 221 | 222 | async def work(self): 223 | if inspect.iscoroutinefunction(self.on_start): 224 | await self.on_start() 225 | else: 226 | self.on_start() 227 | 228 | workers = [ 229 | asyncio.Task(self.workers(), loop=self.loop) 230 | for _ in range(self.concurrency) 231 | ] 232 | 233 | await self._tasks_que.join() 234 | for worker in workers: 235 | worker.cancel() 236 | 237 | def run(self): 238 | start_at = datetime.now() 239 | self.logger.info('{} Started, Concurrency:{}'.format( 240 | self.name, self.concurrency)) 241 | try: 242 | self.loop.run_until_complete(self.work()) 243 | except KeyboardInterrupt: 244 | for task in asyncio.Task.all_tasks(): 245 | task.cancel() 246 | except asyncio.CancelledError: 247 | pass # All tasks has been cancelled 248 | finally: 249 | end_at = datetime.now() 250 | self.logger.info( 251 | '{} Finished in {} seconds.' 252 | 'Success:{}, Failure:{}'.format( 253 | self.name, (end_at-start_at).total_seconds(), 254 | len(self._seen_urls), len(self._failed_urls)) 255 | ) 256 | self.ac_session.close() 257 | self.loop.close() 258 | 259 | def __call__(self, *args, **kwargs): 260 | self.run() 261 | -------------------------------------------------------------------------------- /aiocrawler/exceptions.py: -------------------------------------------------------------------------------- 1 | class JsonDecodeError(Exception): 2 | pass 3 | 4 | 5 | class QueueEmpty(Exception): 6 | """Exception raised when Queue.get_nowait() is called on a Queue object 7 | which is empty. 8 | """ 9 | pass 10 | 11 | 12 | class QueueFull(Exception): 13 | """Exception raised when the Queue.put_nowait() method is called on a Queue 14 | object which is full. 15 | """ 16 | pass 17 | -------------------------------------------------------------------------------- /aiocrawler/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | 5 | def has_level_handler(logger): 6 | """Check if there is a handler in the logging chain that will handle the 7 | given logger's :meth:`effective level <~logging.Logger.getEffectiveLevel>`. 8 | """ 9 | level = logger.getEffectiveLevel() 10 | current = logger 11 | 12 | while current: 13 | if any(handler.level <= level for handler in current.handlers): 14 | return True 15 | if not current.propagate: 16 | break 17 | current = current.parent 18 | 19 | return False 20 | 21 | 22 | default_handler = logging.StreamHandler(sys.stdout) 23 | default_handler.setFormatter(logging.Formatter( 24 | '[%(asctime)s] %(levelname)s in %(module)s: %(message)s' 25 | )) 26 | 27 | 28 | def create_logger(instance): 29 | """Get the AioCrawl's instance logger and configure it. 30 | When the instance.debug is enabled, set the logger level to 31 | logging.DEBUG` if it is not set. 32 | """ 33 | logger = logging.getLogger(instance.__class__.__name__) 34 | 35 | if instance.debug and logger.level == logging.NOTSET: 36 | logger.setLevel(logging.DEBUG) 37 | 38 | if not has_level_handler(logger): 39 | logger.addHandler(default_handler) 40 | 41 | return logger 42 | -------------------------------------------------------------------------------- /aiocrawler/queues/__init__.py: -------------------------------------------------------------------------------- 1 | from .redis_queue import AioRedisQueue, AioRedisLifoQueue 2 | 3 | 4 | __all__ = [AioRedisQueue, AioRedisLifoQueue] 5 | -------------------------------------------------------------------------------- /aiocrawler/queues/redis_queue.py: -------------------------------------------------------------------------------- 1 | import aioredis 2 | import asyncio 3 | import time 4 | import umsgpack 5 | from aiocrawler.exceptions import QueueEmpty, QueueFull 6 | from aiocrawler.crawler import async_loop 7 | from aiocrawler.constants import QUEUE_BLOCK_SLEEP_INTERVAL, \ 8 | AIOREDIS_POOL_MIN_SIZE, AIOREDIS_POOL_MAX_SIZE 9 | 10 | 11 | class AioRedisQueue(object): 12 | 13 | def __init__(self, name, host="localhost", port=6379, db=0, max_size=0, 14 | password=None, loop=None, timeout=None, ssl=None, 15 | encoding=None): 16 | self._name = name 17 | self._host = host 18 | self._port = port 19 | self._db = db 20 | self._password = password 21 | self._timeout = timeout 22 | self._ssl = ssl 23 | self._encoding = encoding 24 | if loop is None: 25 | self._loop = async_loop.new_event_loop() 26 | else: 27 | self._loop = loop 28 | self._max_size = max_size 29 | self._pool = None 30 | 31 | async def connect(self): 32 | with await asyncio.Lock(): 33 | self._pool = await aioredis.create_pool( 34 | (self._host, self._port), db=self._db, password=self._password, 35 | ssl=self._ssl, encoding=self._encoding, loop=self._loop, 36 | minsize=AIOREDIS_POOL_MIN_SIZE, maxsize=AIOREDIS_POOL_MAX_SIZE 37 | ) 38 | 39 | async def close(self): 40 | if self._pool: 41 | self._pool.close() 42 | await self._pool.wait_closed() 43 | 44 | async def qsize(self): 45 | if not self._pool: 46 | await self.connect() 47 | return await self._pool.execute("LLEN", self._name) 48 | 49 | async def empty(self): 50 | return await self.qsize() == 0 51 | 52 | async def full(self): 53 | if not self._max_size: 54 | return False 55 | return await self.qsize() >= self._max_size 56 | 57 | # no wait for full, but still async 58 | async def put_nowait(self, item): 59 | if not self._pool: 60 | await self.connect() 61 | if await self.full(): 62 | raise QueueFull 63 | await self._pool.execute("RPUSH", self._name, umsgpack.packb(item)) 64 | return True 65 | 66 | async def put(self, item, block=True, timeout=None): 67 | if not block: 68 | return await self.put_nowait(item) 69 | timeout = timeout or self._timeout 70 | start_time = time.time() 71 | 72 | while True: 73 | try: 74 | return await self.put_nowait(item) 75 | except QueueFull: 76 | if not timeout: 77 | await asyncio.sleep(QUEUE_BLOCK_SLEEP_INTERVAL) 78 | continue 79 | lasted = time.time() - start_time 80 | if lasted < timeout: 81 | await asyncio.sleep(min(timeout, timeout-lasted)) 82 | else: # timeout 83 | raise 84 | 85 | async def _get(self): 86 | if not self._pool: 87 | await self.connect() 88 | return await self._pool.execute("LPOP", self._name) 89 | 90 | async def get_nowait(self): 91 | if not self._pool: 92 | await self.connect() 93 | serialized_item = await self._get() 94 | if serialized_item is None: 95 | raise QueueEmpty 96 | return umsgpack.unpackb(serialized_item) 97 | 98 | async def get(self, block=True, timeout=None): 99 | if not block: 100 | return await self.get_nowait() 101 | timeout = timeout or self._timeout 102 | start_time = time.time() 103 | 104 | while True: 105 | try: 106 | return await self.get_nowait() 107 | except QueueEmpty: 108 | if not timeout: 109 | await asyncio.sleep(QUEUE_BLOCK_SLEEP_INTERVAL) 110 | continue 111 | lasted = time.time() - start_time 112 | if lasted < timeout: 113 | await asyncio.sleep(min(timeout, timeout-lasted)) 114 | else: # timeout 115 | raise 116 | 117 | 118 | class AioRedisLifoQueue(AioRedisQueue): 119 | 120 | async def _get(self): 121 | return self._pool.execute("RPOP", self._name) 122 | -------------------------------------------------------------------------------- /aiocrawler/request.py: -------------------------------------------------------------------------------- 1 | from user_agent import generate_user_agent, generate_navigator, \ 2 | generate_navigator_js 3 | 4 | 5 | # Factory Mode 6 | def random_navigator_headers(*args): 7 | # "user_agent"-> "User-Agent 8 | raw_headers = generate_navigator() 9 | return { 10 | header.title().replace("_", "-"): value or "" 11 | for header, value in raw_headers.items() 12 | } 13 | 14 | 15 | def random_navigator_js_headers(*args): 16 | raw_headers = generate_navigator_js() 17 | return { 18 | header: value or "" 19 | for header, value in raw_headers.items() 20 | } 21 | 22 | 23 | def random_user_agent(*args): 24 | return { 25 | 'User-Agent': generate_user_agent() 26 | } 27 | 28 | -------------------------------------------------------------------------------- /aiocrawler/responses/__init__.py: -------------------------------------------------------------------------------- 1 | from .responses import JsonResponse, HtmlResponse, XmlResponse 2 | from .wrap import ResponseTypes 3 | 4 | 5 | __all__ = [JsonResponse, HtmlResponse, XmlResponse] 6 | 7 | 8 | async def wrap_response(response): 9 | wrapped_response = await ResponseTypes.construct(response) 10 | return wrapped_response 11 | -------------------------------------------------------------------------------- /aiocrawler/responses/responses.py: -------------------------------------------------------------------------------- 1 | from lxml import etree 2 | from pyquery import PyQuery 3 | from aiocrawler.exceptions import JsonDecodeError 4 | 5 | try: 6 | from ujson import loads as json_loads 7 | except ImportError: 8 | import sys 9 | import json 10 | # PY35 only support str not bytes. 11 | if sys.version_info[:2] == (3, 5): 12 | def json_loads(data): 13 | return json.loads(data.decode()) 14 | else: 15 | json_loads = json.loads 16 | 17 | 18 | class _BaseResponse(object): 19 | 20 | def __init__(self, response): 21 | self.__dict__.update( 22 | content_type=response.content_type, 23 | charset=response.charset, 24 | method=response.method, 25 | content=response.content, 26 | request_info=response.__dict__["_request_info"], 27 | url=response.__dict__["_url"], 28 | status=response.__dict__["status"], 29 | cookies=response.__dict__["cookies"], 30 | headers=response.__dict__["headers"], 31 | raw_headers=response.__dict__["raw_headers"], 32 | reason=response.reason 33 | ) 34 | 35 | 36 | class JsonResponse(_BaseResponse): 37 | 38 | def __init__(self, response=None): 39 | super(self.__class__, self).__init__(response=response) 40 | self.text = response.text 41 | self.type = "json" 42 | self._json = None 43 | 44 | @property 45 | def json(self): 46 | if self._json is None: 47 | try: 48 | self._json = json_loads(self.text) 49 | # ujson decode exception 50 | except (ValueError, SyntaxError) as e: 51 | raise JsonDecodeError(e) 52 | # json decode exception 53 | except json.JSONDecodeError as e: 54 | raise JsonDecodeError(e) 55 | return self._json 56 | 57 | 58 | class HtmlResponse(_BaseResponse): 59 | 60 | def __init__(self, response=None): 61 | super(self.__class__, self).__init__(response=response) 62 | self.text = response.text 63 | self._etree = None 64 | self._py_query_doc = None 65 | self.type = "html" 66 | 67 | @property 68 | def etree(self): 69 | if self._etree is None: 70 | self._etree = etree.HTML(self.text) 71 | return self._etree 72 | 73 | @property 74 | def doc(self): 75 | if self._py_query_doc is None: 76 | self._py_query_doc = PyQuery(self.text) 77 | return self._py_query_doc 78 | 79 | def xpath(self, path): 80 | return self.etree.xpath(path) 81 | 82 | def selector(self, rule): 83 | return self.doc(rule) 84 | 85 | 86 | class XmlResponse(HtmlResponse): 87 | pass 88 | -------------------------------------------------------------------------------- /aiocrawler/responses/wrap.py: -------------------------------------------------------------------------------- 1 | from aiohttp.client import ClientResponse as ParResponse 2 | from .responses import HtmlResponse, JsonResponse, XmlResponse 3 | from mimetypes import guess_type 4 | 5 | 6 | # TODO: response encoding 7 | 8 | 9 | class ResponseTypes(object): 10 | 11 | CONTENT_TYPES = { 12 | 'text/html': HtmlResponse, 13 | 'application/atom+xml': XmlResponse, 14 | 'application/rdf+xml': XmlResponse, 15 | 'application/rss+xml': XmlResponse, 16 | 'application/xhtml+xml': HtmlResponse, 17 | 'application/vnd.wap.xhtml+xml': HtmlResponse, 18 | 'application/xml': XmlResponse, 19 | 'application/json': JsonResponse, 20 | 'application/x-json': JsonResponse, 21 | 'application/json-amazonui-streaming': JsonResponse, 22 | 'application/javascript': JsonResponse, 23 | 'application/x-javascript': JsonResponse, 24 | 'text/xml': XmlResponse, 25 | 'text/*': HtmlResponse, 26 | } 27 | 28 | @classmethod 29 | def lookup(cls, raw_response): 30 | assert isinstance(raw_response, ParResponse) 31 | try: 32 | return cls._lookup_mime_type(raw_response) 33 | except KeyError: 34 | pass 35 | try: 36 | return cls._lookup_header_content_type(raw_response) 37 | except KeyError: 38 | pass 39 | try: 40 | return cls._lookup_content_type(raw_response) 41 | except KeyError: 42 | pass 43 | return HtmlResponse 44 | 45 | @classmethod 46 | def _lookup_mime_type(cls, raw_response): 47 | guessed_type, _ = guess_type(str(raw_response.url)) 48 | return cls.CONTENT_TYPES[guessed_type] 49 | 50 | @classmethod 51 | def _lookup_header_content_type(cls, raw_response): 52 | found_content_type = None 53 | # if not exists, .get_all() raise KeyError 54 | for content_type in raw_response.headers.getall("Content-Type"): 55 | if content_type in cls.CONTENT_TYPES: 56 | found_content_type = content_type 57 | break 58 | return cls.CONTENT_TYPES[found_content_type] 59 | 60 | @classmethod 61 | def _lookup_content_type(cls, raw_response): 62 | content_type = raw_response.content_type 63 | return cls.CONTENT_TYPES[content_type] 64 | 65 | @classmethod 66 | async def construct(cls, raw_response): 67 | factory_cls = cls.lookup(raw_response) 68 | response = factory_cls(raw_response) 69 | response.text = await response.text() 70 | return response 71 | -------------------------------------------------------------------------------- /examples/lianjia_crawl.py: -------------------------------------------------------------------------------- 1 | from aiocrawler import AioCrawler 2 | from aiocrawler.request import random_navigator_headers 3 | 4 | 5 | class LianjiaCrawler(AioCrawler): 6 | concurrency = 50 7 | urls = ( 8 | "http://sh.lianjia.com/zufang/d{}".format(count) 9 | for count in range(1, 100) 10 | ) 11 | timeout = 30 12 | headers = random_navigator_headers 13 | 14 | def on_start(self): 15 | self.get(self.urls, parser=self.parse, sleep=0.2) 16 | 17 | @staticmethod 18 | def parse(response): 19 | """ 20 | json: 21 | 'content_type', 'charset', 'method', 'request_info', 'url', 22 | 'status', 'cookies', 'headers', 'raw_headers', 'json', 'type' 23 | html: 24 | 'content_type', 'charset', 'method', 'request_info', 'url', 'status', 25 | 'cookies', 'headers', 'raw_headers', 'text', 'e_doc', 'p_doc', 'type', 26 | xpath(), selector() 27 | """ 28 | print(response.request_info) 29 | houses = response.xpath('//*[@id="house-lst"]/li') 30 | for index, house in enumerate(houses): 31 | title = house.xpath( 32 | '//li[{}]/div[2]/h2/a/text()'.format(index+1))[0] 33 | print("URL:{}, Title:{}".format(response.url, title)) 34 | 35 | 36 | if __name__ == "__main__": 37 | demo = LianjiaCrawler() 38 | demo.run() 39 | -------------------------------------------------------------------------------- /examples/qisuu_crawl.py: -------------------------------------------------------------------------------- 1 | from aiocrawler import AioCrawler 2 | import os 3 | 4 | 5 | class QisuuCrawler(AioCrawler): 6 | concurrency = 20 7 | timeout = 500 8 | debug = True 9 | 10 | def on_start(self): 11 | for count in range(1, 31000): 12 | self.download( 13 | "http://dzs.qisuu.com/txt/{}.txt".format(count), 14 | filename=str(count)+".txt", sleep=1) 15 | 16 | def parse_book(self, response): 17 | 18 | try: 19 | book_select = response.xpath("/html/body/div[4]/div[2]")[0] 20 | except IndexError: 21 | return print("parse error") 22 | 23 | name = book_select.xpath("div[1]/div/div[2]/div/h1/text()")[0] 24 | 25 | try: 26 | author = book_select.xpath("div[1]/div/div[2]/div/ul/li[7]/a/text()")[0] 27 | except IndexError: 28 | author = book_select.xpath("div[1]/div/div[2]/div/ul/li[7]/text()")[0] 29 | 30 | txt_url = book_select.xpath("div[3]/div[2]/ul/li[2]/a/@href")[0] 31 | 32 | save_dir = os.path.join( 33 | "download", "/".join( 34 | element.text for element in 35 | response.xpath("/html/body/div[3]/span/a")[1:-1]) 36 | ) 37 | 38 | self.download(txt_url, save_dir=save_dir, filename=" ".join((name, author)) + ".txt") 39 | 40 | 41 | if __name__ == "__main__": 42 | crawler = QisuuCrawler() 43 | crawler.run() 44 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import find_packages, setup 3 | from codecs import open 4 | import sys 5 | 6 | 7 | # Get the long description from the README file 8 | with open(os.path.join(os.path.dirname( 9 | os.path.realpath(__file__)), 'README.md'), encoding='utf-8') as f: 10 | long_description = f.read() 11 | 12 | install_requires = [ 13 | 'aiohttp', 14 | 'aiofiles', 15 | 'async_timeout', 16 | 'httptools', 17 | "lxml", 18 | "pyquery", 19 | 'uvloop', 20 | 'click', 21 | 'ujson', 22 | 'user_agent', 23 | 'aioredis==1.0.0b2', 24 | 'umsgpack' 25 | ] 26 | 27 | if sys.platform.startswith("win"): 28 | install_requires.remove("uvloop") 29 | 30 | 31 | setup( 32 | name="aiocrawler", 33 | version="0.0.2", 34 | 35 | description="Async crawler framework based on asyncio", 36 | long_description=long_description, 37 | 38 | author="CodingCrush", 39 | author_email="codingcrush@163.com", 40 | 41 | url='https://github.com/CodingCrush/aiocrawler', 42 | classifiers=[ 43 | 'Development Status :: 4 - Beta', 44 | 'Intended Audience :: Developers', 45 | 46 | 'License :: OSI Approved :: MIT License', 47 | 48 | 'Programming Language :: Python :: 3.5', 49 | 'Programming Language :: Python :: 3.6', 50 | ], 51 | keywords='scrapy crawler asyncio uvloop', 52 | install_requires=install_requires, 53 | license='MIT', 54 | packages=find_packages(exclude=['docs', 'examples', 'tests']), 55 | include_package_data=True, 56 | ) 57 | --------------------------------------------------------------------------------