├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── aioscpy ├── VERSION ├── __init__.py ├── __main__.py ├── cmdline.py ├── commands │ ├── __init__.py │ ├── crawl.py │ ├── genspider.py │ ├── onespider.py │ ├── runspider.py │ ├── startproject.py │ └── version.py ├── core │ ├── __init__.py │ ├── downloader │ │ ├── __init__.py │ │ └── handlers │ │ │ ├── __init__.py │ │ │ ├── aiohttp.py │ │ │ ├── curl_cffi.py │ │ │ ├── httpx.py │ │ │ ├── pyhttpx.py │ │ │ └── requests.py │ ├── engine.py │ ├── scheduler │ │ ├── __init__.py │ │ ├── memory.py │ │ └── redis.py │ └── scraper.py ├── crawler.py ├── exceptions.py ├── http │ ├── __init__.py │ ├── request │ │ ├── __init__.py │ │ ├── form.py │ │ └── json.py │ └── response │ │ ├── __init__.py │ │ └── text.py ├── inject.py ├── libs │ ├── __init__.py │ ├── downloadermiddlewares │ │ ├── __init__.py │ │ └── stats.py │ ├── extensions │ │ ├── __init__.py │ │ ├── corestats.py │ │ └── logstats.py │ └── statscollectors.py ├── logformatter.py ├── middleware │ ├── __init__.py │ ├── adaptive_concurrency.py │ ├── downloader.py │ ├── extension.py │ ├── itempipeline.py │ └── manager.py ├── queue │ ├── __init__.py │ ├── compat.py │ ├── convert.py │ ├── memory │ │ ├── __init__.py │ │ └── _queue.py │ ├── rabbitmq │ │ ├── __init__.py │ │ └── _queue.py │ └── redis │ │ ├── __init__.py │ │ ├── _queue.py │ │ └── _queue_async.py ├── settings │ ├── __init__.py │ └── default_settings.py ├── signalmanager.py ├── signals.py ├── spider.py ├── templates │ ├── project │ │ ├── __init__.py │ │ ├── aioscpy.cfg │ │ ├── middlewares.py.tmpl │ │ ├── pipelines.py.tmpl │ │ ├── settings.py.tmpl │ │ ├── spiders │ │ │ └── __init__.py │ │ └── start.py.tmpl │ └── spiders │ │ ├── basic.tmpl │ │ └── crawl.tmpl └── utils │ ├── __init__.py │ ├── common.py │ ├── curl.py │ ├── log.py │ ├── ossignal.py │ ├── othtypes.py │ ├── signal.py │ ├── template.py │ └── tools.py ├── cegex ├── __init__.py ├── baidu.py ├── httpbin.py ├── httpbin_post.py └── ja3.py ├── doc ├── README_ZH.md └── images │ ├── aioscpy.png │ ├── run.png │ └── tree.png ├── example ├── project_quotes │ ├── __init__.py │ ├── aioscpy.cfg │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ ├── spiders │ │ ├── __init__.py │ │ └── quotes.py │ └── start.py └── single_quotes.py ├── requirements.txt ├── setup.py ├── start.py └── tests ├── README.md ├── run_tests.py ├── test_adaptive_concurrency.py ├── test_engine_memory_management.py ├── test_engine_task_beat.py └── test_httpx_handler.py /.gitignore: -------------------------------------------------------------------------------- 1 | # IDE 2 | .vscode/ 3 | .idea/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 ihandmine 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include aioscpy VERSION 2 | recursive-include aioscpy/templates/project *.tmpl *.cfg *.py 3 | recursive-include aioscpy/templates/spiders *.tmpl -------------------------------------------------------------------------------- /aioscpy/VERSION: -------------------------------------------------------------------------------- 1 | 0.3.13 2 | -------------------------------------------------------------------------------- /aioscpy/__init__.py: -------------------------------------------------------------------------------- 1 | import pkgutil 2 | 3 | from aioscpy.inject import call_grace_instance 4 | 5 | __version__ = (pkgutil.get_data(__package__, "VERSION") or b"").decode("ascii").strip() 6 | 7 | __all__ = [ 8 | '__version__', 9 | 'call_grace_instance' 10 | ] 11 | -------------------------------------------------------------------------------- /aioscpy/__main__.py: -------------------------------------------------------------------------------- 1 | from aioscpy.cmdline import execute 2 | 3 | 4 | if __name__ == '__main__': 5 | execute() 6 | -------------------------------------------------------------------------------- /aioscpy/cmdline.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import argparse 4 | import cProfile 5 | import inspect 6 | import pkg_resources 7 | 8 | import aioscpy 9 | from aioscpy.inject import walk_modules 10 | from aioscpy.commands import ASCommand, ASHelpFormatter 11 | from aioscpy.exceptions import UsageError 12 | from aioscpy.utils.tools import get_project_settings 13 | from aioscpy.utils.common import inside_project 14 | from aioscpy import call_grace_instance 15 | 16 | 17 | def _iter_command_classes(module_name): 18 | # TODO: add `name` attribute to commands and and merge this function with 19 | for module in walk_modules(module_name): 20 | for obj in vars(module).values(): 21 | if ( 22 | inspect.isclass(obj) 23 | and issubclass(obj, ASCommand) 24 | and obj.__module__ == module.__name__ 25 | and not obj == ASCommand 26 | ): 27 | yield obj 28 | 29 | 30 | def _get_commands_from_module(module, inproject): 31 | d = {} 32 | for cmd in _iter_command_classes(module): 33 | if inproject or not cmd.requires_project: 34 | cmdname = cmd.__module__.split('.')[-1] 35 | d[cmdname] = cmd() 36 | return d 37 | 38 | 39 | def _get_commands_from_entry_points(inproject, group='aioscpy.commands'): 40 | cmds = {} 41 | for entry_point in pkg_resources.iter_entry_points(group): 42 | obj = entry_point.load() 43 | if inspect.isclass(obj): 44 | cmds[entry_point.name] = obj() 45 | else: 46 | raise Exception(f"Invalid entry point {entry_point.name}") 47 | return cmds 48 | 49 | 50 | def _get_commands_dict(settings, inproject): 51 | cmds = _get_commands_from_module('aioscpy.commands', inproject) 52 | cmds.update(_get_commands_from_entry_points(inproject)) 53 | cmds_module = settings['COMMANDS_MODULE'] 54 | if cmds_module: 55 | cmds.update(_get_commands_from_module(cmds_module, inproject)) 56 | return cmds 57 | 58 | 59 | def _pop_command_name(argv): 60 | i = 0 61 | for arg in argv[1:]: 62 | if not arg.startswith('-'): 63 | del argv[i] 64 | return arg 65 | i += 1 66 | 67 | 68 | def _print_header(settings, inproject): 69 | version = aioscpy.__version__ 70 | if inproject: 71 | print(f"aioscpy {version} - project: {settings['BOT_NAME']}\n") 72 | else: 73 | print(f"aioscpy {version} - no active project\n") 74 | 75 | 76 | def _print_commands(settings, inproject): 77 | _print_header(settings, inproject) 78 | print("Usage:") 79 | print(" aioscpy [options] [args]\n") 80 | print("Available commands:") 81 | cmds = _get_commands_dict(settings, inproject) 82 | for cmdname, cmdclass in sorted(cmds.items()): 83 | print(f" {cmdname:<13} {cmdclass.short_desc()}") 84 | if not inproject: 85 | print() 86 | print(" [ more ] More commands available when run from project directory") 87 | print() 88 | print('Use "aioscpy -h" to see more info about a command') 89 | 90 | 91 | def _print_unknown_command(settings, cmdname, inproject): 92 | _print_header(settings, inproject) 93 | print(f"Unknown command: {cmdname}\n") 94 | print('Use "aioscpy" to see available commands') 95 | 96 | 97 | def _run_print_help(parser, func, *a, **kw): 98 | try: 99 | func(*a, **kw) 100 | except UsageError as e: 101 | if str(e): 102 | parser.error(str(e)) 103 | if e.print_help: 104 | parser.print_help() 105 | sys.exit(2) 106 | 107 | 108 | def execute(argv=None, settings=None): 109 | if argv is None: 110 | argv = sys.argv 111 | 112 | if settings is None: 113 | settings = get_project_settings() 114 | # set EDITOR from environment if available 115 | try: 116 | editor = os.environ['EDITOR'] 117 | except KeyError: 118 | pass 119 | else: 120 | settings['EDITOR'] = editor 121 | 122 | inproject = inside_project() 123 | cmds = _get_commands_dict(settings, inproject) 124 | cmdname = _pop_command_name(argv) 125 | if not cmdname: 126 | _print_commands(settings, inproject) 127 | sys.exit(0) 128 | elif cmdname not in cmds: 129 | _print_unknown_command(settings, cmdname, inproject) 130 | sys.exit(2) 131 | 132 | cmd = cmds[cmdname] 133 | parser = argparse.ArgumentParser(formatter_class=ASHelpFormatter, 134 | usage=f"aioscpy {cmdname} {cmd.syntax()}", 135 | conflict_handler='resolve', 136 | description=cmd.long_desc()) 137 | settings.setdict(cmd.default_settings, priority='command') 138 | cmd.settings = settings 139 | cmd.add_options(parser) 140 | opts, args = parser.parse_known_args(args=argv[1:]) 141 | _run_print_help(parser, cmd.process_options, args, opts) 142 | 143 | if getattr(cmd, "requires_process"): 144 | # cmd.crawler_process = CrawlerProcess(settings) 145 | cmd.crawler_process = call_grace_instance("crawler_process", settings) 146 | _run_print_help(parser, _run_command, cmd, args, opts) 147 | sys.exit(cmd.exitcode) 148 | 149 | 150 | def _run_command(cmd, args, opts): 151 | if opts.profile: 152 | _run_command_profiled(cmd, args, opts) 153 | else: 154 | cmd.run(args, opts) 155 | 156 | 157 | def _run_command_profiled(cmd, args, opts): 158 | if opts.profile: 159 | sys.stderr.write(f"aioscpy: writing cProfile stats to {opts.profile!r}\n") 160 | loc = locals() 161 | p = cProfile.Profile() 162 | p.runctx('cmd.run(args, opts)', globals(), loc) 163 | if opts.profile: 164 | p.dump_stats(opts.profile) 165 | 166 | 167 | if __name__ == '__main__': 168 | execute() 169 | 170 | -------------------------------------------------------------------------------- /aioscpy/commands/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from typing import Any, Dict 4 | 5 | 6 | from aioscpy.utils.common import arglist_to_dict 7 | from aioscpy.exceptions import UsageError 8 | 9 | 10 | class ASCommand: 11 | 12 | requires_project = False 13 | crawler_process = None 14 | requires_process = True 15 | 16 | # default settings to be used for this command instead of global defaults 17 | default_settings: Dict[str, Any] = {} 18 | 19 | exitcode = 0 20 | 21 | def __init__(self): 22 | self.settings = None # set in aioscpy.cmdline 23 | 24 | def set_crawler(self, crawler): 25 | if hasattr(self, '_crawler'): 26 | raise RuntimeError("crawler already set") 27 | self._crawler = crawler 28 | 29 | def syntax(self): 30 | """ 31 | Command syntax (preferably one-line). Do not include command name. 32 | """ 33 | return "" 34 | 35 | def short_desc(self): 36 | """ 37 | A short description of the command 38 | """ 39 | return "" 40 | 41 | def long_desc(self): 42 | """A long description of the command. Return short description when not 43 | available. It cannot contain newlines since contents will be formatted 44 | by optparser which removes newlines and wraps text. 45 | """ 46 | return self.short_desc() 47 | 48 | def help(self): 49 | """An extensive help for the command. It will be shown when using the 50 | "help" command. It can contain newlines since no post-formatting will 51 | be applied to its contents. 52 | """ 53 | return self.long_desc() 54 | 55 | def add_options(self, parser): 56 | """ 57 | Populate option parse with options available for this command 58 | """ 59 | group = parser.add_argument_group(title='Global Options') 60 | group.add_argument("--logfile", metavar="FILE", 61 | help="log file. if omitted stderr will be used") 62 | group.add_argument("-L", "--loglevel", metavar="LEVEL", default=None, 63 | help=f"log level (default: {self.settings['LOG_LEVEL']})") 64 | group.add_argument("--nolog", action="store_true", 65 | help="disable logging completely") 66 | group.add_argument("--profile", metavar="FILE", default=None, 67 | help="write python cProfile stats to FILE") 68 | group.add_argument("--pidfile", metavar="FILE", 69 | help="write process ID to FILE") 70 | group.add_argument("-s", "--set", action="append", default=[], metavar="NAME=VALUE", 71 | help="set/override setting (may be repeated)") 72 | 73 | def process_options(self, args, opts): 74 | try: 75 | self.settings.setdict(arglist_to_dict(opts.set), 76 | priority='cmdline') 77 | except ValueError: 78 | raise UsageError("Invalid -s value, use -s NAME=VALUE", print_help=False) 79 | 80 | if opts.logfile: 81 | self.settings.set('LOG_ENABLED', True, priority='cmdline') 82 | self.settings.set('LOG_FILE', opts.logfile, priority='cmdline') 83 | 84 | if opts.loglevel: 85 | self.settings.set('LOG_ENABLED', True, priority='cmdline') 86 | self.settings.set('LOG_LEVEL', opts.loglevel, priority='cmdline') 87 | 88 | if opts.nolog: 89 | self.settings.set('LOG_ENABLED', False, priority='cmdline') 90 | 91 | if opts.pidfile: 92 | with open(opts.pidfile, "w") as f: 93 | f.write(str(os.getpid()) + os.linesep) 94 | 95 | def run(self, args, opts): 96 | """ 97 | Entry point for running commands 98 | """ 99 | raise NotImplementedError 100 | 101 | 102 | class BaseRunSpiderCommand(ASCommand): 103 | """ 104 | Common class used to share functionality between the crawl, parse and runspider commands 105 | """ 106 | def add_options(self, parser): 107 | ASCommand.add_options(self, parser) 108 | parser.add_argument("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", 109 | help="set spider argument (may be repeated)") 110 | parser.add_argument("-o", "--output", metavar="FILE", action="append", 111 | help="append scraped items to the end of FILE (use - for stdout)") 112 | parser.add_argument("-O", "--overwrite-output", metavar="FILE", action="append", 113 | help="dump scraped items into FILE, overwriting any existing file") 114 | parser.add_argument("-t", "--output-format", metavar="FORMAT", 115 | help="format to use for dumping items") 116 | 117 | def process_options(self, args, opts): 118 | ASCommand.process_options(self, args, opts) 119 | try: 120 | opts.spargs = arglist_to_dict(opts.spargs) 121 | except ValueError: 122 | raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) 123 | 124 | 125 | class ASHelpFormatter(argparse.HelpFormatter): 126 | """ 127 | Help Formatter for aioscpy command line help messages. 128 | """ 129 | def __init__(self, prog, indent_increment=2, max_help_position=24, width=None): 130 | super().__init__(prog, indent_increment=indent_increment, 131 | max_help_position=max_help_position, width=width) 132 | 133 | def _join_parts(self, part_strings): 134 | parts = self.format_part_strings(part_strings) 135 | return super()._join_parts(parts) 136 | 137 | def format_part_strings(self, part_strings): 138 | """ 139 | Underline and title case command line help message headers. 140 | """ 141 | if part_strings and part_strings[0].startswith("usage: "): 142 | part_strings[0] = "Usage\n=====\n " + part_strings[0][len('usage: '):] 143 | headings = [i for i in range(len(part_strings)) if part_strings[i].endswith(':\n')] 144 | for index in headings[::-1]: 145 | char = '-' if "Global Options" in part_strings[index] else '=' 146 | part_strings[index] = part_strings[index][:-2].title() 147 | underline = ''.join(["\n", (char * len(part_strings[index])), "\n"]) 148 | part_strings.insert(index + 1, underline) 149 | return part_strings 150 | -------------------------------------------------------------------------------- /aioscpy/commands/crawl.py: -------------------------------------------------------------------------------- 1 | from aioscpy.commands import BaseRunSpiderCommand 2 | from aioscpy.exceptions import UsageError 3 | 4 | 5 | class Command(BaseRunSpiderCommand): 6 | 7 | requires_project = True 8 | 9 | def syntax(self): 10 | return "[options] " 11 | 12 | def short_desc(self): 13 | return "Run a spider" 14 | 15 | def run(self, args, opts): 16 | if len(args) < 1: 17 | raise UsageError() 18 | elif len(args) > 1: 19 | raise UsageError("running 'aioscpy crawl' with more than one spider is not supported") 20 | spname = args[0] 21 | 22 | crawl_defer = self.crawler_process.crawl(spname, **opts.spargs) 23 | 24 | if getattr(crawl_defer, 'result', None) is not None and issubclass(crawl_defer.result.type, Exception): 25 | self.exitcode = 1 26 | else: 27 | self.crawler_process.start() 28 | 29 | if ( 30 | self.crawler_process.bootstrap_failed 31 | or hasattr(self.crawler_process, 'has_exception') and self.crawler_process.has_exception 32 | ): 33 | self.exitcode = 1 34 | -------------------------------------------------------------------------------- /aioscpy/commands/genspider.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import string 4 | 5 | from importlib import import_module 6 | from os.path import join, dirname, abspath, exists, splitext 7 | 8 | import aioscpy 9 | from aioscpy.commands import ASCommand 10 | from aioscpy.utils.template import render_templatefile, string_camelcase 11 | from aioscpy.exceptions import UsageError 12 | 13 | 14 | def sanitize_module_name(module_name): 15 | """Sanitize the given module name, by replacing dashes and points 16 | with underscores and prefixing it with a letter if it doesn't start 17 | with one 18 | """ 19 | module_name = module_name.replace('-', '_').replace('.', '_') 20 | if module_name[0] not in string.ascii_letters: 21 | module_name = "a" + module_name 22 | return module_name 23 | 24 | 25 | class Command(ASCommand): 26 | 27 | requires_project = False 28 | default_settings = {'LOG_ENABLED': False} 29 | requires_process = False 30 | 31 | def syntax(self): 32 | return "[options] " 33 | 34 | def short_desc(self): 35 | return "Generate new spider in project using pre-defined templates" 36 | 37 | def add_options(self, parser): 38 | ASCommand.add_options(self, parser) 39 | parser.add_argument("-l", "--list", dest="list", action="store_true", 40 | help="List available templates") 41 | parser.add_argument("-d", "--dump", dest="dump", metavar="TEMPLATE", 42 | help="Dump template to standard output") 43 | parser.add_argument("-t", "--template", dest="template", default="basic", 44 | help="Uses a custom template.") 45 | parser.add_argument("--force", dest="force", action="store_true", 46 | help="If the spider already exists, overwrite it with the template") 47 | 48 | def run(self, args, opts): 49 | if opts.list: 50 | self._list_templates() 51 | return 52 | if opts.dump: 53 | template_file = self._find_template(opts.dump) 54 | if template_file: 55 | with open(template_file, "r") as f: 56 | print(f.read()) 57 | return 58 | if not args: 59 | raise UsageError() 60 | 61 | name = args[0] 62 | module = sanitize_module_name(name) 63 | 64 | if self.settings.get('BOT_NAME') == module: 65 | print("Cannot create a spider with the same name as your project") 66 | return 67 | 68 | if not opts.force and self._spider_exists(name): 69 | return 70 | 71 | template_file = self._find_template(opts.template) 72 | if template_file: 73 | self._genspider(module, name, opts.template, template_file) 74 | 75 | def _genspider(self, module, name, template_name, template_file): 76 | """Generate the spider module, based on the given template""" 77 | capitalized_module = ''.join(s.capitalize() for s in module.split('_')) 78 | tvars = { 79 | 'project_name': self.settings.get('BOT_NAME'), 80 | 'ProjectName': string_camelcase(self.settings.get('BOT_NAME')), 81 | 'module': module, 82 | 'name': name, 83 | 'classname': f'{capitalized_module}Spider' 84 | } 85 | if self.settings.get('NEWSPIDER_MODULE'): 86 | spiders_module = import_module(self.settings['NEWSPIDER_MODULE']) 87 | spiders_dir = abspath(dirname(spiders_module.__file__)) 88 | else: 89 | spiders_module = None 90 | spiders_dir = "." 91 | spider_file = f"{join(spiders_dir, module)}.py" 92 | shutil.copyfile(template_file, spider_file) 93 | render_templatefile(spider_file, **tvars) 94 | print(f"Created spider {name!r} using template {template_name!r} ", 95 | end=('' if spiders_module else '\n')) 96 | if spiders_module: 97 | print(f"in module:\n {spiders_module.__name__}.{module}") 98 | 99 | def _find_template(self, template): 100 | template_file = join(self.templates_dir, f'{template}.tmpl') 101 | if exists(template_file): 102 | return template_file 103 | print(f"Unable to find template: {template}\n") 104 | print('Use "aioscpy genspider --list" to see all available templates.') 105 | 106 | def _list_templates(self): 107 | print("Available templates:") 108 | for filename in sorted(os.listdir(self.templates_dir)): 109 | if filename.endswith('.tmpl'): 110 | print(f" {splitext(filename)[0]}") 111 | 112 | def _spider_exists(self, name): 113 | if not self.settings.get('NEWSPIDER_MODULE'): 114 | # if run as a standalone command and file with same filename already exists 115 | if exists(name + ".py"): 116 | print(f"{abspath(name + '.py')} already exists") 117 | return True 118 | return False 119 | 120 | # a file with the same name exists in the target directory 121 | spiders_module = import_module(self.settings['NEWSPIDER_MODULE']) 122 | spiders_dir = dirname(spiders_module.__file__) 123 | spiders_dir_abs = abspath(spiders_dir) 124 | if exists(join(spiders_dir_abs, name + ".py")): 125 | print(f"{join(spiders_dir_abs, (name + '.py'))} already exists") 126 | return True 127 | 128 | return False 129 | 130 | @property 131 | def templates_dir(self): 132 | return join( 133 | self.settings['TEMPLATES_DIR'] or join(aioscpy.__path__[0], 'templates'), 134 | 'spiders' 135 | ) 136 | -------------------------------------------------------------------------------- /aioscpy/commands/onespider.py: -------------------------------------------------------------------------------- 1 | from aioscpy.commands import ASCommand 2 | from aioscpy.commands.genspider import Command 3 | 4 | 5 | class OCommand(Command): 6 | 7 | def short_desc(self): 8 | return "Generate new spider in xxx.py using pre-defined templates" 9 | 10 | def add_options(self, parser): 11 | ASCommand.add_options(self, parser) 12 | parser.add_argument("-l", "--list", dest="list", action="store_true", 13 | help="List available templates") 14 | parser.add_argument("-d", "--dump", dest="dump", metavar="TEMPLATE", 15 | help="Dump template to standard output") 16 | parser.add_argument("-t", "--template", dest="template", default="crawl", 17 | help="Uses a custom template.") 18 | parser.add_argument("--force", dest="force", action="store_true", 19 | help="If the spider already exists, overwrite it with the template") -------------------------------------------------------------------------------- /aioscpy/commands/runspider.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import inspect 4 | 5 | from importlib import import_module 6 | 7 | from aioscpy.exceptions import UsageError 8 | from aioscpy.commands import BaseRunSpiderCommand 9 | 10 | 11 | def iter_spider_classes(module): 12 | from aioscpy.spider import Spider 13 | 14 | for obj in vars(module).values(): 15 | if ( 16 | inspect.isclass(obj) 17 | and issubclass(obj, Spider) 18 | and obj.__module__ == module.__name__ 19 | and getattr(obj, 'name', None) 20 | ): 21 | yield obj 22 | 23 | 24 | def _import_file(filepath): 25 | abspath = os.path.abspath(filepath) 26 | dirname, file = os.path.split(abspath) 27 | fname, fext = os.path.splitext(file) 28 | if fext not in ('.py', '.pyw'): 29 | raise ValueError(f"Not a Python source file: {abspath}") 30 | if dirname: 31 | sys.path = [dirname] + sys.path 32 | try: 33 | module = import_module(fname) 34 | finally: 35 | if dirname: 36 | sys.path.pop(0) 37 | return module 38 | 39 | 40 | class Command(BaseRunSpiderCommand): 41 | 42 | requires_project = False 43 | default_settings = {'SPIDER_LOADER_WARN_ONLY': True} 44 | 45 | def syntax(self): 46 | return "[options] " 47 | 48 | def short_desc(self): 49 | return "Run a self-contained spider (without creating a project)" 50 | 51 | def long_desc(self): 52 | return "Run the spider defined in the given file" 53 | 54 | def run(self, args, opts): 55 | if len(args) != 1: 56 | raise UsageError() 57 | filename = args[0] 58 | if not os.path.exists(filename): 59 | raise UsageError(f"File not found: {filename}\n") 60 | try: 61 | module = _import_file(filename) 62 | except (ImportError, ValueError) as e: 63 | raise UsageError(f"Unable to load {filename!r}: {e}\n") 64 | spclasses = list(iter_spider_classes(module)) 65 | if not spclasses: 66 | raise UsageError(f"No spider found in file: {filename}\n") 67 | spidercls = spclasses.pop() 68 | 69 | self.crawler_process.crawl(spidercls, **opts.spargs) 70 | self.crawler_process.start() 71 | 72 | if self.crawler_process.bootstrap_failed: 73 | self.exitcode = 1 74 | -------------------------------------------------------------------------------- /aioscpy/commands/startproject.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import string 4 | from importlib.util import find_spec 5 | from os.path import join, exists, abspath 6 | from shutil import ignore_patterns, move, copy2, copystat 7 | from stat import S_IWUSR as OWNER_WRITE_PERMISSION 8 | 9 | import aioscpy 10 | from aioscpy.commands import ASCommand 11 | from aioscpy.utils.template import render_templatefile, string_camelcase 12 | from aioscpy.exceptions import UsageError 13 | 14 | 15 | TEMPLATES_TO_RENDER = ( 16 | ('aioscpy.cfg',), 17 | ('settings.py.tmpl',), 18 | ('pipelines.py.tmpl',), 19 | ('middlewares.py.tmpl',), 20 | ('start.py.tmpl',), 21 | ) 22 | 23 | IGNORE = ignore_patterns('*.pyc', '__pycache__', '.svn') 24 | 25 | 26 | def _make_writable(path): 27 | current_permissions = os.stat(path).st_mode 28 | os.chmod(path, current_permissions | OWNER_WRITE_PERMISSION) 29 | 30 | 31 | class Command(ASCommand): 32 | 33 | requires_project = False 34 | default_settings = {'LOG_ENABLED': False, 35 | 'SPIDER_LOADER_WARN_ONLY': True} 36 | requires_process = False 37 | 38 | def syntax(self): 39 | return " [project_dir]" 40 | 41 | def short_desc(self): 42 | return "Create new project" 43 | 44 | def _is_valid_name(self, project_name): 45 | def _module_exists(module_name): 46 | spec = find_spec(module_name) 47 | return spec is not None and spec.loader is not None 48 | 49 | if not re.search(r'^[_a-zA-Z]\w*$', project_name): 50 | print('Error: Project names must begin with a letter and contain' 51 | ' only\nletters, numbers and underscores') 52 | elif _module_exists(project_name): 53 | print(f'Error: Module {project_name!r} already exists') 54 | else: 55 | return True 56 | return False 57 | 58 | def _copytree(self, src, dst): 59 | """ 60 | Since the original function always creates the directory, to resolve 61 | the issue a new function had to be created. It's a simple copy and 62 | was reduced for this case. 63 | 64 | More info at: 65 | https://github.com/aioscpy/aioscpy/pull/2005 66 | """ 67 | ignore = IGNORE 68 | names = os.listdir(src) 69 | ignored_names = ignore(src, names) 70 | 71 | if not os.path.exists(dst): 72 | os.makedirs(dst) 73 | 74 | for name in names: 75 | if name in ignored_names: 76 | continue 77 | 78 | srcname = os.path.join(src, name) 79 | dstname = os.path.join(dst, name) 80 | if os.path.isdir(srcname): 81 | self._copytree(srcname, dstname) 82 | else: 83 | copy2(srcname, dstname) 84 | _make_writable(dstname) 85 | 86 | copystat(src, dst) 87 | _make_writable(dst) 88 | 89 | def run(self, args, opts): 90 | if len(args) not in (1, 2): 91 | raise UsageError() 92 | 93 | project_name = args[0] 94 | project_dir = args[0] 95 | 96 | if len(args) == 2: 97 | project_dir = args[1] 98 | 99 | if exists(join(project_dir, 'aioscpy.cfg')): 100 | self.exitcode = 1 101 | print(f'Error: aioscpy.cfg already exists in {abspath(project_dir)}') 102 | return 103 | 104 | if not self._is_valid_name(project_name): 105 | self.exitcode = 1 106 | return 107 | 108 | self._copytree(self.templates_dir, abspath(project_dir)) 109 | # move(join(project_dir, 'module'), join(project_dir, project_name)) 110 | for paths in TEMPLATES_TO_RENDER: 111 | path = join(*paths) 112 | tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name)) 113 | render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name)) 114 | print(f"New Aioscpy project '{project_name}', using template directory " 115 | f"'{self.templates_dir}', created in:") 116 | print(f" {abspath(project_dir)}\n") 117 | print("You can start your first spider with:") 118 | print(f" cd {project_dir}") 119 | print(" aioscpy genspider/onespider example") 120 | 121 | @property 122 | def templates_dir(self): 123 | return join( 124 | self.settings['TEMPLATES_DIR'] or join(aioscpy.__path__[0], 'templates'), 125 | 'project' 126 | ) 127 | -------------------------------------------------------------------------------- /aioscpy/commands/version.py: -------------------------------------------------------------------------------- 1 | import aioscpy 2 | from aioscpy.commands import ASCommand 3 | 4 | 5 | class Command(ASCommand): 6 | 7 | default_settings = {'LOG_ENABLED': False, 8 | 'SPIDER_LOADER_WARN_ONLY': True} 9 | requires_process = False 10 | 11 | def syntax(self): 12 | return "[-v]" 13 | 14 | def short_desc(self): 15 | return "Print aioscpy version" 16 | 17 | def add_options(self, parser): 18 | ASCommand.add_options(self, parser) 19 | parser.add_argument("--verbose", "-v", dest="verbose", action="store_true", 20 | help="also display twisted/python/platform info (useful for bug reports)") 21 | 22 | def run(self, args, opts): 23 | # if opts.verbose: 24 | # versions = aioscpy_components_versions() 25 | # width = max(len(n) for (n, _) in versions) 26 | # for name, version in versions: 27 | # print(f"{name:<{width}} : {version}") 28 | # else: 29 | print(f"AIOSPCY {aioscpy.__version__}") 30 | -------------------------------------------------------------------------------- /aioscpy/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/aioscpy/core/__init__.py -------------------------------------------------------------------------------- /aioscpy/core/downloader/__init__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import random 3 | 4 | from datetime import datetime 5 | from collections import deque 6 | 7 | from aioscpy import signals 8 | from aioscpy import call_grace_instance 9 | 10 | 11 | class Slot: 12 | """Downloader slot""" 13 | 14 | def __init__(self, concurrency, randomize_delay, delay=0): 15 | self.concurrency = concurrency 16 | self.delay = delay 17 | self.randomize_delay = randomize_delay 18 | 19 | self.active = set() 20 | self.queue = deque() 21 | self.transferring = set() 22 | self.lastseen = 0 23 | self.delay_run = False 24 | 25 | def free_transfer_slots(self): 26 | return self.concurrency - len(self.transferring) 27 | 28 | def download_delay(self): 29 | if self.randomize_delay: 30 | return random.uniform(0.5 * self.delay, 1.5 * self.delay) 31 | return self.delay 32 | 33 | def close(self): 34 | self.delay_run = True 35 | 36 | def __repr__(self): 37 | cls_name = self.__class__.__name__ 38 | return "%s(concurrency=%r, delay=%0.2f, randomize_delay=%r)" % ( 39 | cls_name, self.concurrency, self.delay, self.randomize_delay) 40 | 41 | def __str__(self): 42 | return ( 43 | "" % ( 45 | self.concurrency, self.delay, self.randomize_delay, 46 | len(self.active), len(self.queue), len(self.transferring), 47 | datetime.fromtimestamp(self.lastseen).isoformat() 48 | ) 49 | ) 50 | 51 | 52 | class Downloader(object): 53 | DOWNLOAD_SLOT = 'download_slot' 54 | 55 | def __init__(self, crawler): 56 | self.settings = crawler.settings 57 | self.crawler = crawler 58 | self.slot = None 59 | self.active = set() 60 | self.call_helper = self.di.get("tools").call_helper 61 | self.handlers = call_grace_instance('downloader_handler', self.settings, crawler) 62 | self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS') 63 | self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN') 64 | self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP') 65 | self.randomize_delay = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY') 66 | self.delay = self.settings.getfloat('DOWNLOAD_DELAY') 67 | self.middleware = call_grace_instance(self.di.get('downloader_middleware'), only_instance=True).from_crawler(crawler) 68 | self.process_queue_task = None 69 | self.engine = None 70 | 71 | crawler.signals.connect(self.close, signals.engine_stopped) 72 | 73 | @classmethod 74 | def from_crawler(cls, crawler): 75 | return cls(crawler) 76 | 77 | async def open(self, spider, engine): 78 | conc = self.ip_concurrency if self.ip_concurrency else self.domain_concurrency 79 | self.slot = Slot(conc, self.randomize_delay, self.delay) 80 | self.engine = engine 81 | self.process_queue_task = asyncio.create_task(self._process_queue(spider, self.slot)) 82 | 83 | async def fetch(self, request): 84 | self.active.add(request) 85 | self.slot.active.add(request) 86 | self.slot.queue.append(request) 87 | 88 | async def _process_queue(self, spider, slot): 89 | while True: 90 | await asyncio.sleep(0.1) 91 | while slot.queue and slot.free_transfer_slots() > 0: 92 | request = slot.queue.popleft() 93 | asyncio.create_task(self._download(slot, request, spider)) 94 | slot.transferring.add(request) 95 | slot.active.remove(request) 96 | self.active.remove(request) 97 | if slot.download_delay(): 98 | await asyncio.sleep(slot.download_delay()) 99 | 100 | async def _download(self, slot, request, spider): 101 | try: 102 | response = None 103 | response = await self.middleware.process_request(spider, request) 104 | process_request_method = getattr(spider, "process_request", None) 105 | if process_request_method: 106 | response = await self.call_helper(process_request_method, request) 107 | if response is None or isinstance(response, self.di.get('request')): 108 | request = response or request 109 | response = await self.handlers.download_request(request, spider) 110 | except (Exception, BaseException, asyncio.TimeoutError) as exc: 111 | response = await self.middleware.process_exception(spider, request, exc) 112 | process_exception_method = getattr(spider, "process_exception", None) 113 | if process_exception_method: 114 | response = await self.call_helper(process_exception_method, request, exc) 115 | else: 116 | try: 117 | response = await self.middleware.process_response(spider, request, response) 118 | process_response_method = getattr(spider, "process_response", None) 119 | if process_response_method: 120 | response = await self.call_helper(process_response_method, request, response) 121 | except (Exception, BaseException) as exc: 122 | response = exc 123 | finally: 124 | slot.transferring.discard(request) 125 | if isinstance(response, self.di.get('response')): 126 | response.request = request 127 | await self.engine._handle_downloader_output(response, request, spider) 128 | 129 | async def close(self): 130 | try: 131 | if self.slot is not None: 132 | self.slot.close() 133 | await self.handlers.close() 134 | if self.process_queue_task: 135 | self.process_queue_task.cancel() 136 | except (asyncio.CancelledError, Exception, BaseException) as exc: 137 | pass 138 | 139 | def needs_backout(self): 140 | return len(self.active) >= self.total_concurrency 141 | -------------------------------------------------------------------------------- /aioscpy/core/downloader/handlers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/aioscpy/core/downloader/handlers/__init__.py -------------------------------------------------------------------------------- /aioscpy/core/downloader/handlers/aiohttp.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import ssl 3 | import aiohttp 4 | import ujson 5 | import json 6 | 7 | from anti_header import Headers 8 | from anti_useragent.utils.cipers import generate_cipher 9 | 10 | 11 | class AioHttpDownloadHandler(object): 12 | 13 | def __init__(self, settings, crawler): 14 | self.settings = settings 15 | self.crawler = crawler 16 | self.aiohttp_client_session = { 17 | 'timeout': aiohttp.ClientTimeout(total=20), 18 | 'trust_env': True, 19 | 'json_serialize': ujson.dumps, 20 | "connector": aiohttp.TCPConnector( 21 | verify_ssl=False, 22 | limit=1000, 23 | force_close=True, 24 | use_dns_cache=False, 25 | limit_per_host=200, 26 | enable_cleanup_closed=True 27 | ) 28 | } 29 | self.session_stats = self.settings.getbool("REQUESTS_SESSION_STATS", False) 30 | self.session = None 31 | self.context = None 32 | 33 | @classmethod 34 | def from_settings(cls, settings, crawler): 35 | return cls(settings, crawler) 36 | 37 | @classmethod 38 | def from_crawler(cls, crawler): 39 | return cls.from_settings(crawler.settings, crawler) 40 | 41 | async def download_request(self, request, spider): 42 | session_kwargs = { 43 | 'timeout': self.settings.get('DOWNLOAD_TIMEOUT'), 44 | 'cookies': dict(request.cookies), 45 | "data": request.body, 46 | "json": request.json 47 | } 48 | headers = request.headers 49 | if isinstance(headers, Headers): 50 | headers = headers.to_unicode_dict() 51 | session_kwargs['headers'] = headers 52 | 53 | if request.meta.get('TLS_CIPHERS') or self.settings.get('TLS_CIPHERS'): 54 | self.context = ssl.create_default_context() 55 | self.context.set_ciphers(generate_cipher()) 56 | session_kwargs['ssl'] = self.context 57 | 58 | if request.meta.get("proxy"): 59 | session_kwargs["proxy"] = request.meta['proxy'] 60 | self.logger.debug(f"use {request.meta['proxy']} crawling: {request.url}") 61 | 62 | if self.session_stats: 63 | if self.session is None: 64 | self.session = aiohttp.ClientSession(**self.aiohttp_client_session) 65 | response = await self.session.request(request.method, request.url, **session_kwargs) 66 | content = await response.read() 67 | else: 68 | async with aiohttp.ClientSession( 69 | timeout=aiohttp.ClientTimeout(total=20), 70 | trust_env=True, 71 | connector=aiohttp.TCPConnector(verify_ssl=False)) as session: 72 | async with session.request(request.method, request.url, **session_kwargs) as response: 73 | content = await response.read() 74 | 75 | return self.di.get("response")( 76 | str(response.url), 77 | status=response.status, 78 | headers=response.headers, 79 | body=content, 80 | cookies=response.cookies, 81 | _response=response) 82 | 83 | async def close(self): 84 | if self.session is not None: 85 | await self.session.close() 86 | 87 | # Wait 250 ms for the underlying SSL connections to close 88 | # https://docs.aiohttp.org/en/latest/client_advanced.html#graceful-shutdown 89 | await asyncio.sleep(0.250) 90 | -------------------------------------------------------------------------------- /aioscpy/core/downloader/handlers/curl_cffi.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import random 3 | 4 | from curl_cffi.requests import AsyncSession 5 | 6 | from anti_header import Headers 7 | 8 | 9 | class CurlCffiDownloadHandler(object): 10 | 11 | def __init__(self, settings, crawler): 12 | self.settings = settings 13 | self.crawler = crawler 14 | self.context = None 15 | self.browsers = [ 16 | "chrome99", 17 | "chrome100", 18 | "chrome101", 19 | "chrome104", 20 | "chrome107", 21 | "chrome110", 22 | # "chrome116", 23 | "chrome99_android", 24 | "edge99", 25 | "edge101", 26 | # "ff91esr", 27 | # "ff95", 28 | # "ff98", 29 | # "ff100", 30 | # "ff102", 31 | # "ff109", 32 | # "ff117", 33 | "safari15_3", 34 | "safari15_5", 35 | ] 36 | 37 | @classmethod 38 | def from_settings(cls, settings, crawler): 39 | return cls(settings, crawler) 40 | 41 | @classmethod 42 | def from_crawler(cls, crawler): 43 | return cls.from_settings(crawler.settings, crawler) 44 | 45 | async def download_request(self, request, spider): 46 | headers = request.headers 47 | if isinstance(headers, Headers): 48 | headers = headers.to_unicode_dict() 49 | session_kwargs = { 50 | 'timeout': self.settings.get('DOWNLOAD_TIMEOUT'), 51 | 'cookies': dict(request.cookies), 52 | 'headers': headers, 53 | 'allow_redirects': True, 54 | "data": request.body, 55 | "json": request.json 56 | } 57 | 58 | if request.meta.get('TLS_CIPHERS') or self.settings.get('TLS_CIPHERS'): 59 | session_kwargs['impersonate'] = random.choice(self.browsers) 60 | 61 | if request.meta.get("proxy"): 62 | session_kwargs['proxies'] = { 63 | 'http': request.meta["proxy"], 64 | 'https': request.meta["proxy"] 65 | } 66 | self.logger.debug(f"use {request.meta['proxy']} crawling: {request.url}") 67 | 68 | async with AsyncSession() as session: 69 | response = await session.request(request.method, request.url, **session_kwargs) 70 | content = response.content 71 | 72 | return self.di.get("response")( 73 | str(response.url), 74 | status=response.status_code, 75 | headers=response.headers, 76 | body=content, 77 | cookies=response.cookies, 78 | _response=response) 79 | 80 | async def close(self): 81 | await asyncio.sleep(0.1) 82 | -------------------------------------------------------------------------------- /aioscpy/core/downloader/handlers/httpx.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import ssl 3 | import httpx 4 | 5 | from anti_header import Headers 6 | from anti_useragent.utils.cipers import generate_cipher 7 | 8 | 9 | class HttpxDownloadHandler(object): 10 | 11 | def __init__(self, settings, crawler): 12 | self.settings = settings 13 | self.crawler = crawler 14 | self.context = None 15 | 16 | @classmethod 17 | def from_settings(cls, settings, crawler): 18 | return cls(settings, crawler) 19 | 20 | @classmethod 21 | def from_crawler(cls, crawler): 22 | return cls.from_settings(crawler.settings, crawler) 23 | 24 | async def download_request(self, request, spider): 25 | headers = request.headers 26 | if isinstance(headers, Headers): 27 | headers = headers.to_unicode_dict() 28 | httpx_client_session = {} 29 | 30 | # Configure TLS settings if needed 31 | if request.meta.get('TLS_CIPHERS') or self.settings.get('TLS_CIPHERS'): 32 | try: 33 | self.context = ssl.create_default_context() 34 | self.context.set_ciphers(generate_cipher()) 35 | httpx_client_session['verify'] = self.context 36 | except Exception as e: 37 | self.logger.warning(f"Error configuring TLS for {request.url}: {str(e)}") 38 | 39 | # Configure proxy if specified 40 | if request.meta.get("proxy"): 41 | httpx_client_session['proxies'] = request.meta["proxy"] 42 | self.logger.debug(f"Using proxy {request.meta['proxy']} for: {request.url}") 43 | 44 | # Prepare session arguments 45 | session_kwargs = { 46 | 'timeout': self.settings.get('DOWNLOAD_TIMEOUT'), 47 | 'cookies': dict(request.cookies), 48 | 'headers': headers, 49 | 'follow_redirects': True, 50 | "data": request.body, 51 | "json": request.json 52 | } 53 | 54 | try: 55 | async with httpx.AsyncClient(**httpx_client_session) as session: 56 | response = await session.request(request.method, request.url, **session_kwargs) 57 | content = response.read() 58 | 59 | return self.di.get("response")( 60 | str(response.url), 61 | status=response.status_code, 62 | headers=response.headers, 63 | body=content, 64 | cookies=response.cookies, 65 | _response=response) 66 | 67 | except httpx.TimeoutException as e: 68 | self.logger.warning(f"Request to {request.url} timed out: {str(e)}") 69 | raise self.di.get("exceptions").TimeoutError(f"Request to {request.url} timed out") 70 | 71 | except httpx.RequestError as e: 72 | self.logger.warning(f"Request to {request.url} failed: {str(e)}") 73 | raise self.di.get("exceptions").ConnectionError(f"Request to {request.url} failed: {str(e)}") 74 | 75 | except Exception as e: 76 | self.logger.error(f"Unexpected error when downloading {request.url}: {str(e)}") 77 | raise self.di.get("exceptions").DownloadError(f"Unexpected error: {str(e)}") 78 | 79 | async def close(self): 80 | await asyncio.sleep(0.1) 81 | -------------------------------------------------------------------------------- /aioscpy/core/downloader/handlers/pyhttpx.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import pyhttpx 3 | 4 | from anti_header import Headers 5 | 6 | 7 | class PyHttpxDownloadHandler(object): 8 | 9 | def __init__(self, settings, crawler): 10 | self.settings = settings 11 | self.crawler = crawler 12 | self.context = None 13 | 14 | @classmethod 15 | def from_settings(cls, settings, crawler): 16 | return cls(settings, crawler) 17 | 18 | @classmethod 19 | def from_crawler(cls, crawler): 20 | return cls.from_settings(crawler.settings, crawler) 21 | 22 | async def download_request(self, request, spider): 23 | headers = request.headers 24 | if isinstance(headers, Headers): 25 | headers = headers.to_unicode_dict() 26 | pyhttpx_client_session = { 27 | 'timeout': self.settings.get('DOWNLOAD_TIMEOUT'), 28 | 'cookies': dict(request.cookies), 29 | 'headers': headers, 30 | 'allow_redirects': True, 31 | "data": request.body, 32 | "json": request.json 33 | } 34 | 35 | if request.meta.get("proxy"): 36 | pyhttpx_client_session['proxies'] = {'https': request.meta["proxy"]} 37 | self.logger.debug(f"use {request.meta['proxy']} crawling: {request.url}") 38 | 39 | session_args = {'http2': True} 40 | with pyhttpx.HttpSession(**session_args) as session: 41 | response = await asyncio.to_thread(session.request, request.method, request.url, **pyhttpx_client_session) 42 | 43 | return self.di.get("response")( 44 | str(request.url), 45 | status=response.status_code, 46 | headers=response.headers, 47 | body=response.content, 48 | cookies=response.cookies, 49 | _response=response) 50 | 51 | async def close(self): 52 | await asyncio.sleep(0.1) 53 | -------------------------------------------------------------------------------- /aioscpy/core/downloader/handlers/requests.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import requests 3 | 4 | from anti_header import Headers 5 | from anti_useragent.utils.cipers import generate_cipher 6 | 7 | 8 | class RequestsDownloadHandler(object): 9 | 10 | def __init__(self, settings, crawler): 11 | self.settings = settings 12 | self.crawler = crawler 13 | 14 | @classmethod 15 | def from_settings(cls, settings, crawler): 16 | return cls(settings, crawler) 17 | 18 | @classmethod 19 | def from_crawler(cls, crawler): 20 | return cls.from_settings(crawler.settings, crawler) 21 | 22 | async def download_request(self, request, spider): 23 | headers = request.headers 24 | if isinstance(headers, Headers): 25 | headers = headers.to_unicode_dict() 26 | requests_client_session = { 27 | 'timeout': self.settings.get('DOWNLOAD_TIMEOUT'), 28 | 'cookies': dict(request.cookies), 29 | 'headers': headers, 30 | 'allow_redirects': request.meta.get("allow_redirects", True), 31 | "data": request.body, 32 | "json": request.json, 33 | } 34 | 35 | if request.meta.get('TLS_CIPHERS') or self.settings.get('TLS_CIPHERS'): 36 | requests.adapters.DEFAULT_RETRIES = 10 37 | requests.packages.urllib3.disable_warnings() 38 | cipers_real = generate_cipher() 39 | self.logger.debug(cipers_real) 40 | requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = cipers_real 41 | 42 | if request.meta.get("proxy"): 43 | requests_client_session['proxies'] = { 44 | 'http': request.meta["proxy"], 45 | 'https': request.meta["proxy"] 46 | } 47 | self.logger.debug(f"use {request.meta['proxy']} crawling: {request.url}") 48 | 49 | 50 | 51 | response = await asyncio.to_thread(requests.request, request.method, request.url, **requests_client_session) 52 | 53 | return self.di.get("response")( 54 | str(response.url), 55 | status=response.status_code, 56 | headers=response.headers, 57 | body=response.content, 58 | cookies=response.cookies, 59 | _response=response) 60 | 61 | async def close(self): 62 | await asyncio.sleep(0.1) 63 | -------------------------------------------------------------------------------- /aioscpy/core/scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | 4 | class Scheduler(object): 5 | 6 | def __init__(self, _queue_df, spider, stats): 7 | self.queue = _queue_df 8 | self.stats = stats 9 | self.spider = spider 10 | 11 | @classmethod 12 | def from_crawler(cls, crawler): 13 | raise NotImplementedError( 14 | '{} from_crawler method must define'.format(cls.__class__.__name__)) 15 | 16 | async def enqueue_request(self, request): 17 | if self.stats: 18 | self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) 19 | await self.queue.push(request) 20 | return True 21 | 22 | async def async_next_request(self, count=None): 23 | # Use the provided count or get from settings 24 | if count is None: 25 | count = getattr(self.spider, 'settings', {}).get('TASK_BEAT_BATCH_SIZE', 100) 26 | 27 | _results = await self.queue.pop(count=count) 28 | if self.stats and _results: 29 | self.stats.inc_value('scheduler/dequeued/redis', count=len(_results), spider=self.spider) 30 | return _results 31 | 32 | async def open(self, start_requests): 33 | if asyncio.iscoroutine(self.queue): 34 | self.queue = await self.queue 35 | async for request in start_requests: 36 | await self.enqueue_request(request) 37 | 38 | async def close(self, slot): 39 | if slot.inprogress: 40 | for request in slot.inprogress: 41 | await self.enqueue_request(request) 42 | await self.queue.close() 43 | 44 | def __len__(self): 45 | return self.queue.qsize() 46 | 47 | async def has_pending_requests(self): 48 | return len(self) > 0 49 | -------------------------------------------------------------------------------- /aioscpy/core/scheduler/memory.py: -------------------------------------------------------------------------------- 1 | from aioscpy.core.scheduler import Scheduler 2 | from aioscpy.queue.memory import memory_queue 3 | 4 | 5 | class MemoryScheduler(Scheduler): 6 | 7 | @classmethod 8 | def from_crawler(cls, crawler): 9 | return cls(_queue_df=memory_queue(crawler.spider), stats=crawler.stats, spider=crawler.spider) 10 | -------------------------------------------------------------------------------- /aioscpy/core/scheduler/redis.py: -------------------------------------------------------------------------------- 1 | from aioscpy.core.scheduler import Scheduler 2 | from aioscpy.queue.redis import aio_priority_queue 3 | 4 | 5 | class RedisScheduler(Scheduler): 6 | 7 | @classmethod 8 | def from_crawler(cls, crawler): 9 | redis_tcp = crawler.settings.get('REDIS_URI') or \ 10 | crawler.settings.get('REDIS_TCP') 11 | queue_key = crawler.settings.get('QUEUE_KEY') % {'spider': crawler.spider.name} 12 | return cls(_queue_df=aio_priority_queue(queue_key, redis_tcp, crawler.spider), spider=crawler.spider, stats=crawler.stats) 13 | 14 | async def has_pending_requests(self): 15 | return await self.queue.qsize() > 0 16 | -------------------------------------------------------------------------------- /aioscpy/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Aioscpy core exceptions 3 | 4 | These exceptions are documented in docs/topics/exceptions.rst. Please don't add 5 | new exceptions here without documenting them there. 6 | """ 7 | 8 | # Internal 9 | 10 | 11 | class StopDownload(Exception): 12 | """ 13 | Stop the download of the body for a given response. 14 | The 'fail' boolean parameter indicates whether or not the resulting partial response 15 | should be handled by the request errback. Note that 'fail' is a keyword-only argument. 16 | """ 17 | 18 | def __init__(self, *, fail=True): 19 | super().__init__() 20 | self.fail = fail 21 | 22 | 23 | class NotConfigured(Exception): 24 | """Indicates a missing configuration situation""" 25 | pass 26 | 27 | 28 | class _InvalidOutput(TypeError): 29 | """ 30 | Indicates an invalid value has been returned by a middleware's processing method. 31 | Internal and undocumented, it should not be raised or caught by user code. 32 | """ 33 | pass 34 | 35 | 36 | # HTTP and crawling 37 | 38 | 39 | class IgnoreRequest(Exception): 40 | """Indicates a decision was made not to process a request""" 41 | 42 | 43 | class DontCloseSpider(Exception): 44 | """Request the spider not to be closed yet""" 45 | pass 46 | 47 | 48 | class CloseSpider(Exception): 49 | """Raise this from callbacks to request the spider to be closed""" 50 | 51 | def __init__(self, reason='cancelled'): 52 | super(CloseSpider, self).__init__() 53 | self.reason = reason 54 | 55 | 56 | # Items 57 | 58 | 59 | class DropItem(Exception): 60 | """Drop item from the item pipeline""" 61 | pass 62 | 63 | 64 | class NotSupported(Exception): 65 | """Indicates a feature or method is not supported""" 66 | pass 67 | 68 | 69 | # Commands 70 | 71 | 72 | class UsageError(Exception): 73 | """To indicate a command-line usage error""" 74 | 75 | def __init__(self, *a, **kw): 76 | self.print_help = kw.pop('print_help', True) 77 | super(UsageError, self).__init__(*a, **kw) 78 | 79 | 80 | class AioscpyDeprecationWarning(Warning): 81 | """Warning category for deprecated features, since the default 82 | DeprecationWarning is silenced on Python 2.7+ 83 | """ 84 | pass 85 | 86 | 87 | class ContractFail(AssertionError): 88 | """Error raised in case of a failing contract""" 89 | pass 90 | -------------------------------------------------------------------------------- /aioscpy/http/__init__.py: -------------------------------------------------------------------------------- 1 | from aioscpy.http.request import Request 2 | from aioscpy.http.request.form import FormRequest 3 | from aioscpy.http.request.json import JsonRequest 4 | 5 | from aioscpy.http.response import Response 6 | from aioscpy.http.response.text import TextResponse 7 | 8 | 9 | __all__ = [ 10 | Request, 11 | FormRequest, 12 | JsonRequest, 13 | Response, 14 | TextResponse 15 | ] 16 | -------------------------------------------------------------------------------- /aioscpy/http/request/__init__.py: -------------------------------------------------------------------------------- 1 | from w3lib.url import safe_url_string 2 | 3 | 4 | class Request(object): 5 | 6 | def __init__(self, url, 7 | callback=None, 8 | method='GET', 9 | headers=None, 10 | body=None, 11 | json=None, 12 | cookies=None, 13 | meta=None, 14 | encoding='utf-8', 15 | priority=0, 16 | dont_filter=False, 17 | errback=None, flags=None, cb_kwargs=None): 18 | self._encoding = encoding 19 | self.method = str(method).upper() 20 | self._set_url(url) 21 | self._set_body(body) 22 | self._set_json(json) 23 | 24 | assert isinstance(priority, int), "Request priority not an integer: %r" % priority 25 | self.priority = priority 26 | 27 | if callback is not None and not callable(callback): 28 | raise TypeError('callback must be a callable, got %s' % 29 | type(callback).__name__) 30 | if errback is not None and not callable(errback): 31 | raise TypeError('errback must be a callable, got %s' % 32 | type(errback).__name__) 33 | self.callback = callback 34 | self.errback = errback 35 | 36 | self.cookies = cookies or {} 37 | self.headers = headers or {} 38 | self.dont_filter = dont_filter 39 | 40 | self._meta = dict(meta) if meta else None 41 | self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None 42 | self.flags = [] if flags is None else list(flags) 43 | 44 | @property 45 | def cb_kwargs(self): 46 | if self._cb_kwargs is None: 47 | self._cb_kwargs = {} 48 | return self._cb_kwargs 49 | 50 | @property 51 | def meta(self): 52 | if self._meta is None: 53 | self._meta = {} 54 | return self._meta 55 | 56 | def get(self, key, default): 57 | return self.meta.get(key, default) 58 | 59 | def _get_url(self): 60 | return self._url 61 | 62 | def _set_url(self, url): 63 | if not isinstance(url, str): 64 | raise TypeError( 65 | 'Request url must be str or unicode, got %s:' % type(url).__name__) 66 | 67 | s = safe_url_string(url, self.encoding) 68 | self._url = s 69 | 70 | if ('://' not in self._url) and (not self._url.startswith('data:')): 71 | raise ValueError('Missing scheme in request url: %s' % self._url) 72 | 73 | url = property(_get_url, _set_url) 74 | 75 | def _get_body(self): 76 | return self._body 77 | 78 | def _set_body(self, body): 79 | self._body = body or None 80 | 81 | body = property(_get_body, _set_body) 82 | 83 | def _get_json(self): 84 | return self._json 85 | 86 | def _set_json(self, json): 87 | self._json = json or None 88 | 89 | json = property(_get_json, _set_json) 90 | 91 | @property 92 | def encoding(self): 93 | return self._encoding 94 | 95 | def __str__(self): 96 | return "<%s %s>" % (self.method, self.url) 97 | 98 | __repr__ = __str__ 99 | 100 | def copy(self): 101 | """Return a copy of this Request""" 102 | return self.replace() 103 | 104 | def replace(self, *args, **kwargs): 105 | """Create a new Request with the same attributes except for those 106 | given new values. 107 | """ 108 | for x in ['url', 'method', 'headers', 'body', 'cookies', 'meta', 'flags', 109 | 'encoding', 'priority', 'dont_filter', 'callback', 'errback', 'cb_kwargs']: 110 | kwargs.setdefault(x, getattr(self, x)) 111 | cls = kwargs.pop('cls', self.__class__) 112 | return cls(*args, **kwargs) 113 | -------------------------------------------------------------------------------- /aioscpy/http/request/form.py: -------------------------------------------------------------------------------- 1 | from aioscpy.http.request import Request 2 | 3 | 4 | class FormRequest(Request): 5 | valid_form_methods = ['POST'] 6 | 7 | def __init__(self, *args, **kwargs): 8 | formdata = kwargs.pop('formdata', None) 9 | if formdata and kwargs.get('method') is None: 10 | kwargs['method'] = 'POST' 11 | 12 | super(FormRequest, self).__init__(*args, **kwargs) 13 | 14 | if formdata: 15 | if self.method == 'POST': 16 | self.headers.setdefault( 17 | b'Content-Type', [b'application/x-www-form-urlencoded']) 18 | self._set_body(formdata) 19 | -------------------------------------------------------------------------------- /aioscpy/http/request/json.py: -------------------------------------------------------------------------------- 1 | from aioscpy.http.request import Request 2 | 3 | 4 | class JsonRequest(Request): 5 | valid_form_methods = ['POST'] 6 | 7 | def __init__(self, *args, **kwargs): 8 | jsondata = kwargs.pop('jsondata', None) 9 | if jsondata and kwargs.get('method') is None: 10 | kwargs['method'] = 'POST' 11 | 12 | super(JsonRequest, self).__init__(*args, **kwargs) 13 | 14 | if jsondata: 15 | if self.method == 'POST': 16 | self.headers.setdefault( 17 | b'Content-Type', [b'application/json']) 18 | self._set_json(jsondata) 19 | -------------------------------------------------------------------------------- /aioscpy/http/response/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Generator 2 | from urllib.parse import urljoin 3 | 4 | from aioscpy.http.request import Request 5 | from aioscpy.http.request.form import FormRequest 6 | from aioscpy import call_grace_instance 7 | from aioscpy.utils.tools import obsolete_setter 8 | 9 | 10 | class Response(object): 11 | 12 | def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None, certificate=None, _response=None): 13 | self.headers = headers or {} 14 | self.status = int(status) 15 | self._set_body(body) 16 | self._set_url(url) 17 | self.request = request 18 | self.flags = [] if flags is None else list(flags) 19 | self.certificate = certificate 20 | self._response = _response 21 | 22 | @property 23 | def cb_kwargs(self): 24 | try: 25 | return self.request.cb_kwargs 26 | except AttributeError: 27 | raise AttributeError( 28 | "Response.cb_kwargs not available, this response " 29 | "is not tied to any request" 30 | ) 31 | 32 | @property 33 | def meta(self): 34 | try: 35 | return self.request.meta 36 | except AttributeError: 37 | raise AttributeError( 38 | "Response.meta not available, this response " 39 | "is not tied to any request" 40 | ) 41 | 42 | def _get_url(self): 43 | return self._url 44 | 45 | def _set_url(self, url): 46 | if isinstance(url, str): 47 | self._url = url 48 | else: 49 | raise TypeError('%s url must be str, got %s:' % 50 | (type(self).__name__, type(url).__name__)) 51 | 52 | url = property(_get_url, obsolete_setter(_set_url, 'url')) 53 | 54 | def _get_body(self): 55 | return self._body 56 | 57 | def _set_body(self, body): 58 | if body is None: 59 | self._body = b'' 60 | elif not isinstance(body, bytes): 61 | raise TypeError( 62 | "Response body must be bytes. " 63 | "If you want to pass unicode body use TextResponse " 64 | "or HtmlResponse.") 65 | else: 66 | self._body = body 67 | 68 | body = property(_get_body, obsolete_setter(_set_body, 'body')) 69 | 70 | def __str__(self): 71 | return "<%d %s>" % (self.status, self.url) 72 | 73 | __repr__ = __str__ 74 | 75 | def copy(self): 76 | """Return a copy of this Response""" 77 | return self.replace() 78 | 79 | def replace(self, *args, **kwargs): 80 | """Create a new Response with the same attributes except for those 81 | given new values. 82 | """ 83 | for x in ['url', 'status', 'headers', 'body', 'request', 'flags', 'certificate']: 84 | kwargs.setdefault(x, getattr(self, x)) 85 | cls = kwargs.pop('cls', self.__class__) 86 | return cls(*args, **kwargs) 87 | 88 | def urljoin(self, url: str) -> str: 89 | """Join this Response's url with a possible relative url to form an 90 | absolute interpretation of the latter.""" 91 | return urljoin(self.url, url) 92 | 93 | @property 94 | def text(self): 95 | """For subclasses of TextResponse, this will return the body 96 | as str 97 | """ 98 | raise AttributeError("Response content isn't text") 99 | 100 | def css(self, *a, **kw): 101 | """Shortcut method implemented only by responses whose content 102 | is text (subclasses of TextResponse). 103 | """ 104 | # raise NotSupported("Response content isn't text") 105 | raise NotImplementedError 106 | 107 | def xpath(self, *a, **kw): 108 | """Shortcut method implemented only by responses whose content 109 | is text (subclasses of TextResponse). 110 | """ 111 | # raise NotSupported("Response content isn't text") 112 | raise NotImplementedError 113 | 114 | def follow(self, url, callback=None, method='GET', formdata=None, headers=None, body=None, 115 | cookies=None, meta=None, encoding='utf-8', priority=0, 116 | dont_filter=False, errback=None, cb_kwargs=None, flags=None, **kwargs) -> Request: 117 | 118 | url = self.urljoin(url) 119 | method_request = Request 120 | if method == "POST": 121 | method_request = FormRequest 122 | kwargs['formdata'] = formdata 123 | 124 | return call_grace_instance( 125 | method_request, 126 | url=url, 127 | callback=callback, 128 | method=method, 129 | headers=headers, 130 | body=body, 131 | cookies=cookies, 132 | meta=meta, 133 | encoding=encoding, 134 | priority=priority, 135 | dont_filter=dont_filter, 136 | errback=errback, 137 | cb_kwargs=cb_kwargs, 138 | flags=flags, 139 | **kwargs 140 | ) 141 | 142 | def follow_all(self, urls, callback=None, method='GET', headers=None, body=None, 143 | cookies=None, meta=None, encoding='utf-8', priority=0, 144 | dont_filter=False, errback=None, cb_kwargs=None, flags=None) -> Generator: 145 | if not hasattr(urls, '__iter__'): 146 | raise TypeError("'urls' argument must be an iterable") 147 | return ( 148 | self.follow( 149 | url=url, 150 | callback=callback, 151 | method=method, 152 | headers=headers, 153 | body=body, 154 | cookies=cookies, 155 | meta=meta, 156 | encoding=encoding, 157 | priority=priority, 158 | dont_filter=dont_filter, 159 | errback=errback, 160 | cb_kwargs=cb_kwargs, 161 | flags=flags, 162 | ) 163 | for url in urls 164 | ) 165 | -------------------------------------------------------------------------------- /aioscpy/inject.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from importlib import import_module 4 | from pkgutil import iter_modules 5 | 6 | from aioscpy.settings import Settings 7 | from aioscpy.utils.tools import singleton, get_project_settings 8 | 9 | 10 | @singleton 11 | class CSlot: 12 | 13 | def __init__(self): 14 | self._object_slot_cls = {} 15 | 16 | def get(self, sets: str, default=None) -> object: 17 | return self._object_slot_cls.get(sets, default) 18 | 19 | def set(self, sets: str, obj: object): 20 | self._object_slot_cls.__setitem__(sets, obj) 21 | 22 | def empty(self): 23 | return not bool(len(self._object_slot_cls)) 24 | 25 | 26 | class Slot: 27 | 28 | def __init__(self, settings, crawler): 29 | self._objects_slot = { 30 | 'settings': settings, 31 | 'crawler': crawler 32 | } 33 | self._modules_slot = [] 34 | self._close = None 35 | self.live_beat = None 36 | 37 | @property 38 | def is_live(self): 39 | return bool(self._close) 40 | 41 | def get(self, sets: str, default=None) -> object: 42 | return self._objects_slot.get(sets, default) 43 | 44 | def set(self, sets: str, obj: object): 45 | self._objects_slot.__setitem__(sets, obj) 46 | 47 | def clear(self): 48 | del self._objects_slot 49 | self._modules_slot = [] 50 | self._close = True 51 | 52 | def close(self): 53 | if self.live_beat: 54 | self.live_beat.cancel() 55 | 56 | 57 | class DependencyInjection(object): 58 | def __init__(self, settings: Settings = None, crawler=None): 59 | if not settings: 60 | settings = Settings() 61 | self.settings = settings 62 | self.crawler = crawler 63 | self.slot = Slot(settings, crawler) 64 | 65 | @classmethod 66 | def from_settings(cls, settings, crawler): 67 | return cls(settings, crawler) 68 | 69 | @classmethod 70 | def from_crawler(cls, crawler): 71 | return cls.from_settings(crawler.settings, crawler) 72 | 73 | def load(self, key: str): 74 | return self.slot.get(key) 75 | 76 | @staticmethod 77 | def load_all_spider(dirname): 78 | _class_objects = {} 79 | 80 | def load_all_spider_inner(dirname): 81 | for importer, package_name, ispkg in iter_modules([dirname]): 82 | if ispkg: 83 | load_all_spider_inner(dirname + '/' + package_name) 84 | else: 85 | module = importer.find_module(package_name) 86 | module = module.load_module(package_name) 87 | for cls_name in module.__dir__(): 88 | if cls_name == "__spiders__": 89 | class_object = getattr(module, cls_name) 90 | for co in class_object: 91 | _class_objects[co.name] = co 92 | if not cls_name.startswith('__'): 93 | class_object = getattr(module, cls_name) 94 | if hasattr(class_object, "name") and getattr(class_object, "name"): 95 | _class_objects[class_object.name] = class_object 96 | 97 | 98 | load_all_spider_inner(dirname) 99 | return _class_objects 100 | 101 | @staticmethod 102 | def load_object(path: str): 103 | try: 104 | dot = path.rindex('.') 105 | except ValueError: 106 | raise ValueError("Error loading object '%s': not a full path" % path) 107 | 108 | module, name = path[:dot], path[dot + 1:] 109 | mod = import_module(module) 110 | 111 | try: 112 | obj = getattr(mod, name) 113 | except AttributeError: 114 | raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name)) 115 | else: 116 | return obj 117 | 118 | def load_object_slot(self, key: str, path: str, cls=None): 119 | obj = self.load_object(path) 120 | if cls is None: 121 | obj = self.create_instance(obj, self.settings, self.crawler) 122 | self.slot.set(key, obj) 123 | else: 124 | self.c_slot.set(key, obj) 125 | return obj 126 | 127 | def walk_modules(self, path: str): 128 | mods = [] 129 | if hasattr(self, "slot"): 130 | mods = self.slot._modules_slot 131 | mod = import_module(path) 132 | mods.append(mod) 133 | if hasattr(mod, '__path__'): 134 | for _, subpath, ispkg in iter_modules(mod.__path__): 135 | fullpath = path + '.' + subpath 136 | if ispkg: 137 | mods += self.walk_modules(fullpath) 138 | else: 139 | submod = import_module(fullpath) 140 | mods.append(submod) 141 | return mods 142 | 143 | def create_instance(self, objcls, settings, crawler, *args, **kwargs): 144 | if settings is None: 145 | if crawler is None: 146 | raise ValueError("Specify at least one of settings and crawler.") 147 | settings = crawler.settings 148 | if not (type(objcls) == "function"): 149 | objcls = call_grace_instance(objcls, only_instance=True) 150 | if crawler and hasattr(objcls, 'from_crawler'): 151 | return objcls.from_crawler(crawler, *args, **kwargs) 152 | elif hasattr(objcls, 'from_settings'): 153 | return objcls.from_settings(settings, *args, **kwargs) 154 | else: 155 | return objcls(*args, **kwargs) 156 | 157 | async def inject_runner(self): 158 | if any([not self.settings.get('DI_CONFIG'), not self.settings.get('DI_CONFIG_CLS')]): 159 | raise KeyError('Settings DI_CONFIG/DI_CONFIG_CLS not be None') 160 | for key, value in self.settings['DI_CONFIG'].items(): 161 | self.load_object_slot(key, value) 162 | # self.slot.live_beat = asyncio.create_task(self.live_beat()) 163 | 164 | async def live_beat(self): 165 | while 1: 166 | if not self.slot.is_live: 167 | await asyncio.sleep(20) 168 | break 169 | asyncio.create_task(self.inject_runner()) 170 | 171 | 172 | class DependencyInjectionCls(DependencyInjection): 173 | 174 | def __init__(self): 175 | self.c_slot = CSlot() 176 | self.settings = get_project_settings() 177 | 178 | def inject(self): 179 | if self.c_slot.empty(): 180 | for key, value in self.settings['DI_CONFIG_CLS'].items(): 181 | self.load_object_slot(key, value, cls=True) 182 | return self.c_slot 183 | 184 | 185 | _create_dependency = DependencyInjectionCls() 186 | load_object = _create_dependency.load_object 187 | walk_modules = _create_dependency.walk_modules 188 | settings_ins = _create_dependency.settings 189 | 190 | 191 | class object_ref(type): 192 | def __init__(msc, *args, **kwargs): 193 | msc.di = _create_dependency.inject() 194 | msc.logger = msc.di.get("log").logger 195 | super().__init__(*args, **kwargs) 196 | 197 | 198 | def call_grace_instance(obj, *args, only_instance=None, **kwargs): 199 | 200 | if isinstance(obj, str): 201 | obj = load_object(settings_ins['DI_CREATE_CLS'].get(obj)) 202 | 203 | class Inner(obj, metaclass=object_ref): 204 | pass 205 | if only_instance is None: 206 | return Inner(*args, **kwargs) 207 | else: 208 | return Inner 209 | -------------------------------------------------------------------------------- /aioscpy/libs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/aioscpy/libs/__init__.py -------------------------------------------------------------------------------- /aioscpy/libs/downloadermiddlewares/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/aioscpy/libs/downloadermiddlewares/__init__.py -------------------------------------------------------------------------------- /aioscpy/libs/downloadermiddlewares/stats.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlunparse 2 | 3 | from aioscpy.exceptions import NotConfigured 4 | from aioscpy.utils.tools import to_bytes 5 | from aioscpy.utils.othtypes import urlparse_cached 6 | 7 | 8 | def global_object_name(obj): 9 | return f"{obj.__module__}.{obj.__name__}" 10 | 11 | 12 | def request_httprepr(request: "Request") -> bytes: 13 | """Return the raw HTTP representation (as bytes) of the given request. 14 | This is provided only for reference since it's not the actual stream of 15 | bytes that will be send when performing the request (that's controlled 16 | by Twisted). 17 | """ 18 | parsed = urlparse_cached(request) 19 | path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, '')) 20 | s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n" 21 | s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n" 22 | if request.headers: 23 | s += request.headers.to_string() + b"\r\n" 24 | s += b"\r\n" 25 | s += str(request.body).encode() if request.body and isinstance(request.body, dict) else b"" 26 | return s 27 | 28 | 29 | def get_header_size(headers): 30 | size = 0 31 | for key, value in headers.items(): 32 | if isinstance(value, (list, tuple)): 33 | for v in value: 34 | size += len(b": ") + len(key) + len(v) 35 | return size + len(b'\r\n') * (len(headers.keys()) - 1) 36 | 37 | 38 | class DownloaderStats: 39 | 40 | def __init__(self, stats): 41 | self.stats = stats 42 | 43 | @classmethod 44 | def from_crawler(cls, crawler): 45 | if not crawler.settings.getbool('DOWNLOADER_STATS'): 46 | raise NotConfigured 47 | return cls(crawler.stats) 48 | 49 | def process_request(self, request, spider): 50 | self.stats.inc_value('downloader/request_count', spider=spider) 51 | self.stats.inc_value(f'downloader/request_method_count/{request.method}', spider=spider) 52 | reqlen = len(request_httprepr(request)) 53 | self.stats.inc_value('downloader/request_bytes', reqlen, spider=spider) 54 | 55 | def process_response(self, request, response, spider): 56 | self.stats.inc_value('downloader/response_count', spider=spider) 57 | self.stats.inc_value(f'downloader/response_status_count/{response.status}', spider=spider) 58 | reslen = len(response.body) + get_header_size(response.headers) + 4 59 | # response.body + b"\r\n"+ response.header + b"\r\n" + response.status 60 | self.stats.inc_value('downloader/response_bytes', reslen, spider=spider) 61 | return response 62 | 63 | def process_exception(self, request, exception, spider): 64 | ex_class = global_object_name(exception.__class__) 65 | self.stats.inc_value('downloader/exception_count', spider=spider) 66 | self.stats.inc_value(f'downloader/exception_type_count/{ex_class}', spider=spider) 67 | -------------------------------------------------------------------------------- /aioscpy/libs/extensions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/aioscpy/libs/extensions/__init__.py -------------------------------------------------------------------------------- /aioscpy/libs/extensions/corestats.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from aioscpy import signals 4 | 5 | 6 | class CoreStats: 7 | 8 | def __init__(self, stats): 9 | self.stats = stats 10 | self.start_time = None 11 | 12 | @classmethod 13 | def from_crawler(cls, crawler): 14 | o = cls(crawler.stats) 15 | crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) 16 | crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) 17 | crawler.signals.connect(o.item_scraped, signal=signals.item_scraped) 18 | crawler.signals.connect(o.item_dropped, signal=signals.item_dropped) 19 | crawler.signals.connect(o.response_received, signal=signals.response_received) 20 | return o 21 | 22 | def spider_opened(self, spider): 23 | self.start_time = datetime.utcnow() 24 | self.stats.set_value('start_time', self.start_time, spider=spider) 25 | 26 | def spider_closed(self, spider, reason): 27 | finish_time = datetime.utcnow() 28 | elapsed_time = finish_time - self.start_time 29 | elapsed_time_seconds = elapsed_time.total_seconds() 30 | self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider) 31 | self.stats.set_value('finish_time', finish_time, spider=spider) 32 | self.stats.set_value('finish_reason', reason, spider=spider) 33 | 34 | def item_scraped(self, item, spider): 35 | self.stats.inc_value('item_scraped_count', spider=spider) 36 | 37 | def response_received(self, spider): 38 | self.stats.inc_value('response_received_count', spider=spider) 39 | 40 | def item_dropped(self, item, spider, exception): 41 | reason = exception.__class__.__name__ 42 | self.stats.inc_value('item_dropped_count', spider=spider) 43 | self.stats.inc_value(f'item_dropped_reasons_count/{reason}', spider=spider) 44 | -------------------------------------------------------------------------------- /aioscpy/libs/extensions/logstats.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from aioscpy.exceptions import NotConfigured 4 | from aioscpy import signals 5 | 6 | 7 | class LogStats: 8 | """Log basic scraping stats periodically""" 9 | 10 | def __init__(self, stats, interval=60.0): 11 | self.stats = stats 12 | self.interval = interval 13 | self.multiplier = 60.0 / self.interval 14 | self.task = None 15 | self._close_stats = 0 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | interval = crawler.settings.getfloat('LOGSTATS_INTERVAL') 20 | if not interval: 21 | raise NotConfigured 22 | o = cls(crawler.stats, interval) 23 | crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) 24 | crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) 25 | return o 26 | 27 | def spider_opened(self, spider): 28 | self.pagesprev = 0 29 | self.itemsprev = 0 30 | self.task = asyncio.create_task(self.log(spider)) 31 | 32 | async def log(self, spider): 33 | await asyncio.sleep(self.interval) 34 | items = self.stats.get_value('item_scraped_count', 0) 35 | pages = self.stats.get_value('response_received_count', 0) 36 | irate = (items - self.itemsprev) * self.multiplier 37 | prate = (pages - self.pagesprev) * self.multiplier 38 | self.pagesprev, self.itemsprev = pages, items 39 | 40 | msg = ("<{spider_name}> Crawled {pages} pages (at {pagerate} pages/min), " 41 | "scraped {items} items (at {itemrate} items/min)") 42 | log_args = {'pages': pages, 'pagerate': prate, 'spider_name': spider.name, 43 | 'items': items, 'itemrate': irate} 44 | self.logger.info(msg, **log_args, extra={'spider': spider}) 45 | self.task = asyncio.create_task(self.log(spider)) 46 | 47 | def spider_closed(self, spider, reason): 48 | if self.task and not self.task.done(): 49 | self.logger.warning(f'[{spider.name}] recevier logstats closed signed! reason: {reason}') 50 | self.task.cancel() 51 | -------------------------------------------------------------------------------- /aioscpy/libs/statscollectors.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | 3 | 4 | class StatsCollector: 5 | 6 | @classmethod 7 | def from_crawler(cls, crawler): 8 | return cls(crawler) 9 | 10 | def __init__(self, crawler): 11 | self._dump = crawler.settings.getbool('STATS_DUMP') 12 | self._stats = {} 13 | 14 | def get_value(self, key, default=None, spider=None): 15 | return self._stats.get(key, default) 16 | 17 | def get_stats(self, spider=None): 18 | return self._stats 19 | 20 | def set_value(self, key, value, spider=None): 21 | self._stats[key] = value 22 | 23 | def set_stats(self, stats, spider=None): 24 | self._stats = stats 25 | 26 | def inc_value(self, key, count=1, start=0, spider=None): 27 | d = self._stats 28 | d[key] = d.setdefault(key, start) + count 29 | 30 | def max_value(self, key, value, spider=None): 31 | self._stats[key] = max(self._stats.setdefault(key, value), value) 32 | 33 | def min_value(self, key, value, spider=None): 34 | self._stats[key] = min(self._stats.setdefault(key, value), value) 35 | 36 | def clear_stats(self, spider=None): 37 | self._stats.clear() 38 | 39 | def open_spider(self, spider): 40 | pass 41 | 42 | def close_spider(self, spider, reason): 43 | if self._dump: 44 | self.logger.info("Dumping Aioscpy stats:\n {stats}", **{'stats': pprint.pformat(self._stats)}, 45 | extra={'spider': spider}) 46 | self._persist_stats(self._stats, spider) 47 | 48 | def _persist_stats(self, stats, spider): 49 | pass 50 | 51 | 52 | class MemoryStatsCollector(StatsCollector): 53 | 54 | def __init__(self, crawler): 55 | super().__init__(crawler) 56 | self.spider_stats = {} 57 | 58 | def _persist_stats(self, stats, spider): 59 | self.spider_stats[spider.name] = stats 60 | 61 | 62 | class DummyStatsCollector(StatsCollector): 63 | 64 | def get_value(self, key, default=None, spider=None): 65 | return default 66 | 67 | def set_value(self, key, value, spider=None): 68 | pass 69 | 70 | def set_stats(self, stats, spider=None): 71 | pass 72 | 73 | def inc_value(self, key, count=1, start=0, spider=None): 74 | pass 75 | 76 | def max_value(self, key, value, spider=None): 77 | pass 78 | 79 | def min_value(self, key, value, spider=None): 80 | pass 81 | -------------------------------------------------------------------------------- /aioscpy/logformatter.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from aioscpy.utils.tools import referer_str 4 | 5 | 6 | SCRAPEDMSG = "Scraped from {src}" + os.linesep + "{item}" 7 | DROPPEDMSG = "Dropped: {exception}" + os.linesep + "{item}" 8 | CRAWLEDMSG = "Crawled ({status}) {request}{request_flags} (referer: {referer}){response_flags}" 9 | ITEMERRORMSG = "Error processing {item}" 10 | SPIDERERRORMSG = "Spider error processing {request} (referer: {referer})" 11 | DOWNLOADERRORMSG_SHORT = "Error downloading {request}" 12 | DOWNLOADERRORMSG_LONG = "Error downloading {request}: {errmsg}" 13 | 14 | 15 | class LogFormatter: 16 | 17 | def crawled(self, request, response, spider): 18 | request_flags = f' {str(request.flags)}' if request.flags else '' 19 | response_flags = f' {str(response.flags)}' if response.flags else '' 20 | return { 21 | 'level': "DEBUG", 22 | 'msg': CRAWLEDMSG, 23 | 'args': { 24 | 'status': response.status, 25 | 'request': request, 26 | 'request_flags': request_flags, 27 | 'referer': referer_str(request), 28 | 'response_flags': response_flags, 29 | # backward compatibility with Aioscpy logformatter below 1.4 version 30 | 'flags': response_flags 31 | } 32 | } 33 | 34 | def scraped(self, item, response, spider): 35 | """Logs a message when an item is scraped by a spider.""" 36 | src = response 37 | return { 38 | 'level': "DEBUG", 39 | 'msg': SCRAPEDMSG, 40 | 'args': { 41 | 'src': src, 42 | 'item': item, 43 | } 44 | } 45 | 46 | def dropped(self, item, exception, response, spider): 47 | """Logs a message when an item is dropped while it is passing through the item pipeline.""" 48 | return { 49 | 'level': "WARNING", 50 | 'msg': DROPPEDMSG, 51 | 'args': { 52 | 'exception': exception, 53 | 'item': item, 54 | } 55 | } 56 | 57 | def item_error(self, item, exception, response, spider): 58 | """Logs a message when an item causes an error while it is passing 59 | through the item pipeline. 60 | 61 | .. versionadded:: 2.0 62 | """ 63 | return { 64 | 'level': "ERROR", 65 | 'msg': ITEMERRORMSG, 66 | 'args': { 67 | 'item': item, 68 | } 69 | } 70 | 71 | def spider_error(self, failure, request, response, spider): 72 | """Logs an error message from a spider. 73 | 74 | .. versionadded:: 2.0 75 | """ 76 | return { 77 | 'level': "ERROR", 78 | 'msg': SPIDERERRORMSG, 79 | 'args': { 80 | 'request': request, 81 | 'referer': referer_str(request), 82 | } 83 | } 84 | 85 | def download_error(self, failure, request, spider, errmsg=None): 86 | """Logs a download error message from a spider (typically coming from 87 | the engine). 88 | 89 | .. versionadded:: 2.0 90 | """ 91 | args = {'request': request} 92 | if errmsg: 93 | msg = DOWNLOADERRORMSG_LONG 94 | args['errmsg'] = errmsg 95 | else: 96 | msg = DOWNLOADERRORMSG_SHORT 97 | return { 98 | 'level': "ERROR", 99 | 'msg': msg, 100 | 'args': args, 101 | } 102 | 103 | @classmethod 104 | def from_crawler(cls, crawler): 105 | return cls() 106 | -------------------------------------------------------------------------------- /aioscpy/middleware/__init__.py: -------------------------------------------------------------------------------- 1 | from aioscpy.middleware.downloader import DownloaderMiddlewareManager 2 | from aioscpy.middleware.itempipeline import ItemPipelineManager 3 | from aioscpy.middleware.extension import ExtensionManager 4 | 5 | 6 | __all__ = [ 7 | "DownloaderMiddlewareManager", 8 | "ItemPipelineManager", 9 | "ExtensionManager", 10 | ] 11 | -------------------------------------------------------------------------------- /aioscpy/middleware/adaptive_concurrency.py: -------------------------------------------------------------------------------- 1 | import time 2 | from collections import deque 3 | 4 | from aioscpy.middleware.manager import MiddlewareManager 5 | 6 | 7 | class AdaptiveConcurrencyMiddleware: 8 | """ 9 | Middleware that adjusts concurrency based on response times. 10 | 11 | This middleware monitors response times and adjusts the concurrency 12 | settings dynamically to maintain optimal performance. 13 | """ 14 | 15 | def __init__(self, crawler): 16 | self.crawler = crawler 17 | self.settings = crawler.settings 18 | self.enabled = self.settings.getbool('ADAPTIVE_CONCURRENCY_ENABLED', False) 19 | 20 | if not self.enabled: 21 | return 22 | 23 | # Configuration 24 | self.target_response_time = self.settings.getfloat('ADAPTIVE_CONCURRENCY_TARGET_RESPONSE_TIME', 1.0) 25 | self.min_concurrency = self.settings.getint('ADAPTIVE_CONCURRENCY_MIN_REQUESTS', 8) 26 | self.max_concurrency = self.settings.getint('ADAPTIVE_CONCURRENCY_MAX_REQUESTS', 32) 27 | self.window_size = self.settings.getint('ADAPTIVE_CONCURRENCY_WINDOW_SIZE', 20) 28 | self.adjustment_interval = self.settings.getint('ADAPTIVE_CONCURRENCY_ADJUSTMENT_INTERVAL', 10) 29 | 30 | # State 31 | self.response_times = deque(maxlen=self.window_size) 32 | self.last_adjustment_time = time.time() 33 | self.current_concurrency = self.settings.getint('CONCURRENT_REQUESTS', 16) 34 | 35 | # Set initial concurrency 36 | self.crawler.settings.set('CONCURRENT_REQUESTS', self.current_concurrency) 37 | self.logger.info(f"Adaptive concurrency enabled. Initial concurrency: {self.current_concurrency}") 38 | 39 | @classmethod 40 | def from_crawler(cls, crawler): 41 | return cls(crawler) 42 | 43 | async def process_request(self, request, spider): 44 | if not self.enabled: 45 | return None 46 | 47 | # Store request start time 48 | request.meta['request_start_time'] = time.time() 49 | return None 50 | 51 | async def process_response(self, request, response, spider): 52 | if not self.enabled or 'request_start_time' not in request.meta: 53 | return response 54 | 55 | # Calculate response time 56 | response_time = time.time() - request.meta['request_start_time'] 57 | self.response_times.append(response_time) 58 | 59 | # Adjust concurrency if needed 60 | current_time = time.time() 61 | if (current_time - self.last_adjustment_time) >= self.adjustment_interval and len(self.response_times) >= self.window_size: 62 | self._adjust_concurrency() 63 | self.last_adjustment_time = current_time 64 | 65 | return response 66 | 67 | def _adjust_concurrency(self): 68 | """Adjust concurrency based on average response time""" 69 | avg_response_time = sum(self.response_times) / len(self.response_times) 70 | 71 | # Calculate adjustment factor 72 | adjustment_factor = self.target_response_time / avg_response_time 73 | 74 | # Apply adjustment with limits 75 | new_concurrency = int(self.current_concurrency * adjustment_factor) 76 | new_concurrency = max(self.min_concurrency, min(self.max_concurrency, new_concurrency)) 77 | 78 | # Only update if there's a significant change 79 | if new_concurrency != self.current_concurrency: 80 | self.current_concurrency = new_concurrency 81 | self.crawler.settings.set('CONCURRENT_REQUESTS', new_concurrency) 82 | self.logger.info( 83 | f"Adjusted concurrency to {new_concurrency} (avg response time: {avg_response_time:.2f}s, " 84 | f"target: {self.target_response_time:.2f}s)" 85 | ) 86 | -------------------------------------------------------------------------------- /aioscpy/middleware/downloader.py: -------------------------------------------------------------------------------- 1 | from asyncio import iscoroutinefunction 2 | 3 | from aioscpy.exceptions import _InvalidOutput 4 | from aioscpy.utils.common import build_component_list 5 | from aioscpy.middleware.manager import MiddlewareManager 6 | 7 | 8 | class DownloaderMiddlewareManager(MiddlewareManager): 9 | component_name = 'downloader middleware' 10 | 11 | @classmethod 12 | def _get_mwlist_from_settings(cls, settings): 13 | return build_component_list( 14 | settings.getwithbase('DOWNLOADER_MIDDLEWARES')) 15 | 16 | def _add_middleware(self, mw): 17 | if hasattr(mw, 'process_request'): 18 | self.methods['process_request'].append(mw.process_request) 19 | if hasattr(mw, 'process_response'): 20 | self.methods['process_response'].appendleft(mw.process_response) 21 | if hasattr(mw, 'process_exception'): 22 | self.methods['process_exception'].appendleft(mw.process_exception) 23 | 24 | async def process_request(self, spider, request): 25 | for method in self.methods['process_request']: 26 | if iscoroutinefunction(method): 27 | response = await method(request=request, spider=spider) 28 | else: 29 | response = method(request=request, spider=spider) 30 | if response is not None and not isinstance(response, (self.di.get("response"), self.di.get('request'))): 31 | raise _InvalidOutput( 32 | "Middleware %s.process_request must return None, Response or Request, got %s" 33 | % (method.__self__.__class__.__name__, response.__class__.__name__) 34 | ) 35 | if response: 36 | return response 37 | 38 | async def process_response(self, spider, request, response): 39 | if response is None: 40 | raise TypeError("Received None in process_response") 41 | elif isinstance(response, self.di.get('request')): 42 | return response 43 | 44 | for method in self.methods['process_response']: 45 | if iscoroutinefunction(method): 46 | response = await method(request=request, response=response, spider=spider) 47 | else: 48 | response = method(request=request, response=response, spider=spider) 49 | if not isinstance(response, (self.di.get("response"), self.di.get('request'))): 50 | raise _InvalidOutput( 51 | "Middleware %s.process_response must return Response or Request, got %s" 52 | % (method.__self__.__class__.__name__, type(response)) 53 | ) 54 | if isinstance(response, self.di.get('request')): 55 | return response 56 | return response 57 | 58 | async def process_exception(self, spider, request, exception): 59 | for method in self.methods['process_exception']: 60 | if iscoroutinefunction(method): 61 | response = await method(request=request, exception=exception, spider=spider) 62 | else: 63 | response = method(request=request, exception=exception, spider=spider) 64 | if response is not None and not isinstance(response, (self.di.get('response'), self.di.get('request'))): 65 | raise _InvalidOutput( 66 | "Middleware %s.process_exception must return None, Response or Request, got %s" 67 | % (method.__self__.__class__.__name__, type(response)) 68 | ) 69 | if response: 70 | return response 71 | return exception 72 | 73 | -------------------------------------------------------------------------------- /aioscpy/middleware/extension.py: -------------------------------------------------------------------------------- 1 | from aioscpy.middleware.manager import MiddlewareManager 2 | from aioscpy.utils.common import build_component_list 3 | 4 | 5 | class ExtensionManager(MiddlewareManager): 6 | 7 | component_name = 'extension' 8 | 9 | @classmethod 10 | def _get_mwlist_from_settings(cls, settings): 11 | return build_component_list(settings.getwithbase('EXTENSIONS')) 12 | -------------------------------------------------------------------------------- /aioscpy/middleware/itempipeline.py: -------------------------------------------------------------------------------- 1 | from aioscpy.middleware.manager import MiddlewareManager 2 | from aioscpy.utils.common import build_component_list 3 | 4 | 5 | class ItemPipelineManager(MiddlewareManager): 6 | component_name = 'item pipeline' 7 | 8 | @classmethod 9 | def _get_mwlist_from_settings(cls, settings): 10 | return build_component_list(settings.getwithbase('ITEM_PIPELINES')) 11 | 12 | def _add_middleware(self, pipe): 13 | super()._add_middleware(pipe) 14 | if hasattr(pipe, 'process_item'): 15 | self.methods['process_item'].append(pipe.process_item) 16 | 17 | async def process_item(self, item, spider): 18 | return await self._process_chain('process_item', item, spider) 19 | -------------------------------------------------------------------------------- /aioscpy/middleware/manager.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | 3 | from asyncio import iscoroutinefunction 4 | from collections import defaultdict, deque 5 | 6 | from aioscpy.exceptions import NotConfigured 7 | 8 | 9 | class MiddlewareManager: 10 | """Base class for implementing middleware managers""" 11 | 12 | component_name = 'foo middleware' 13 | 14 | def __init__(self, crawler=None, middlewares=None): 15 | self.crawler = crawler 16 | self.middlewares = middlewares 17 | self.methods = defaultdict(deque) 18 | for mw in middlewares: 19 | self._add_middleware(mw) 20 | 21 | @classmethod 22 | def _get_mwlist_from_settings(cls, settings): 23 | raise NotImplementedError 24 | 25 | @classmethod 26 | def from_settings(cls, settings, crawler=None): 27 | mwlist = cls._get_mwlist_from_settings(settings) 28 | middlewares = [] 29 | enabled = [] 30 | for clspath in mwlist: 31 | try: 32 | mw = crawler.DI.load_object_slot(clspath.split('.')[-2], clspath) 33 | middlewares.append(mw) 34 | enabled.append(clspath) 35 | except NotConfigured as e: 36 | if e.args: 37 | clsname = clspath.split('.')[-1] 38 | cls.logger.warning("Disabled {clsname}: {eargs}", 39 | **{'clsname': clsname, 'eargs': e.args[0]}, 40 | extra={'crawler': crawler}) 41 | if enabled: 42 | cls.logger.info("Enabled {name} {componentname}s:\n{enabledlist}", 43 | **{'componentname': cls.component_name, 44 | 'enabledlist': pprint.pformat(enabled), 45 | 'name': crawler.spider.name}, 46 | extra={'crawler': crawler}) 47 | return cls(crawler=crawler, middlewares=middlewares) 48 | 49 | @classmethod 50 | def from_crawler(cls, crawler): 51 | return cls.from_settings(crawler.settings, crawler) 52 | 53 | def _add_middleware(self, mw): 54 | if hasattr(mw, 'open_spider'): 55 | self.methods['open_spider'].append(mw.open_spider) 56 | if hasattr(mw, 'close_spider'): 57 | self.methods['close_spider'].appendleft(mw.close_spider) 58 | 59 | async def _process_parallel(self, methodname, obj, *args): 60 | return await self.process_parallel(self.methods[methodname], obj, *args) 61 | 62 | async def _process_chain(self, methodname, obj, *args): 63 | return await self.process_chain(self.methods[methodname], obj, *args) 64 | 65 | async def _process_chain_both(self, cb_methodname, eb_methodname, obj, *args): 66 | return await self.process_chain_both(self.methods[cb_methodname], 67 | self.methods[eb_methodname], obj, *args) 68 | 69 | async def open_spider(self, spider): 70 | return await self._process_parallel('open_spider', spider) 71 | 72 | async def close_spider(self, spider): 73 | return await self._process_parallel('close_spider', spider) 74 | 75 | @staticmethod 76 | async def process_parallel(callbacks, input_, *a, **kw): 77 | for callback in callbacks: 78 | if iscoroutinefunction(callback): 79 | await callback(input_, *a, **kw) 80 | else: 81 | callback(input_, *a, **kw) 82 | 83 | @staticmethod 84 | async def process_chain(callbacks, input_, *a, **kw): 85 | for callback in callbacks: 86 | if iscoroutinefunction(callback): 87 | input_result = await callback(input_, *a, **kw) 88 | else: 89 | input_result = callback(input_, *a, **kw) 90 | if input_result is not None: 91 | input_ = input_result 92 | return input_ 93 | 94 | @staticmethod 95 | async def process_chain_both(callbacks, errbacks, input_, *a, **kw): 96 | for cb, eb in zip(callbacks, errbacks): 97 | try: 98 | if iscoroutinefunction(cb): 99 | input_ = await cb(input_, *a, **kw) 100 | else: 101 | input_ = cb(input_, *a, **kw) 102 | except(Exception, BaseException) as e: 103 | if iscoroutinefunction(cb): 104 | input_ = await eb(input_, *a, **kw) 105 | else: 106 | input_ = eb(input_, *a, **kw) 107 | return input_ 108 | -------------------------------------------------------------------------------- /aioscpy/queue/__init__.py: -------------------------------------------------------------------------------- 1 | from aioscpy.queue.compat import COMPAT_TYPE 2 | 3 | from aioscpy.queue.convert import request_from_dict, request_to_dict 4 | 5 | 6 | class BaseQueue(object): 7 | 8 | __slots__ = ["server", "key", "serializer", "spider"] 9 | __compat__ = COMPAT_TYPE 10 | 11 | def __init__(self, server, spider=None, key=None, serializer=None): 12 | if serializer is None: 13 | serializer = self.__compat__[serializer or "json"] 14 | 15 | if not hasattr(serializer, 'loads'): 16 | raise TypeError("serializer does not implement 'loads' function: %r" 17 | % serializer) 18 | if not hasattr(serializer, 'dumps'): 19 | raise TypeError("serializer does not implement 'dumps' function: %r" 20 | % serializer) 21 | 22 | self.server = server 23 | self.key = key or 'sp:requests' 24 | self.serializer = serializer 25 | self.spider = spider 26 | 27 | def _encode_request(self, request) -> bytes: 28 | obj = request_to_dict(request, self.spider) 29 | return self.serializer.dumps(obj) 30 | 31 | def _decode_request(self, encoded_request: bytes) -> dict: 32 | obj = self.serializer.loads(encoded_request) 33 | return request_from_dict(obj, self.spider) 34 | # return obj 35 | 36 | def __len__(self): 37 | raise Exception('please use function len()') 38 | 39 | async def qsize(self): 40 | raise NotImplementedError 41 | 42 | async def push(self, request): 43 | raise NotImplementedError 44 | 45 | async def pop(self, timeout=0): 46 | raise NotImplementedError 47 | 48 | async def clear(self): 49 | await self.server.delete(self.key) 50 | 51 | async def close(self): 52 | if hasattr(self.server, "close"): 53 | await self.server.close() 54 | -------------------------------------------------------------------------------- /aioscpy/queue/compat.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import json 3 | 4 | from aioscpy.utils.tools import to_unicode 5 | 6 | 7 | def _request_byte2str(obj): 8 | _encoding = obj.get('_encoding', 'utf-8') 9 | if isinstance(obj['body'], bytes): 10 | _body = obj['body'].decode(_encoding) 11 | elif isinstance(obj['body'], dict): 12 | _body = json.dumps(obj['body']) 13 | else: 14 | _body = obj['body'] 15 | _headers = {} 16 | for k, v in obj['headers'].items(): 17 | if isinstance(k, bytes) or isinstance(v, bytes): 18 | _headers.update({to_unicode(k, encoding=_encoding): to_unicode(b','.join(v), encoding=_encoding)}) 19 | else: 20 | _headers.update({k: v}) 21 | obj.update({ 22 | 'body': _body, 23 | 'headers': _headers 24 | }) 25 | return obj 26 | 27 | 28 | class PickleCompat: 29 | 30 | @staticmethod 31 | def loads(s: bytes) -> dict: 32 | return pickle.loads(s) 33 | 34 | @staticmethod 35 | def dumps(obj) -> bytes: 36 | return pickle.dumps(obj, protocol=-1) 37 | 38 | 39 | class JsonCompat: 40 | 41 | @staticmethod 42 | def loads(s: bytes) -> dict: 43 | return json.loads(s) 44 | 45 | @staticmethod 46 | def dumps(obj) -> str: 47 | return json.dumps(_request_byte2str(obj)) 48 | 49 | 50 | COMPAT_TYPE = { 51 | "pickle": PickleCompat, 52 | "json": JsonCompat 53 | } 54 | 55 | __all__ = [ 56 | COMPAT_TYPE, 57 | ] 58 | -------------------------------------------------------------------------------- /aioscpy/queue/convert.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper functions for serializing (and deserializing) requests. 3 | """ 4 | import inspect 5 | import json 6 | 7 | from aioscpy import call_grace_instance 8 | from aioscpy.http import Request 9 | from aioscpy.utils.tools import to_unicode 10 | from aioscpy.inject import load_object 11 | from anti_header import Headers 12 | 13 | 14 | def request_to_dict(request, spider=None): 15 | """Convert Request object to a dict. 16 | 17 | If a spider is given, it will try to find out the name of the spider method 18 | used in the callback and store that as the callback. 19 | """ 20 | cb = request.callback 21 | if callable(cb): 22 | cb = _find_method(spider, cb) 23 | eb = request.errback 24 | if callable(eb): 25 | eb = _find_method(spider, eb) 26 | d = { 27 | 'url': to_unicode(request.url), # urls should be safe (safe_string_url) 28 | 'callback': cb, 29 | 'errback': eb, 30 | 'method': request.method, 31 | 'headers': dict(request.headers), 32 | 'body': request.body, 33 | 'json': request.json, 34 | 'cookies': request.cookies, 35 | 'meta': request.meta, 36 | '_encoding': request._encoding, 37 | 'priority': request.priority, 38 | 'dont_filter': request.dont_filter, 39 | 'flags': request.flags, 40 | 'cb_kwargs': request.cb_kwargs, 41 | } 42 | _body = getattr(request, "body") 43 | _json = getattr(request, "json") 44 | if _body and isinstance(_body, dict) or _json and isinstance(_json, dict): 45 | base_cls = request.__class__.__bases__[0] 46 | d['_class'] = base_cls.__module__ + '.' + base_cls.__name__ 47 | return d 48 | 49 | 50 | def request_from_dict(d, spider=None): 51 | """Create Request object from a dict. 52 | 53 | If a spider is given, it will try to resolve the callbacks looking at the 54 | spider for methods with the same name. 55 | """ 56 | cb = d.get('callback', 'parse') 57 | if cb and spider: 58 | cb = _get_method(spider, cb) 59 | eb = d.get('errback') 60 | if eb and spider: 61 | eb = _get_method(spider, eb) 62 | request_cls = load_object(d['_class']) if '_class' in d else Request 63 | 64 | _json, _body = None, None 65 | if request_cls.__name__ in ["FormRequest"]: 66 | if d.get('body') and isinstance(d.get('body'), dict): 67 | _body = d['body'] 68 | elif d.get('body') and isinstance(d.get('body'), str): 69 | _body = json.loads(d['body']) 70 | elif request_cls.__name__ in ["JsonRequest"]: 71 | if d.get('json') and isinstance(d.get('json'), dict): 72 | _json = d['json'] 73 | elif d.get('json') and isinstance(d.get('json'), str): 74 | _json = json.loads(d['json']) 75 | 76 | return call_grace_instance( 77 | request_cls, 78 | url=to_unicode(d['url']), 79 | callback=cb, 80 | errback=eb, 81 | method=d.get('method', 'GET'), 82 | headers=Headers(d.get('headers', {})), 83 | body=_body, 84 | json=_json, 85 | cookies=d.get('cookies'), 86 | meta=d.get('meta'), 87 | encoding=d.get('_encoding', 'utf-8'), 88 | priority=d.get('priority', 0), 89 | dont_filter=d.get('dont_filter', True), 90 | flags=d.get('flags'), 91 | cb_kwargs=d.get('cb_kwargs'), 92 | ) 93 | 94 | 95 | def _find_method(obj, func): 96 | # Only instance methods contain ``__func__`` 97 | if obj and hasattr(func, '__func__'): 98 | members = inspect.getmembers(obj, predicate=inspect.ismethod) 99 | for name, obj_func in members: 100 | # We need to use __func__ to access the original 101 | # function object because instance method objects 102 | # are generated each time attribute is retrieved from 103 | # instance. 104 | # 105 | # Reference: The standard type hierarchy 106 | # https://docs.python.org/3/reference/datamodel.html 107 | if obj_func.__func__ is func.__func__: 108 | return name 109 | raise ValueError(f"Function {func} is not an instance method in: {obj}") 110 | 111 | 112 | def _get_method(obj, name): 113 | name = str(name) 114 | try: 115 | return getattr(obj, name) 116 | except AttributeError: 117 | raise ValueError(f"Method {name!r} not found in: {obj}") 118 | -------------------------------------------------------------------------------- /aioscpy/queue/memory/__init__.py: -------------------------------------------------------------------------------- 1 | from ._queue import spider_queue, memory_queue 2 | 3 | 4 | __all__ = [ 5 | spider_queue, 6 | memory_queue 7 | ] 8 | -------------------------------------------------------------------------------- /aioscpy/queue/memory/_queue.py: -------------------------------------------------------------------------------- 1 | from asyncio import Queue 2 | 3 | from aioscpy.queue import BaseQueue 4 | 5 | 6 | class PriorityQueue(BaseQueue): 7 | 8 | def __init__(self, server, spider, serializer="pickle"): 9 | super().__init__(server, spider) 10 | self.serializer = self.__compat__[serializer] 11 | 12 | def qsize(self) -> int: 13 | """Return the length of the queue""" 14 | return self.server.qsize() 15 | 16 | async def push(self, request): 17 | data = self._encode_request(request) 18 | await self.server.put(data) 19 | 20 | async def pop(self, timeout: int = 0, count: int = 0) -> list: 21 | _item = await self.server.get() 22 | return [self._decode_request(_item)] 23 | 24 | 25 | def memory_queue(spider) -> PriorityQueue: 26 | """ 27 | async def run(): 28 | queue = memery_queue('message:queue') 29 | await queue.push({"url": "https://www.baidu.com/?kw=1", "task_id": '123'}) 30 | print(await queue.pop()) 31 | 32 | 33 | if __name__ == "__main__": 34 | import asyncio 35 | asyncio.run(run()) 36 | 37 | """ 38 | server = Queue() 39 | return PriorityQueue(server=server, spider=spider) 40 | 41 | 42 | spider_queue = memory_queue 43 | -------------------------------------------------------------------------------- /aioscpy/queue/rabbitmq/__init__.py: -------------------------------------------------------------------------------- 1 | from ._queue import spider_priority_queue, priority_queue 2 | 3 | 4 | __all__ = [ 5 | spider_priority_queue, 6 | priority_queue, 7 | ] 8 | -------------------------------------------------------------------------------- /aioscpy/queue/rabbitmq/_queue.py: -------------------------------------------------------------------------------- 1 | import pika 2 | 3 | from aioscpy.queue import BaseQueue 4 | 5 | 6 | class PriorityQueue(BaseQueue): 7 | def qsize(self) -> int: 8 | return self.server.get_waiting_message_count() 9 | 10 | def push(self, request: dict): 11 | data = self._encode_request(request) 12 | score = request.get('priority', 1) 13 | 14 | self.server.basic_publish( 15 | properties=pika.BasicProperties(priority=score), 16 | exchange='', 17 | routing_key=self.key, 18 | body=data 19 | ) 20 | 21 | def on_message(self, ch, method, properties, body): 22 | pass 23 | 24 | def m_pop(self, on_message_callback=None, auto_ack=False): 25 | if not on_message_callback: 26 | on_message_callback = self.on_message 27 | self.server.basic_consume( 28 | on_message_callback=on_message_callback, 29 | queue=self.key, 30 | auto_ack=auto_ack 31 | ) 32 | self.server.start_consuming() 33 | 34 | def pop(self, auto_ack=False): 35 | _method, _, _body = self.server.basic_get(queue=self.key, auto_ack=auto_ack) 36 | if all([isinstance(_body, bytes), _body is not None]): 37 | return _method, self._decode_request(_body) 38 | return None, None 39 | 40 | def finish(self, method): 41 | self.server.basic_ack(delivery_tag=method.delivery_tag) 42 | 43 | 44 | class RabbitMq: 45 | __mq_instance = None 46 | __mq_connection_instance = None 47 | 48 | def __init__(self, *args, **kwargs): 49 | self.args = args 50 | self.kwargs = self.validator(kwargs) 51 | 52 | @staticmethod 53 | def validator(params: dict) -> dict: 54 | params.setdefault('host', '127.0.0.1') 55 | params.setdefault('port', 5672) 56 | params.setdefault('username', 'admin') 57 | params.setdefault('password', 'admin') 58 | params.setdefault('max_priority', 100) 59 | params.setdefault('key', 'rabbitmq:queue') 60 | return params 61 | 62 | @property 63 | def get_channel(self): 64 | if not self.__mq_instance: 65 | connection = pika.BlockingConnection( 66 | pika.ConnectionParameters( 67 | host=self.kwargs['host'], 68 | port=self.kwargs['port'], 69 | credentials=pika.PlainCredentials( 70 | username=self.kwargs['username'], 71 | password=self.kwargs['password'] 72 | ) 73 | ) 74 | ) 75 | channel = connection.channel() 76 | channel.queue_declare( 77 | queue=self.kwargs['key'], 78 | arguments={"x-max-priority": self.kwargs['max_priority']} 79 | ) 80 | self.__mq_instance, self.__mq_connection_instance = channel, connection 81 | return self.__mq_instance 82 | 83 | def close(self): 84 | if self.__mq_instance: 85 | self.__mq_instance.close() 86 | self.__mq_connection_instance.close() 87 | 88 | 89 | def priority_queue(key: str, mq: dict) -> PriorityQueue: 90 | """ 91 | # unit test example 92 | def run(): 93 | queue = rabbitmq_client('message:queue') 94 | for i in range(5): 95 | queue.push({"url": f"https://www.baidu.com/?kw={i}", "task_id": '123'}) 96 | while 1: 97 | method, msg = queue.pop() 98 | print(msg) 99 | if not msg: 100 | break 101 | time.sleep(1) 102 | if method: 103 | queue.finish(method) 104 | 105 | run() 106 | 107 | """ 108 | server = RabbitMq(**mq).get_channel 109 | return PriorityQueue(server=server, key=key) 110 | 111 | 112 | spider_priority_queue = priority_queue 113 | -------------------------------------------------------------------------------- /aioscpy/queue/redis/__init__.py: -------------------------------------------------------------------------------- 1 | from ._queue import spider_priority_queue, priority_queue 2 | from ._queue_async import spider_aio_priority_queue, aio_priority_queue 3 | 4 | 5 | __all__ = [ 6 | spider_priority_queue, 7 | priority_queue, 8 | spider_aio_priority_queue, 9 | aio_priority_queue, 10 | ] 11 | -------------------------------------------------------------------------------- /aioscpy/queue/redis/_queue.py: -------------------------------------------------------------------------------- 1 | from redis import ConnectionPool, StrictRedis 2 | 3 | from aioscpy.queue import BaseQueue 4 | 5 | 6 | class PriorityQueue(BaseQueue): 7 | def qsize(self) -> int: 8 | """Return the length of the queue""" 9 | return self.server.zcard(self.key) 10 | 11 | def push(self, request: dict): 12 | data = self._encode_request(request) 13 | score = -request.get('priority', 1) 14 | self.server.zadd(self.key, {data: score}) 15 | 16 | def pop(self, timeout: int = 0) -> dict: 17 | pipe = self.server.pipeline() 18 | pipe.multi() 19 | pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0) 20 | results, count = pipe.execute() 21 | if results: 22 | return self._decode_request(results[0]) 23 | 24 | 25 | class Redis: 26 | 27 | __redis_instance = None 28 | 29 | def __init__(self, *args, **kwargs): 30 | self.args = args 31 | self.kwargs = self.validator(kwargs) 32 | 33 | @staticmethod 34 | def validator(params: dict) -> dict: 35 | params.setdefault('host', '127.0.0.1') 36 | params.setdefault('port', 6379) 37 | params.setdefault('db', 1) 38 | params.setdefault('password', 'admin') 39 | return params 40 | 41 | @property 42 | def format_url(self) -> str: 43 | """REDIS_URL = 'redis://:123456@172.16.8.147:6379/1'""" 44 | _format_url = f"redis://:{self.kwargs['password']}@{self.kwargs['host']}:{self.kwargs['port']}/{self.kwargs['db']}"\ 45 | if not self.kwargs.get('redis_url') else self.kwargs['redis_url'] 46 | return _format_url 47 | 48 | @property 49 | def get_redis_pool(self) -> StrictRedis: 50 | if not self.__redis_instance: 51 | pool = ConnectionPool(**self.kwargs) 52 | self.__redis_instance = StrictRedis(connection_pool=pool) 53 | 54 | return self.__redis_instance 55 | 56 | def close(self): 57 | if self.__redis_instance: 58 | self.__redis_instance.close() 59 | 60 | 61 | def priority_queue(key: str, redis_tcp: dict) -> PriorityQueue: 62 | """ 63 | def run(): 64 | queue = redis_client('message:queue') 65 | # queue.push({"url": "https://www.baidu.com/?kw=1", "task_id": '123'}) 66 | print(queue.pop()) 67 | 68 | run() 69 | """ 70 | server = Redis(**redis_tcp).get_redis_pool 71 | return PriorityQueue(server=server, key=key) 72 | 73 | 74 | spider_priority_queue = priority_queue 75 | -------------------------------------------------------------------------------- /aioscpy/queue/redis/_queue_async.py: -------------------------------------------------------------------------------- 1 | from redis.asyncio import Redis, BlockingConnectionPool 2 | 3 | from aioscpy.queue import BaseQueue 4 | 5 | 6 | class PriorityQueue(BaseQueue): 7 | def __init__(self, server, spider, key=None, serializer="pickle"): 8 | super().__init__(server, spider, key) 9 | self.serializer = self.__compat__[serializer] 10 | 11 | async def qsize(self) -> int: 12 | return await self.server.zcard(self.key) 13 | 14 | async def push(self, request): 15 | data = self._encode_request(request) 16 | score = -request.get('priority', 1) 17 | await self.server.zadd(self.key, {data: score}) 18 | 19 | async def mpush(self, requests: list): 20 | async with self.server.pipeline() as pipe: 21 | for request in requests: 22 | data = self._encode_request(request) 23 | score = -request.get('priority', 1) 24 | pipe.zadd(self.key, {data: score}) 25 | await pipe.execute() 26 | 27 | async def pop(self, timeout: int = 0, count: int = 0): 28 | async with self.server.pipeline(transaction=True) as pipe: 29 | results, _ = await ( 30 | pipe.zrange(self.key, 0, count) 31 | .zremrangebyrank(self.key, 0, count) 32 | .execute() 33 | ) 34 | _results = [] 35 | for result in results: 36 | _results.append(self._decode_request(result)) 37 | return _results 38 | 39 | 40 | class AsyncRedis: 41 | __redis_instance = None 42 | 43 | def __init__(self, *args, **kwargs): 44 | self.args = args 45 | if not kwargs: 46 | self.kwargs = self.validator(kwargs) 47 | self.kwargs = kwargs 48 | 49 | @staticmethod 50 | def validator(params: dict) -> dict: 51 | params.setdefault('host', '127.0.0.1') 52 | params.setdefault('port', 6379) 53 | params.setdefault('db', 1) 54 | params.setdefault('password', 'admin') 55 | return params 56 | 57 | @property 58 | async def get_redis_pool(self) -> Redis: 59 | if not self.__redis_instance: 60 | url = self.kwargs.pop('url', None) 61 | if url: 62 | connection_pool = BlockingConnectionPool.from_url(url, **self.kwargs) 63 | else: 64 | connection_pool = BlockingConnectionPool(**self.kwargs) 65 | self.__redis_instance = Redis(connection_pool=connection_pool) 66 | return self.__redis_instance 67 | 68 | async def close(self): 69 | if self.__redis_instance: 70 | await self.__redis_instance.close() 71 | 72 | 73 | async def aio_priority_queue(key: str, redis_tcp, spider) -> PriorityQueue: 74 | """ 75 | # unit test example 76 | async def run(): 77 | REDIS_TCP = { 78 | "host": "172.16.7.172", 79 | "port": 6379, 80 | "password": "123456", 81 | "db": 15 82 | } 83 | queue = await aio_priority_queue('message:queue', REDIS_TCP) 84 | # await queue.push({"url": "https://www.baidu.com/?kw=1", "task_id": '123'}) 85 | print(await queue.pop()) 86 | 87 | 88 | if __name__ == "__main__": 89 | import asyncio 90 | asyncio.run(run()) 91 | """ 92 | 93 | if isinstance(redis_tcp, str): 94 | redis_tcp = {'url': redis_tcp} 95 | server = await AsyncRedis(**redis_tcp).get_redis_pool 96 | return PriorityQueue(server=server, spider=spider, key=key, serializer='json') 97 | 98 | 99 | spider_aio_priority_queue = aio_priority_queue 100 | -------------------------------------------------------------------------------- /aioscpy/settings/default_settings.py: -------------------------------------------------------------------------------- 1 | BOT_NAME = "aioscpy" 2 | 3 | # Concurrency settings 4 | CONCURRENT_REQUESTS = 16 5 | CONCURRENT_REQUESTS_PER_DOMAIN = 8 6 | CONCURRENT_REQUESTS_PER_IP = 0 7 | CONCURRENT_ITEMS = 16 8 | 9 | # Adaptive concurrency settings 10 | ADAPTIVE_CONCURRENCY_ENABLED = False 11 | ADAPTIVE_CONCURRENCY_TARGET_RESPONSE_TIME = 1.0 # seconds 12 | ADAPTIVE_CONCURRENCY_MIN_REQUESTS = 8 13 | ADAPTIVE_CONCURRENCY_MAX_REQUESTS = 32 14 | ADAPTIVE_CONCURRENCY_WINDOW_SIZE = 20 15 | ADAPTIVE_CONCURRENCY_ADJUSTMENT_INTERVAL = 10 # seconds 16 | 17 | # Download settings 18 | DOWNLOAD_DELAY = 0 19 | DOWNLOAD_TIMEOUT = 20 20 | RANDOMIZE_DOWNLOAD_DELAY = True 21 | 22 | # Memory optimization settings 23 | GC_ENABLED = True 24 | GC_FREQUENCY = 10 # Run garbage collection every 10 heartbeats 25 | 26 | # Task beat settings 27 | TASK_BEAT_ACTIVE_SLEEP = 0.2 # Sleep when active (seconds) 28 | TASK_BEAT_IDLE_SLEEP = 1.0 # Sleep when idle (seconds) 29 | TASK_BEAT_BATCH_SIZE = 100 # Max requests per batch 30 | 31 | # Handler and scheduler settings 32 | # DOWNLOAD_HANDLER = "aioscpy.core.downloader.handlers.aiohttp.AioHttpDownloadHandler" 33 | DOWNLOAD_HANDLER = "aioscpy.core.downloader.handlers.httpx.HttpxDownloadHandler" 34 | # DOWNLOAD_HANDLER = "aioscpy.core.downloader.handlers.requests.RequestsDownloadHandler" 35 | # SCHEDULER = "aioscpy.core.scheduler.redis.RedisScheduler" 36 | SCHEDULER = "aioscpy.core.scheduler.memory.MemoryScheduler" 37 | REQUESTS_SESSION_STATS = False 38 | 39 | SPIDER_IDLE = False 40 | 41 | # LOG CONFIG 42 | LOG_LEVEL = "DEBUG" 43 | LOG_FILE = False 44 | LOG_FILENAME = f"{BOT_NAME}.log" 45 | LOG_ENCODING = "utf-8" 46 | LOG_ROTATION = "1 week" 47 | LOG_RETENTION = "30 days" 48 | 49 | DI_CONFIG = { 50 | "scheduler": f"{SCHEDULER}", 51 | "log_formatter": "aioscpy.logformatter.LogFormatter", 52 | "extension": "aioscpy.middleware.ExtensionManager", 53 | 54 | } 55 | DI_CONFIG_CLS = { 56 | "request": "aioscpy.http.Request", 57 | "response": "aioscpy.http.TextResponse", 58 | "form_request": "aioscpy.http.FormRequest", 59 | "json_request": "aioscpy.http.JsonRequest", 60 | "logger": "aioscpy.utils.log.logger", 61 | "log": "aioscpy.utils.log", 62 | "exceptions": "aioscpy.exceptions", 63 | "tools": "aioscpy.utils.tools", 64 | 'downloader_middleware': 'aioscpy.middleware.DownloaderMiddlewareManager', 65 | "item_processor": "aioscpy.middleware.ItemPipelineManager", 66 | } 67 | DI_CREATE_CLS = { 68 | 'crawler': 'aioscpy.crawler.Crawler', 69 | 'crawler_process': 'aioscpy.crawler.CrawlerProcess', 70 | 'engine': 'aioscpy.core.engine.ExecutionEngine', 71 | 'spider': 'aioscpy.spider.Spider', 72 | 'downloader_handler': f'{DOWNLOAD_HANDLER}', 73 | 'stats': 'aioscpy.libs.statscollectors.MemoryStatsCollector', 74 | 'scraper': 'aioscpy.core.scraper.Scraper', 75 | "downloader": "aioscpy.core.downloader.Downloader", 76 | } 77 | 78 | # message config 79 | # RABBITMQ_TCP = { 80 | # "host": "172.16.8.147", 81 | # # "port": 5672, 82 | # # "username": "admin", 83 | # # "password": "admin", 84 | # # "key": "message:queue", 85 | # # "max_priority": 100 86 | # } 87 | QUEUE_KEY = '%(spider)s:requests' 88 | 89 | # REDIS_TCP = { 90 | # "host": "172.16.7.172", 91 | # "port": 6379, 92 | # "password": "123456", 93 | # "db": 15 94 | # } 95 | # REDIS_URI = "redis://:123456@172.16.7.172:6379/1" 96 | 97 | 98 | EXTENSIONS_BASE = { 99 | 'aioscpy.libs.extensions.corestats.CoreStats': 0, 100 | 'aioscpy.libs.extensions.logstats.LogStats': 0, 101 | 102 | } 103 | 104 | DOWNLOADER_MIDDLEWARES_BASE = { 105 | # Engine side 106 | 'aioscpy.middleware.adaptive_concurrency.AdaptiveConcurrencyMiddleware': 500, 107 | 'aioscpy.libs.downloadermiddlewares.stats.DownloaderStats': 850, 108 | # Downloader side 109 | } 110 | DOWNLOADER_STATS = True 111 | 112 | LOGSTATS_INTERVAL = 60.0 113 | STATS_CLASS = 'aioscpy.libs.statscollectors.MemoryStatsCollector' 114 | STATS_DUMP = True 115 | SCRAPER_SLOT_MAX_ACTIVE_SIZE = 5000000 116 | 117 | TLS_CIPHERS = False 118 | 119 | -------------------------------------------------------------------------------- /aioscpy/signalmanager.py: -------------------------------------------------------------------------------- 1 | from pydispatch import dispatcher 2 | from aioscpy.utils import signal as _signal 3 | 4 | 5 | class SignalManager: 6 | 7 | def __init__(self, sender=dispatcher.Anonymous): 8 | self.sender = sender 9 | 10 | def connect(self, receiver, signal, **kwargs): 11 | """ 12 | Connect a receiver function to a signal. 13 | 14 | The signal can be any object, although Aioscpy comes with some 15 | predefined signals that are documented in the :ref:`topics-signals` 16 | section. 17 | 18 | :param receiver: the function to be connected 19 | :type receiver: callable 20 | 21 | :param signal: the signal to connect to 22 | :type signal: object 23 | """ 24 | kwargs.setdefault('sender', self.sender) 25 | return dispatcher.connect(receiver, signal, **kwargs) 26 | 27 | def disconnect(self, receiver, signal, **kwargs): 28 | """ 29 | Disconnect a receiver function from a signal. This has the 30 | opposite effect of the :meth:`connect` method, and the arguments 31 | are the same. 32 | """ 33 | kwargs.setdefault('sender', self.sender) 34 | return dispatcher.disconnect(receiver, signal, **kwargs) 35 | 36 | async def send_catch_log(self, signal, **kwargs): 37 | """ 38 | Send a signal, catch exceptions and log them. 39 | 40 | The keyword arguments are passed to the signal handlers (connected 41 | through the :meth:`connect` method). 42 | """ 43 | kwargs.setdefault('sender', self.sender) 44 | return await _signal.send_catch_log(signal, **kwargs) 45 | 46 | async def send_catch_log_coroutine(self, signal, **kwargs): 47 | """ 48 | Like :meth:`send_catch_log` but supports returning 49 | :class:`~twisted.internet.defer.Deferred` objects from signal handlers. 50 | 51 | Returns a Deferred that gets fired once all signal handlers 52 | deferreds were fired. Send a signal, catch exceptions and log them. 53 | 54 | The keyword arguments are passed to the signal handlers (connected 55 | through the :meth:`connect` method). 56 | """ 57 | kwargs.setdefault('sender', self.sender) 58 | return await _signal.send_catch_log_coroutine(signal, **kwargs) 59 | 60 | def disconnect_all(self, signal, **kwargs): 61 | """ 62 | Disconnect all receivers from the given signal. 63 | 64 | :param signal: the signal to disconnect from 65 | :type signal: object 66 | """ 67 | kwargs.setdefault('sender', self.sender) 68 | return _signal.disconnect_all(signal, **kwargs) 69 | -------------------------------------------------------------------------------- /aioscpy/signals.py: -------------------------------------------------------------------------------- 1 | """ 2 | aioscpy like for Aioscpy signals 3 | 4 | These signals are documented in docs/topics/signals.rst. Please don't add new 5 | signals here without documenting them there. 6 | """ 7 | 8 | engine_started = object() 9 | engine_stopped = object() 10 | spider_opened = object() 11 | spider_idle = object() 12 | spider_closed = object() 13 | spider_error = object() 14 | request_scheduled = object() 15 | request_dropped = object() 16 | request_reached_downloader = object() 17 | request_left_downloader = object() 18 | response_received = object() 19 | response_downloaded = object() 20 | item_scraped = object() 21 | item_dropped = object() 22 | item_error = object() 23 | 24 | # for backward compatibility 25 | stats_spider_opened = spider_opened 26 | stats_spider_closing = spider_closed 27 | stats_spider_closed = spider_closed 28 | 29 | item_passed = item_scraped 30 | 31 | request_received = request_scheduled 32 | -------------------------------------------------------------------------------- /aioscpy/spider.py: -------------------------------------------------------------------------------- 1 | from aioscpy import signals 2 | from aioscpy import call_grace_instance 3 | 4 | 5 | class Spider(object): 6 | name = None 7 | custom_settings = None 8 | 9 | def __init__(self, name=None, **kwargs): 10 | if name is not None: 11 | self.name = name 12 | self.__dict__.update(kwargs) 13 | if not hasattr(self, 'start_urls'): 14 | self.start_urls = [] 15 | 16 | def log(self, message, level='DEBUG', **kw): 17 | self.logger.log(level, message, **kw) 18 | 19 | @classmethod 20 | def from_crawler(cls, crawler, *args, **kwargs): 21 | spider = cls(*args, **kwargs) 22 | spider._set_crawler(crawler) 23 | return spider 24 | 25 | def _set_crawler(self, crawler): 26 | self.crawler = crawler 27 | self.settings = crawler.settings 28 | crawler.signals.connect(self.close, signals.spider_closed) 29 | crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 30 | 31 | async def start_requests(self): 32 | for url in self.start_urls: 33 | yield self.di.get('request')(url, dont_filter=True) 34 | 35 | async def _parse(self, response, **kwargs): 36 | return self.parse(response) 37 | 38 | async def parse(self, response): 39 | raise NotImplementedError(f'{self.__class__.__name__}.parse callback is not defined') 40 | 41 | @classmethod 42 | def update_settings(cls, settings): 43 | settings.setdict(cls.custom_settings or {}, priority='spider') 44 | 45 | @staticmethod 46 | def close(spider, reason): 47 | closed = getattr(spider, 'closed', None) 48 | if callable(closed): 49 | return closed(reason) 50 | 51 | @classmethod 52 | def start(cls): 53 | from aioscpy.crawler import CrawlerProcess 54 | from aioscpy.utils.tools import get_project_settings 55 | 56 | process = call_grace_instance(CrawlerProcess, get_project_settings()) 57 | process.crawl(cls) 58 | process.start() 59 | 60 | def spider_idle(self): 61 | if self.settings.get("SPIDER_IDLE", True): 62 | raise self.di.get('exceptions').DontCloseSpider 63 | 64 | def __str__(self): 65 | return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self)) 66 | 67 | __repr__ = __str__ 68 | 69 | 70 | Spider = call_grace_instance('spider', only_instance=True) 71 | -------------------------------------------------------------------------------- /aioscpy/templates/project/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/aioscpy/templates/project/__init__.py -------------------------------------------------------------------------------- /aioscpy/templates/project/aioscpy.cfg: -------------------------------------------------------------------------------- 1 | [package_env] 2 | path = ../ 3 | 4 | [settings] 5 | default = settings 6 | 7 | [deploy] 8 | #url = http://localhost:6800/ 9 | project = ${project_name} 10 | -------------------------------------------------------------------------------- /aioscpy/templates/project/middlewares.py.tmpl: -------------------------------------------------------------------------------- 1 | 2 | class ${ProjectName}DownloaderMiddleware: 3 | 4 | @classmethod 5 | def from_crawler(cls, crawler): 6 | # This method is used by Aioscpy to create your spiders. 7 | s = cls() 8 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 9 | return s 10 | 11 | def process_request(self, spider, request): 12 | # Called for each request that goes through the downloader 13 | # middleware. 14 | 15 | # Must either: 16 | # - return None: continue processing this request 17 | # - or return a Response object 18 | # - or return a Request object 19 | # - or raise IgnoreRequest: process_exception() methods of 20 | # installed downloader middleware will be called 21 | return None 22 | 23 | def process_response(self, request, response, spider): 24 | # Called with the response returned from the downloader. 25 | 26 | # Must either; 27 | # - return a Response object 28 | # - return a Request object 29 | # - or raise IgnoreRequest 30 | return response 31 | 32 | def process_exception(self, request, exception, spider): 33 | # Called when a download handler or a process_request() 34 | # (from other downloader middleware) raises an exception. 35 | 36 | # Must either: 37 | # - return None: continue processing this exception 38 | # - return a Response object: stops process_exception() chain 39 | # - return a Request object: stops process_exception() chain 40 | pass 41 | 42 | def spider_opened(self, spider): 43 | spider.logger.info('Spider opened: %s' % spider.name) 44 | -------------------------------------------------------------------------------- /aioscpy/templates/project/pipelines.py.tmpl: -------------------------------------------------------------------------------- 1 | 2 | class ${ProjectName}Pipeline: 3 | def process_item(self, item, spider): 4 | return item 5 | -------------------------------------------------------------------------------- /aioscpy/templates/project/settings.py.tmpl: -------------------------------------------------------------------------------- 1 | BOT_NAME = '$project_name' 2 | 3 | SPIDER_MODULES = ['spiders'] 4 | NEWSPIDER_MODULE = 'spiders' 5 | 6 | # CONCURRENT_ITEMS = 100 7 | # CONCURRENT_REQUESTS = 16 8 | # CONCURRENT_REQUESTS_PER_DOMAIN = 8 9 | # CONCURRENT_REQUESTS_PER_IP = 0 10 | # RANDOMIZE_DOWNLOAD_DELAY = True 11 | 12 | # DOWNLOAD_DELAY = 0 13 | # DOWNLOAD_TIMEOUT = 20 14 | # DOWNLOAD_HANDLER = "aioscpy.core.downloader.http.AioHttpDownloadHandler" 15 | # DOWNLOAD_HANDLER = "aioscpy.core.downloader.handlers.httpx.HttpxDownloadHandler" 16 | # SCHEDULER = "aioscpy.core.scheduler.redis.RedisScheduler" 17 | # SCHEDULER = "aioscpy.core.scheduler.memory.MemoryScheduler" 18 | # REQUESTS_SESSION_STATS = False 19 | 20 | # SCRAPER_SLOT_MAX_ACTIVE_SIZE = 500000 21 | 22 | 23 | # SPIDER_IDLE = False 24 | 25 | # :LOG CONFIG 26 | # LOG_LEVEL = "DEBUG" 27 | # LOG_FILE = False 28 | # LOG_FILENAME = f"{BOT_NAME}.log" 29 | # LOG_ENCODING = "utf-8" 30 | # LOG_ROTATION = "1 week" 31 | # LOG_RETENTION = "30 days" 32 | 33 | # message config 34 | # RABBITMQ_TCP = { 35 | # "host": "172.16.8.147", 36 | # # "port": 5672, 37 | # # "username": "admin", 38 | # # "password": "admin", 39 | # # "key": "message:queue", 40 | # # "max_priority": 100 41 | # } 42 | # QUEUE_KEY = '%(spider)s:requests' 43 | 44 | # REDIS_TCP = { 45 | # "host": "172.16.7.172", 46 | # "port": 6379, 47 | # "password": "123456", 48 | # "db": 15 49 | # } 50 | # REDIS_URI = "redis://:123456@172.16.7.172:6379/1" 51 | 52 | 53 | # DOWNLOADER_STATS = True 54 | 55 | # LOGSTATS_INTERVAL = 60.0 56 | # STATS_CLASS = 'aioscpy.libs.statscollectors.MemoryStatsCollector' 57 | # STATS_DUMP = True 58 | 59 | # DOWNLOADER_MIDDLEWARES = { 60 | # '$project_name.middlewares.${ProjectName}DownloaderMiddleware': 543, 61 | # } 62 | 63 | # EXTENSIONS = { 64 | # } 65 | 66 | # ITEM_PIPELINES = { 67 | # '$project_name.pipelines.${ProjectName}Pipeline': 300, 68 | # } 69 | -------------------------------------------------------------------------------- /aioscpy/templates/project/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Aioscpy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /aioscpy/templates/project/start.py.tmpl: -------------------------------------------------------------------------------- 1 | from aioscpy.crawler import call_grace_instance 2 | from aioscpy.utils.tools import get_project_settings 3 | 4 | """start spider method one: 5 | from cegex.baidu import BaiduSpider 6 | from cegex.httpbin import HttpBinSpider 7 | 8 | process = CrawlerProcess() 9 | process.crawl(HttpBinSpider) 10 | process.crawl(BaiduSpider) 11 | process.start() 12 | """ 13 | 14 | 15 | def load_file_to_execute(): 16 | process = call_grace_instance("crawler_process", get_project_settings()) 17 | process.load_spider(path='[spiders path]]', spider_like='[spider name]') 18 | process.start() 19 | 20 | 21 | def load_name_to_execute(): 22 | process = call_grace_instance("crawler_process", get_project_settings()) 23 | process.crawl('[spider name]', path="[spiders path]") 24 | process.start() 25 | -------------------------------------------------------------------------------- /aioscpy/templates/spiders/basic.tmpl: -------------------------------------------------------------------------------- 1 | from aioscpy.spider import Spider 2 | 3 | 4 | class $classname(Spider): 5 | name = '$name' 6 | custom_settings = { 7 | "SPIDER_IDLE": False 8 | } 9 | start_urls = [] 10 | 11 | async def parse(self, response): 12 | item = { 13 | 'hot': '\n'.join(response.xpath('//span[@class="title-content-title"]/text()').extract()), 14 | } 15 | yield item 16 | -------------------------------------------------------------------------------- /aioscpy/templates/spiders/crawl.tmpl: -------------------------------------------------------------------------------- 1 | from aioscpy.spider import Spider 2 | from anti_header import Header 3 | from pprint import pprint, pformat 4 | 5 | 6 | class $classname(Spider): 7 | name = '$name' 8 | custom_settings = { 9 | "SPIDER_IDLE": False 10 | } 11 | start_urls = [] 12 | 13 | async def process_request(self, request): 14 | request.headers = Header(url=request.url, platform='windows', connection=True).random 15 | return request 16 | 17 | async def process_response(self, request, response): 18 | if response.status in [404, 503]: 19 | return request 20 | return response 21 | 22 | async def parse(self, response): 23 | item = { 24 | # 'hot': '\n'.join(response.xpath('//span[@class="title-content-title"]/text()').extract()), 25 | } 26 | yield item 27 | 28 | async def process_item(self, item): 29 | pass 30 | # self.logger.info("{item}", **{'item': pformat(item)}) 31 | 32 | 33 | if __name__ == '__main__': 34 | sp = $classname() 35 | sp.start() 36 | -------------------------------------------------------------------------------- /aioscpy/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/aioscpy/utils/__init__.py -------------------------------------------------------------------------------- /aioscpy/utils/common.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | import warnings 3 | import os 4 | 5 | from operator import itemgetter 6 | from importlib import import_module 7 | 8 | from aioscpy.settings import BaseSettings 9 | 10 | 11 | def without_none_values(iterable): 12 | """Return a copy of ``iterable`` with all ``None`` entries removed. 13 | 14 | If ``iterable`` is a mapping, return a dictionary where all pairs that have 15 | value ``None`` have been removed. 16 | """ 17 | try: 18 | return {k: v for k, v in iterable.items() if v is not None} 19 | except AttributeError: 20 | return type(iterable)((v for v in iterable if v is not None)) 21 | 22 | 23 | def build_component_list(compdict, custom=None): 24 | """Compose a component list from a { class: order } dictionary.""" 25 | 26 | def _check_components(complist): 27 | if len({c for c in complist}) != len(complist): 28 | raise ValueError(f'Some paths in {complist!r} convert to the same object, ' 29 | 'please update your settings') 30 | 31 | def _map_keys(compdict): 32 | if isinstance(compdict, BaseSettings): 33 | compbs = BaseSettings() 34 | for k, v in compdict.items(): 35 | prio = compdict.getpriority(k) 36 | if compbs.getpriority(k) == prio: 37 | raise ValueError(f'Some paths in {list(compdict.keys())!r} ' 38 | 'convert to the same ' 39 | 'object, please update your settings' 40 | ) 41 | else: 42 | compbs.set(k, v, priority=prio) 43 | return compbs 44 | else: 45 | _check_components(compdict) 46 | return {k: v for k, v in compdict.items()} 47 | 48 | def _validate_values(compdict): 49 | """Fail if a value in the components dict is not a real number or None.""" 50 | for name, value in compdict.items(): 51 | if value is not None and not isinstance(value, numbers.Real): 52 | raise ValueError(f'Invalid value {value} for component {name}, ' 53 | 'please provide a real number or None instead') 54 | 55 | # BEGIN Backward compatibility for old (base, custom) call signature 56 | if isinstance(custom, (list, tuple)): 57 | _check_components(custom) 58 | return type(custom)(c for c in custom) 59 | 60 | if custom is not None: 61 | compdict.update(custom) 62 | # END Backward compatibility 63 | 64 | _validate_values(compdict) 65 | compdict = without_none_values(_map_keys(compdict)) 66 | return [k for k, v in sorted(compdict.items(), key=itemgetter(1))] 67 | 68 | 69 | def arglist_to_dict(arglist): 70 | """Convert a list of arguments like ['arg1=val1', 'arg2=val2', ...] to a 71 | dict 72 | """ 73 | return dict(x.split('=', 1) for x in arglist) 74 | 75 | 76 | def inside_project(): 77 | aioscpy_module = os.environ.get('AIOSCPY_SETTINGS_MODULE') 78 | if aioscpy_module is not None: 79 | try: 80 | import_module(aioscpy_module) 81 | except ImportError as exc: 82 | warnings.warn(f"Cannot import aioscpy settings module {aioscpy_module}: {exc}") 83 | else: 84 | return True 85 | return bool(closest_aioscpy_cfg()) 86 | 87 | 88 | def closest_aioscpy_cfg(path='.', prevpath=None): 89 | """Return the path to the closest aioscpy.cfg file by traversing the current 90 | directory and its parents 91 | """ 92 | if path == prevpath: 93 | return '' 94 | path = os.path.abspath(path) 95 | cfgfile = os.path.join(path, 'aioscpy.cfg') 96 | if os.path.exists(cfgfile): 97 | return cfgfile 98 | return closest_aioscpy_cfg(os.path.dirname(path), path) 99 | -------------------------------------------------------------------------------- /aioscpy/utils/curl.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import warnings 3 | from shlex import split 4 | from http.cookies import SimpleCookie 5 | from urllib.parse import urlparse 6 | 7 | from w3lib.http import basic_auth_header 8 | 9 | 10 | class CurlParser(argparse.ArgumentParser): 11 | def error(self, message): 12 | error_msg = f'There was an error parsing the curl command: {message}' 13 | raise ValueError(error_msg) 14 | 15 | 16 | curl_parser = CurlParser() 17 | curl_parser.add_argument('url') 18 | curl_parser.add_argument('-H', '--header', dest='headers', action='append') 19 | curl_parser.add_argument('-X', '--request', dest='method') 20 | curl_parser.add_argument('-d', '--data', '--data-raw', dest='data') 21 | curl_parser.add_argument('-u', '--user', dest='auth') 22 | 23 | 24 | safe_to_ignore_arguments = [ 25 | ['--compressed'], 26 | # `--compressed` argument is not safe to ignore, but it's included here 27 | # because the `HttpCompressionMiddleware` is enabled by default 28 | ['-s', '--silent'], 29 | ['-v', '--verbose'], 30 | ['-#', '--progress-bar'] 31 | ] 32 | 33 | for argument in safe_to_ignore_arguments: 34 | curl_parser.add_argument(*argument, action='store_true') 35 | 36 | 37 | def _parse_headers_and_cookies(parsed_args): 38 | headers = [] 39 | cookies = {} 40 | for header in parsed_args.headers or (): 41 | name, val = header.split(':', 1) 42 | name = name.strip() 43 | val = val.strip() 44 | if name.title() == 'Cookie': 45 | for name, morsel in SimpleCookie(val).items(): 46 | cookies[name] = morsel.value 47 | else: 48 | headers.append((name, val)) 49 | 50 | if parsed_args.auth: 51 | user, password = parsed_args.auth.split(':', 1) 52 | headers.append(('Authorization', basic_auth_header(user, password))) 53 | 54 | return headers, cookies 55 | 56 | 57 | def curl_to_request_kwargs(curl_command: str, ignore_unknown_options: bool = True) -> dict: 58 | """Convert a cURL command syntax to Request kwargs. 59 | 60 | :param str curl_command: string containing the curl command 61 | :param bool ignore_unknown_options: If true, only a warning is emitted when 62 | cURL options are unknown. Otherwise 63 | raises an error. (default: True) 64 | :return: dictionary of Request kwargs 65 | """ 66 | 67 | curl_args = split(curl_command) 68 | 69 | if curl_args[0] != 'curl': 70 | raise ValueError('A curl command must start with "curl"') 71 | 72 | parsed_args, argv = curl_parser.parse_known_args(curl_args[1:]) 73 | 74 | if argv: 75 | msg = f'Unrecognized options: {", ".join(argv)}' 76 | if ignore_unknown_options: 77 | warnings.warn(msg) 78 | else: 79 | raise ValueError(msg) 80 | 81 | url = parsed_args.url 82 | 83 | # curl automatically prepends 'http' if the scheme is missing, but Request 84 | # needs the scheme to work 85 | parsed_url = urlparse(url) 86 | if not parsed_url.scheme: 87 | url = 'http://' + url 88 | 89 | method = parsed_args.method or 'GET' 90 | 91 | result = {'method': method.upper(), 'url': url} 92 | 93 | headers, cookies = _parse_headers_and_cookies(parsed_args) 94 | 95 | if headers: 96 | result['headers'] = headers 97 | if cookies: 98 | result['cookies'] = cookies 99 | if parsed_args.data: 100 | result['body'] = parsed_args.data 101 | if not parsed_args.method: 102 | # if the "data" is specified but the "method" is not specified, 103 | # the default method is 'POST' 104 | result['method'] = 'POST' 105 | 106 | return result 107 | 108 | 109 | if __name__ == '__main__': 110 | curl_str = """ 111 | curl 'https://quotes.toscrape.com/api/quotes?page=10' \ 112 | -H 'authority: quotes.toscrape.com' \ 113 | -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \ 114 | -H 'accept-language: zh-CN,zh;q=0.9,en;q=0.8' \ 115 | -H 'cache-control: no-cache' \ 116 | -H 'pragma: no-cache' \ 117 | -H 'referer: https://docs.scrapy.org/en/latest/topics/developer-tools.html?highlight=curl' \ 118 | -H 'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"' \ 119 | -H 'sec-ch-ua-mobile: ?0' \ 120 | -H 'sec-ch-ua-platform: "Windows"' \ 121 | -H 'sec-fetch-dest: document' \ 122 | -H 'sec-fetch-mode: navigate' \ 123 | -H 'sec-fetch-site: cross-site' \ 124 | -H 'sec-fetch-user: ?1' \ 125 | -H 'upgrade-insecure-requests: 1' \ 126 | -H 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36' \ 127 | --compressed 128 | """ 129 | 130 | res = curl_to_request_kwargs(curl_str) 131 | print(res) 132 | -------------------------------------------------------------------------------- /aioscpy/utils/log.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | 3 | import sys 4 | import socket 5 | import warnings 6 | import aioscpy 7 | 8 | from loguru import logger 9 | 10 | from aioscpy.exceptions import AioscpyDeprecationWarning 11 | # from aioscpy.settings import Settings 12 | from aioscpy.utils.tools import get_project_settings 13 | 14 | 15 | def set_log_config(formatter: str, settings): 16 | _log_config = { 17 | "default": { 18 | "handlers": [ 19 | { 20 | "sink": sys.stdout, 21 | "format": formatter, 22 | "level": settings.get('LOG_LEVEL', "TRACE") 23 | } 24 | ], 25 | "extra": { 26 | "host": socket.gethostbyname(socket.gethostname()), 27 | 'log_name': settings.get("BOT_NAME", 'default'), 28 | 'type': 'None' 29 | }, 30 | "levels": [ 31 | dict(name="TRACE", icon="✏️", color=""), 32 | dict(name="DEBUG", icon="❄️", color=""), 33 | dict(name="INFO", icon="♻️", color=""), 34 | dict(name="SUCCESS", icon="✔️", color=""), 35 | dict(name="WARNING", icon="⚠️", color=""), 36 | dict(name="ERROR", icon="❌️", color=""), 37 | dict(name="CRITICAL", icon="☠️", color=""), 38 | ] 39 | } 40 | } 41 | if settings.get('LOG_FILE', False): 42 | _log_config['default']['handlers'].append({ 43 | "sink": settings.get('LOG_FILENAME', __file__), 44 | "format": formatter, 45 | "level": settings.get('LOG_LEVEL', "DEBUG"), 46 | "rotation": settings.get("LOG_ROTATION", '1 week'), 47 | "retention": settings.get("LOG_RETENTION", '30 days'), 48 | 'encoding': settings.get("LOG_ENCODING", "utf-8") 49 | }) 50 | return _log_config 51 | 52 | 53 | class LogFormatter(object): 54 | simple_formatter = '{time:YYYY-MM-DD HH:mm:ss} ' \ 55 | '[{name}] ' \ 56 | '{level.icon}{level}: ' \ 57 | '{message} ' 58 | 59 | default_formatter = '{time:YYYY-MM-DD HH:mm:ss,SSS} | ' \ 60 | '[{extra[log_name]}] {module}:{name}:{function}:{line} | ' \ 61 | '{extra[host]} | ' \ 62 | '{level.icon}{level: <5} | ' \ 63 | '{level.no} | ' \ 64 | '{extra[type]} | ' \ 65 | '{message} ' 66 | 67 | kafka_formatter = '{time:YYYY-MM-DD HH:mm:ss,SSS}| ' \ 68 | '[{extra[log_name]}] {module}:{name}:{function}:{line} | ' \ 69 | '{extra[host]} | ' \ 70 | '{process} | ' \ 71 | '{thread} | ' \ 72 | '{level: <5} | ' \ 73 | '{level.no} | ' \ 74 | '{extra[type]}| ' \ 75 | '{message} ' 76 | 77 | @classmethod 78 | def setter_log_handler(cls, log, callback=None): 79 | assert callable(callback), 'callback must be a callable object' 80 | log.add(callback, format=cls.kafka_formatter) 81 | 82 | @classmethod 83 | def get_logger(cls, log, name=None): 84 | settings = get_project_settings() 85 | log_config = set_log_config(cls.simple_formatter, settings) 86 | config = log_config.pop('default', {}) 87 | if name: 88 | config['extra']['log_name'] = name 89 | log.configure(**config) 90 | return log 91 | 92 | @staticmethod 93 | def format(spider, meta): 94 | if hasattr(spider, 'logging_keys'): 95 | logging_txt = [] 96 | for key in spider.logging_keys: 97 | if meta.get(key, None) is not None: 98 | logging_txt.append(u'{0}:{1} '.format(key, meta[key])) 99 | logging_txt.append('successfully') 100 | return ' '.join(logging_txt) 101 | 102 | 103 | def logformatter_adapter(logkws): 104 | if not {'level', 'msg', 'args'} <= set(logkws): 105 | warnings.warn('Missing keys in LogFormatter method', 106 | AioscpyDeprecationWarning) 107 | 108 | if 'format' in logkws: 109 | warnings.warn('`format` key in LogFormatter methods has been ' 110 | 'deprecated, use `msg` instead', 111 | AioscpyDeprecationWarning) 112 | 113 | level = logkws.get('level', 'INFO') 114 | message = logkws.get('format', logkws.get('msg')) 115 | args = logkws if not logkws.get('args') else logkws['args'] 116 | 117 | return level, message, args 118 | 119 | 120 | def std_log_aioscpy_info(settings): 121 | from pprint import pprint, pformat 122 | 123 | icon = """ 124 | (_) 125 | __ _ _ ___ ___ ___ _ __ _ _ 126 | / _` | |/ _ \/ __|/ __| '_ \| | | | 127 | | (_| | | (_) \__ \ (__| |_) | |_| | 128 | \__,_|_|\___/|___/\___| .__/ \__, | 129 | | | __/ | 130 | |_| |___/ 131 | """ 132 | logger.info("{item}", **{'item': icon}) 133 | logger.info("aioscpy {version} started (bot: {bot})", 134 | **{'version': aioscpy.__version__, 'bot': settings['BOT_NAME']}) 135 | 136 | 137 | lof = LogFormatter 138 | 139 | logger = lof.get_logger(logger) 140 | -------------------------------------------------------------------------------- /aioscpy/utils/ossignal.py: -------------------------------------------------------------------------------- 1 | import signal 2 | 3 | 4 | signal_names = {} 5 | for signame in dir(signal): 6 | if signame.startswith('SIG') and not signame.startswith('SIG_'): 7 | signum = getattr(signal, signame) 8 | if isinstance(signum, int): 9 | signal_names[signum] = signame 10 | 11 | 12 | def install_shutdown_handlers(function, override_sigint=True): 13 | """Install the given function as a signal handler for all common shutdown 14 | signals (such as SIGINT, SIGTERM, etc). If override_sigint is ``False`` the 15 | SIGINT handler won't be install if there is already a handler in place 16 | (e.g. Pdb) 17 | """ 18 | signal.signal(signal.SIGTERM, function) 19 | if signal.getsignal(signal.SIGINT) == signal.default_int_handler or override_sigint: 20 | signal.signal(signal.SIGINT, function) 21 | # Catch Ctrl-Break in windows 22 | if hasattr(signal, 'SIGBREAK'): 23 | signal.signal(signal.SIGBREAK, function) 24 | -------------------------------------------------------------------------------- /aioscpy/utils/othtypes.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import weakref 3 | 4 | from collections.abc import Mapping 5 | from typing import Union 6 | from urllib.parse import urlparse, ParseResult 7 | from weakref import WeakKeyDictionary 8 | 9 | from aioscpy.http import Request, Response 10 | 11 | 12 | _urlparse_cache: "WeakKeyDictionary[Union[Request, Response], ParseResult]" = WeakKeyDictionary() 13 | 14 | 15 | def urlparse_cached(request_or_response: Union[Request, Response]) -> ParseResult: 16 | """Return urlparse.urlparse caching the result, where the argument can be a 17 | Request or Response object 18 | """ 19 | if request_or_response not in _urlparse_cache: 20 | _urlparse_cache[request_or_response] = urlparse(request_or_response.url) 21 | return _urlparse_cache[request_or_response] 22 | 23 | 24 | class CaselessDict(dict): 25 | 26 | __slots__ = () 27 | 28 | def __init__(self, seq=None): 29 | super().__init__() 30 | if seq: 31 | self.update(seq) 32 | 33 | def __getitem__(self, key): 34 | return dict.__getitem__(self, self.normkey(key)) 35 | 36 | def __setitem__(self, key, value): 37 | dict.__setitem__(self, self.normkey(key), self.normvalue(value)) 38 | 39 | def __delitem__(self, key): 40 | dict.__delitem__(self, self.normkey(key)) 41 | 42 | def __contains__(self, key): 43 | return dict.__contains__(self, self.normkey(key)) 44 | has_key = __contains__ 45 | 46 | def __copy__(self): 47 | return self.__class__(self) 48 | copy = __copy__ 49 | 50 | def normkey(self, key): 51 | """Method to normalize dictionary key access""" 52 | return key.lower() 53 | 54 | def normvalue(self, value): 55 | """Method to normalize values prior to be set""" 56 | return value 57 | 58 | def get(self, key, def_val=None): 59 | return dict.get(self, self.normkey(key), self.normvalue(def_val)) 60 | 61 | def setdefault(self, key, def_val=None): 62 | return dict.setdefault(self, self.normkey(key), self.normvalue(def_val)) 63 | 64 | def update(self, seq): 65 | seq = seq.items() if isinstance(seq, Mapping) else seq 66 | iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq) 67 | super().update(iseq) 68 | 69 | @classmethod 70 | def fromkeys(cls, keys, value=None): 71 | return cls((k, value) for k in keys) 72 | 73 | def pop(self, key, *args): 74 | return dict.pop(self, self.normkey(key), *args) 75 | 76 | 77 | class LocalCache(collections.OrderedDict): 78 | """Dictionary with a finite number of keys. 79 | 80 | Older items expires first. 81 | """ 82 | 83 | def __init__(self, limit=None): 84 | super().__init__() 85 | self.limit = limit 86 | 87 | def __setitem__(self, key, value): 88 | if self.limit: 89 | while len(self) >= self.limit: 90 | self.popitem(last=False) 91 | super().__setitem__(key, value) 92 | 93 | 94 | class LocalWeakReferencedCache(weakref.WeakKeyDictionary): 95 | """ 96 | A weakref.WeakKeyDictionary implementation that uses LocalCache as its 97 | underlying data structure, making it ordered and capable of being size-limited. 98 | 99 | Useful for memoization, while avoiding keeping received 100 | arguments in memory only because of the cached references. 101 | 102 | Note: like LocalCache and unlike weakref.WeakKeyDictionary, 103 | it cannot be instantiated with an initial dictionary. 104 | """ 105 | 106 | def __init__(self, limit=None): 107 | super().__init__() 108 | self.data = LocalCache(limit=limit) 109 | 110 | def __setitem__(self, key, value): 111 | try: 112 | super().__setitem__(key, value) 113 | except TypeError: 114 | pass # key is not weak-referenceable, skip caching 115 | 116 | def __getitem__(self, key): 117 | try: 118 | return super().__getitem__(key) 119 | except (TypeError, KeyError): 120 | return None # key is either not weak-referenceable or not cached 121 | 122 | 123 | class SequenceExclude: 124 | """Object to test if an item is NOT within some sequence.""" 125 | 126 | def __init__(self, seq): 127 | self.seq = seq 128 | 129 | def __contains__(self, item): 130 | return item not in self.seq 131 | 132 | 133 | dnscache = LocalCache(10000) 134 | -------------------------------------------------------------------------------- /aioscpy/utils/signal.py: -------------------------------------------------------------------------------- 1 | """Helper functions for working with signals""" 2 | import asyncio 3 | 4 | from pydispatch.dispatcher import Anonymous, Any, disconnect, getAllReceivers, liveReceivers 5 | from pydispatch.robustapply import robustApply 6 | from aioscpy.exceptions import StopDownload 7 | from aioscpy.utils.log import logger 8 | 9 | 10 | class _IgnoredException(Exception): 11 | pass 12 | 13 | 14 | async def robustApplyWrap(f, recv, *args, **kw): 15 | dont_log = kw.pop('dont_log', None) 16 | spider = kw.get('spider', None) 17 | try: 18 | result = f(recv, *args, **kw) 19 | if asyncio.iscoroutine(result): 20 | return await result 21 | except (Exception, BaseException) as exc: # noqa: E722 22 | if dont_log is None or not isinstance(exc, dont_log): 23 | logger.error("Error caught on signal handler: {receiver}", 24 | **{'receiver': recv}, 25 | exc_info=exc, 26 | extra={'spider': spider}) 27 | return exc 28 | 29 | 30 | async def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named): 31 | """Like pydispatcher.robust.sendRobust but it also logs errors and returns 32 | Failures instead of exceptions. 33 | """ 34 | named['dont_log'] = (named.pop('dont_log', _IgnoredException), StopDownload) 35 | responses = [] 36 | for receiver in liveReceivers(getAllReceivers(sender, signal)): 37 | result = await robustApplyWrap(robustApply, receiver, signal=signal, sender=sender, *arguments, **named) 38 | responses.append((receiver, result)) 39 | return responses 40 | 41 | 42 | async def send_catch_log_coroutine(signal=Any, sender=Anonymous, *arguments, **named): 43 | """Like send_catch_log but supports returning deferreds on signal handlers. 44 | Returns a deferred that gets fired once all signal handlers deferreds were 45 | fired. 46 | """ 47 | dfds = [] 48 | for receiver in liveReceivers(getAllReceivers(sender, signal)): 49 | dfds.append(asyncio.create_task( 50 | robustApplyWrap(robustApply, receiver, signal=signal, sender=sender, *arguments, **named))) 51 | res = await asyncio.gather(*dfds) 52 | return res 53 | 54 | 55 | def disconnect_all(signal=Any, sender=Any): 56 | """Disconnect all signal handlers. Useful for cleaning up after running 57 | tests 58 | """ 59 | for receiver in liveReceivers(getAllReceivers(sender, signal)): 60 | disconnect(receiver, signal=signal, sender=sender) 61 | 62 | -------------------------------------------------------------------------------- /aioscpy/utils/template.py: -------------------------------------------------------------------------------- 1 | """Helper functions for working with templates_bak1""" 2 | 3 | import os 4 | import re 5 | import string 6 | 7 | 8 | def render_templatefile(path, **kwargs): 9 | with open(path, 'rb') as fp: 10 | raw = fp.read().decode('utf8') 11 | 12 | content = string.Template(raw).substitute(**kwargs) 13 | 14 | render_path = path[:-len('.tmpl')] if path.endswith('.tmpl') else path 15 | 16 | if path.endswith('.tmpl'): 17 | os.rename(path, render_path) 18 | 19 | with open(render_path, 'wb') as fp: 20 | fp.write(content.encode('utf8')) 21 | 22 | 23 | CAMELCASE_INVALID_CHARS = re.compile(r'[^a-zA-Z\d]') 24 | 25 | 26 | def string_camelcase(string): 27 | """ Convert a word to its CamelCase version and remove invalid chars 28 | 29 | >>> string_camelcase('lost-pound') 30 | 'LostPound' 31 | 32 | >>> string_camelcase('missing_images') 33 | 'MissingImages' 34 | 35 | """ 36 | return CAMELCASE_INVALID_CHARS.sub('', string.title()) 37 | -------------------------------------------------------------------------------- /cegex/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/cegex/__init__.py -------------------------------------------------------------------------------- /cegex/baidu.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from aioscpy.spider import Spider 4 | from anti_header import Header 5 | from pprint import pprint, pformat 6 | 7 | 8 | class BaiduSpider(Spider): 9 | name = 'baidu' 10 | custom_settings = { 11 | "SPIDER_IDLE": False, 12 | 'TLS_CIPHERS': True, 13 | "DOWNLOAD_HANDLER": "aioscpy.core.downloader.handlers.requests.RequestsDownloadHandler" 14 | } 15 | start_urls = [f'https://www.baidu.com/?a{i}' for i in range(10)] 16 | 17 | async def process_request(self, request): 18 | request.headers = Header(url=request.url, platform='windows', connection=True).random 19 | return request 20 | 21 | async def process_response(self, request, response): 22 | return response 23 | 24 | async def process_exception(self, request, exc): 25 | raise exc 26 | 27 | async def parse(self, response): 28 | item = { 29 | 'hot': '\n'.join(response.xpath('//span[@class="title-content-title"]/text()').extract()), 30 | } 31 | yield item 32 | 33 | async def process_item(self, item): 34 | pass 35 | # self.logger.info("{item}", **{'item': pformat(item)}) 36 | 37 | 38 | if __name__ == '__main__': 39 | baidu = BaiduSpider() 40 | baidu.start() 41 | -------------------------------------------------------------------------------- /cegex/httpbin.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from aioscpy.spider import Spider 4 | 5 | 6 | class HttpBinSpider(Spider): 7 | name = 'httpbin' 8 | custom_settings = { 9 | 'CONCURRENT_REQUESTS': 10 10 | } 11 | start_urls = [f'http://httpbin.org/get?a{i}' for i in range(20)] 12 | 13 | async def parse(self, response): 14 | item = await response.json 15 | await asyncio.sleep(2) 16 | yield item 17 | 18 | async def process_item(self, item): 19 | pass 20 | # self.logger.info(item) 21 | 22 | 23 | if __name__ == '__main__': 24 | q = HttpBinSpider() 25 | q.start() 26 | -------------------------------------------------------------------------------- /cegex/httpbin_post.py: -------------------------------------------------------------------------------- 1 | from aioscpy.spider import Spider 2 | from aioscpy import call_grace_instance 3 | from aioscpy.http import FormRequest 4 | 5 | 6 | class HttpBinPostSpider(Spider): 7 | name = 'httpbin_post' 8 | custom_settings = { 9 | 'CONCURRENT_REQUESTS': 10 10 | } 11 | 12 | start_urls = ['http://httpbin.org/post' for _ in range(20)] 13 | 14 | async def start_requests(self): 15 | """ 16 | : request usage description: 17 | : data = body 18 | [header]: Content-Type: application/x-www-form-urlencoded 19 | [method]: POST 20 | [body]: 21 | { 22 | 'a': 1, 23 | 'b': 2 24 | } 25 | # supported special scenarios about json request 26 | : json = body 27 | [header]: Content-Type: application/json 28 | [method]: POST 29 | [body]: { 30 | 'a': 1, 31 | 'b': 2 32 | } 33 | """ 34 | for url in self.start_urls: 35 | yield call_grace_instance( 36 | FormRequest, 37 | # self.di.get('form_request'), 38 | # self.di.get('json_request'), 39 | url, 40 | method='POST', 41 | formdata={"b": '11'} 42 | ) 43 | 44 | async def parse(self, response): 45 | item = await response.json 46 | yield item 47 | 48 | async def process_item(self, item): 49 | self.logger.info(item) 50 | 51 | 52 | if __name__ == '__main__': 53 | q = HttpBinPostSpider() 54 | q.start() 55 | -------------------------------------------------------------------------------- /cegex/ja3.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from aioscpy.spider import Spider 4 | from anti_header import Header 5 | from pprint import pprint, pformat 6 | 7 | 8 | class Ja3Spider(Spider): 9 | name = 'ja3' 10 | custom_settings = { 11 | "SPIDER_IDLE": False, 12 | 'TLS_CIPHERS': True, 13 | "DOWNLOAD_HANDLER": "aioscpy.core.downloader.handlers.requests.AiohttpDownloadHandler" 14 | } 15 | start_urls = [f'https://tls.browserleaks.com/json?a{i}' for i in range(10)] 16 | 17 | async def process_request(self, request): 18 | request.headers = Header(url=request.url, platform='windows', connection=True).random 19 | return request 20 | 21 | async def process_response(self, request, response): 22 | return response 23 | 24 | async def process_exception(self, request, exc): 25 | raise exc 26 | 27 | async def parse(self, response): 28 | _ja = await response.json 29 | item = { 30 | 'ja3': _ja['ja3_hash'], 31 | } 32 | yield item 33 | 34 | async def process_item(self, item): 35 | pass 36 | # self.logger.info("{item}", **{'item': pformat(item)}) 37 | 38 | 39 | if __name__ == '__main__': 40 | ja3 = Ja3Spider() 41 | ja3.start() 42 | -------------------------------------------------------------------------------- /doc/README_ZH.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ![aioscpy](./images/aioscpy.png) 4 | 5 | # Aioscpy 6 | 7 | 一个强大的、高性能的异步Web爬取和抓取框架,基于Python的asyncio生态系统构建。 8 | 9 | [英文](../README.md) | 中文 10 | 11 | ## 概述 12 | 13 | Aioscpy是一个快速的高级web爬行和web抓取框架,用于抓取网站并从其页面提取结构化数据。它受到Scrapy和scrapy_redis的启发,但从头开始设计,充分利用异步编程的全部功能。 14 | 15 | ### 主要特点 16 | 17 | - **完全异步**:基于Python的asyncio,实现高性能并发操作 18 | - **Scrapy风格的API**:为来自Scrapy的用户提供熟悉的API 19 | - **分布式爬取**:支持使用Redis进行分布式爬取 20 | - **多种HTTP后端**:支持aiohttp、httpx和requests 21 | - **动态变量注入**:强大的依赖注入系统 22 | - **灵活的中间件系统**:可定制的请求/响应处理管道 23 | - **强大的数据处理**:用于处理爬取数据的管道 24 | 25 | ## 系统要求 26 | 27 | - Python 3.8+ 28 | - 支持Linux、Windows、macOS、BSD 29 | 30 | ## 安装 31 | 32 | ### 基本安装 33 | 34 | ```shell 35 | pip install aioscpy 36 | ``` 37 | 38 | ### 安装所有依赖 39 | 40 | ```shell 41 | pip install aioscpy[all] 42 | ``` 43 | 44 | ### 安装特定HTTP后端 45 | 46 | ```shell 47 | pip install aioscpy[aiohttp,httpx] 48 | ``` 49 | 50 | ### 从最新版本安装 51 | 52 | ```shell 53 | pip install git+https://github.com/ihandmine/aioscpy 54 | ``` 55 | 56 | ## 快速开始 57 | 58 | ### 创建新项目 59 | 60 | ```shell 61 | aioscpy startproject myproject 62 | cd myproject 63 | ``` 64 | 65 | ### 创建爬虫 66 | 67 | ```shell 68 | aioscpy genspider myspider 69 | ``` 70 | 71 | 这将在`spiders`目录中创建一个基本爬虫。 72 | 73 | ![tree](./images/tree.png) 74 | 75 | ### 示例爬虫 76 | 77 | ```python 78 | from aioscpy.spider import Spider 79 | 80 | 81 | class QuotesSpider(Spider): 82 | name = 'quotes' 83 | custom_settings = { 84 | "SPIDER_IDLE": False 85 | } 86 | start_urls = [ 87 | 'https://quotes.toscrape.com/tag/humor/', 88 | ] 89 | 90 | async def parse(self, response): 91 | for quote in response.css('div.quote'): 92 | yield { 93 | 'author': quote.xpath('span/small/text()').get(), 94 | 'text': quote.css('span.text::text').get(), 95 | } 96 | 97 | next_page = response.css('li.next a::attr("href")').get() 98 | if next_page is not None: 99 | yield response.follow(next_page, self.parse) 100 | ``` 101 | 102 | ### 创建单个爬虫脚本 103 | 104 | ```shell 105 | aioscpy onespider single_quotes 106 | ``` 107 | 108 | ### 高级爬虫示例 109 | 110 | ```python 111 | from aioscpy.spider import Spider 112 | from anti_header import Header 113 | from pprint import pprint, pformat 114 | 115 | 116 | class SingleQuotesSpider(Spider): 117 | name = 'single_quotes' 118 | custom_settings = { 119 | "SPIDER_IDLE": False 120 | } 121 | start_urls = [ 122 | 'https://quotes.toscrape.com/', 123 | ] 124 | 125 | async def process_request(self, request): 126 | request.headers = Header(url=request.url, platform='windows', connection=True).random 127 | return request 128 | 129 | async def process_response(self, request, response): 130 | if response.status in [404, 503]: 131 | return request 132 | return response 133 | 134 | async def process_exception(self, request, exc): 135 | raise exc 136 | 137 | async def parse(self, response): 138 | for quote in response.css('div.quote'): 139 | yield { 140 | 'author': quote.xpath('span/small/text()').get(), 141 | 'text': quote.css('span.text::text').get(), 142 | } 143 | 144 | next_page = response.css('li.next a::attr("href")').get() 145 | if next_page is not None: 146 | yield response.follow(next_page, callback=self.parse) 147 | 148 | async def process_item(self, item): 149 | self.logger.info("{item}", **{'item': pformat(item)}) 150 | 151 | 152 | if __name__ == '__main__': 153 | quotes = SingleQuotesSpider() 154 | quotes.start() 155 | ``` 156 | 157 | ### 运行爬虫 158 | 159 | ```shell 160 | # 从项目中运行爬虫 161 | aioscpy crawl quotes 162 | 163 | # 运行单个爬虫脚本 164 | aioscpy runspider quotes.py 165 | ``` 166 | 167 | ![run](./images/run.png) 168 | 169 | ### 从代码中运行 170 | 171 | ```python 172 | from aioscpy.crawler import call_grace_instance 173 | from aioscpy.utils.tools import get_project_settings 174 | 175 | # 方法1:从目录中加载所有爬虫 176 | def load_spiders_from_directory(): 177 | process = call_grace_instance("crawler_process", get_project_settings()) 178 | process.load_spider(path='./spiders') 179 | process.start() 180 | 181 | # 方法2:按名称运行特定爬虫 182 | def run_specific_spider(): 183 | process = call_grace_instance("crawler_process", get_project_settings()) 184 | process.crawl('myspider') 185 | process.start() 186 | 187 | if __name__ == '__main__': 188 | run_specific_spider() 189 | ``` 190 | 191 | ## 配置 192 | 193 | Aioscpy可以通过项目中的`settings.py`文件进行配置。以下是最重要的设置: 194 | 195 | ### 并发设置 196 | 197 | ```python 198 | # 最大并发处理项目数 199 | CONCURRENT_ITEMS = 100 200 | 201 | # 最大并发请求数 202 | CONCURRENT_REQUESTS = 16 203 | 204 | # 每个域名的最大并发请求数 205 | CONCURRENT_REQUESTS_PER_DOMAIN = 8 206 | 207 | # 每个IP的最大并发请求数 208 | CONCURRENT_REQUESTS_PER_IP = 0 209 | ``` 210 | 211 | ### 下载设置 212 | 213 | ```python 214 | # 请求间的延迟(秒) 215 | DOWNLOAD_DELAY = 0 216 | 217 | # 请求超时时间(秒) 218 | DOWNLOAD_TIMEOUT = 20 219 | 220 | # 是否随机化下载延迟 221 | RANDOMIZE_DOWNLOAD_DELAY = True 222 | 223 | # 使用的HTTP后端 224 | DOWNLOAD_HANDLER = "aioscpy.core.downloader.handlers.httpx.HttpxDownloadHandler" 225 | # 其他选项: 226 | # DOWNLOAD_HANDLER = "aioscpy.core.downloader.handlers.aiohttp.AioHttpDownloadHandler" 227 | # DOWNLOAD_HANDLER = "aioscpy.core.downloader.handlers.requests.RequestsDownloadHandler" 228 | ``` 229 | 230 | ### 调度器设置 231 | 232 | ```python 233 | # 使用的调度器(基于内存或Redis) 234 | SCHEDULER = "aioscpy.core.scheduler.memory.MemoryScheduler" 235 | # 分布式爬取: 236 | # SCHEDULER = "aioscpy.core.scheduler.redis.RedisScheduler" 237 | 238 | # Redis连接设置(用于Redis调度器) 239 | REDIS_URI = "redis://localhost:6379" 240 | QUEUE_KEY = "%(spider)s:queue" 241 | ``` 242 | 243 | ## 响应API 244 | 245 | Aioscpy提供了丰富的API来处理响应: 246 | 247 | ### 提取数据 248 | 249 | ```python 250 | # 使用CSS选择器 251 | title = response.css('title::text').get() 252 | all_links = response.css('a::attr(href)').getall() 253 | 254 | # 使用XPath 255 | title = response.xpath('//title/text()').get() 256 | all_links = response.xpath('//a/@href').getall() 257 | ``` 258 | 259 | ### 跟踪链接 260 | 261 | ```python 262 | # 跟踪链接 263 | yield response.follow('next-page.html', self.parse) 264 | 265 | # 使用回调跟踪链接 266 | yield response.follow('details.html', self.parse_details) 267 | 268 | # 跟踪所有匹配的CSS选择器的链接 269 | yield from response.follow_all(css='a.product::attr(href)', callback=self.parse_product) 270 | ``` 271 | 272 | ## 更多命令 273 | 274 | ```shell 275 | aioscpy -h 276 | ``` 277 | 278 | ## 分布式爬取 279 | 280 | 要启用基于Redis的分布式爬取: 281 | 282 | 1. 在设置中配置Redis: 283 | 284 | ```python 285 | SCHEDULER = "aioscpy.core.scheduler.redis.RedisScheduler" 286 | REDIS_URI = "redis://localhost:6379" 287 | QUEUE_KEY = "%(spider)s:queue" 288 | ``` 289 | 290 | 2. 在不同的机器上运行多个爬虫实例,全部连接到同一个Redis服务器。 291 | 292 | ## 贡献 293 | 294 | 请通过创建issue向项目所有者提交您的建议。 295 | 296 | ## 感谢 297 | 298 | [aiohttp](https://github.com/aio-libs/aiohttp/) 299 | 300 | [scrapy](https://github.com/scrapy/scrapy) 301 | 302 | [loguru](https://github.com/Delgan/loguru) 303 | 304 | [httpx](https://github.com/encode/httpx) 305 | -------------------------------------------------------------------------------- /doc/images/aioscpy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/doc/images/aioscpy.png -------------------------------------------------------------------------------- /doc/images/run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/doc/images/run.png -------------------------------------------------------------------------------- /doc/images/tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/doc/images/tree.png -------------------------------------------------------------------------------- /example/project_quotes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/example/project_quotes/__init__.py -------------------------------------------------------------------------------- /example/project_quotes/aioscpy.cfg: -------------------------------------------------------------------------------- 1 | 2 | [settings] 3 | default = settings 4 | 5 | [deploy] 6 | #url = http://localhost:6800/ 7 | project = project_quotes 8 | -------------------------------------------------------------------------------- /example/project_quotes/middlewares.py: -------------------------------------------------------------------------------- 1 | 2 | class ProjectQuotesDownloaderMiddleware: 3 | 4 | @classmethod 5 | def from_crawler(cls, crawler): 6 | # This method is used by Aioscpy to create your spiders. 7 | s = cls() 8 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 9 | return s 10 | 11 | def process_request(self, request, spider): 12 | # Called for each request that goes through the downloader 13 | # middleware. 14 | 15 | # Must either: 16 | # - return None: continue processing this request 17 | # - or return a Response object 18 | # - or return a Request object 19 | # - or raise IgnoreRequest: process_exception() methods of 20 | # installed downloader middleware will be called 21 | return None 22 | 23 | def process_response(self, request, response, spider): 24 | # Called with the response returned from the downloader. 25 | 26 | # Must either; 27 | # - return a Response object 28 | # - return a Request object 29 | # - or raise IgnoreRequest 30 | return response 31 | 32 | def process_exception(self, request, exception, spider): 33 | # Called when a download handler or a process_request() 34 | # (from other downloader middleware) raises an exception. 35 | 36 | # Must either: 37 | # - return None: continue processing this exception 38 | # - return a Response object: stops process_exception() chain 39 | # - return a Request object: stops process_exception() chain 40 | pass 41 | 42 | def spider_opened(self, spider): 43 | spider.logger.info('Spider opened: %s' % spider.name) 44 | -------------------------------------------------------------------------------- /example/project_quotes/pipelines.py: -------------------------------------------------------------------------------- 1 | 2 | class ProjectQuotesPipeline: 3 | def process_item(self, item, spider): 4 | return item 5 | -------------------------------------------------------------------------------- /example/project_quotes/settings.py: -------------------------------------------------------------------------------- 1 | BOT_NAME = 'project_quotes' 2 | 3 | SPIDER_MODULES = ['spiders'] 4 | NEWSPIDER_MODULE = 'spiders' 5 | 6 | # CONCURRENT_REQUESTS = 16 7 | # CONCURRENT_REQUESTS_PER_DOMAIN = 8 8 | # CONCURRENT_REQUESTS_PER_IP = 0 9 | # RANDOMIZE_DOWNLOAD_DELAY = True 10 | 11 | # DOWNLOAD_DELAY = 0 12 | # DOWNLOAD_TIMEOUT = 20 13 | # DOWNLOAD_HANDLER = "aioscpy.core.downloader.http.AioHttpDownloadHandler" 14 | # SCHEDULER = "aioscpy.core.scheduler.redis.RedisScheduler" 15 | # SCHEDULER = "aioscpy.core.scheduler.memory.MemoryScheduler" 16 | 17 | 18 | # SPIDER_IDLE = False 19 | 20 | # :LOG CONFIG 21 | # LOG_LEVEL = "DEBUG" 22 | # LOG_FILE = False 23 | # LOG_FILENAME = f"{BOT_NAME}.log" 24 | # LOG_ENCODING = "utf-8" 25 | # LOG_ROTATION = "1 week" 26 | # LOG_RETENTION = "30 days" 27 | 28 | # message config 29 | # RABBITMQ_TCP = { 30 | # "host": "172.16.8.147", 31 | # # "port": 5672, 32 | # # "username": "admin", 33 | # # "password": "admin", 34 | # # "key": "message:queue", 35 | # # "max_priority": 100 36 | # } 37 | # QUEUE_KEY = '%(spider)s:requests' 38 | 39 | # REDIS_TCP = { 40 | # "host": "172.16.7.172", 41 | # "port": 6379, 42 | # "password": "123456", 43 | # "db": 15 44 | # } 45 | # REDIS_URI = "redis://:123456@172.16.7.172:6379/1" 46 | 47 | 48 | # DOWNLOADER_STATS = True 49 | 50 | # LOGSTATS_INTERVAL = 60.0 51 | # STATS_CLASS = 'aioscpy.libs.statscollectors.MemoryStatsCollector' 52 | # STATS_DUMP = True 53 | 54 | # DOWNLOADER_MIDDLEWARES = { 55 | # 'project_quotes.middlewares.ProjectQuotesDownloaderMiddleware': 543, 56 | # } 57 | 58 | # EXTENSIONS = { 59 | # } 60 | 61 | # ITEM_PIPELINES = { 62 | # 'project_quotes.pipelines.ProjectQuotesPipeline': 300, 63 | # } 64 | -------------------------------------------------------------------------------- /example/project_quotes/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Aioscpy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /example/project_quotes/spiders/quotes.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from aioscpy.spider import Spider 4 | from aioscpy import call_grace_instance 5 | 6 | 7 | class QuotesSpider(Spider): 8 | name = 'quotes' 9 | custom_settings = { 10 | "SPIDER_IDLE": False 11 | } 12 | start_urls = [ 13 | 'https://quotes.toscrape.com/', 14 | ] 15 | 16 | async def parse(self, response): 17 | 18 | for quote in response.css('div.quote'): 19 | yield { 20 | 'author': quote.xpath('span/small/text()').get(), 21 | 'text': quote.css('span.text::text').get(), 22 | } 23 | 24 | next_page = response.css('li.next a::attr("href")').get() 25 | if next_page is not None: 26 | # first next_page method: 27 | yield response.follow(next_page, callback=self.parse) 28 | 29 | # second next_page method: 30 | # next_page_url = 'https://quotes.toscrape.com' + next_page 31 | # yield call_grace_instance(self.di.get("request"), next_page_url, callback=self.parse) 32 | 33 | 34 | if __name__ == '__main__': 35 | q = QuotesSpider() 36 | q.start() 37 | -------------------------------------------------------------------------------- /example/project_quotes/start.py: -------------------------------------------------------------------------------- 1 | from aioscpy import call_grace_instance 2 | from aioscpy.utils.tools import get_project_settings 3 | 4 | 5 | def load_file_to_execute(): 6 | process = call_grace_instance("crawler_process", get_project_settings()) 7 | process.load_spider(path='./spiders') 8 | process.start() 9 | 10 | 11 | def load_name_to_execute(): 12 | process = call_grace_instance("crawler_process", get_project_settings()) 13 | process.crawl('quotes') 14 | process.start() 15 | 16 | 17 | if __name__ == '__main__': 18 | load_name_to_execute() 19 | -------------------------------------------------------------------------------- /example/single_quotes.py: -------------------------------------------------------------------------------- 1 | from aioscpy.spider import Spider 2 | from anti_header import Header 3 | from pprint import pprint, pformat 4 | 5 | 6 | class SingleQuotesSpider(Spider): 7 | name = 'single_quotes' 8 | custom_settings = { 9 | "SPIDER_IDLE": False 10 | } 11 | start_urls = [ 12 | 'https://quotes.toscrape.com/', 13 | ] 14 | 15 | async def process_request(self, request): 16 | request.headers = Header(url=request.url, platform='windows', connection=True).random 17 | return request 18 | 19 | async def process_response(self, request, response): 20 | if response.status in [404, 503]: 21 | return request 22 | return response 23 | 24 | async def process_exception(self, request, exc): 25 | raise exc 26 | 27 | async def parse(self, response): 28 | 29 | for quote in response.css('div.quote'): 30 | yield { 31 | 'author': quote.xpath('span/small/text()').get(), 32 | 'text': quote.css('span.text::text').get(), 33 | } 34 | 35 | next_page = response.css('li.next a::attr("href")').get() 36 | if next_page is not None: 37 | # first next_page method: 38 | yield response.follow(next_page, callback=self.parse) 39 | 40 | # second next_page method: 41 | # next_page_url = 'https://quotes.toscrape.com' + next_page 42 | # yield call_grace_instance(self.di.get("request"), next_page_url, callback=self.parse) 43 | 44 | async def process_item(self, item): 45 | pass 46 | # self.logger.info("{item}", **{'item': pformat(item)}) 47 | 48 | 49 | if __name__ == '__main__': 50 | q = SingleQuotesSpider() 51 | q.start() 52 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp~=3.8.1 2 | w3lib~=1.22.0 3 | parsel~=1.6.0 4 | redis~=4.3.1 5 | pika~=1.2.0 6 | loguru~=0.5.3 7 | PyDispatcher~=2.0.5 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from shutil import rmtree 4 | from os.path import dirname, join 5 | 6 | from setuptools import setup, Command, find_packages 7 | 8 | # Package meta-data. 9 | NAME = "aioscpy" 10 | DESCRIPTION = "An asyncio + aiolibs crawler imitate scrapy framework" 11 | URL = "https://github.com/ihandmine/aioscpy" 12 | EMAIL = "handmine@outlook.com" 13 | AUTHOR = "handmine" 14 | REQUIRES_PYTHON = ">=3.8.0" 15 | 16 | here = os.path.abspath(os.path.dirname(__file__)) 17 | with open(f"{here}/README.md", encoding='utf-8') as f: 18 | long_description = f.read() 19 | 20 | with open(join(dirname(__file__), 'aioscpy/VERSION'), 'rb') as f: 21 | old_version = f.read().decode('ascii').strip() 22 | maxv, midv, minv = [int(v) for v in old_version.split('.')] 23 | if minv <= 24: 24 | minv += 1 25 | else: 26 | midv += 1 27 | minv = 0 28 | VERSION = '.'.join([str(v) for v in [maxv, midv, minv]]) 29 | print(f'old version: {old_version}, new version: {VERSION}') 30 | 31 | 32 | class UploadCommand(Command): 33 | """Support setup_bak.py upload.""" 34 | 35 | description = "Build and publish the package." 36 | user_options = [] 37 | 38 | @staticmethod 39 | def status(s): 40 | """Prints things in bold.""" 41 | print("\033[1m{0}\033[0m".format(s)) 42 | 43 | def initialize_options(self): 44 | pass 45 | 46 | def finalize_options(self): 47 | pass 48 | 49 | def run(self): 50 | try: 51 | self.status("Removing previous builds...") 52 | rmtree(os.path.join(here, "dist")) 53 | except OSError: 54 | pass 55 | 56 | self.status("Building Source and Wheel distribution...") 57 | os.system("{0} setup.py sdist bdist_wheel".format(sys.executable)) 58 | 59 | self.status("Uploading the package to PyPI via Twine...") 60 | os.system("twine upload dist/*") 61 | 62 | with open(join(dirname(__file__), 'aioscpy/VERSION'), 'w') as f: 63 | f.write(VERSION + '\n') 64 | 65 | self.status("git option [add]") 66 | os.system("git add aioscpy/VERSION") 67 | 68 | self.status("git option [commit][push]") 69 | os.system(f'git commit -m "{VERSION}"') 70 | os.system("git push") 71 | sys.exit() 72 | 73 | 74 | extras_require = { 75 | "all": [ 76 | "aiohttp", 77 | "httpx", 78 | "anti-header", 79 | "w3lib", 80 | "parsel", 81 | "PyDispatcher", 82 | "redis", 83 | "anyio", 84 | "ujson" 85 | ], 86 | "aiohttp": ["aiohttp", "cryptography"], 87 | "httpx": ["httpx[http2]>=0.23.0"], 88 | } 89 | 90 | 91 | setup( 92 | name=NAME, 93 | version=VERSION, 94 | author=AUTHOR, 95 | packages=find_packages(), 96 | include_package_data=True, 97 | package_data={"": ["*.py", "*.tmpl", '*.cfg']}, 98 | install_requires=[ 99 | "aiohttp", 100 | "httpx", 101 | "anti-header", 102 | "w3lib", 103 | "parsel", 104 | "PyDispatcher", 105 | "redis", 106 | "anyio", 107 | "ujson" 108 | ], 109 | extras_require=extras_require, 110 | description=DESCRIPTION, 111 | long_description=long_description, 112 | long_description_content_type='text/markdown', 113 | url=URL, 114 | author_email=EMAIL, 115 | license="MIT", 116 | keywords=""" 117 | crawler 118 | scrapy 119 | asyncio 120 | aiohttp 121 | anti-header 122 | anti-useragent 123 | python3 124 | """, 125 | python_requires=REQUIRES_PYTHON, 126 | zip_safe=False, 127 | entry_points={ 128 | 'console_scripts': ['aioscpy = aioscpy.cmdline:execute'] 129 | }, 130 | classifiers=[ 131 | "License :: OSI Approved :: MIT License", 132 | "Programming Language :: Python", 133 | "Programming Language :: Python :: 3.7", 134 | "Development Status :: 3 - Alpha", 135 | "Framework :: AsyncIO", 136 | "Operating System :: Unix", 137 | "Operating System :: Microsoft :: Windows", 138 | "Operating System :: MacOS", 139 | ], 140 | # Build and upload package: python3 setup_bak.py upload 141 | cmdclass={"upload": UploadCommand}, 142 | ) 143 | -------------------------------------------------------------------------------- /start.py: -------------------------------------------------------------------------------- 1 | from aioscpy.crawler import call_grace_instance 2 | from aioscpy.utils.tools import get_project_settings 3 | 4 | """start spider method one: 5 | from cegex.baidu import BaiduSpider 6 | from cegex.httpbin import HttpBinSpider 7 | 8 | process = CrawlerProcess() 9 | process.crawl(HttpBinSpider) 10 | process.crawl(BaiduSpider) 11 | process.start() 12 | """ 13 | 14 | 15 | def load_file_to_execute(): 16 | process = call_grace_instance("crawler_process", get_project_settings()) 17 | process.load_spider(path='./cegex', spider_like='httpbin') 18 | process.start() 19 | 20 | 21 | def load_name_to_execute(): 22 | process = call_grace_instance("crawler_process", get_project_settings()) 23 | process.crawl('ja3', path="./cegex") 24 | process.start() 25 | 26 | 27 | if __name__ == '__main__': 28 | load_name_to_execute() 29 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Aioscpy Tests 2 | 3 | This directory contains unit tests for the Aioscpy framework. 4 | 5 | ## Running the Tests 6 | 7 | To run all tests, use the following command from the project root: 8 | 9 | ```bash 10 | python -m tests.run_tests 11 | ``` 12 | 13 | To run a specific test file: 14 | 15 | ```bash 16 | python -m tests.test_engine_memory_management 17 | ``` 18 | 19 | ## Test Files 20 | 21 | - `test_engine_memory_management.py`: Tests for the memory management optimizations in the ExecutionEngine. 22 | - `test_engine_task_beat.py`: Tests for the task beat optimizations in the ExecutionEngine. 23 | - `test_httpx_handler.py`: Tests for the improved error handling in the HttpxDownloadHandler. 24 | - `test_adaptive_concurrency.py`: Tests for the AdaptiveConcurrencyMiddleware. 25 | 26 | ## Writing New Tests 27 | 28 | When writing new tests, follow these guidelines: 29 | 30 | 1. Create a new test file with a name that clearly indicates what is being tested. 31 | 2. Use the `unittest` framework. 32 | 3. Use mocks to isolate the code being tested. 33 | 4. Test both success and failure cases. 34 | 5. Add the new test to `run_tests.py`. 35 | 36 | ## Test Coverage 37 | 38 | To generate a test coverage report, install the `coverage` package: 39 | 40 | ```bash 41 | pip install coverage 42 | ``` 43 | 44 | Then run the tests with coverage: 45 | 46 | ```bash 47 | coverage run -m tests.run_tests 48 | ``` 49 | 50 | And generate a report: 51 | 52 | ```bash 53 | coverage report 54 | ``` 55 | 56 | Or an HTML report: 57 | 58 | ```bash 59 | coverage html 60 | ``` 61 | -------------------------------------------------------------------------------- /tests/run_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import sys 3 | import os 4 | 5 | # Add the parent directory to the path so we can import the modules 6 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 7 | 8 | # Import the test modules 9 | from test_engine_memory_management import TestEngineMemoryManagement 10 | from test_engine_task_beat import TestEngineTaskBeat 11 | from test_httpx_handler import TestHttpxHandler 12 | from test_adaptive_concurrency import TestAdaptiveConcurrencyMiddleware 13 | 14 | 15 | def run_tests(): 16 | """Run all the tests.""" 17 | # Create a test suite 18 | test_suite = unittest.TestSuite() 19 | 20 | # Add the test cases 21 | test_suite.addTest(unittest.makeSuite(TestEngineMemoryManagement)) 22 | test_suite.addTest(unittest.makeSuite(TestEngineTaskBeat)) 23 | test_suite.addTest(unittest.makeSuite(TestHttpxHandler)) 24 | test_suite.addTest(unittest.makeSuite(TestAdaptiveConcurrencyMiddleware)) 25 | 26 | # Run the tests 27 | runner = unittest.TextTestRunner(verbosity=2) 28 | result = runner.run(test_suite) 29 | 30 | # Return the result 31 | return result.wasSuccessful() 32 | 33 | 34 | if __name__ == '__main__': 35 | success = run_tests() 36 | sys.exit(0 if success else 1) 37 | -------------------------------------------------------------------------------- /tests/test_adaptive_concurrency.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import time 3 | from unittest.mock import MagicMock, patch 4 | 5 | from aioscpy.middleware.adaptive_concurrency import AdaptiveConcurrencyMiddleware 6 | 7 | 8 | class TestAdaptiveConcurrencyMiddleware(unittest.TestCase): 9 | """Test the AdaptiveConcurrencyMiddleware.""" 10 | 11 | def setUp(self): 12 | # Create mocks 13 | self.crawler = MagicMock() 14 | self.crawler.settings = { 15 | 'ADAPTIVE_CONCURRENCY_ENABLED': True, 16 | 'ADAPTIVE_CONCURRENCY_TARGET_RESPONSE_TIME': 0.5, 17 | 'ADAPTIVE_CONCURRENCY_MIN_REQUESTS': 5, 18 | 'ADAPTIVE_CONCURRENCY_MAX_REQUESTS': 20, 19 | 'ADAPTIVE_CONCURRENCY_WINDOW_SIZE': 10, 20 | 'ADAPTIVE_CONCURRENCY_ADJUSTMENT_INTERVAL': 1, 21 | 'CONCURRENT_REQUESTS': 10, 22 | } 23 | self.crawler.settings.getbool = lambda key, default: self.crawler.settings.get(key, default) 24 | self.crawler.settings.getfloat = lambda key, default: self.crawler.settings.get(key, default) 25 | self.crawler.settings.getint = lambda key, default: self.crawler.settings.get(key, default) 26 | 27 | self.spider = MagicMock() 28 | self.spider.name = 'test_spider' 29 | 30 | # Create middleware 31 | self.middleware = AdaptiveConcurrencyMiddleware(self.crawler) 32 | self.middleware.logger = MagicMock() 33 | 34 | # Create request and response mocks 35 | self.request = MagicMock() 36 | self.request.meta = {} 37 | self.response = MagicMock() 38 | 39 | async def test_process_request_adds_start_time(self): 40 | """Test that process_request adds a start time to the request meta.""" 41 | result = await self.middleware.process_request(self.request, self.spider) 42 | 43 | # Verify that the result is None (middleware continues) 44 | self.assertIsNone(result) 45 | 46 | # Verify that a start time was added to the request meta 47 | self.assertIn('request_start_time', self.request.meta) 48 | self.assertIsInstance(self.request.meta['request_start_time'], float) 49 | 50 | async def test_process_response_calculates_time(self): 51 | """Test that process_response calculates the response time.""" 52 | # Set up a request with a start time 53 | start_time = time.time() - 0.3 # 300ms ago 54 | self.request.meta['request_start_time'] = start_time 55 | 56 | result = await self.middleware.process_response(self.request, self.response, self.spider) 57 | 58 | # Verify that the result is the response 59 | self.assertEqual(result, self.response) 60 | 61 | # Verify that a response time was added to the deque 62 | self.assertEqual(len(self.middleware.response_times), 1) 63 | self.assertGreaterEqual(self.middleware.response_times[0], 0.3) 64 | 65 | async def test_adjust_concurrency_faster_responses(self): 66 | """Test that concurrency is increased when responses are faster than target.""" 67 | # Fill the response times deque with fast responses (0.2s) 68 | self.middleware.response_times.extend([0.2] * self.middleware.window_size) 69 | self.middleware.current_concurrency = 10 70 | 71 | # Adjust concurrency 72 | self.middleware._adjust_concurrency() 73 | 74 | # Verify that concurrency was increased 75 | self.assertGreater(self.middleware.current_concurrency, 10) 76 | 77 | # Verify that the setting was updated 78 | self.crawler.settings.set.assert_called_with('CONCURRENT_REQUESTS', self.middleware.current_concurrency) 79 | 80 | # Verify that the change was logged 81 | self.middleware.logger.info.assert_called_once() 82 | 83 | async def test_adjust_concurrency_slower_responses(self): 84 | """Test that concurrency is decreased when responses are slower than target.""" 85 | # Fill the response times deque with slow responses (1.0s) 86 | self.middleware.response_times.extend([1.0] * self.middleware.window_size) 87 | self.middleware.current_concurrency = 10 88 | 89 | # Adjust concurrency 90 | self.middleware._adjust_concurrency() 91 | 92 | # Verify that concurrency was decreased 93 | self.assertLess(self.middleware.current_concurrency, 10) 94 | 95 | # Verify that the setting was updated 96 | self.crawler.settings.set.assert_called_with('CONCURRENT_REQUESTS', self.middleware.current_concurrency) 97 | 98 | # Verify that the change was logged 99 | self.middleware.logger.info.assert_called_once() 100 | 101 | async def test_adjust_concurrency_respects_min_max(self): 102 | """Test that concurrency adjustments respect the min and max limits.""" 103 | # Test minimum limit 104 | self.middleware.response_times.extend([2.0] * self.middleware.window_size) # Very slow responses 105 | self.middleware.current_concurrency = 6 106 | 107 | self.middleware._adjust_concurrency() 108 | 109 | # Verify that concurrency was not decreased below the minimum 110 | self.assertEqual(self.middleware.current_concurrency, 5) 111 | 112 | # Test maximum limit 113 | self.middleware.response_times.clear() 114 | self.middleware.response_times.extend([0.1] * self.middleware.window_size) # Very fast responses 115 | self.middleware.current_concurrency = 19 116 | 117 | self.middleware._adjust_concurrency() 118 | 119 | # Verify that concurrency was not increased above the maximum 120 | self.assertEqual(self.middleware.current_concurrency, 20) 121 | 122 | async def test_disabled_middleware(self): 123 | """Test that the middleware does nothing when disabled.""" 124 | # Disable the middleware 125 | self.middleware.enabled = False 126 | 127 | # Process a request 128 | result = await self.middleware.process_request(self.request, self.spider) 129 | 130 | # Verify that the result is None 131 | self.assertIsNone(result) 132 | 133 | # Verify that no start time was added 134 | self.assertNotIn('request_start_time', self.request.meta) 135 | 136 | # Process a response 137 | result = await self.middleware.process_response(self.request, self.response, self.spider) 138 | 139 | # Verify that the result is the response 140 | self.assertEqual(result, self.response) 141 | 142 | # Verify that no response times were recorded 143 | self.assertEqual(len(self.middleware.response_times), 0) 144 | 145 | 146 | if __name__ == '__main__': 147 | unittest.main() 148 | -------------------------------------------------------------------------------- /tests/test_engine_memory_management.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import asyncio 3 | from unittest.mock import MagicMock, patch, AsyncMock 4 | 5 | from aioscpy.core.engine import ExecutionEngine 6 | 7 | 8 | class TestEngineMemoryManagement(unittest.TestCase): 9 | """Test the memory management optimizations in the ExecutionEngine.""" 10 | 11 | def setUp(self): 12 | # Create mocks 13 | self.crawler = MagicMock() 14 | self.crawler.settings = { 15 | 'GC_ENABLED': True, 16 | 'GC_FREQUENCY': 3, # Set to a small value for testing 17 | } 18 | self.crawler.settings.getint = lambda key, default: self.crawler.settings.get(key, default) 19 | self.crawler.settings.getbool = lambda key, default: self.crawler.settings.get(key, default) 20 | self.crawler.settings.getfloat = lambda key, default: self.crawler.settings.get(key, default) 21 | 22 | self.spider = MagicMock() 23 | self.spider.name = 'test_spider' 24 | 25 | self.slot = MagicMock() 26 | self.slot.close_if_idle = True 27 | 28 | # Create engine 29 | self.engine = ExecutionEngine(self.crawler, lambda: None) 30 | self.engine.logger = MagicMock() 31 | self.engine.spider_is_idle = AsyncMock(return_value=False) 32 | 33 | # Patch asyncio.sleep to avoid actual sleeping 34 | self.sleep_patch = patch('asyncio.sleep', new=AsyncMock()) 35 | self.mock_sleep = self.sleep_patch.start() 36 | 37 | # Patch gc.collect 38 | self.gc_patch = patch('gc.collect') 39 | self.mock_gc = self.gc_patch.start() 40 | 41 | def tearDown(self): 42 | self.sleep_patch.stop() 43 | self.gc_patch.stop() 44 | 45 | async def _run_heart_beat(self, iterations): 46 | """Helper to run the heart_beat method for a specific number of iterations.""" 47 | # Create a task for heart_beat 48 | task = asyncio.create_task(self.engine.heart_beat(0.1, self.spider, self.slot)) 49 | 50 | # Let it run for a few iterations 51 | for _ in range(iterations): 52 | await asyncio.sleep(0) 53 | 54 | # Cancel the task 55 | task.cancel() 56 | try: 57 | await task 58 | except asyncio.CancelledError: 59 | pass 60 | 61 | def test_gc_enabled(self): 62 | """Test that garbage collection runs when enabled.""" 63 | asyncio.run(self._run_heart_beat(10)) 64 | 65 | # With GC_FREQUENCY=3, we should have called gc.collect about 3 times in 10 iterations 66 | # (not exactly 3 because of the counter initialization and async nature) 67 | self.assertGreaterEqual(self.mock_gc.call_count, 2) 68 | self.assertLessEqual(self.mock_gc.call_count, 4) 69 | 70 | def test_gc_disabled(self): 71 | """Test that garbage collection doesn't run when disabled.""" 72 | self.crawler.settings['GC_ENABLED'] = False 73 | 74 | asyncio.run(self._run_heart_beat(10)) 75 | 76 | # With GC_ENABLED=False, gc.collect should never be called 77 | self.mock_gc.assert_not_called() 78 | 79 | def test_gc_frequency(self): 80 | """Test that garbage collection respects the frequency setting.""" 81 | # Set frequency to 5 82 | self.crawler.settings['GC_FREQUENCY'] = 5 83 | 84 | asyncio.run(self._run_heart_beat(15)) 85 | 86 | # With GC_FREQUENCY=5, we should have called gc.collect about 3 times in 15 iterations 87 | self.assertGreaterEqual(self.mock_gc.call_count, 2) 88 | self.assertLessEqual(self.mock_gc.call_count, 4) 89 | 90 | def test_gc_exception_handling(self): 91 | """Test that exceptions in garbage collection are handled properly.""" 92 | # Make gc.collect raise an exception 93 | self.mock_gc.side_effect = Exception("Test exception") 94 | 95 | asyncio.run(self._run_heart_beat(5)) 96 | 97 | # The exception should be caught and logged 98 | self.engine.logger.warning.assert_called() 99 | 100 | # The heart_beat should continue running despite the exception 101 | self.assertGreater(self.mock_sleep.call_count, 3) 102 | 103 | 104 | if __name__ == '__main__': 105 | unittest.main() 106 | -------------------------------------------------------------------------------- /tests/test_engine_task_beat.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import asyncio 3 | from unittest.mock import MagicMock, patch, AsyncMock 4 | 5 | from aioscpy.core.engine import ExecutionEngine 6 | 7 | 8 | class TestEngineTaskBeat(unittest.TestCase): 9 | """Test the task beat optimizations in the ExecutionEngine.""" 10 | 11 | def setUp(self): 12 | # Create mocks 13 | self.crawler = MagicMock() 14 | self.crawler.settings = { 15 | 'TASK_BEAT_ACTIVE_SLEEP': 0.1, 16 | 'TASK_BEAT_IDLE_SLEEP': 0.5, 17 | 'TASK_BEAT_BATCH_SIZE': 10, 18 | } 19 | self.crawler.settings.getint = lambda key, default: self.crawler.settings.get(key, default) 20 | self.crawler.settings.getbool = lambda key, default: self.crawler.settings.get(key, default) 21 | self.crawler.settings.getfloat = lambda key, default: self.crawler.settings.get(key, default) 22 | 23 | self.slot = MagicMock() 24 | self.slot.scheduler = MagicMock() 25 | self.slot.scheduler.async_next_request = AsyncMock() 26 | self.slot.add_request = MagicMock() 27 | 28 | # Create engine 29 | self.engine = ExecutionEngine(self.crawler, lambda: None) 30 | self.engine.logger = MagicMock() 31 | self.engine._needs_backout = MagicMock(return_value=False) 32 | self.engine.slot = self.slot 33 | self.engine.downloader = MagicMock() 34 | self.engine.downloader.fetch = AsyncMock() 35 | 36 | # Patch asyncio.sleep to avoid actual sleeping 37 | self.sleep_patch = patch('asyncio.sleep', new=AsyncMock()) 38 | self.mock_sleep = self.sleep_patch.start() 39 | 40 | def tearDown(self): 41 | self.sleep_patch.stop() 42 | 43 | async def _run_task_beat(self, iterations): 44 | """Helper to run the task_beat method for a specific number of iterations.""" 45 | # Create a task for task_beat 46 | task = asyncio.create_task(self.engine.task_beat()) 47 | 48 | # Let it run for a few iterations 49 | for _ in range(iterations): 50 | await asyncio.sleep(0) 51 | 52 | # Cancel the task 53 | task.cancel() 54 | try: 55 | await task 56 | except asyncio.CancelledError: 57 | pass 58 | 59 | def test_task_beat_with_requests(self): 60 | """Test that task_beat processes requests when available.""" 61 | # Set up mock to return some requests 62 | mock_requests = [MagicMock() for _ in range(3)] 63 | self.slot.scheduler.async_next_request.return_value = mock_requests 64 | 65 | asyncio.run(self._run_task_beat(2)) 66 | 67 | # Verify that scheduler.async_next_request was called with the batch size 68 | self.slot.scheduler.async_next_request.assert_called_with(count=10) 69 | 70 | # Verify that add_request and fetch were called for each request 71 | self.assertEqual(self.slot.add_request.call_count, 3) 72 | self.assertEqual(self.engine.downloader.fetch.call_count, 3) 73 | 74 | # Verify that we used the active sleep time 75 | self.mock_sleep.assert_called_with(0.1) 76 | 77 | def test_task_beat_no_requests(self): 78 | """Test that task_beat handles the case when no requests are available.""" 79 | # Set up mock to return no requests 80 | self.slot.scheduler.async_next_request.return_value = [] 81 | 82 | asyncio.run(self._run_task_beat(2)) 83 | 84 | # Verify that scheduler.async_next_request was called 85 | self.slot.scheduler.async_next_request.assert_called() 86 | 87 | # Verify that add_request and fetch were not called 88 | self.slot.add_request.assert_not_called() 89 | self.engine.downloader.fetch.assert_not_called() 90 | 91 | # Verify that we used the idle sleep time 92 | self.mock_sleep.assert_called_with(0.5) 93 | 94 | def test_task_beat_with_backout(self): 95 | """Test that task_beat respects the backout condition.""" 96 | # Set up mock to indicate backout is needed 97 | self.engine._needs_backout.return_value = True 98 | 99 | asyncio.run(self._run_task_beat(2)) 100 | 101 | # Verify that scheduler.async_next_request was not called 102 | self.slot.scheduler.async_next_request.assert_not_called() 103 | 104 | # Verify that we used the idle sleep time 105 | self.mock_sleep.assert_called_with(0.5) 106 | 107 | 108 | if __name__ == '__main__': 109 | unittest.main() 110 | -------------------------------------------------------------------------------- /tests/test_httpx_handler.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import asyncio 3 | from unittest.mock import MagicMock, patch, AsyncMock 4 | 5 | import httpx 6 | 7 | from aioscpy.core.downloader.handlers.httpx import HttpxDownloadHandler 8 | 9 | 10 | class TestHttpxHandler(unittest.TestCase): 11 | """Test the improved error handling in the HttpxDownloadHandler.""" 12 | 13 | def setUp(self): 14 | # Create mocks 15 | self.settings = { 16 | 'DOWNLOAD_TIMEOUT': 10, 17 | } 18 | self.settings.get = lambda key, default=None: self.settings.get(key, default) 19 | 20 | self.crawler = MagicMock() 21 | self.crawler.settings = self.settings 22 | 23 | self.spider = MagicMock() 24 | self.spider.name = 'test_spider' 25 | 26 | # Create request mock 27 | self.request = MagicMock() 28 | self.request.url = 'https://example.com' 29 | self.request.method = 'GET' 30 | self.request.headers = {} 31 | self.request.cookies = {} 32 | self.request.body = None 33 | self.request.json = None 34 | self.request.meta = {} 35 | 36 | # Create handler 37 | self.handler = HttpxDownloadHandler(self.settings, self.crawler) 38 | self.handler.logger = MagicMock() 39 | 40 | # Mock the dependency injection 41 | self.mock_response_cls = MagicMock() 42 | self.handler.di = MagicMock() 43 | self.handler.di.get.return_value = self.mock_response_cls 44 | 45 | # Patch httpx.AsyncClient 46 | self.client_patch = patch('httpx.AsyncClient') 47 | self.mock_client_cls = self.client_patch.start() 48 | self.mock_client = AsyncMock() 49 | self.mock_client_cls.return_value.__aenter__.return_value = self.mock_client 50 | 51 | # Create a mock response 52 | self.mock_http_response = MagicMock() 53 | self.mock_http_response.url = 'https://example.com' 54 | self.mock_http_response.status_code = 200 55 | self.mock_http_response.headers = {} 56 | self.mock_http_response.cookies = {} 57 | self.mock_http_response.read.return_value = b'response content' 58 | 59 | # Set up the client to return the mock response 60 | self.mock_client.request.return_value = self.mock_http_response 61 | 62 | def tearDown(self): 63 | self.client_patch.stop() 64 | 65 | async def test_successful_request(self): 66 | """Test that a successful request returns a response object.""" 67 | response = await self.handler.download_request(self.request, self.spider) 68 | 69 | # Verify that the client was called with the correct arguments 70 | self.mock_client.request.assert_called_once() 71 | args, kwargs = self.mock_client.request.call_args 72 | self.assertEqual(args[0], 'GET') 73 | self.assertEqual(args[1], 'https://example.com') 74 | 75 | # Verify that the response was created correctly 76 | self.mock_response_cls.assert_called_once() 77 | self.assertEqual(response, self.mock_response_cls.return_value) 78 | 79 | async def test_timeout_exception(self): 80 | """Test that a timeout exception is handled properly.""" 81 | # Make the client raise a timeout exception 82 | self.mock_client.request.side_effect = httpx.TimeoutException('Timeout') 83 | 84 | # Mock the exceptions 85 | mock_timeout_error = MagicMock() 86 | self.handler.di.get.side_effect = lambda x: mock_timeout_error if x == 'exceptions' else self.mock_response_cls 87 | 88 | with self.assertRaises(Exception): 89 | await self.handler.download_request(self.request, self.spider) 90 | 91 | # Verify that the error was logged 92 | self.handler.logger.warning.assert_called_once() 93 | 94 | # Verify that the correct exception was raised 95 | mock_timeout_error.TimeoutError.assert_called_once() 96 | 97 | async def test_request_error(self): 98 | """Test that a request error is handled properly.""" 99 | # Make the client raise a request error 100 | self.mock_client.request.side_effect = httpx.RequestError('Connection error') 101 | 102 | # Mock the exceptions 103 | mock_connection_error = MagicMock() 104 | self.handler.di.get.side_effect = lambda x: mock_connection_error if x == 'exceptions' else self.mock_response_cls 105 | 106 | with self.assertRaises(Exception): 107 | await self.handler.download_request(self.request, self.spider) 108 | 109 | # Verify that the error was logged 110 | self.handler.logger.warning.assert_called_once() 111 | 112 | # Verify that the correct exception was raised 113 | mock_connection_error.ConnectionError.assert_called_once() 114 | 115 | async def test_unexpected_error(self): 116 | """Test that an unexpected error is handled properly.""" 117 | # Make the client raise an unexpected error 118 | self.mock_client.request.side_effect = ValueError('Unexpected error') 119 | 120 | # Mock the exceptions 121 | mock_download_error = MagicMock() 122 | self.handler.di.get.side_effect = lambda x: mock_download_error if x == 'exceptions' else self.mock_response_cls 123 | 124 | with self.assertRaises(Exception): 125 | await self.handler.download_request(self.request, self.spider) 126 | 127 | # Verify that the error was logged 128 | self.handler.logger.error.assert_called_once() 129 | 130 | # Verify that the correct exception was raised 131 | mock_download_error.DownloadError.assert_called_once() 132 | 133 | async def test_proxy_configuration(self): 134 | """Test that proxy configuration is handled properly.""" 135 | # Set up a request with a proxy 136 | self.request.meta['proxy'] = 'http://proxy.example.com:8080' 137 | 138 | await self.handler.download_request(self.request, self.spider) 139 | 140 | # Verify that the client was created with the proxy 141 | args, kwargs = self.mock_client_cls.call_args 142 | self.assertEqual(kwargs['proxies'], 'http://proxy.example.com:8080') 143 | 144 | # Verify that the proxy usage was logged 145 | self.handler.logger.debug.assert_called_once() 146 | 147 | 148 | if __name__ == '__main__': 149 | unittest.main() 150 | --------------------------------------------------------------------------------