├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── aioscpy
    ├── VERSION
    ├── __init__.py
    ├── __main__.py
    ├── cmdline.py
    ├── commands
    │   ├── __init__.py
    │   ├── crawl.py
    │   ├── genspider.py
    │   ├── onespider.py
    │   ├── runspider.py
    │   ├── startproject.py
    │   └── version.py
    ├── core
    │   ├── __init__.py
    │   ├── downloader
    │   │   ├── __init__.py
    │   │   └── handlers
    │   │   │   ├── __init__.py
    │   │   │   ├── aiohttp.py
    │   │   │   ├── curl_cffi.py
    │   │   │   ├── httpx.py
    │   │   │   ├── pyhttpx.py
    │   │   │   └── requests.py
    │   ├── engine.py
    │   ├── scheduler
    │   │   ├── __init__.py
    │   │   ├── memory.py
    │   │   └── redis.py
    │   └── scraper.py
    ├── crawler.py
    ├── exceptions.py
    ├── http
    │   ├── __init__.py
    │   ├── request
    │   │   ├── __init__.py
    │   │   ├── form.py
    │   │   └── json.py
    │   └── response
    │   │   ├── __init__.py
    │   │   └── text.py
    ├── inject.py
    ├── libs
    │   ├── __init__.py
    │   ├── downloadermiddlewares
    │   │   ├── __init__.py
    │   │   └── stats.py
    │   ├── extensions
    │   │   ├── __init__.py
    │   │   ├── corestats.py
    │   │   └── logstats.py
    │   └── statscollectors.py
    ├── logformatter.py
    ├── middleware
    │   ├── __init__.py
    │   ├── adaptive_concurrency.py
    │   ├── downloader.py
    │   ├── extension.py
    │   ├── itempipeline.py
    │   └── manager.py
    ├── queue
    │   ├── __init__.py
    │   ├── compat.py
    │   ├── convert.py
    │   ├── memory
    │   │   ├── __init__.py
    │   │   └── _queue.py
    │   ├── rabbitmq
    │   │   ├── __init__.py
    │   │   └── _queue.py
    │   └── redis
    │   │   ├── __init__.py
    │   │   ├── _queue.py
    │   │   └── _queue_async.py
    ├── settings
    │   ├── __init__.py
    │   └── default_settings.py
    ├── signalmanager.py
    ├── signals.py
    ├── spider.py
    ├── templates
    │   ├── project
    │   │   ├── __init__.py
    │   │   ├── aioscpy.cfg
    │   │   ├── middlewares.py.tmpl
    │   │   ├── pipelines.py.tmpl
    │   │   ├── settings.py.tmpl
    │   │   ├── spiders
    │   │   │   └── __init__.py
    │   │   └── start.py.tmpl
    │   └── spiders
    │   │   ├── basic.tmpl
    │   │   └── crawl.tmpl
    └── utils
    │   ├── __init__.py
    │   ├── common.py
    │   ├── curl.py
    │   ├── log.py
    │   ├── ossignal.py
    │   ├── othtypes.py
    │   ├── signal.py
    │   ├── template.py
    │   └── tools.py
├── cegex
    ├── __init__.py
    ├── baidu.py
    ├── httpbin.py
    ├── httpbin_post.py
    └── ja3.py
├── doc
    ├── README_ZH.md
    └── images
    │   ├── aioscpy.png
    │   ├── run.png
    │   └── tree.png
├── example
    ├── project_quotes
    │   ├── __init__.py
    │   ├── aioscpy.cfg
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── quotes.py
    │   └── start.py
    └── single_quotes.py
├── requirements.txt
├── setup.py
├── start.py
└── tests
    ├── README.md
    ├── run_tests.py
    ├── test_adaptive_concurrency.py
    ├── test_engine_memory_management.py
    ├── test_engine_task_beat.py
    └── test_httpx_handler.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # IDE
  2 | .vscode/
  3 | .idea/
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 ihandmine
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include aioscpy VERSION
2 | recursive-include aioscpy/templates/project *.tmpl *.cfg *.py
3 | recursive-include aioscpy/templates/spiders *.tmpl


--------------------------------------------------------------------------------
/aioscpy/VERSION:
--------------------------------------------------------------------------------
1 | 0.3.13
2 | 


--------------------------------------------------------------------------------
/aioscpy/__init__.py:
--------------------------------------------------------------------------------
 1 | import pkgutil
 2 | 
 3 | from aioscpy.inject import call_grace_instance
 4 | 
 5 | __version__ = (pkgutil.get_data(__package__, "VERSION") or b"").decode("ascii").strip()
 6 | 
 7 | __all__ = [
 8 |     '__version__',
 9 |     'call_grace_instance'
10 | ]
11 | 


--------------------------------------------------------------------------------
/aioscpy/__main__.py:
--------------------------------------------------------------------------------
1 | from aioscpy.cmdline import execute
2 | 
3 | 
4 | if __name__ == '__main__':
5 |     execute()
6 | 


--------------------------------------------------------------------------------
/aioscpy/cmdline.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import argparse
  4 | import cProfile
  5 | import inspect
  6 | import pkg_resources
  7 | 
  8 | import aioscpy
  9 | from aioscpy.inject import walk_modules
 10 | from aioscpy.commands import ASCommand, ASHelpFormatter
 11 | from aioscpy.exceptions import UsageError
 12 | from aioscpy.utils.tools import get_project_settings
 13 | from aioscpy.utils.common import inside_project
 14 | from aioscpy import call_grace_instance
 15 | 
 16 | 
 17 | def _iter_command_classes(module_name):
 18 |     # TODO: add `name` attribute to commands and and merge this function with
 19 |     for module in walk_modules(module_name):
 20 |         for obj in vars(module).values():
 21 |             if (
 22 |                 inspect.isclass(obj)
 23 |                 and issubclass(obj, ASCommand)
 24 |                 and obj.__module__ == module.__name__
 25 |                 and not obj == ASCommand
 26 |             ):
 27 |                 yield obj
 28 | 
 29 | 
 30 | def _get_commands_from_module(module, inproject):
 31 |     d = {}
 32 |     for cmd in _iter_command_classes(module):
 33 |         if inproject or not cmd.requires_project:
 34 |             cmdname = cmd.__module__.split('.')[-1]
 35 |             d[cmdname] = cmd()
 36 |     return d
 37 | 
 38 | 
 39 | def _get_commands_from_entry_points(inproject, group='aioscpy.commands'):
 40 |     cmds = {}
 41 |     for entry_point in pkg_resources.iter_entry_points(group):
 42 |         obj = entry_point.load()
 43 |         if inspect.isclass(obj):
 44 |             cmds[entry_point.name] = obj()
 45 |         else:
 46 |             raise Exception(f"Invalid entry point {entry_point.name}")
 47 |     return cmds
 48 | 
 49 | 
 50 | def _get_commands_dict(settings, inproject):
 51 |     cmds = _get_commands_from_module('aioscpy.commands', inproject)
 52 |     cmds.update(_get_commands_from_entry_points(inproject))
 53 |     cmds_module = settings['COMMANDS_MODULE']
 54 |     if cmds_module:
 55 |         cmds.update(_get_commands_from_module(cmds_module, inproject))
 56 |     return cmds
 57 | 
 58 | 
 59 | def _pop_command_name(argv):
 60 |     i = 0
 61 |     for arg in argv[1:]:
 62 |         if not arg.startswith('-'):
 63 |             del argv[i]
 64 |             return arg
 65 |         i += 1
 66 | 
 67 | 
 68 | def _print_header(settings, inproject):
 69 |     version = aioscpy.__version__
 70 |     if inproject:
 71 |         print(f"aioscpy {version} - project: {settings['BOT_NAME']}\n")
 72 |     else:
 73 |         print(f"aioscpy {version} - no active project\n")
 74 | 
 75 | 
 76 | def _print_commands(settings, inproject):
 77 |     _print_header(settings, inproject)
 78 |     print("Usage:")
 79 |     print("  aioscpy <command> [options] [args]\n")
 80 |     print("Available commands:")
 81 |     cmds = _get_commands_dict(settings, inproject)
 82 |     for cmdname, cmdclass in sorted(cmds.items()):
 83 |         print(f"  {cmdname:<13} {cmdclass.short_desc()}")
 84 |     if not inproject:
 85 |         print()
 86 |         print("  [ more ]      More commands available when run from project directory")
 87 |     print()
 88 |     print('Use "aioscpy <command> -h" to see more info about a command')
 89 | 
 90 | 
 91 | def _print_unknown_command(settings, cmdname, inproject):
 92 |     _print_header(settings, inproject)
 93 |     print(f"Unknown command: {cmdname}\n")
 94 |     print('Use "aioscpy" to see available commands')
 95 | 
 96 | 
 97 | def _run_print_help(parser, func, *a, **kw):
 98 |     try:
 99 |         func(*a, **kw)
100 |     except UsageError as e:
101 |         if str(e):
102 |             parser.error(str(e))
103 |         if e.print_help:
104 |             parser.print_help()
105 |         sys.exit(2)
106 | 
107 | 
108 | def execute(argv=None, settings=None):
109 |     if argv is None:
110 |         argv = sys.argv
111 | 
112 |     if settings is None:
113 |         settings = get_project_settings()
114 |         # set EDITOR from environment if available
115 |         try:
116 |             editor = os.environ['EDITOR']
117 |         except KeyError:
118 |             pass
119 |         else:
120 |             settings['EDITOR'] = editor
121 | 
122 |     inproject = inside_project()
123 |     cmds = _get_commands_dict(settings, inproject)
124 |     cmdname = _pop_command_name(argv)
125 |     if not cmdname:
126 |         _print_commands(settings, inproject)
127 |         sys.exit(0)
128 |     elif cmdname not in cmds:
129 |         _print_unknown_command(settings, cmdname, inproject)
130 |         sys.exit(2)
131 | 
132 |     cmd = cmds[cmdname]
133 |     parser = argparse.ArgumentParser(formatter_class=ASHelpFormatter,
134 |                                      usage=f"aioscpy {cmdname} {cmd.syntax()}",
135 |                                      conflict_handler='resolve',
136 |                                      description=cmd.long_desc())
137 |     settings.setdict(cmd.default_settings, priority='command')
138 |     cmd.settings = settings
139 |     cmd.add_options(parser)
140 |     opts, args = parser.parse_known_args(args=argv[1:])
141 |     _run_print_help(parser, cmd.process_options, args, opts)
142 | 
143 |     if getattr(cmd, "requires_process"):
144 |         # cmd.crawler_process = CrawlerProcess(settings)
145 |         cmd.crawler_process = call_grace_instance("crawler_process", settings)
146 |     _run_print_help(parser, _run_command, cmd, args, opts)
147 |     sys.exit(cmd.exitcode)
148 | 
149 | 
150 | def _run_command(cmd, args, opts):
151 |     if opts.profile:
152 |         _run_command_profiled(cmd, args, opts)
153 |     else:
154 |         cmd.run(args, opts)
155 | 
156 | 
157 | def _run_command_profiled(cmd, args, opts):
158 |     if opts.profile:
159 |         sys.stderr.write(f"aioscpy: writing cProfile stats to {opts.profile!r}\n")
160 |     loc = locals()
161 |     p = cProfile.Profile()
162 |     p.runctx('cmd.run(args, opts)', globals(), loc)
163 |     if opts.profile:
164 |         p.dump_stats(opts.profile)
165 | 
166 | 
167 | if __name__ == '__main__':
168 |     execute()
169 | 
170 | 


--------------------------------------------------------------------------------
/aioscpy/commands/__init__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | from typing import Any, Dict
  4 | 
  5 | 
  6 | from aioscpy.utils.common import arglist_to_dict
  7 | from aioscpy.exceptions import UsageError
  8 | 
  9 | 
 10 | class ASCommand:
 11 | 
 12 |     requires_project = False
 13 |     crawler_process = None
 14 |     requires_process = True
 15 | 
 16 |     # default settings to be used for this command instead of global defaults
 17 |     default_settings: Dict[str, Any] = {}
 18 | 
 19 |     exitcode = 0
 20 | 
 21 |     def __init__(self):
 22 |         self.settings = None  # set in aioscpy.cmdline
 23 | 
 24 |     def set_crawler(self, crawler):
 25 |         if hasattr(self, '_crawler'):
 26 |             raise RuntimeError("crawler already set")
 27 |         self._crawler = crawler
 28 | 
 29 |     def syntax(self):
 30 |         """
 31 |         Command syntax (preferably one-line). Do not include command name.
 32 |         """
 33 |         return ""
 34 | 
 35 |     def short_desc(self):
 36 |         """
 37 |         A short description of the command
 38 |         """
 39 |         return ""
 40 | 
 41 |     def long_desc(self):
 42 |         """A long description of the command. Return short description when not
 43 |         available. It cannot contain newlines since contents will be formatted
 44 |         by optparser which removes newlines and wraps text.
 45 |         """
 46 |         return self.short_desc()
 47 | 
 48 |     def help(self):
 49 |         """An extensive help for the command. It will be shown when using the
 50 |         "help" command. It can contain newlines since no post-formatting will
 51 |         be applied to its contents.
 52 |         """
 53 |         return self.long_desc()
 54 | 
 55 |     def add_options(self, parser):
 56 |         """
 57 |         Populate option parse with options available for this command
 58 |         """
 59 |         group = parser.add_argument_group(title='Global Options')
 60 |         group.add_argument("--logfile", metavar="FILE",
 61 |                            help="log file. if omitted stderr will be used")
 62 |         group.add_argument("-L", "--loglevel", metavar="LEVEL", default=None,
 63 |                            help=f"log level (default: {self.settings['LOG_LEVEL']})")
 64 |         group.add_argument("--nolog", action="store_true",
 65 |                            help="disable logging completely")
 66 |         group.add_argument("--profile", metavar="FILE", default=None,
 67 |                            help="write python cProfile stats to FILE")
 68 |         group.add_argument("--pidfile", metavar="FILE",
 69 |                            help="write process ID to FILE")
 70 |         group.add_argument("-s", "--set", action="append", default=[], metavar="NAME=VALUE",
 71 |                            help="set/override setting (may be repeated)")
 72 | 
 73 |     def process_options(self, args, opts):
 74 |         try:
 75 |             self.settings.setdict(arglist_to_dict(opts.set),
 76 |                                   priority='cmdline')
 77 |         except ValueError:
 78 |             raise UsageError("Invalid -s value, use -s NAME=VALUE", print_help=False)
 79 | 
 80 |         if opts.logfile:
 81 |             self.settings.set('LOG_ENABLED', True, priority='cmdline')
 82 |             self.settings.set('LOG_FILE', opts.logfile, priority='cmdline')
 83 | 
 84 |         if opts.loglevel:
 85 |             self.settings.set('LOG_ENABLED', True, priority='cmdline')
 86 |             self.settings.set('LOG_LEVEL', opts.loglevel, priority='cmdline')
 87 | 
 88 |         if opts.nolog:
 89 |             self.settings.set('LOG_ENABLED', False, priority='cmdline')
 90 | 
 91 |         if opts.pidfile:
 92 |             with open(opts.pidfile, "w") as f:
 93 |                 f.write(str(os.getpid()) + os.linesep)
 94 | 
 95 |     def run(self, args, opts):
 96 |         """
 97 |         Entry point for running commands
 98 |         """
 99 |         raise NotImplementedError
100 | 
101 | 
102 | class BaseRunSpiderCommand(ASCommand):
103 |     """
104 |     Common class used to share functionality between the crawl, parse and runspider commands
105 |     """
106 |     def add_options(self, parser):
107 |         ASCommand.add_options(self, parser)
108 |         parser.add_argument("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
109 |                             help="set spider argument (may be repeated)")
110 |         parser.add_argument("-o", "--output", metavar="FILE", action="append",
111 |                             help="append scraped items to the end of FILE (use - for stdout)")
112 |         parser.add_argument("-O", "--overwrite-output", metavar="FILE", action="append",
113 |                             help="dump scraped items into FILE, overwriting any existing file")
114 |         parser.add_argument("-t", "--output-format", metavar="FORMAT",
115 |                             help="format to use for dumping items")
116 | 
117 |     def process_options(self, args, opts):
118 |         ASCommand.process_options(self, args, opts)
119 |         try:
120 |             opts.spargs = arglist_to_dict(opts.spargs)
121 |         except ValueError:
122 |             raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
123 | 
124 | 
125 | class ASHelpFormatter(argparse.HelpFormatter):
126 |     """
127 |     Help Formatter for aioscpy command line help messages.
128 |     """
129 |     def __init__(self, prog, indent_increment=2, max_help_position=24, width=None):
130 |         super().__init__(prog, indent_increment=indent_increment,
131 |                          max_help_position=max_help_position, width=width)
132 | 
133 |     def _join_parts(self, part_strings):
134 |         parts = self.format_part_strings(part_strings)
135 |         return super()._join_parts(parts)
136 | 
137 |     def format_part_strings(self, part_strings):
138 |         """
139 |         Underline and title case command line help message headers.
140 |         """
141 |         if part_strings and part_strings[0].startswith("usage: "):
142 |             part_strings[0] = "Usage\n=====\n  " + part_strings[0][len('usage: '):]
143 |         headings = [i for i in range(len(part_strings)) if part_strings[i].endswith(':\n')]
144 |         for index in headings[::-1]:
145 |             char = '-' if "Global Options" in part_strings[index] else '='
146 |             part_strings[index] = part_strings[index][:-2].title()
147 |             underline = ''.join(["\n", (char * len(part_strings[index])), "\n"])
148 |             part_strings.insert(index + 1, underline)
149 |         return part_strings
150 | 


--------------------------------------------------------------------------------
/aioscpy/commands/crawl.py:
--------------------------------------------------------------------------------
 1 | from aioscpy.commands import BaseRunSpiderCommand
 2 | from aioscpy.exceptions import UsageError
 3 | 
 4 | 
 5 | class Command(BaseRunSpiderCommand):
 6 | 
 7 |     requires_project = True
 8 | 
 9 |     def syntax(self):
10 |         return "[options] <spider>"
11 | 
12 |     def short_desc(self):
13 |         return "Run a spider"
14 | 
15 |     def run(self, args, opts):
16 |         if len(args) < 1:
17 |             raise UsageError()
18 |         elif len(args) > 1:
19 |             raise UsageError("running 'aioscpy crawl' with more than one spider is not supported")
20 |         spname = args[0]
21 | 
22 |         crawl_defer = self.crawler_process.crawl(spname, **opts.spargs)
23 | 
24 |         if getattr(crawl_defer, 'result', None) is not None and issubclass(crawl_defer.result.type, Exception):
25 |             self.exitcode = 1
26 |         else:
27 |             self.crawler_process.start()
28 | 
29 |             if (
30 |                 self.crawler_process.bootstrap_failed
31 |                 or hasattr(self.crawler_process, 'has_exception') and self.crawler_process.has_exception
32 |             ):
33 |                 self.exitcode = 1
34 | 


--------------------------------------------------------------------------------
/aioscpy/commands/genspider.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import string
  4 | 
  5 | from importlib import import_module
  6 | from os.path import join, dirname, abspath, exists, splitext
  7 | 
  8 | import aioscpy
  9 | from aioscpy.commands import ASCommand
 10 | from aioscpy.utils.template import render_templatefile, string_camelcase
 11 | from aioscpy.exceptions import UsageError
 12 | 
 13 | 
 14 | def sanitize_module_name(module_name):
 15 |     """Sanitize the given module name, by replacing dashes and points
 16 |     with underscores and prefixing it with a letter if it doesn't start
 17 |     with one
 18 |     """
 19 |     module_name = module_name.replace('-', '_').replace('.', '_')
 20 |     if module_name[0] not in string.ascii_letters:
 21 |         module_name = "a" + module_name
 22 |     return module_name
 23 | 
 24 | 
 25 | class Command(ASCommand):
 26 | 
 27 |     requires_project = False
 28 |     default_settings = {'LOG_ENABLED': False}
 29 |     requires_process = False
 30 | 
 31 |     def syntax(self):
 32 |         return "[options] <name>"
 33 | 
 34 |     def short_desc(self):
 35 |         return "Generate new spider in project using pre-defined templates"
 36 | 
 37 |     def add_options(self, parser):
 38 |         ASCommand.add_options(self, parser)
 39 |         parser.add_argument("-l", "--list", dest="list", action="store_true",
 40 |                             help="List available templates")
 41 |         parser.add_argument("-d", "--dump", dest="dump", metavar="TEMPLATE",
 42 |                             help="Dump template to standard output")
 43 |         parser.add_argument("-t", "--template", dest="template", default="basic",
 44 |                             help="Uses a custom template.")
 45 |         parser.add_argument("--force", dest="force", action="store_true",
 46 |                             help="If the spider already exists, overwrite it with the template")
 47 | 
 48 |     def run(self, args, opts):
 49 |         if opts.list:
 50 |             self._list_templates()
 51 |             return
 52 |         if opts.dump:
 53 |             template_file = self._find_template(opts.dump)
 54 |             if template_file:
 55 |                 with open(template_file, "r") as f:
 56 |                     print(f.read())
 57 |             return
 58 |         if not args:
 59 |             raise UsageError()
 60 | 
 61 |         name = args[0]
 62 |         module = sanitize_module_name(name)
 63 | 
 64 |         if self.settings.get('BOT_NAME') == module:
 65 |             print("Cannot create a spider with the same name as your project")
 66 |             return
 67 | 
 68 |         if not opts.force and self._spider_exists(name):
 69 |             return
 70 | 
 71 |         template_file = self._find_template(opts.template)
 72 |         if template_file:
 73 |             self._genspider(module, name, opts.template, template_file)
 74 | 
 75 |     def _genspider(self, module, name, template_name, template_file):
 76 |         """Generate the spider module, based on the given template"""
 77 |         capitalized_module = ''.join(s.capitalize() for s in module.split('_'))
 78 |         tvars = {
 79 |             'project_name': self.settings.get('BOT_NAME'),
 80 |             'ProjectName': string_camelcase(self.settings.get('BOT_NAME')),
 81 |             'module': module,
 82 |             'name': name,
 83 |             'classname': f'{capitalized_module}Spider'
 84 |         }
 85 |         if self.settings.get('NEWSPIDER_MODULE'):
 86 |             spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
 87 |             spiders_dir = abspath(dirname(spiders_module.__file__))
 88 |         else:
 89 |             spiders_module = None
 90 |             spiders_dir = "."
 91 |         spider_file = f"{join(spiders_dir, module)}.py"
 92 |         shutil.copyfile(template_file, spider_file)
 93 |         render_templatefile(spider_file, **tvars)
 94 |         print(f"Created spider {name!r} using template {template_name!r} ",
 95 |               end=('' if spiders_module else '\n'))
 96 |         if spiders_module:
 97 |             print(f"in module:\n  {spiders_module.__name__}.{module}")
 98 | 
 99 |     def _find_template(self, template):
100 |         template_file = join(self.templates_dir, f'{template}.tmpl')
101 |         if exists(template_file):
102 |             return template_file
103 |         print(f"Unable to find template: {template}\n")
104 |         print('Use "aioscpy genspider --list" to see all available templates.')
105 | 
106 |     def _list_templates(self):
107 |         print("Available templates:")
108 |         for filename in sorted(os.listdir(self.templates_dir)):
109 |             if filename.endswith('.tmpl'):
110 |                 print(f"  {splitext(filename)[0]}")
111 | 
112 |     def _spider_exists(self, name):
113 |         if not self.settings.get('NEWSPIDER_MODULE'):
114 |             # if run as a standalone command and file with same filename already exists
115 |             if exists(name + ".py"):
116 |                 print(f"{abspath(name + '.py')} already exists")
117 |                 return True
118 |             return False
119 | 
120 |         # a file with the same name exists in the target directory
121 |         spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
122 |         spiders_dir = dirname(spiders_module.__file__)
123 |         spiders_dir_abs = abspath(spiders_dir)
124 |         if exists(join(spiders_dir_abs, name + ".py")):
125 |             print(f"{join(spiders_dir_abs, (name + '.py'))} already exists")
126 |             return True
127 | 
128 |         return False
129 | 
130 |     @property
131 |     def templates_dir(self):
132 |         return join(
133 |             self.settings['TEMPLATES_DIR'] or join(aioscpy.__path__[0], 'templates'),
134 |             'spiders'
135 |         )
136 | 


--------------------------------------------------------------------------------
/aioscpy/commands/onespider.py:
--------------------------------------------------------------------------------
 1 | from aioscpy.commands import ASCommand
 2 | from aioscpy.commands.genspider import Command
 3 | 
 4 | 
 5 | class OCommand(Command):
 6 | 
 7 |     def short_desc(self):
 8 |         return "Generate new spider in xxx.py using pre-defined templates"
 9 | 
10 |     def add_options(self, parser):
11 |         ASCommand.add_options(self, parser)
12 |         parser.add_argument("-l", "--list", dest="list", action="store_true",
13 |                             help="List available templates")
14 |         parser.add_argument("-d", "--dump", dest="dump", metavar="TEMPLATE",
15 |                             help="Dump template to standard output")
16 |         parser.add_argument("-t", "--template", dest="template", default="crawl",
17 |                             help="Uses a custom template.")
18 |         parser.add_argument("--force", dest="force", action="store_true",
19 |                             help="If the spider already exists, overwrite it with the template")


--------------------------------------------------------------------------------
/aioscpy/commands/runspider.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import inspect
 4 | 
 5 | from importlib import import_module
 6 | 
 7 | from aioscpy.exceptions import UsageError
 8 | from aioscpy.commands import BaseRunSpiderCommand
 9 | 
10 | 
11 | def iter_spider_classes(module):
12 |     from aioscpy.spider import Spider
13 | 
14 |     for obj in vars(module).values():
15 |         if (
16 |             inspect.isclass(obj)
17 |             and issubclass(obj, Spider)
18 |             and obj.__module__ == module.__name__
19 |             and getattr(obj, 'name', None)
20 |         ):
21 |             yield obj
22 | 
23 | 
24 | def _import_file(filepath):
25 |     abspath = os.path.abspath(filepath)
26 |     dirname, file = os.path.split(abspath)
27 |     fname, fext = os.path.splitext(file)
28 |     if fext not in ('.py', '.pyw'):
29 |         raise ValueError(f"Not a Python source file: {abspath}")
30 |     if dirname:
31 |         sys.path = [dirname] + sys.path
32 |     try:
33 |         module = import_module(fname)
34 |     finally:
35 |         if dirname:
36 |             sys.path.pop(0)
37 |     return module
38 | 
39 | 
40 | class Command(BaseRunSpiderCommand):
41 | 
42 |     requires_project = False
43 |     default_settings = {'SPIDER_LOADER_WARN_ONLY': True}
44 | 
45 |     def syntax(self):
46 |         return "[options] <spider_file>"
47 | 
48 |     def short_desc(self):
49 |         return "Run a self-contained spider (without creating a project)"
50 | 
51 |     def long_desc(self):
52 |         return "Run the spider defined in the given file"
53 | 
54 |     def run(self, args, opts):
55 |         if len(args) != 1:
56 |             raise UsageError()
57 |         filename = args[0]
58 |         if not os.path.exists(filename):
59 |             raise UsageError(f"File not found: {filename}\n")
60 |         try:
61 |             module = _import_file(filename)
62 |         except (ImportError, ValueError) as e:
63 |             raise UsageError(f"Unable to load {filename!r}: {e}\n")
64 |         spclasses = list(iter_spider_classes(module))
65 |         if not spclasses:
66 |             raise UsageError(f"No spider found in file: {filename}\n")
67 |         spidercls = spclasses.pop()
68 | 
69 |         self.crawler_process.crawl(spidercls, **opts.spargs)
70 |         self.crawler_process.start()
71 | 
72 |         if self.crawler_process.bootstrap_failed:
73 |             self.exitcode = 1
74 | 


--------------------------------------------------------------------------------
/aioscpy/commands/startproject.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import string
  4 | from importlib.util import find_spec
  5 | from os.path import join, exists, abspath
  6 | from shutil import ignore_patterns, move, copy2, copystat
  7 | from stat import S_IWUSR as OWNER_WRITE_PERMISSION
  8 | 
  9 | import aioscpy
 10 | from aioscpy.commands import ASCommand
 11 | from aioscpy.utils.template import render_templatefile, string_camelcase
 12 | from aioscpy.exceptions import UsageError
 13 | 
 14 | 
 15 | TEMPLATES_TO_RENDER = (
 16 |     ('aioscpy.cfg',),
 17 |     ('settings.py.tmpl',),
 18 |     ('pipelines.py.tmpl',),
 19 |     ('middlewares.py.tmpl',),
 20 |     ('start.py.tmpl',),
 21 | )
 22 | 
 23 | IGNORE = ignore_patterns('*.pyc', '__pycache__', '.svn')
 24 | 
 25 | 
 26 | def _make_writable(path):
 27 |     current_permissions = os.stat(path).st_mode
 28 |     os.chmod(path, current_permissions | OWNER_WRITE_PERMISSION)
 29 | 
 30 | 
 31 | class Command(ASCommand):
 32 | 
 33 |     requires_project = False
 34 |     default_settings = {'LOG_ENABLED': False,
 35 |                         'SPIDER_LOADER_WARN_ONLY': True}
 36 |     requires_process = False
 37 | 
 38 |     def syntax(self):
 39 |         return "<project_name> [project_dir]"
 40 | 
 41 |     def short_desc(self):
 42 |         return "Create new project"
 43 | 
 44 |     def _is_valid_name(self, project_name):
 45 |         def _module_exists(module_name):
 46 |             spec = find_spec(module_name)
 47 |             return spec is not None and spec.loader is not None
 48 | 
 49 |         if not re.search(r'^[_a-zA-Z]\w*$', project_name):
 50 |             print('Error: Project names must begin with a letter and contain'
 51 |                   ' only\nletters, numbers and underscores')
 52 |         elif _module_exists(project_name):
 53 |             print(f'Error: Module {project_name!r} already exists')
 54 |         else:
 55 |             return True
 56 |         return False
 57 | 
 58 |     def _copytree(self, src, dst):
 59 |         """
 60 |         Since the original function always creates the directory, to resolve
 61 |         the issue a new function had to be created. It's a simple copy and
 62 |         was reduced for this case.
 63 | 
 64 |         More info at:
 65 |         https://github.com/aioscpy/aioscpy/pull/2005
 66 |         """
 67 |         ignore = IGNORE
 68 |         names = os.listdir(src)
 69 |         ignored_names = ignore(src, names)
 70 | 
 71 |         if not os.path.exists(dst):
 72 |             os.makedirs(dst)
 73 | 
 74 |         for name in names:
 75 |             if name in ignored_names:
 76 |                 continue
 77 | 
 78 |             srcname = os.path.join(src, name)
 79 |             dstname = os.path.join(dst, name)
 80 |             if os.path.isdir(srcname):
 81 |                 self._copytree(srcname, dstname)
 82 |             else:
 83 |                 copy2(srcname, dstname)
 84 |                 _make_writable(dstname)
 85 | 
 86 |         copystat(src, dst)
 87 |         _make_writable(dst)
 88 | 
 89 |     def run(self, args, opts):
 90 |         if len(args) not in (1, 2):
 91 |             raise UsageError()
 92 | 
 93 |         project_name = args[0]
 94 |         project_dir = args[0]
 95 | 
 96 |         if len(args) == 2:
 97 |             project_dir = args[1]
 98 | 
 99 |         if exists(join(project_dir, 'aioscpy.cfg')):
100 |             self.exitcode = 1
101 |             print(f'Error: aioscpy.cfg already exists in {abspath(project_dir)}')
102 |             return
103 | 
104 |         if not self._is_valid_name(project_name):
105 |             self.exitcode = 1
106 |             return
107 | 
108 |         self._copytree(self.templates_dir, abspath(project_dir))
109 |         # move(join(project_dir, 'module'), join(project_dir, project_name))
110 |         for paths in TEMPLATES_TO_RENDER:
111 |             path = join(*paths)
112 |             tplfile = join(project_dir, string.Template(path).substitute(project_name=project_name))
113 |             render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name))
114 |         print(f"New Aioscpy project '{project_name}', using template directory "
115 |               f"'{self.templates_dir}', created in:")
116 |         print(f"    {abspath(project_dir)}\n")
117 |         print("You can start your first spider with:")
118 |         print(f"    cd {project_dir}")
119 |         print("    aioscpy genspider/onespider example")
120 | 
121 |     @property
122 |     def templates_dir(self):
123 |         return join(
124 |             self.settings['TEMPLATES_DIR'] or join(aioscpy.__path__[0], 'templates'),
125 |             'project'
126 |         )
127 | 


--------------------------------------------------------------------------------
/aioscpy/commands/version.py:
--------------------------------------------------------------------------------
 1 | import aioscpy
 2 | from aioscpy.commands import ASCommand
 3 | 
 4 | 
 5 | class Command(ASCommand):
 6 | 
 7 |     default_settings = {'LOG_ENABLED': False,
 8 |                         'SPIDER_LOADER_WARN_ONLY': True}
 9 |     requires_process = False
10 | 
11 |     def syntax(self):
12 |         return "[-v]"
13 | 
14 |     def short_desc(self):
15 |         return "Print aioscpy version"
16 | 
17 |     def add_options(self, parser):
18 |         ASCommand.add_options(self, parser)
19 |         parser.add_argument("--verbose", "-v", dest="verbose", action="store_true",
20 |                             help="also display twisted/python/platform info (useful for bug reports)")
21 | 
22 |     def run(self, args, opts):
23 |         # if opts.verbose:
24 |         #     versions = aioscpy_components_versions()
25 |         #     width = max(len(n) for (n, _) in versions)
26 |         #     for name, version in versions:
27 |         #         print(f"{name:<{width}} : {version}")
28 |         # else:
29 |         print(f"AIOSPCY {aioscpy.__version__}")
30 | 


--------------------------------------------------------------------------------
/aioscpy/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/aioscpy/core/__init__.py


--------------------------------------------------------------------------------
/aioscpy/core/downloader/__init__.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import random
  3 | 
  4 | from datetime import datetime
  5 | from collections import deque
  6 | 
  7 | from aioscpy import signals
  8 | from aioscpy import call_grace_instance
  9 | 
 10 | 
 11 | class Slot:
 12 |     """Downloader slot"""
 13 | 
 14 |     def __init__(self, concurrency, randomize_delay, delay=0):
 15 |         self.concurrency = concurrency
 16 |         self.delay = delay
 17 |         self.randomize_delay = randomize_delay
 18 | 
 19 |         self.active = set()
 20 |         self.queue = deque()
 21 |         self.transferring = set()
 22 |         self.lastseen = 0
 23 |         self.delay_run = False
 24 | 
 25 |     def free_transfer_slots(self):
 26 |         return self.concurrency - len(self.transferring)
 27 | 
 28 |     def download_delay(self):
 29 |         if self.randomize_delay:
 30 |             return random.uniform(0.5 * self.delay, 1.5 * self.delay)
 31 |         return self.delay
 32 | 
 33 |     def close(self):
 34 |         self.delay_run = True
 35 | 
 36 |     def __repr__(self):
 37 |         cls_name = self.__class__.__name__
 38 |         return "%s(concurrency=%r, delay=%0.2f, randomize_delay=%r)" % (
 39 |             cls_name, self.concurrency, self.delay, self.randomize_delay)
 40 | 
 41 |     def __str__(self):
 42 |         return (
 43 |                 "<downloader.Slot concurrency=%r delay=%0.2f randomize_delay=%r "
 44 |                 "len(active)=%d len(queue)=%d len(transferring)=%d lastseen=%s>" % (
 45 |                     self.concurrency, self.delay, self.randomize_delay,
 46 |                     len(self.active), len(self.queue), len(self.transferring),
 47 |                     datetime.fromtimestamp(self.lastseen).isoformat()
 48 |                 )
 49 |         )
 50 | 
 51 | 
 52 | class Downloader(object):
 53 |     DOWNLOAD_SLOT = 'download_slot'
 54 | 
 55 |     def __init__(self, crawler):
 56 |         self.settings = crawler.settings
 57 |         self.crawler = crawler
 58 |         self.slot = None
 59 |         self.active = set()
 60 |         self.call_helper = self.di.get("tools").call_helper
 61 |         self.handlers = call_grace_instance('downloader_handler', self.settings, crawler)
 62 |         self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
 63 |         self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
 64 |         self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
 65 |         self.randomize_delay = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY')
 66 |         self.delay = self.settings.getfloat('DOWNLOAD_DELAY')
 67 |         self.middleware = call_grace_instance(self.di.get('downloader_middleware'), only_instance=True).from_crawler(crawler)
 68 |         self.process_queue_task = None
 69 |         self.engine = None
 70 | 
 71 |         crawler.signals.connect(self.close, signals.engine_stopped)
 72 | 
 73 |     @classmethod
 74 |     def from_crawler(cls, crawler):
 75 |         return cls(crawler)
 76 | 
 77 |     async def open(self, spider, engine):
 78 |         conc = self.ip_concurrency if self.ip_concurrency else self.domain_concurrency
 79 |         self.slot = Slot(conc, self.randomize_delay, self.delay)
 80 |         self.engine = engine
 81 |         self.process_queue_task = asyncio.create_task(self._process_queue(spider, self.slot))
 82 | 
 83 |     async def fetch(self, request):
 84 |         self.active.add(request)
 85 |         self.slot.active.add(request)
 86 |         self.slot.queue.append(request)
 87 | 
 88 |     async def _process_queue(self, spider, slot):
 89 |         while True:
 90 |             await asyncio.sleep(0.1)
 91 |             while slot.queue and slot.free_transfer_slots() > 0:
 92 |                 request = slot.queue.popleft()
 93 |                 asyncio.create_task(self._download(slot, request, spider))
 94 |                 slot.transferring.add(request)
 95 |                 slot.active.remove(request)
 96 |                 self.active.remove(request)
 97 |                 if slot.download_delay():
 98 |                     await asyncio.sleep(slot.download_delay())
 99 | 
100 |     async def _download(self, slot, request, spider):
101 |         try:
102 |             response = None
103 |             response = await self.middleware.process_request(spider, request)
104 |             process_request_method = getattr(spider, "process_request", None)
105 |             if process_request_method:
106 |                 response = await self.call_helper(process_request_method, request)
107 |             if response is None or isinstance(response, self.di.get('request')):
108 |                 request = response or request
109 |                 response = await self.handlers.download_request(request, spider)
110 |         except (Exception, BaseException, asyncio.TimeoutError) as exc:
111 |             response = await self.middleware.process_exception(spider, request, exc)
112 |             process_exception_method = getattr(spider, "process_exception", None)
113 |             if process_exception_method:
114 |                 response = await self.call_helper(process_exception_method, request, exc)
115 |         else:
116 |             try:
117 |                 response = await self.middleware.process_response(spider, request, response)
118 |                 process_response_method = getattr(spider, "process_response", None)
119 |                 if process_response_method:
120 |                     response = await self.call_helper(process_response_method, request, response)
121 |             except (Exception, BaseException) as exc:
122 |                 response = exc
123 |         finally:
124 |             slot.transferring.discard(request)
125 |             if isinstance(response, self.di.get('response')):
126 |                 response.request = request
127 |             await self.engine._handle_downloader_output(response, request, spider)
128 | 
129 |     async def close(self):
130 |         try:
131 |             if self.slot is not None:
132 |                 self.slot.close()
133 |             await self.handlers.close()
134 |             if self.process_queue_task:
135 |                 self.process_queue_task.cancel()
136 |         except (asyncio.CancelledError, Exception, BaseException) as exc:
137 |             pass
138 | 
139 |     def needs_backout(self):
140 |         return len(self.active) >= self.total_concurrency
141 | 


--------------------------------------------------------------------------------
/aioscpy/core/downloader/handlers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/aioscpy/core/downloader/handlers/__init__.py


--------------------------------------------------------------------------------
/aioscpy/core/downloader/handlers/aiohttp.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import ssl
 3 | import aiohttp
 4 | import ujson
 5 | import json
 6 | 
 7 | from anti_header import Headers
 8 | from anti_useragent.utils.cipers import generate_cipher
 9 | 
10 | 
11 | class AioHttpDownloadHandler(object):
12 | 
13 |     def __init__(self, settings, crawler):
14 |         self.settings = settings
15 |         self.crawler = crawler
16 |         self.aiohttp_client_session = {
17 |             'timeout': aiohttp.ClientTimeout(total=20),
18 |             'trust_env': True,
19 |             'json_serialize': ujson.dumps,
20 |             "connector": aiohttp.TCPConnector(
21 |                 verify_ssl=False,
22 |                 limit=1000,
23 |                 force_close=True,
24 |                 use_dns_cache=False,
25 |                 limit_per_host=200,
26 |                 enable_cleanup_closed=True
27 |             )
28 |         }
29 |         self.session_stats = self.settings.getbool("REQUESTS_SESSION_STATS", False)
30 |         self.session = None
31 |         self.context = None
32 | 
33 |     @classmethod
34 |     def from_settings(cls, settings, crawler):
35 |         return cls(settings, crawler)
36 | 
37 |     @classmethod
38 |     def from_crawler(cls, crawler):
39 |         return cls.from_settings(crawler.settings, crawler)
40 | 
41 |     async def download_request(self, request, spider):
42 |         session_kwargs = {
43 |             'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
44 |             'cookies': dict(request.cookies),
45 |             "data": request.body,
46 |             "json": request.json
47 |         }
48 |         headers = request.headers
49 |         if isinstance(headers, Headers):
50 |             headers = headers.to_unicode_dict()
51 |         session_kwargs['headers'] = headers
52 | 
53 |         if request.meta.get('TLS_CIPHERS') or self.settings.get('TLS_CIPHERS'):
54 |             self.context = ssl.create_default_context()
55 |             self.context.set_ciphers(generate_cipher())
56 |             session_kwargs['ssl'] = self.context
57 | 
58 |         if request.meta.get("proxy"):
59 |             session_kwargs["proxy"] = request.meta['proxy']
60 |             self.logger.debug(f"use {request.meta['proxy']} crawling: {request.url}")
61 | 
62 |         if self.session_stats:
63 |             if self.session is None:
64 |                 self.session = aiohttp.ClientSession(**self.aiohttp_client_session)
65 |             response = await self.session.request(request.method, request.url, **session_kwargs)
66 |             content = await response.read()
67 |         else:
68 |             async with aiohttp.ClientSession(
69 |                 timeout=aiohttp.ClientTimeout(total=20),
70 |                 trust_env=True,
71 |                 connector=aiohttp.TCPConnector(verify_ssl=False)) as session:
72 |                 async with session.request(request.method, request.url, **session_kwargs) as response:
73 |                     content = await response.read()
74 | 
75 |         return self.di.get("response")(
76 |             str(response.url),
77 |             status=response.status,
78 |             headers=response.headers,
79 |             body=content,
80 |             cookies=response.cookies,
81 |             _response=response)
82 | 
83 |     async def close(self):
84 |         if self.session is not None:
85 |             await self.session.close()
86 | 
87 |         # Wait 250 ms for the underlying SSL connections to close
88 |         # https://docs.aiohttp.org/en/latest/client_advanced.html#graceful-shutdown
89 |         await asyncio.sleep(0.250)
90 | 


--------------------------------------------------------------------------------
/aioscpy/core/downloader/handlers/curl_cffi.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import random
 3 | 
 4 | from curl_cffi.requests import AsyncSession
 5 | 
 6 | from anti_header import Headers
 7 | 
 8 | 
 9 | class CurlCffiDownloadHandler(object):
10 | 
11 |     def __init__(self, settings, crawler):
12 |         self.settings = settings
13 |         self.crawler = crawler
14 |         self.context = None
15 |         self.browsers = [
16 |             "chrome99",
17 |             "chrome100",
18 |             "chrome101",
19 |             "chrome104",
20 |             "chrome107",
21 |             "chrome110",
22 |             # "chrome116",
23 |             "chrome99_android",
24 |             "edge99",
25 |             "edge101",
26 |             # "ff91esr",
27 |             # "ff95",
28 |             # "ff98",
29 |             # "ff100",
30 |             # "ff102",
31 |             # "ff109",
32 |             # "ff117",
33 |             "safari15_3",
34 |             "safari15_5",
35 |         ]
36 | 
37 |     @classmethod
38 |     def from_settings(cls, settings, crawler):
39 |         return cls(settings, crawler)
40 | 
41 |     @classmethod
42 |     def from_crawler(cls, crawler):
43 |         return cls.from_settings(crawler.settings, crawler)
44 | 
45 |     async def download_request(self, request, spider):
46 |         headers = request.headers
47 |         if isinstance(headers, Headers):
48 |             headers = headers.to_unicode_dict()
49 |         session_kwargs = {
50 |             'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
51 |             'cookies': dict(request.cookies),
52 |             'headers': headers,
53 |             'allow_redirects': True,
54 |             "data": request.body,
55 |             "json": request.json
56 |         }
57 | 
58 |         if request.meta.get('TLS_CIPHERS') or self.settings.get('TLS_CIPHERS'):
59 |             session_kwargs['impersonate'] = random.choice(self.browsers)
60 | 
61 |         if request.meta.get("proxy"):
62 |             session_kwargs['proxies'] = {
63 |                 'http': request.meta["proxy"],
64 |                 'https': request.meta["proxy"]
65 |             }
66 |             self.logger.debug(f"use {request.meta['proxy']} crawling: {request.url}")
67 |         
68 |         async with AsyncSession() as session:
69 |             response = await session.request(request.method, request.url, **session_kwargs)
70 |             content = response.content
71 | 
72 |         return self.di.get("response")(
73 |             str(response.url),
74 |             status=response.status_code,
75 |             headers=response.headers,
76 |             body=content,
77 |             cookies=response.cookies,
78 |             _response=response)
79 | 
80 |     async def close(self):
81 |         await asyncio.sleep(0.1)
82 | 


--------------------------------------------------------------------------------
/aioscpy/core/downloader/handlers/httpx.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import ssl
 3 | import httpx
 4 | 
 5 | from anti_header import Headers
 6 | from anti_useragent.utils.cipers import generate_cipher
 7 | 
 8 | 
 9 | class HttpxDownloadHandler(object):
10 | 
11 |     def __init__(self, settings, crawler):
12 |         self.settings = settings
13 |         self.crawler = crawler
14 |         self.context = None
15 | 
16 |     @classmethod
17 |     def from_settings(cls, settings, crawler):
18 |         return cls(settings, crawler)
19 | 
20 |     @classmethod
21 |     def from_crawler(cls, crawler):
22 |         return cls.from_settings(crawler.settings, crawler)
23 | 
24 |     async def download_request(self, request, spider):
25 |         headers = request.headers
26 |         if isinstance(headers, Headers):
27 |             headers = headers.to_unicode_dict()
28 |         httpx_client_session = {}
29 | 
30 |         # Configure TLS settings if needed
31 |         if request.meta.get('TLS_CIPHERS') or self.settings.get('TLS_CIPHERS'):
32 |             try:
33 |                 self.context = ssl.create_default_context()
34 |                 self.context.set_ciphers(generate_cipher())
35 |                 httpx_client_session['verify'] = self.context
36 |             except Exception as e:
37 |                 self.logger.warning(f"Error configuring TLS for {request.url}: {str(e)}")
38 | 
39 |         # Configure proxy if specified
40 |         if request.meta.get("proxy"):
41 |             httpx_client_session['proxies'] = request.meta["proxy"]
42 |             self.logger.debug(f"Using proxy {request.meta['proxy']} for: {request.url}")
43 | 
44 |         # Prepare session arguments
45 |         session_kwargs = {
46 |             'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
47 |             'cookies': dict(request.cookies),
48 |             'headers': headers,
49 |             'follow_redirects': True,
50 |             "data": request.body,
51 |             "json": request.json
52 |         }
53 | 
54 |         try:
55 |             async with httpx.AsyncClient(**httpx_client_session) as session:
56 |                 response = await session.request(request.method, request.url, **session_kwargs)
57 |                 content = response.read()
58 | 
59 |             return self.di.get("response")(
60 |                 str(response.url),
61 |                 status=response.status_code,
62 |                 headers=response.headers,
63 |                 body=content,
64 |                 cookies=response.cookies,
65 |                 _response=response)
66 | 
67 |         except httpx.TimeoutException as e:
68 |             self.logger.warning(f"Request to {request.url} timed out: {str(e)}")
69 |             raise self.di.get("exceptions").TimeoutError(f"Request to {request.url} timed out")
70 | 
71 |         except httpx.RequestError as e:
72 |             self.logger.warning(f"Request to {request.url} failed: {str(e)}")
73 |             raise self.di.get("exceptions").ConnectionError(f"Request to {request.url} failed: {str(e)}")
74 | 
75 |         except Exception as e:
76 |             self.logger.error(f"Unexpected error when downloading {request.url}: {str(e)}")
77 |             raise self.di.get("exceptions").DownloadError(f"Unexpected error: {str(e)}")
78 | 
79 |     async def close(self):
80 |         await asyncio.sleep(0.1)
81 | 


--------------------------------------------------------------------------------
/aioscpy/core/downloader/handlers/pyhttpx.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import pyhttpx
 3 | 
 4 | from anti_header import Headers
 5 | 
 6 | 
 7 | class PyHttpxDownloadHandler(object):
 8 | 
 9 |     def __init__(self, settings, crawler):
10 |         self.settings = settings
11 |         self.crawler = crawler
12 |         self.context = None
13 | 
14 |     @classmethod
15 |     def from_settings(cls, settings, crawler):
16 |         return cls(settings, crawler)
17 | 
18 |     @classmethod
19 |     def from_crawler(cls, crawler):
20 |         return cls.from_settings(crawler.settings, crawler)
21 | 
22 |     async def download_request(self, request, spider):
23 |         headers = request.headers
24 |         if isinstance(headers, Headers):
25 |             headers = headers.to_unicode_dict()
26 |         pyhttpx_client_session = {
27 |             'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
28 |             'cookies': dict(request.cookies),
29 |             'headers': headers,
30 |             'allow_redirects': True,
31 |             "data": request.body,
32 |             "json": request.json
33 |         }
34 | 
35 |         if request.meta.get("proxy"):
36 |             pyhttpx_client_session['proxies'] = {'https': request.meta["proxy"]}
37 |             self.logger.debug(f"use {request.meta['proxy']} crawling: {request.url}")
38 |         
39 |         session_args = {'http2': True}
40 |         with pyhttpx.HttpSession(**session_args) as session:
41 |             response = await asyncio.to_thread(session.request, request.method, request.url, **pyhttpx_client_session)
42 | 
43 |         return self.di.get("response")(
44 |             str(request.url),
45 |             status=response.status_code,
46 |             headers=response.headers,
47 |             body=response.content,
48 |             cookies=response.cookies,
49 |             _response=response)
50 | 
51 |     async def close(self):
52 |         await asyncio.sleep(0.1)
53 | 


--------------------------------------------------------------------------------
/aioscpy/core/downloader/handlers/requests.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import requests
 3 | 
 4 | from anti_header import Headers
 5 | from anti_useragent.utils.cipers import generate_cipher
 6 | 
 7 | 
 8 | class RequestsDownloadHandler(object):
 9 | 
10 |     def __init__(self, settings, crawler):
11 |         self.settings = settings
12 |         self.crawler = crawler
13 | 
14 |     @classmethod
15 |     def from_settings(cls, settings, crawler):
16 |         return cls(settings, crawler)
17 | 
18 |     @classmethod
19 |     def from_crawler(cls, crawler):
20 |         return cls.from_settings(crawler.settings, crawler)
21 | 
22 |     async def download_request(self, request, spider):
23 |         headers = request.headers
24 |         if isinstance(headers, Headers):
25 |             headers = headers.to_unicode_dict()
26 |         requests_client_session = {
27 |             'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
28 |             'cookies': dict(request.cookies),
29 |             'headers': headers,
30 |             'allow_redirects': request.meta.get("allow_redirects", True),
31 |             "data": request.body,
32 |             "json": request.json,
33 |         }
34 | 
35 |         if request.meta.get('TLS_CIPHERS') or self.settings.get('TLS_CIPHERS'):
36 |             requests.adapters.DEFAULT_RETRIES = 10
37 |             requests.packages.urllib3.disable_warnings()
38 |             cipers_real = generate_cipher()
39 |             self.logger.debug(cipers_real)
40 |             requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = cipers_real
41 | 
42 |         if request.meta.get("proxy"):
43 |             requests_client_session['proxies'] = {
44 |                 'http': request.meta["proxy"],
45 |                 'https': request.meta["proxy"]
46 |             }
47 |             self.logger.debug(f"use {request.meta['proxy']} crawling: {request.url}")
48 | 
49 | 
50 |         
51 |         response = await asyncio.to_thread(requests.request, request.method, request.url, **requests_client_session)
52 | 
53 |         return self.di.get("response")(
54 |             str(response.url),
55 |             status=response.status_code,
56 |             headers=response.headers,
57 |             body=response.content,
58 |             cookies=response.cookies,
59 |             _response=response)
60 | 
61 |     async def close(self):
62 |         await asyncio.sleep(0.1)
63 | 


--------------------------------------------------------------------------------
/aioscpy/core/scheduler/__init__.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | 
 4 | class Scheduler(object):
 5 | 
 6 |     def __init__(self, _queue_df, spider, stats):
 7 |         self.queue = _queue_df
 8 |         self.stats = stats
 9 |         self.spider = spider
10 | 
11 |     @classmethod
12 |     def from_crawler(cls, crawler):
13 |         raise NotImplementedError(
14 |             '{} from_crawler method must define'.format(cls.__class__.__name__))
15 | 
16 |     async def enqueue_request(self, request):
17 |         if self.stats:
18 |             self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
19 |         await self.queue.push(request)
20 |         return True
21 | 
22 |     async def async_next_request(self, count=None):
23 |         # Use the provided count or get from settings
24 |         if count is None:
25 |             count = getattr(self.spider, 'settings', {}).get('TASK_BEAT_BATCH_SIZE', 100)
26 | 
27 |         _results = await self.queue.pop(count=count)
28 |         if self.stats and _results:
29 |             self.stats.inc_value('scheduler/dequeued/redis', count=len(_results), spider=self.spider)
30 |         return _results
31 | 
32 |     async def open(self, start_requests):
33 |         if asyncio.iscoroutine(self.queue):
34 |             self.queue = await self.queue
35 |         async for request in start_requests:
36 |             await self.enqueue_request(request)
37 | 
38 |     async def close(self, slot):
39 |         if slot.inprogress:
40 |             for request in slot.inprogress:
41 |                 await self.enqueue_request(request)
42 |         await self.queue.close()
43 | 
44 |     def __len__(self):
45 |         return self.queue.qsize()
46 | 
47 |     async def has_pending_requests(self):
48 |         return len(self) > 0
49 | 


--------------------------------------------------------------------------------
/aioscpy/core/scheduler/memory.py:
--------------------------------------------------------------------------------
 1 | from aioscpy.core.scheduler import Scheduler
 2 | from aioscpy.queue.memory import memory_queue
 3 | 
 4 | 
 5 | class MemoryScheduler(Scheduler):
 6 | 
 7 |     @classmethod
 8 |     def from_crawler(cls, crawler):
 9 |         return cls(_queue_df=memory_queue(crawler.spider), stats=crawler.stats, spider=crawler.spider)
10 | 


--------------------------------------------------------------------------------
/aioscpy/core/scheduler/redis.py:
--------------------------------------------------------------------------------
 1 | from aioscpy.core.scheduler import Scheduler
 2 | from aioscpy.queue.redis import aio_priority_queue
 3 | 
 4 | 
 5 | class RedisScheduler(Scheduler):
 6 | 
 7 |     @classmethod
 8 |     def from_crawler(cls, crawler):
 9 |         redis_tcp = crawler.settings.get('REDIS_URI') or \
10 |                     crawler.settings.get('REDIS_TCP')
11 |         queue_key = crawler.settings.get('QUEUE_KEY') % {'spider': crawler.spider.name}
12 |         return cls(_queue_df=aio_priority_queue(queue_key, redis_tcp, crawler.spider), spider=crawler.spider, stats=crawler.stats)
13 | 
14 |     async def has_pending_requests(self):
15 |         return await self.queue.qsize() > 0
16 | 


--------------------------------------------------------------------------------
/aioscpy/exceptions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Aioscpy core exceptions
 3 | 
 4 | These exceptions are documented in docs/topics/exceptions.rst. Please don't add
 5 | new exceptions here without documenting them there.
 6 | """
 7 | 
 8 | # Internal
 9 | 
10 | 
11 | class StopDownload(Exception):
12 |     """
13 |     Stop the download of the body for a given response.
14 |     The 'fail' boolean parameter indicates whether or not the resulting partial response
15 |     should be handled by the request errback. Note that 'fail' is a keyword-only argument.
16 |     """
17 | 
18 |     def __init__(self, *, fail=True):
19 |         super().__init__()
20 |         self.fail = fail
21 | 
22 | 
23 | class NotConfigured(Exception):
24 |     """Indicates a missing configuration situation"""
25 |     pass
26 | 
27 | 
28 | class _InvalidOutput(TypeError):
29 |     """
30 |     Indicates an invalid value has been returned by a middleware's processing method.
31 |     Internal and undocumented, it should not be raised or caught by user code.
32 |     """
33 |     pass
34 | 
35 | 
36 | # HTTP and crawling
37 | 
38 | 
39 | class IgnoreRequest(Exception):
40 |     """Indicates a decision was made not to process a request"""
41 | 
42 | 
43 | class DontCloseSpider(Exception):
44 |     """Request the spider not to be closed yet"""
45 |     pass
46 | 
47 | 
48 | class CloseSpider(Exception):
49 |     """Raise this from callbacks to request the spider to be closed"""
50 | 
51 |     def __init__(self, reason='cancelled'):
52 |         super(CloseSpider, self).__init__()
53 |         self.reason = reason
54 | 
55 | 
56 | # Items
57 | 
58 | 
59 | class DropItem(Exception):
60 |     """Drop item from the item pipeline"""
61 |     pass
62 | 
63 | 
64 | class NotSupported(Exception):
65 |     """Indicates a feature or method is not supported"""
66 |     pass
67 | 
68 | 
69 | # Commands
70 | 
71 | 
72 | class UsageError(Exception):
73 |     """To indicate a command-line usage error"""
74 | 
75 |     def __init__(self, *a, **kw):
76 |         self.print_help = kw.pop('print_help', True)
77 |         super(UsageError, self).__init__(*a, **kw)
78 | 
79 | 
80 | class AioscpyDeprecationWarning(Warning):
81 |     """Warning category for deprecated features, since the default
82 |     DeprecationWarning is silenced on Python 2.7+
83 |     """
84 |     pass
85 | 
86 | 
87 | class ContractFail(AssertionError):
88 |     """Error raised in case of a failing contract"""
89 |     pass
90 | 


--------------------------------------------------------------------------------
/aioscpy/http/__init__.py:
--------------------------------------------------------------------------------
 1 | from aioscpy.http.request import Request
 2 | from aioscpy.http.request.form import FormRequest
 3 | from aioscpy.http.request.json import JsonRequest
 4 | 
 5 | from aioscpy.http.response import Response
 6 | from aioscpy.http.response.text import TextResponse
 7 | 
 8 | 
 9 | __all__ = [
10 |     Request,
11 |     FormRequest,
12 |     JsonRequest,
13 |     Response,
14 |     TextResponse
15 | ]
16 | 


--------------------------------------------------------------------------------
/aioscpy/http/request/__init__.py:
--------------------------------------------------------------------------------
  1 | from w3lib.url import safe_url_string
  2 | 
  3 | 
  4 | class Request(object):
  5 | 
  6 |     def __init__(self, url,
  7 |                  callback=None,
  8 |                  method='GET',
  9 |                  headers=None,
 10 |                  body=None,
 11 |                  json=None,
 12 |                  cookies=None,
 13 |                  meta=None,
 14 |                  encoding='utf-8',
 15 |                  priority=0,
 16 |                  dont_filter=False,
 17 |                  errback=None, flags=None, cb_kwargs=None):
 18 |         self._encoding = encoding
 19 |         self.method = str(method).upper()
 20 |         self._set_url(url)
 21 |         self._set_body(body)
 22 |         self._set_json(json)
 23 | 
 24 |         assert isinstance(priority, int), "Request priority not an integer: %r" % priority
 25 |         self.priority = priority
 26 | 
 27 |         if callback is not None and not callable(callback):
 28 |             raise TypeError('callback must be a callable, got %s' %
 29 |                             type(callback).__name__)
 30 |         if errback is not None and not callable(errback):
 31 |             raise TypeError('errback must be a callable, got %s' %
 32 |                             type(errback).__name__)
 33 |         self.callback = callback
 34 |         self.errback = errback
 35 | 
 36 |         self.cookies = cookies or {}
 37 |         self.headers = headers or {}
 38 |         self.dont_filter = dont_filter
 39 | 
 40 |         self._meta = dict(meta) if meta else None
 41 |         self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None
 42 |         self.flags = [] if flags is None else list(flags)
 43 | 
 44 |     @property
 45 |     def cb_kwargs(self):
 46 |         if self._cb_kwargs is None:
 47 |             self._cb_kwargs = {}
 48 |         return self._cb_kwargs
 49 | 
 50 |     @property
 51 |     def meta(self):
 52 |         if self._meta is None:
 53 |             self._meta = {}
 54 |         return self._meta
 55 | 
 56 |     def get(self, key, default):
 57 |         return self.meta.get(key, default)
 58 | 
 59 |     def _get_url(self):
 60 |         return self._url
 61 | 
 62 |     def _set_url(self, url):
 63 |         if not isinstance(url, str):
 64 |             raise TypeError(
 65 |                 'Request url must be str or unicode, got %s:' % type(url).__name__)
 66 | 
 67 |         s = safe_url_string(url, self.encoding)
 68 |         self._url = s
 69 | 
 70 |         if ('://' not in self._url) and (not self._url.startswith('data:')):
 71 |             raise ValueError('Missing scheme in request url: %s' % self._url)
 72 | 
 73 |     url = property(_get_url, _set_url)
 74 | 
 75 |     def _get_body(self):
 76 |         return self._body
 77 | 
 78 |     def _set_body(self, body):
 79 |         self._body = body or None
 80 | 
 81 |     body = property(_get_body, _set_body)
 82 | 
 83 |     def _get_json(self):
 84 |         return self._json
 85 |     
 86 |     def _set_json(self, json):
 87 |         self._json = json or None
 88 | 
 89 |     json = property(_get_json, _set_json)
 90 | 
 91 |     @property
 92 |     def encoding(self):
 93 |         return self._encoding
 94 | 
 95 |     def __str__(self):
 96 |         return "<%s %s>" % (self.method, self.url)
 97 | 
 98 |     __repr__ = __str__
 99 | 
100 |     def copy(self):
101 |         """Return a copy of this Request"""
102 |         return self.replace()
103 | 
104 |     def replace(self, *args, **kwargs):
105 |         """Create a new Request with the same attributes except for those
106 |         given new values.
107 |         """
108 |         for x in ['url', 'method', 'headers', 'body', 'cookies', 'meta', 'flags',
109 |                   'encoding', 'priority', 'dont_filter', 'callback', 'errback', 'cb_kwargs']:
110 |             kwargs.setdefault(x, getattr(self, x))
111 |         cls = kwargs.pop('cls', self.__class__)
112 |         return cls(*args, **kwargs)
113 | 


--------------------------------------------------------------------------------
/aioscpy/http/request/form.py:
--------------------------------------------------------------------------------
 1 | from aioscpy.http.request import Request
 2 | 
 3 | 
 4 | class FormRequest(Request):
 5 |     valid_form_methods = ['POST']
 6 | 
 7 |     def __init__(self, *args, **kwargs):
 8 |         formdata = kwargs.pop('formdata', None)
 9 |         if formdata and kwargs.get('method') is None:
10 |             kwargs['method'] = 'POST'
11 | 
12 |         super(FormRequest, self).__init__(*args, **kwargs)
13 | 
14 |         if formdata:
15 |             if self.method == 'POST':
16 |                 self.headers.setdefault(
17 |                     b'Content-Type', [b'application/x-www-form-urlencoded'])
18 |                 self._set_body(formdata)
19 | 


--------------------------------------------------------------------------------
/aioscpy/http/request/json.py:
--------------------------------------------------------------------------------
 1 | from aioscpy.http.request import Request
 2 | 
 3 | 
 4 | class JsonRequest(Request):
 5 |     valid_form_methods = ['POST']
 6 | 
 7 |     def __init__(self, *args, **kwargs):
 8 |         jsondata = kwargs.pop('jsondata', None)
 9 |         if jsondata and kwargs.get('method') is None:
10 |             kwargs['method'] = 'POST'
11 | 
12 |         super(JsonRequest, self).__init__(*args, **kwargs)
13 | 
14 |         if jsondata:
15 |             if self.method == 'POST':
16 |                 self.headers.setdefault(
17 |                     b'Content-Type', [b'application/json'])
18 |                 self._set_json(jsondata)
19 | 


--------------------------------------------------------------------------------
/aioscpy/http/response/__init__.py:
--------------------------------------------------------------------------------
  1 | from typing import Generator
  2 | from urllib.parse import urljoin
  3 | 
  4 | from aioscpy.http.request import Request
  5 | from aioscpy.http.request.form import FormRequest
  6 | from aioscpy import call_grace_instance
  7 | from aioscpy.utils.tools import obsolete_setter
  8 | 
  9 | 
 10 | class Response(object):
 11 | 
 12 |     def __init__(self, url, status=200, headers=None, body=b'', flags=None, request=None, certificate=None, _response=None):
 13 |         self.headers = headers or {}
 14 |         self.status = int(status)
 15 |         self._set_body(body)
 16 |         self._set_url(url)
 17 |         self.request = request
 18 |         self.flags = [] if flags is None else list(flags)
 19 |         self.certificate = certificate
 20 |         self._response = _response
 21 | 
 22 |     @property
 23 |     def cb_kwargs(self):
 24 |         try:
 25 |             return self.request.cb_kwargs
 26 |         except AttributeError:
 27 |             raise AttributeError(
 28 |                 "Response.cb_kwargs not available, this response "
 29 |                 "is not tied to any request"
 30 |             )
 31 | 
 32 |     @property
 33 |     def meta(self):
 34 |         try:
 35 |             return self.request.meta
 36 |         except AttributeError:
 37 |             raise AttributeError(
 38 |                 "Response.meta not available, this response "
 39 |                 "is not tied to any request"
 40 |             )
 41 | 
 42 |     def _get_url(self):
 43 |         return self._url
 44 | 
 45 |     def _set_url(self, url):
 46 |         if isinstance(url, str):
 47 |             self._url = url
 48 |         else:
 49 |             raise TypeError('%s url must be str, got %s:' %
 50 |                             (type(self).__name__, type(url).__name__))
 51 | 
 52 |     url = property(_get_url, obsolete_setter(_set_url, 'url'))
 53 | 
 54 |     def _get_body(self):
 55 |         return self._body
 56 | 
 57 |     def _set_body(self, body):
 58 |         if body is None:
 59 |             self._body = b''
 60 |         elif not isinstance(body, bytes):
 61 |             raise TypeError(
 62 |                 "Response body must be bytes. "
 63 |                 "If you want to pass unicode body use TextResponse "
 64 |                 "or HtmlResponse.")
 65 |         else:
 66 |             self._body = body
 67 | 
 68 |     body = property(_get_body, obsolete_setter(_set_body, 'body'))
 69 | 
 70 |     def __str__(self):
 71 |         return "<%d %s>" % (self.status, self.url)
 72 | 
 73 |     __repr__ = __str__
 74 | 
 75 |     def copy(self):
 76 |         """Return a copy of this Response"""
 77 |         return self.replace()
 78 | 
 79 |     def replace(self, *args, **kwargs):
 80 |         """Create a new Response with the same attributes except for those
 81 |         given new values.
 82 |         """
 83 |         for x in ['url', 'status', 'headers', 'body', 'request', 'flags', 'certificate']:
 84 |             kwargs.setdefault(x, getattr(self, x))
 85 |         cls = kwargs.pop('cls', self.__class__)
 86 |         return cls(*args, **kwargs)
 87 | 
 88 |     def urljoin(self, url: str) -> str:
 89 |         """Join this Response's url with a possible relative url to form an
 90 |         absolute interpretation of the latter."""
 91 |         return urljoin(self.url, url)
 92 | 
 93 |     @property
 94 |     def text(self):
 95 |         """For subclasses of TextResponse, this will return the body
 96 |         as str
 97 |         """
 98 |         raise AttributeError("Response content isn't text")
 99 | 
100 |     def css(self, *a, **kw):
101 |         """Shortcut method implemented only by responses whose content
102 |         is text (subclasses of TextResponse).
103 |         """
104 |         # raise NotSupported("Response content isn't text")
105 |         raise NotImplementedError
106 | 
107 |     def xpath(self, *a, **kw):
108 |         """Shortcut method implemented only by responses whose content
109 |         is text (subclasses of TextResponse).
110 |         """
111 |         # raise NotSupported("Response content isn't text")
112 |         raise NotImplementedError
113 | 
114 |     def follow(self, url, callback=None, method='GET', formdata=None, headers=None, body=None,
115 |                cookies=None, meta=None, encoding='utf-8', priority=0,
116 |                dont_filter=False, errback=None, cb_kwargs=None, flags=None, **kwargs) -> Request:
117 | 
118 |         url = self.urljoin(url)
119 |         method_request = Request
120 |         if method == "POST":
121 |             method_request = FormRequest
122 |             kwargs['formdata'] = formdata
123 | 
124 |         return call_grace_instance(
125 |                method_request,
126 |                url=url,
127 |                callback=callback,
128 |                method=method,
129 |                headers=headers,
130 |                body=body,
131 |                cookies=cookies,
132 |                meta=meta,
133 |                encoding=encoding,
134 |                priority=priority,
135 |                dont_filter=dont_filter,
136 |                errback=errback,
137 |                cb_kwargs=cb_kwargs,
138 |                flags=flags,
139 |                **kwargs
140 |            )
141 | 
142 |     def follow_all(self, urls, callback=None, method='GET', headers=None, body=None,
143 |                    cookies=None, meta=None, encoding='utf-8', priority=0,
144 |                    dont_filter=False, errback=None, cb_kwargs=None, flags=None) -> Generator:
145 |         if not hasattr(urls, '__iter__'):
146 |             raise TypeError("'urls' argument must be an iterable")
147 |         return (
148 |             self.follow(
149 |                 url=url,
150 |                 callback=callback,
151 |                 method=method,
152 |                 headers=headers,
153 |                 body=body,
154 |                 cookies=cookies,
155 |                 meta=meta,
156 |                 encoding=encoding,
157 |                 priority=priority,
158 |                 dont_filter=dont_filter,
159 |                 errback=errback,
160 |                 cb_kwargs=cb_kwargs,
161 |                 flags=flags,
162 |             )
163 |             for url in urls
164 |         )
165 | 


--------------------------------------------------------------------------------
/aioscpy/inject.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | 
  3 | from importlib import import_module
  4 | from pkgutil import iter_modules
  5 | 
  6 | from aioscpy.settings import Settings
  7 | from aioscpy.utils.tools import singleton, get_project_settings
  8 | 
  9 | 
 10 | @singleton
 11 | class CSlot:
 12 | 
 13 |     def __init__(self):
 14 |         self._object_slot_cls = {}
 15 | 
 16 |     def get(self, sets: str, default=None) -> object:
 17 |         return self._object_slot_cls.get(sets, default)
 18 | 
 19 |     def set(self, sets: str, obj: object):
 20 |         self._object_slot_cls.__setitem__(sets, obj)
 21 | 
 22 |     def empty(self):
 23 |         return not bool(len(self._object_slot_cls))
 24 | 
 25 | 
 26 | class Slot:
 27 | 
 28 |     def __init__(self, settings, crawler):
 29 |         self._objects_slot = {
 30 |             'settings': settings,
 31 |             'crawler': crawler
 32 |         }
 33 |         self._modules_slot = []
 34 |         self._close = None
 35 |         self.live_beat = None
 36 | 
 37 |     @property
 38 |     def is_live(self):
 39 |         return bool(self._close)
 40 | 
 41 |     def get(self, sets: str, default=None) -> object:
 42 |         return self._objects_slot.get(sets, default)
 43 | 
 44 |     def set(self, sets: str, obj: object):
 45 |         self._objects_slot.__setitem__(sets, obj)
 46 | 
 47 |     def clear(self):
 48 |         del self._objects_slot
 49 |         self._modules_slot = []
 50 |         self._close = True
 51 | 
 52 |     def close(self):
 53 |         if self.live_beat:
 54 |             self.live_beat.cancel()
 55 | 
 56 | 
 57 | class DependencyInjection(object):
 58 |     def __init__(self, settings: Settings = None, crawler=None):
 59 |         if not settings:
 60 |             settings = Settings()
 61 |         self.settings = settings
 62 |         self.crawler = crawler
 63 |         self.slot = Slot(settings, crawler)
 64 | 
 65 |     @classmethod
 66 |     def from_settings(cls, settings, crawler):
 67 |         return cls(settings, crawler)
 68 | 
 69 |     @classmethod
 70 |     def from_crawler(cls, crawler):
 71 |         return cls.from_settings(crawler.settings, crawler)
 72 | 
 73 |     def load(self, key: str):
 74 |         return self.slot.get(key)
 75 | 
 76 |     @staticmethod
 77 |     def load_all_spider(dirname):
 78 |         _class_objects = {}
 79 |     
 80 |         def load_all_spider_inner(dirname):
 81 |             for importer, package_name, ispkg in iter_modules([dirname]):
 82 |                 if ispkg:
 83 |                     load_all_spider_inner(dirname + '/' + package_name)
 84 |                 else:
 85 |                     module = importer.find_module(package_name)
 86 |                     module = module.load_module(package_name)
 87 |                     for cls_name in module.__dir__():
 88 |                         if cls_name == "__spiders__":
 89 |                             class_object = getattr(module, cls_name)
 90 |                             for co in class_object:
 91 |                                 _class_objects[co.name] = co
 92 |                         if not cls_name.startswith('__'):
 93 |                             class_object = getattr(module, cls_name)
 94 |                             if hasattr(class_object, "name") and getattr(class_object, "name"):
 95 |                                 _class_objects[class_object.name] = class_object
 96 | 
 97 | 
 98 |         load_all_spider_inner(dirname)
 99 |         return _class_objects
100 | 
101 |     @staticmethod
102 |     def load_object(path: str):
103 |         try:
104 |             dot = path.rindex('.')
105 |         except ValueError:
106 |             raise ValueError("Error loading object '%s': not a full path" % path)
107 | 
108 |         module, name = path[:dot], path[dot + 1:]
109 |         mod = import_module(module)
110 | 
111 |         try:
112 |             obj = getattr(mod, name)
113 |         except AttributeError:
114 |             raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
115 |         else:
116 |             return obj
117 | 
118 |     def load_object_slot(self, key: str, path: str, cls=None):
119 |         obj = self.load_object(path)
120 |         if cls is None:
121 |             obj = self.create_instance(obj, self.settings, self.crawler)
122 |             self.slot.set(key, obj)
123 |         else:
124 |             self.c_slot.set(key, obj)
125 |         return obj
126 | 
127 |     def walk_modules(self, path: str):
128 |         mods = []
129 |         if hasattr(self, "slot"):
130 |             mods = self.slot._modules_slot
131 |         mod = import_module(path)
132 |         mods.append(mod)
133 |         if hasattr(mod, '__path__'):
134 |             for _, subpath, ispkg in iter_modules(mod.__path__):
135 |                 fullpath = path + '.' + subpath
136 |                 if ispkg:
137 |                     mods += self.walk_modules(fullpath)
138 |                 else:
139 |                     submod = import_module(fullpath)
140 |                     mods.append(submod)
141 |         return mods
142 | 
143 |     def create_instance(self, objcls, settings, crawler, *args, **kwargs):
144 |         if settings is None:
145 |             if crawler is None:
146 |                 raise ValueError("Specify at least one of settings and crawler.")
147 |             settings = crawler.settings
148 |         if not (type(objcls) == "function"):
149 |             objcls = call_grace_instance(objcls, only_instance=True)
150 |         if crawler and hasattr(objcls, 'from_crawler'):
151 |             return objcls.from_crawler(crawler, *args, **kwargs)
152 |         elif hasattr(objcls, 'from_settings'):
153 |             return objcls.from_settings(settings, *args, **kwargs)
154 |         else:
155 |             return objcls(*args, **kwargs)
156 | 
157 |     async def inject_runner(self):
158 |         if any([not self.settings.get('DI_CONFIG'), not self.settings.get('DI_CONFIG_CLS')]):
159 |             raise KeyError('Settings DI_CONFIG/DI_CONFIG_CLS not be None')
160 |         for key, value in self.settings['DI_CONFIG'].items():
161 |             self.load_object_slot(key, value)
162 |         # self.slot.live_beat = asyncio.create_task(self.live_beat())
163 | 
164 |     async def live_beat(self):
165 |         while 1:
166 |             if not self.slot.is_live:
167 |                 await asyncio.sleep(20)
168 |                 break
169 |             asyncio.create_task(self.inject_runner())
170 | 
171 | 
172 | class DependencyInjectionCls(DependencyInjection):
173 | 
174 |     def __init__(self):
175 |         self.c_slot = CSlot()
176 |         self.settings = get_project_settings()
177 | 
178 |     def inject(self):
179 |         if self.c_slot.empty():
180 |             for key, value in self.settings['DI_CONFIG_CLS'].items():
181 |                 self.load_object_slot(key, value, cls=True)
182 |         return self.c_slot
183 | 
184 | 
185 | _create_dependency = DependencyInjectionCls()
186 | load_object = _create_dependency.load_object
187 | walk_modules = _create_dependency.walk_modules
188 | settings_ins = _create_dependency.settings
189 | 
190 | 
191 | class object_ref(type):
192 |     def __init__(msc, *args, **kwargs):
193 |         msc.di = _create_dependency.inject()
194 |         msc.logger = msc.di.get("log").logger
195 |         super().__init__(*args, **kwargs)
196 | 
197 | 
198 | def call_grace_instance(obj, *args, only_instance=None, **kwargs):
199 | 
200 |     if isinstance(obj, str):
201 |         obj = load_object(settings_ins['DI_CREATE_CLS'].get(obj))
202 | 
203 |     class Inner(obj, metaclass=object_ref):
204 |         pass
205 |     if only_instance is None:
206 |         return Inner(*args, **kwargs)
207 |     else:
208 |         return Inner
209 | 


--------------------------------------------------------------------------------
/aioscpy/libs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/aioscpy/libs/__init__.py


--------------------------------------------------------------------------------
/aioscpy/libs/downloadermiddlewares/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/aioscpy/libs/downloadermiddlewares/__init__.py


--------------------------------------------------------------------------------
/aioscpy/libs/downloadermiddlewares/stats.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlunparse
 2 | 
 3 | from aioscpy.exceptions import NotConfigured
 4 | from aioscpy.utils.tools import to_bytes
 5 | from aioscpy.utils.othtypes import urlparse_cached
 6 | 
 7 | 
 8 | def global_object_name(obj):
 9 |     return f"{obj.__module__}.{obj.__name__}"
10 | 
11 | 
12 | def request_httprepr(request: "Request") -> bytes:
13 |     """Return the raw HTTP representation (as bytes) of the given request.
14 |     This is provided only for reference since it's not the actual stream of
15 |     bytes that will be send when performing the request (that's controlled
16 |     by Twisted).
17 |     """
18 |     parsed = urlparse_cached(request)
19 |     path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
20 |     s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
21 |     s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n"
22 |     if request.headers:
23 |         s += request.headers.to_string() + b"\r\n"
24 |     s += b"\r\n"
25 |     s += str(request.body).encode() if request.body and isinstance(request.body, dict) else b""
26 |     return s
27 | 
28 | 
29 | def get_header_size(headers):
30 |     size = 0
31 |     for key, value in headers.items():
32 |         if isinstance(value, (list, tuple)):
33 |             for v in value:
34 |                 size += len(b": ") + len(key) + len(v)
35 |     return size + len(b'\r\n') * (len(headers.keys()) - 1)
36 | 
37 | 
38 | class DownloaderStats:
39 | 
40 |     def __init__(self, stats):
41 |         self.stats = stats
42 | 
43 |     @classmethod
44 |     def from_crawler(cls, crawler):
45 |         if not crawler.settings.getbool('DOWNLOADER_STATS'):
46 |             raise NotConfigured
47 |         return cls(crawler.stats)
48 | 
49 |     def process_request(self, request, spider):
50 |         self.stats.inc_value('downloader/request_count', spider=spider)
51 |         self.stats.inc_value(f'downloader/request_method_count/{request.method}', spider=spider)
52 |         reqlen = len(request_httprepr(request))
53 |         self.stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
54 | 
55 |     def process_response(self, request, response, spider):
56 |         self.stats.inc_value('downloader/response_count', spider=spider)
57 |         self.stats.inc_value(f'downloader/response_status_count/{response.status}', spider=spider)
58 |         reslen = len(response.body) + get_header_size(response.headers) + 4
59 |         # response.body + b"\r\n"+ response.header + b"\r\n" + response.status
60 |         self.stats.inc_value('downloader/response_bytes', reslen, spider=spider)
61 |         return response
62 | 
63 |     def process_exception(self, request, exception, spider):
64 |         ex_class = global_object_name(exception.__class__)
65 |         self.stats.inc_value('downloader/exception_count', spider=spider)
66 |         self.stats.inc_value(f'downloader/exception_type_count/{ex_class}', spider=spider)
67 | 


--------------------------------------------------------------------------------
/aioscpy/libs/extensions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/aioscpy/libs/extensions/__init__.py


--------------------------------------------------------------------------------
/aioscpy/libs/extensions/corestats.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | from aioscpy import signals
 4 | 
 5 | 
 6 | class CoreStats:
 7 | 
 8 |     def __init__(self, stats):
 9 |         self.stats = stats
10 |         self.start_time = None
11 | 
12 |     @classmethod
13 |     def from_crawler(cls, crawler):
14 |         o = cls(crawler.stats)
15 |         crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
16 |         crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
17 |         crawler.signals.connect(o.item_scraped, signal=signals.item_scraped)
18 |         crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
19 |         crawler.signals.connect(o.response_received, signal=signals.response_received)
20 |         return o
21 | 
22 |     def spider_opened(self, spider):
23 |         self.start_time = datetime.utcnow()
24 |         self.stats.set_value('start_time', self.start_time, spider=spider)
25 | 
26 |     def spider_closed(self, spider, reason):
27 |         finish_time = datetime.utcnow()
28 |         elapsed_time = finish_time - self.start_time
29 |         elapsed_time_seconds = elapsed_time.total_seconds()
30 |         self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider)
31 |         self.stats.set_value('finish_time', finish_time, spider=spider)
32 |         self.stats.set_value('finish_reason', reason, spider=spider)
33 | 
34 |     def item_scraped(self, item, spider):
35 |         self.stats.inc_value('item_scraped_count', spider=spider)
36 | 
37 |     def response_received(self, spider):
38 |         self.stats.inc_value('response_received_count', spider=spider)
39 | 
40 |     def item_dropped(self, item, spider, exception):
41 |         reason = exception.__class__.__name__
42 |         self.stats.inc_value('item_dropped_count', spider=spider)
43 |         self.stats.inc_value(f'item_dropped_reasons_count/{reason}', spider=spider)
44 | 


--------------------------------------------------------------------------------
/aioscpy/libs/extensions/logstats.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from aioscpy.exceptions import NotConfigured
 4 | from aioscpy import signals
 5 | 
 6 | 
 7 | class LogStats:
 8 |     """Log basic scraping stats periodically"""
 9 | 
10 |     def __init__(self, stats, interval=60.0):
11 |         self.stats = stats
12 |         self.interval = interval
13 |         self.multiplier = 60.0 / self.interval
14 |         self.task = None
15 |         self._close_stats = 0
16 | 
17 |     @classmethod
18 |     def from_crawler(cls, crawler):
19 |         interval = crawler.settings.getfloat('LOGSTATS_INTERVAL')
20 |         if not interval:
21 |             raise NotConfigured
22 |         o = cls(crawler.stats, interval)
23 |         crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
24 |         crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
25 |         return o
26 | 
27 |     def spider_opened(self, spider):
28 |         self.pagesprev = 0
29 |         self.itemsprev = 0
30 |         self.task = asyncio.create_task(self.log(spider))
31 | 
32 |     async def log(self, spider):
33 |         await asyncio.sleep(self.interval)
34 |         items = self.stats.get_value('item_scraped_count', 0)
35 |         pages = self.stats.get_value('response_received_count', 0)
36 |         irate = (items - self.itemsprev) * self.multiplier
37 |         prate = (pages - self.pagesprev) * self.multiplier
38 |         self.pagesprev, self.itemsprev = pages, items
39 | 
40 |         msg = ("<{spider_name}> Crawled {pages} pages (at {pagerate} pages/min), "
41 |                "scraped {items} items (at {itemrate} items/min)")
42 |         log_args = {'pages': pages, 'pagerate': prate, 'spider_name': spider.name,
43 |                     'items': items, 'itemrate': irate}
44 |         self.logger.info(msg, **log_args, extra={'spider': spider})
45 |         self.task = asyncio.create_task(self.log(spider))
46 | 
47 |     def spider_closed(self, spider, reason):
48 |         if self.task and not self.task.done():
49 |             self.logger.warning(f'[{spider.name}] recevier logstats closed signed! reason: {reason}')
50 |             self.task.cancel()
51 | 


--------------------------------------------------------------------------------
/aioscpy/libs/statscollectors.py:
--------------------------------------------------------------------------------
 1 | import pprint
 2 | 
 3 | 
 4 | class StatsCollector:
 5 | 
 6 |     @classmethod
 7 |     def from_crawler(cls, crawler):
 8 |         return cls(crawler)
 9 | 
10 |     def __init__(self, crawler):
11 |         self._dump = crawler.settings.getbool('STATS_DUMP')
12 |         self._stats = {}
13 | 
14 |     def get_value(self, key, default=None, spider=None):
15 |         return self._stats.get(key, default)
16 | 
17 |     def get_stats(self, spider=None):
18 |         return self._stats
19 | 
20 |     def set_value(self, key, value, spider=None):
21 |         self._stats[key] = value
22 | 
23 |     def set_stats(self, stats, spider=None):
24 |         self._stats = stats
25 | 
26 |     def inc_value(self, key, count=1, start=0, spider=None):
27 |         d = self._stats
28 |         d[key] = d.setdefault(key, start) + count
29 | 
30 |     def max_value(self, key, value, spider=None):
31 |         self._stats[key] = max(self._stats.setdefault(key, value), value)
32 | 
33 |     def min_value(self, key, value, spider=None):
34 |         self._stats[key] = min(self._stats.setdefault(key, value), value)
35 | 
36 |     def clear_stats(self, spider=None):
37 |         self._stats.clear()
38 | 
39 |     def open_spider(self, spider):
40 |         pass
41 | 
42 |     def close_spider(self, spider, reason):
43 |         if self._dump:
44 |             self.logger.info("Dumping Aioscpy stats:\n {stats}", **{'stats': pprint.pformat(self._stats)},
45 |                         extra={'spider': spider})
46 |         self._persist_stats(self._stats, spider)
47 | 
48 |     def _persist_stats(self, stats, spider):
49 |         pass
50 | 
51 | 
52 | class MemoryStatsCollector(StatsCollector):
53 | 
54 |     def __init__(self, crawler):
55 |         super().__init__(crawler)
56 |         self.spider_stats = {}
57 | 
58 |     def _persist_stats(self, stats, spider):
59 |         self.spider_stats[spider.name] = stats
60 | 
61 | 
62 | class DummyStatsCollector(StatsCollector):
63 | 
64 |     def get_value(self, key, default=None, spider=None):
65 |         return default
66 | 
67 |     def set_value(self, key, value, spider=None):
68 |         pass
69 | 
70 |     def set_stats(self, stats, spider=None):
71 |         pass
72 | 
73 |     def inc_value(self, key, count=1, start=0, spider=None):
74 |         pass
75 | 
76 |     def max_value(self, key, value, spider=None):
77 |         pass
78 | 
79 |     def min_value(self, key, value, spider=None):
80 |         pass
81 | 


--------------------------------------------------------------------------------
/aioscpy/logformatter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from aioscpy.utils.tools import referer_str
  4 | 
  5 | 
  6 | SCRAPEDMSG = "Scraped from {src}" + os.linesep + "{item}"
  7 | DROPPEDMSG = "Dropped: {exception}" + os.linesep + "{item}"
  8 | CRAWLEDMSG = "Crawled ({status}) {request}{request_flags} (referer: {referer}){response_flags}"
  9 | ITEMERRORMSG = "Error processing {item}"
 10 | SPIDERERRORMSG = "Spider error processing {request} (referer: {referer})"
 11 | DOWNLOADERRORMSG_SHORT = "Error downloading {request}"
 12 | DOWNLOADERRORMSG_LONG = "Error downloading {request}: {errmsg}"
 13 | 
 14 | 
 15 | class LogFormatter:
 16 | 
 17 |     def crawled(self, request, response, spider):
 18 |         request_flags = f' {str(request.flags)}' if request.flags else ''
 19 |         response_flags = f' {str(response.flags)}' if response.flags else ''
 20 |         return {
 21 |             'level': "DEBUG",
 22 |             'msg': CRAWLEDMSG,
 23 |             'args': {
 24 |                 'status': response.status,
 25 |                 'request': request,
 26 |                 'request_flags': request_flags,
 27 |                 'referer': referer_str(request),
 28 |                 'response_flags': response_flags,
 29 |                 # backward compatibility with Aioscpy logformatter below 1.4 version
 30 |                 'flags': response_flags
 31 |             }
 32 |         }
 33 | 
 34 |     def scraped(self, item, response, spider):
 35 |         """Logs a message when an item is scraped by a spider."""
 36 |         src = response
 37 |         return {
 38 |             'level': "DEBUG",
 39 |             'msg': SCRAPEDMSG,
 40 |             'args': {
 41 |                 'src': src,
 42 |                 'item': item,
 43 |             }
 44 |         }
 45 | 
 46 |     def dropped(self, item, exception, response, spider):
 47 |         """Logs a message when an item is dropped while it is passing through the item pipeline."""
 48 |         return {
 49 |             'level': "WARNING",
 50 |             'msg': DROPPEDMSG,
 51 |             'args': {
 52 |                 'exception': exception,
 53 |                 'item': item,
 54 |             }
 55 |         }
 56 | 
 57 |     def item_error(self, item, exception, response, spider):
 58 |         """Logs a message when an item causes an error while it is passing
 59 |         through the item pipeline.
 60 | 
 61 |         .. versionadded:: 2.0
 62 |         """
 63 |         return {
 64 |             'level': "ERROR",
 65 |             'msg': ITEMERRORMSG,
 66 |             'args': {
 67 |                 'item': item,
 68 |             }
 69 |         }
 70 | 
 71 |     def spider_error(self, failure, request, response, spider):
 72 |         """Logs an error message from a spider.
 73 | 
 74 |         .. versionadded:: 2.0
 75 |         """
 76 |         return {
 77 |             'level': "ERROR",
 78 |             'msg': SPIDERERRORMSG,
 79 |             'args': {
 80 |                 'request': request,
 81 |                 'referer': referer_str(request),
 82 |             }
 83 |         }
 84 | 
 85 |     def download_error(self, failure, request, spider, errmsg=None):
 86 |         """Logs a download error message from a spider (typically coming from
 87 |         the engine).
 88 | 
 89 |         .. versionadded:: 2.0
 90 |         """
 91 |         args = {'request': request}
 92 |         if errmsg:
 93 |             msg = DOWNLOADERRORMSG_LONG
 94 |             args['errmsg'] = errmsg
 95 |         else:
 96 |             msg = DOWNLOADERRORMSG_SHORT
 97 |         return {
 98 |             'level': "ERROR",
 99 |             'msg': msg,
100 |             'args': args,
101 |         }
102 | 
103 |     @classmethod
104 |     def from_crawler(cls, crawler):
105 |         return cls()
106 | 


--------------------------------------------------------------------------------
/aioscpy/middleware/__init__.py:
--------------------------------------------------------------------------------
 1 | from aioscpy.middleware.downloader import DownloaderMiddlewareManager
 2 | from aioscpy.middleware.itempipeline import ItemPipelineManager
 3 | from aioscpy.middleware.extension import ExtensionManager
 4 | 
 5 | 
 6 | __all__ = [
 7 |     "DownloaderMiddlewareManager",
 8 |     "ItemPipelineManager",
 9 |     "ExtensionManager",
10 | ]
11 | 


--------------------------------------------------------------------------------
/aioscpy/middleware/adaptive_concurrency.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from collections import deque
 3 | 
 4 | from aioscpy.middleware.manager import MiddlewareManager
 5 | 
 6 | 
 7 | class AdaptiveConcurrencyMiddleware:
 8 |     """
 9 |     Middleware that adjusts concurrency based on response times.
10 |     
11 |     This middleware monitors response times and adjusts the concurrency
12 |     settings dynamically to maintain optimal performance.
13 |     """
14 |     
15 |     def __init__(self, crawler):
16 |         self.crawler = crawler
17 |         self.settings = crawler.settings
18 |         self.enabled = self.settings.getbool('ADAPTIVE_CONCURRENCY_ENABLED', False)
19 |         
20 |         if not self.enabled:
21 |             return
22 |             
23 |         # Configuration
24 |         self.target_response_time = self.settings.getfloat('ADAPTIVE_CONCURRENCY_TARGET_RESPONSE_TIME', 1.0)
25 |         self.min_concurrency = self.settings.getint('ADAPTIVE_CONCURRENCY_MIN_REQUESTS', 8)
26 |         self.max_concurrency = self.settings.getint('ADAPTIVE_CONCURRENCY_MAX_REQUESTS', 32)
27 |         self.window_size = self.settings.getint('ADAPTIVE_CONCURRENCY_WINDOW_SIZE', 20)
28 |         self.adjustment_interval = self.settings.getint('ADAPTIVE_CONCURRENCY_ADJUSTMENT_INTERVAL', 10)
29 |         
30 |         # State
31 |         self.response_times = deque(maxlen=self.window_size)
32 |         self.last_adjustment_time = time.time()
33 |         self.current_concurrency = self.settings.getint('CONCURRENT_REQUESTS', 16)
34 |         
35 |         # Set initial concurrency
36 |         self.crawler.settings.set('CONCURRENT_REQUESTS', self.current_concurrency)
37 |         self.logger.info(f"Adaptive concurrency enabled. Initial concurrency: {self.current_concurrency}")
38 |     
39 |     @classmethod
40 |     def from_crawler(cls, crawler):
41 |         return cls(crawler)
42 |     
43 |     async def process_request(self, request, spider):
44 |         if not self.enabled:
45 |             return None
46 |             
47 |         # Store request start time
48 |         request.meta['request_start_time'] = time.time()
49 |         return None
50 |     
51 |     async def process_response(self, request, response, spider):
52 |         if not self.enabled or 'request_start_time' not in request.meta:
53 |             return response
54 |             
55 |         # Calculate response time
56 |         response_time = time.time() - request.meta['request_start_time']
57 |         self.response_times.append(response_time)
58 |         
59 |         # Adjust concurrency if needed
60 |         current_time = time.time()
61 |         if (current_time - self.last_adjustment_time) >= self.adjustment_interval and len(self.response_times) >= self.window_size:
62 |             self._adjust_concurrency()
63 |             self.last_adjustment_time = current_time
64 |             
65 |         return response
66 |     
67 |     def _adjust_concurrency(self):
68 |         """Adjust concurrency based on average response time"""
69 |         avg_response_time = sum(self.response_times) / len(self.response_times)
70 |         
71 |         # Calculate adjustment factor
72 |         adjustment_factor = self.target_response_time / avg_response_time
73 |         
74 |         # Apply adjustment with limits
75 |         new_concurrency = int(self.current_concurrency * adjustment_factor)
76 |         new_concurrency = max(self.min_concurrency, min(self.max_concurrency, new_concurrency))
77 |         
78 |         # Only update if there's a significant change
79 |         if new_concurrency != self.current_concurrency:
80 |             self.current_concurrency = new_concurrency
81 |             self.crawler.settings.set('CONCURRENT_REQUESTS', new_concurrency)
82 |             self.logger.info(
83 |                 f"Adjusted concurrency to {new_concurrency} (avg response time: {avg_response_time:.2f}s, "
84 |                 f"target: {self.target_response_time:.2f}s)"
85 |             )
86 | 


--------------------------------------------------------------------------------
/aioscpy/middleware/downloader.py:
--------------------------------------------------------------------------------
 1 | from asyncio import iscoroutinefunction
 2 | 
 3 | from aioscpy.exceptions import _InvalidOutput
 4 | from aioscpy.utils.common import build_component_list
 5 | from aioscpy.middleware.manager import MiddlewareManager
 6 | 
 7 | 
 8 | class DownloaderMiddlewareManager(MiddlewareManager):
 9 |     component_name = 'downloader middleware'
10 | 
11 |     @classmethod
12 |     def _get_mwlist_from_settings(cls, settings):
13 |         return build_component_list(
14 |             settings.getwithbase('DOWNLOADER_MIDDLEWARES'))
15 | 
16 |     def _add_middleware(self, mw):
17 |         if hasattr(mw, 'process_request'):
18 |             self.methods['process_request'].append(mw.process_request)
19 |         if hasattr(mw, 'process_response'):
20 |             self.methods['process_response'].appendleft(mw.process_response)
21 |         if hasattr(mw, 'process_exception'):
22 |             self.methods['process_exception'].appendleft(mw.process_exception)
23 | 
24 |     async def process_request(self, spider, request):
25 |         for method in self.methods['process_request']:
26 |             if iscoroutinefunction(method):
27 |                 response = await method(request=request, spider=spider)
28 |             else:
29 |                 response = method(request=request, spider=spider)
30 |             if response is not None and not isinstance(response, (self.di.get("response"), self.di.get('request'))):
31 |                 raise _InvalidOutput(
32 |                     "Middleware %s.process_request must return None, Response or Request, got %s"
33 |                     % (method.__self__.__class__.__name__, response.__class__.__name__)
34 |                 )
35 |             if response:
36 |                 return response
37 | 
38 |     async def process_response(self, spider, request, response):
39 |         if response is None:
40 |             raise TypeError("Received None in process_response")
41 |         elif isinstance(response, self.di.get('request')):
42 |             return response
43 | 
44 |         for method in self.methods['process_response']:
45 |             if iscoroutinefunction(method):
46 |                 response = await method(request=request, response=response, spider=spider)
47 |             else:
48 |                 response = method(request=request, response=response, spider=spider)
49 |             if not isinstance(response, (self.di.get("response"), self.di.get('request'))):
50 |                 raise _InvalidOutput(
51 |                     "Middleware %s.process_response must return Response or Request, got %s"
52 |                     % (method.__self__.__class__.__name__, type(response))
53 |                 )
54 |             if isinstance(response, self.di.get('request')):
55 |                 return response
56 |         return response
57 | 
58 |     async def process_exception(self, spider, request, exception):
59 |         for method in self.methods['process_exception']:
60 |             if iscoroutinefunction(method):
61 |                 response = await method(request=request, exception=exception, spider=spider)
62 |             else:
63 |                 response = method(request=request, exception=exception, spider=spider)
64 |             if response is not None and not isinstance(response, (self.di.get('response'), self.di.get('request'))):
65 |                 raise _InvalidOutput(
66 |                     "Middleware %s.process_exception must return None, Response or Request, got %s"
67 |                     % (method.__self__.__class__.__name__, type(response))
68 |                 )
69 |             if response:
70 |                 return response
71 |         return exception
72 | 
73 | 


--------------------------------------------------------------------------------
/aioscpy/middleware/extension.py:
--------------------------------------------------------------------------------
 1 | from aioscpy.middleware.manager import MiddlewareManager
 2 | from aioscpy.utils.common import build_component_list
 3 | 
 4 | 
 5 | class ExtensionManager(MiddlewareManager):
 6 | 
 7 |     component_name = 'extension'
 8 | 
 9 |     @classmethod
10 |     def _get_mwlist_from_settings(cls, settings):
11 |         return build_component_list(settings.getwithbase('EXTENSIONS'))
12 | 


--------------------------------------------------------------------------------
/aioscpy/middleware/itempipeline.py:
--------------------------------------------------------------------------------
 1 | from aioscpy.middleware.manager import MiddlewareManager
 2 | from aioscpy.utils.common import build_component_list
 3 | 
 4 | 
 5 | class ItemPipelineManager(MiddlewareManager):
 6 |     component_name = 'item pipeline'
 7 | 
 8 |     @classmethod
 9 |     def _get_mwlist_from_settings(cls, settings):
10 |         return build_component_list(settings.getwithbase('ITEM_PIPELINES'))
11 | 
12 |     def _add_middleware(self, pipe):
13 |         super()._add_middleware(pipe)
14 |         if hasattr(pipe, 'process_item'):
15 |             self.methods['process_item'].append(pipe.process_item)
16 | 
17 |     async def process_item(self, item, spider):
18 |         return await self._process_chain('process_item', item, spider)
19 | 


--------------------------------------------------------------------------------
/aioscpy/middleware/manager.py:
--------------------------------------------------------------------------------
  1 | import pprint
  2 | 
  3 | from asyncio import iscoroutinefunction
  4 | from collections import defaultdict, deque
  5 | 
  6 | from aioscpy.exceptions import NotConfigured
  7 | 
  8 | 
  9 | class MiddlewareManager:
 10 |     """Base class for implementing middleware managers"""
 11 | 
 12 |     component_name = 'foo middleware'
 13 | 
 14 |     def __init__(self, crawler=None, middlewares=None):
 15 |         self.crawler = crawler
 16 |         self.middlewares = middlewares
 17 |         self.methods = defaultdict(deque)
 18 |         for mw in middlewares:
 19 |             self._add_middleware(mw)
 20 | 
 21 |     @classmethod
 22 |     def _get_mwlist_from_settings(cls, settings):
 23 |         raise NotImplementedError
 24 | 
 25 |     @classmethod
 26 |     def from_settings(cls, settings, crawler=None):
 27 |         mwlist = cls._get_mwlist_from_settings(settings)
 28 |         middlewares = []
 29 |         enabled = []
 30 |         for clspath in mwlist:
 31 |             try:
 32 |                 mw = crawler.DI.load_object_slot(clspath.split('.')[-2], clspath)
 33 |                 middlewares.append(mw)
 34 |                 enabled.append(clspath)
 35 |             except NotConfigured as e:
 36 |                 if e.args:
 37 |                     clsname = clspath.split('.')[-1]
 38 |                     cls.logger.warning("Disabled {clsname}: {eargs}",
 39 |                                    **{'clsname': clsname, 'eargs': e.args[0]},
 40 |                                    extra={'crawler': crawler})
 41 |         if enabled:
 42 |             cls.logger.info("Enabled {name} {componentname}s:\n{enabledlist}",
 43 |                         **{'componentname': cls.component_name,
 44 |                          'enabledlist': pprint.pformat(enabled),
 45 |                          'name': crawler.spider.name},
 46 |                         extra={'crawler': crawler})
 47 |         return cls(crawler=crawler, middlewares=middlewares)
 48 | 
 49 |     @classmethod
 50 |     def from_crawler(cls, crawler):
 51 |         return cls.from_settings(crawler.settings, crawler)
 52 | 
 53 |     def _add_middleware(self, mw):
 54 |         if hasattr(mw, 'open_spider'):
 55 |             self.methods['open_spider'].append(mw.open_spider)
 56 |         if hasattr(mw, 'close_spider'):
 57 |             self.methods['close_spider'].appendleft(mw.close_spider)
 58 | 
 59 |     async def _process_parallel(self, methodname, obj, *args):
 60 |         return await self.process_parallel(self.methods[methodname], obj, *args)
 61 | 
 62 |     async def _process_chain(self, methodname, obj, *args):
 63 |         return await self.process_chain(self.methods[methodname], obj, *args)
 64 | 
 65 |     async def _process_chain_both(self, cb_methodname, eb_methodname, obj, *args):
 66 |         return await self.process_chain_both(self.methods[cb_methodname],
 67 |                                              self.methods[eb_methodname], obj, *args)
 68 | 
 69 |     async def open_spider(self, spider):
 70 |         return await self._process_parallel('open_spider', spider)
 71 | 
 72 |     async def close_spider(self, spider):
 73 |         return await self._process_parallel('close_spider', spider)
 74 | 
 75 |     @staticmethod
 76 |     async def process_parallel(callbacks, input_, *a, **kw):
 77 |         for callback in callbacks:
 78 |             if iscoroutinefunction(callback):
 79 |                 await callback(input_, *a, **kw)
 80 |             else:
 81 |                 callback(input_, *a, **kw)
 82 | 
 83 |     @staticmethod
 84 |     async def process_chain(callbacks, input_, *a, **kw):
 85 |         for callback in callbacks:
 86 |             if iscoroutinefunction(callback):
 87 |                 input_result = await callback(input_, *a, **kw)
 88 |             else:
 89 |                 input_result = callback(input_, *a, **kw)
 90 |             if input_result is not None:
 91 |                 input_ = input_result
 92 |         return input_
 93 | 
 94 |     @staticmethod
 95 |     async def process_chain_both(callbacks, errbacks, input_, *a, **kw):
 96 |         for cb, eb in zip(callbacks, errbacks):
 97 |             try:
 98 |                 if iscoroutinefunction(cb):
 99 |                     input_ = await cb(input_, *a, **kw)
100 |                 else:
101 |                     input_ = cb(input_, *a, **kw)
102 |             except(Exception, BaseException) as e:
103 |                 if iscoroutinefunction(cb):
104 |                     input_ = await eb(input_, *a, **kw)
105 |                 else:
106 |                     input_ = eb(input_, *a, **kw)
107 |             return input_
108 | 


--------------------------------------------------------------------------------
/aioscpy/queue/__init__.py:
--------------------------------------------------------------------------------
 1 | from aioscpy.queue.compat import COMPAT_TYPE
 2 | 
 3 | from aioscpy.queue.convert import request_from_dict, request_to_dict
 4 | 
 5 | 
 6 | class BaseQueue(object):
 7 | 
 8 |     __slots__ = ["server", "key", "serializer", "spider"]
 9 |     __compat__ = COMPAT_TYPE
10 | 
11 |     def __init__(self, server, spider=None, key=None, serializer=None):
12 |         if serializer is None:
13 |             serializer = self.__compat__[serializer or "json"]
14 | 
15 |         if not hasattr(serializer, 'loads'):
16 |             raise TypeError("serializer does not implement 'loads' function: %r"
17 |                             % serializer)
18 |         if not hasattr(serializer, 'dumps'):
19 |             raise TypeError("serializer does not implement 'dumps' function: %r"
20 |                             % serializer)
21 | 
22 |         self.server = server
23 |         self.key = key or 'sp:requests'
24 |         self.serializer = serializer
25 |         self.spider = spider
26 | 
27 |     def _encode_request(self, request) -> bytes:
28 |         obj = request_to_dict(request, self.spider)
29 |         return self.serializer.dumps(obj)
30 | 
31 |     def _decode_request(self, encoded_request: bytes) -> dict:
32 |         obj = self.serializer.loads(encoded_request)
33 |         return request_from_dict(obj, self.spider)
34 |         # return obj
35 | 
36 |     def __len__(self):
37 |         raise Exception('please use function len()')
38 | 
39 |     async def qsize(self):
40 |         raise NotImplementedError
41 | 
42 |     async def push(self, request):
43 |         raise NotImplementedError
44 | 
45 |     async def pop(self, timeout=0):
46 |         raise NotImplementedError
47 | 
48 |     async def clear(self):
49 |         await self.server.delete(self.key)
50 | 
51 |     async def close(self):
52 |         if hasattr(self.server, "close"):
53 |             await self.server.close()
54 | 


--------------------------------------------------------------------------------
/aioscpy/queue/compat.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import json
 3 | 
 4 | from aioscpy.utils.tools import to_unicode
 5 | 
 6 | 
 7 | def _request_byte2str(obj):
 8 |     _encoding = obj.get('_encoding', 'utf-8')
 9 |     if isinstance(obj['body'], bytes):
10 |         _body = obj['body'].decode(_encoding)
11 |     elif isinstance(obj['body'], dict):
12 |         _body = json.dumps(obj['body'])
13 |     else:
14 |         _body = obj['body']
15 |     _headers = {}
16 |     for k, v in obj['headers'].items():
17 |         if isinstance(k, bytes) or isinstance(v, bytes):
18 |             _headers.update({to_unicode(k, encoding=_encoding): to_unicode(b','.join(v), encoding=_encoding)})
19 |         else:
20 |             _headers.update({k: v})
21 |     obj.update({
22 |         'body': _body,
23 |         'headers': _headers
24 |     })
25 |     return obj
26 | 
27 | 
28 | class PickleCompat:
29 | 
30 |     @staticmethod
31 |     def loads(s: bytes) -> dict:
32 |         return pickle.loads(s)
33 | 
34 |     @staticmethod
35 |     def dumps(obj) -> bytes:
36 |         return pickle.dumps(obj, protocol=-1)
37 | 
38 | 
39 | class JsonCompat:
40 | 
41 |     @staticmethod
42 |     def loads(s: bytes) -> dict:
43 |         return json.loads(s)
44 | 
45 |     @staticmethod
46 |     def dumps(obj) -> str:
47 |         return json.dumps(_request_byte2str(obj))
48 | 
49 | 
50 | COMPAT_TYPE = {
51 |     "pickle": PickleCompat,
52 |     "json": JsonCompat
53 | }
54 | 
55 | __all__ = [
56 |     COMPAT_TYPE,
57 | ]
58 | 


--------------------------------------------------------------------------------
/aioscpy/queue/convert.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helper functions for serializing (and deserializing) requests.
  3 | """
  4 | import inspect
  5 | import json
  6 | 
  7 | from aioscpy import call_grace_instance
  8 | from aioscpy.http import Request
  9 | from aioscpy.utils.tools import to_unicode
 10 | from aioscpy.inject import load_object
 11 | from anti_header import Headers
 12 | 
 13 | 
 14 | def request_to_dict(request, spider=None):
 15 |     """Convert Request object to a dict.
 16 | 
 17 |     If a spider is given, it will try to find out the name of the spider method
 18 |     used in the callback and store that as the callback.
 19 |     """
 20 |     cb = request.callback
 21 |     if callable(cb):
 22 |         cb = _find_method(spider, cb)
 23 |     eb = request.errback
 24 |     if callable(eb):
 25 |         eb = _find_method(spider, eb)
 26 |     d = {
 27 |         'url': to_unicode(request.url),  # urls should be safe (safe_string_url)
 28 |         'callback': cb,
 29 |         'errback': eb,
 30 |         'method': request.method,
 31 |         'headers': dict(request.headers),
 32 |         'body': request.body,
 33 |         'json': request.json,
 34 |         'cookies': request.cookies,
 35 |         'meta': request.meta,
 36 |         '_encoding': request._encoding,
 37 |         'priority': request.priority,
 38 |         'dont_filter': request.dont_filter,
 39 |         'flags': request.flags,
 40 |         'cb_kwargs': request.cb_kwargs,
 41 |     }
 42 |     _body = getattr(request, "body")
 43 |     _json = getattr(request, "json")
 44 |     if _body and isinstance(_body, dict) or _json and isinstance(_json, dict):
 45 |         base_cls = request.__class__.__bases__[0]
 46 |         d['_class'] = base_cls.__module__ + '.' + base_cls.__name__
 47 |     return d
 48 | 
 49 | 
 50 | def request_from_dict(d, spider=None):
 51 |     """Create Request object from a dict.
 52 | 
 53 |     If a spider is given, it will try to resolve the callbacks looking at the
 54 |     spider for methods with the same name.
 55 |     """
 56 |     cb = d.get('callback', 'parse')
 57 |     if cb and spider:
 58 |         cb = _get_method(spider, cb)
 59 |     eb = d.get('errback')
 60 |     if eb and spider:
 61 |         eb = _get_method(spider, eb)
 62 |     request_cls = load_object(d['_class']) if '_class' in d else Request
 63 | 
 64 |     _json, _body = None, None
 65 |     if request_cls.__name__ in ["FormRequest"]:
 66 |         if d.get('body') and isinstance(d.get('body'), dict):
 67 |             _body = d['body']
 68 |         elif d.get('body') and isinstance(d.get('body'), str):
 69 |             _body = json.loads(d['body'])
 70 |     elif request_cls.__name__ in ["JsonRequest"]:
 71 |         if d.get('json') and isinstance(d.get('json'), dict):
 72 |             _json = d['json']
 73 |         elif d.get('json') and isinstance(d.get('json'), str):
 74 |             _json = json.loads(d['json'])
 75 | 
 76 |     return call_grace_instance(
 77 |             request_cls,
 78 |             url=to_unicode(d['url']),
 79 |             callback=cb,
 80 |             errback=eb,
 81 |             method=d.get('method', 'GET'),
 82 |             headers=Headers(d.get('headers', {})),
 83 |             body=_body,
 84 |             json=_json,
 85 |             cookies=d.get('cookies'),
 86 |             meta=d.get('meta'),
 87 |             encoding=d.get('_encoding', 'utf-8'),
 88 |             priority=d.get('priority', 0),
 89 |             dont_filter=d.get('dont_filter', True),
 90 |             flags=d.get('flags'),
 91 |             cb_kwargs=d.get('cb_kwargs'),
 92 |     )
 93 | 
 94 | 
 95 | def _find_method(obj, func):
 96 |     # Only instance methods contain ``__func__``
 97 |     if obj and hasattr(func, '__func__'):
 98 |         members = inspect.getmembers(obj, predicate=inspect.ismethod)
 99 |         for name, obj_func in members:
100 |             # We need to use __func__ to access the original
101 |             # function object because instance method objects
102 |             # are generated each time attribute is retrieved from
103 |             # instance.
104 |             #
105 |             # Reference: The standard type hierarchy
106 |             # https://docs.python.org/3/reference/datamodel.html
107 |             if obj_func.__func__ is func.__func__:
108 |                 return name
109 |     raise ValueError(f"Function {func} is not an instance method in: {obj}")
110 | 
111 | 
112 | def _get_method(obj, name):
113 |     name = str(name)
114 |     try:
115 |         return getattr(obj, name)
116 |     except AttributeError:
117 |         raise ValueError(f"Method {name!r} not found in: {obj}")
118 | 


--------------------------------------------------------------------------------
/aioscpy/queue/memory/__init__.py:
--------------------------------------------------------------------------------
1 | from ._queue import spider_queue, memory_queue
2 | 
3 | 
4 | __all__ = [
5 |     spider_queue,
6 |     memory_queue
7 | ]
8 | 


--------------------------------------------------------------------------------
/aioscpy/queue/memory/_queue.py:
--------------------------------------------------------------------------------
 1 | from asyncio import Queue
 2 | 
 3 | from aioscpy.queue import BaseQueue
 4 | 
 5 | 
 6 | class PriorityQueue(BaseQueue):
 7 | 
 8 |     def __init__(self, server, spider, serializer="pickle"):
 9 |         super().__init__(server, spider)
10 |         self.serializer = self.__compat__[serializer]
11 | 
12 |     def qsize(self) -> int:
13 |         """Return the length of the queue"""
14 |         return self.server.qsize()
15 | 
16 |     async def push(self, request):
17 |         data = self._encode_request(request)
18 |         await self.server.put(data)
19 | 
20 |     async def pop(self, timeout: int = 0, count: int = 0) -> list:
21 |         _item = await self.server.get()
22 |         return [self._decode_request(_item)]
23 | 
24 | 
25 | def memory_queue(spider) -> PriorityQueue:
26 |     """
27 |     async def run():
28 |         queue = memery_queue('message:queue')
29 |         await queue.push({"url": "https://www.baidu.com/?kw=1", "task_id": '123'})
30 |         print(await queue.pop())
31 | 
32 | 
33 |     if __name__ == "__main__":
34 |         import asyncio
35 |         asyncio.run(run())
36 | 
37 |     """
38 |     server = Queue()
39 |     return PriorityQueue(server=server, spider=spider)
40 | 
41 | 
42 | spider_queue = memory_queue
43 | 


--------------------------------------------------------------------------------
/aioscpy/queue/rabbitmq/__init__.py:
--------------------------------------------------------------------------------
1 | from ._queue import spider_priority_queue, priority_queue
2 | 
3 | 
4 | __all__ = [
5 |     spider_priority_queue,
6 |     priority_queue,
7 | ]
8 | 


--------------------------------------------------------------------------------
/aioscpy/queue/rabbitmq/_queue.py:
--------------------------------------------------------------------------------
  1 | import pika
  2 | 
  3 | from aioscpy.queue import BaseQueue
  4 | 
  5 | 
  6 | class PriorityQueue(BaseQueue):
  7 |     def qsize(self) -> int:
  8 |         return self.server.get_waiting_message_count()
  9 | 
 10 |     def push(self, request: dict):
 11 |         data = self._encode_request(request)
 12 |         score = request.get('priority', 1)
 13 | 
 14 |         self.server.basic_publish(
 15 |             properties=pika.BasicProperties(priority=score),
 16 |             exchange='',
 17 |             routing_key=self.key,
 18 |             body=data
 19 |         )
 20 | 
 21 |     def on_message(self, ch, method, properties, body):
 22 |         pass
 23 | 
 24 |     def m_pop(self, on_message_callback=None, auto_ack=False):
 25 |         if not on_message_callback:
 26 |             on_message_callback = self.on_message
 27 |         self.server.basic_consume(
 28 |             on_message_callback=on_message_callback,
 29 |             queue=self.key,
 30 |             auto_ack=auto_ack
 31 |         )
 32 |         self.server.start_consuming()
 33 | 
 34 |     def pop(self, auto_ack=False):
 35 |         _method, _, _body = self.server.basic_get(queue=self.key, auto_ack=auto_ack)
 36 |         if all([isinstance(_body, bytes), _body is not None]):
 37 |             return _method, self._decode_request(_body)
 38 |         return None, None
 39 | 
 40 |     def finish(self, method):
 41 |         self.server.basic_ack(delivery_tag=method.delivery_tag)
 42 | 
 43 | 
 44 | class RabbitMq:
 45 |     __mq_instance = None
 46 |     __mq_connection_instance = None
 47 | 
 48 |     def __init__(self, *args, **kwargs):
 49 |         self.args = args
 50 |         self.kwargs = self.validator(kwargs)
 51 | 
 52 |     @staticmethod
 53 |     def validator(params: dict) -> dict:
 54 |         params.setdefault('host', '127.0.0.1')
 55 |         params.setdefault('port', 5672)
 56 |         params.setdefault('username', 'admin')
 57 |         params.setdefault('password', 'admin')
 58 |         params.setdefault('max_priority', 100)
 59 |         params.setdefault('key', 'rabbitmq:queue')
 60 |         return params
 61 | 
 62 |     @property
 63 |     def get_channel(self):
 64 |         if not self.__mq_instance:
 65 |             connection = pika.BlockingConnection(
 66 |                 pika.ConnectionParameters(
 67 |                     host=self.kwargs['host'],
 68 |                     port=self.kwargs['port'],
 69 |                     credentials=pika.PlainCredentials(
 70 |                         username=self.kwargs['username'],
 71 |                         password=self.kwargs['password']
 72 |                     )
 73 |                 )
 74 |             )
 75 |             channel = connection.channel()
 76 |             channel.queue_declare(
 77 |                 queue=self.kwargs['key'],
 78 |                 arguments={"x-max-priority": self.kwargs['max_priority']}
 79 |             )
 80 |             self.__mq_instance, self.__mq_connection_instance = channel, connection
 81 |         return self.__mq_instance
 82 | 
 83 |     def close(self):
 84 |         if self.__mq_instance:
 85 |             self.__mq_instance.close()
 86 |             self.__mq_connection_instance.close()
 87 | 
 88 | 
 89 | def priority_queue(key: str, mq: dict) -> PriorityQueue:
 90 |     """
 91 |     # unit test example
 92 |     def run():
 93 |         queue = rabbitmq_client('message:queue')
 94 |         for i in range(5):
 95 |             queue.push({"url": f"https://www.baidu.com/?kw={i}", "task_id": '123'})
 96 |         while 1:
 97 |             method, msg = queue.pop()
 98 |             print(msg)
 99 |             if not msg:
100 |                 break
101 |             time.sleep(1)
102 |             if method:
103 |                 queue.finish(method)
104 | 
105 |     run()
106 | 
107 |     """
108 |     server = RabbitMq(**mq).get_channel
109 |     return PriorityQueue(server=server, key=key)
110 | 
111 | 
112 | spider_priority_queue = priority_queue
113 | 


--------------------------------------------------------------------------------
/aioscpy/queue/redis/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._queue import spider_priority_queue, priority_queue
 2 | from ._queue_async import spider_aio_priority_queue, aio_priority_queue
 3 | 
 4 | 
 5 | __all__ = [
 6 |     spider_priority_queue,
 7 |     priority_queue,
 8 |     spider_aio_priority_queue,
 9 |     aio_priority_queue,
10 | ]
11 | 


--------------------------------------------------------------------------------
/aioscpy/queue/redis/_queue.py:
--------------------------------------------------------------------------------
 1 | from redis import ConnectionPool, StrictRedis
 2 | 
 3 | from aioscpy.queue import BaseQueue
 4 | 
 5 | 
 6 | class PriorityQueue(BaseQueue):
 7 |     def qsize(self) -> int:
 8 |         """Return the length of the queue"""
 9 |         return self.server.zcard(self.key)
10 | 
11 |     def push(self, request: dict):
12 |         data = self._encode_request(request)
13 |         score = -request.get('priority', 1)
14 |         self.server.zadd(self.key, {data: score})
15 | 
16 |     def pop(self, timeout: int = 0) -> dict:
17 |         pipe = self.server.pipeline()
18 |         pipe.multi()
19 |         pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0)
20 |         results, count = pipe.execute()
21 |         if results:
22 |             return self._decode_request(results[0])
23 | 
24 | 
25 | class Redis:
26 | 
27 |     __redis_instance = None
28 | 
29 |     def __init__(self, *args, **kwargs):
30 |         self.args = args
31 |         self.kwargs = self.validator(kwargs)
32 | 
33 |     @staticmethod
34 |     def validator(params: dict) -> dict:
35 |         params.setdefault('host', '127.0.0.1')
36 |         params.setdefault('port', 6379)
37 |         params.setdefault('db', 1)
38 |         params.setdefault('password', 'admin')
39 |         return params
40 | 
41 |     @property
42 |     def format_url(self) -> str:
43 |         """REDIS_URL = 'redis://:123456@172.16.8.147:6379/1'"""
44 |         _format_url = f"redis://:{self.kwargs['password']}@{self.kwargs['host']}:{self.kwargs['port']}/{self.kwargs['db']}"\
45 |             if not self.kwargs.get('redis_url') else self.kwargs['redis_url']
46 |         return _format_url
47 | 
48 |     @property
49 |     def get_redis_pool(self) -> StrictRedis:
50 |         if not self.__redis_instance:
51 |             pool = ConnectionPool(**self.kwargs)
52 |             self.__redis_instance = StrictRedis(connection_pool=pool)
53 | 
54 |         return self.__redis_instance
55 | 
56 |     def close(self):
57 |         if self.__redis_instance:
58 |             self.__redis_instance.close()
59 | 
60 | 
61 | def priority_queue(key: str, redis_tcp: dict) -> PriorityQueue:
62 |     """
63 |     def run():
64 |         queue = redis_client('message:queue')
65 |         # queue.push({"url": "https://www.baidu.com/?kw=1", "task_id": '123'})
66 |         print(queue.pop())
67 | 
68 |     run()
69 |     """
70 |     server = Redis(**redis_tcp).get_redis_pool
71 |     return PriorityQueue(server=server, key=key)
72 | 
73 | 
74 | spider_priority_queue = priority_queue
75 | 


--------------------------------------------------------------------------------
/aioscpy/queue/redis/_queue_async.py:
--------------------------------------------------------------------------------
  1 | from redis.asyncio import Redis, BlockingConnectionPool
  2 | 
  3 | from aioscpy.queue import BaseQueue
  4 | 
  5 | 
  6 | class PriorityQueue(BaseQueue):
  7 |     def __init__(self, server, spider, key=None, serializer="pickle"):
  8 |         super().__init__(server, spider, key)
  9 |         self.serializer = self.__compat__[serializer]
 10 | 
 11 |     async def qsize(self) -> int:
 12 |         return await self.server.zcard(self.key)
 13 | 
 14 |     async def push(self, request):
 15 |         data = self._encode_request(request)
 16 |         score = -request.get('priority', 1)
 17 |         await self.server.zadd(self.key, {data: score})
 18 | 
 19 |     async def mpush(self, requests: list):
 20 |         async with self.server.pipeline() as pipe:
 21 |             for request in requests:
 22 |                 data = self._encode_request(request)
 23 |                 score = -request.get('priority', 1)
 24 |                 pipe.zadd(self.key, {data: score})
 25 |             await pipe.execute()
 26 | 
 27 |     async def pop(self, timeout: int = 0, count: int = 0):
 28 |         async with self.server.pipeline(transaction=True) as pipe:
 29 |             results, _ = await (
 30 |                 pipe.zrange(self.key, 0, count)
 31 |                     .zremrangebyrank(self.key, 0, count)
 32 |                     .execute()
 33 |             )
 34 |         _results = []
 35 |         for result in results:
 36 |             _results.append(self._decode_request(result))
 37 |         return _results
 38 | 
 39 | 
 40 | class AsyncRedis:
 41 |     __redis_instance = None
 42 | 
 43 |     def __init__(self, *args, **kwargs):
 44 |         self.args = args
 45 |         if not kwargs:
 46 |             self.kwargs = self.validator(kwargs)
 47 |         self.kwargs = kwargs
 48 | 
 49 |     @staticmethod
 50 |     def validator(params: dict) -> dict:
 51 |         params.setdefault('host', '127.0.0.1')
 52 |         params.setdefault('port', 6379)
 53 |         params.setdefault('db', 1)
 54 |         params.setdefault('password', 'admin')
 55 |         return params
 56 | 
 57 |     @property
 58 |     async def get_redis_pool(self) -> Redis:
 59 |         if not self.__redis_instance:
 60 |             url = self.kwargs.pop('url', None)
 61 |             if url:
 62 |                 connection_pool = BlockingConnectionPool.from_url(url, **self.kwargs)
 63 |             else:
 64 |                 connection_pool = BlockingConnectionPool(**self.kwargs)
 65 |             self.__redis_instance = Redis(connection_pool=connection_pool)
 66 |         return self.__redis_instance
 67 | 
 68 |     async def close(self):
 69 |         if self.__redis_instance:
 70 |             await self.__redis_instance.close()
 71 | 
 72 | 
 73 | async def aio_priority_queue(key: str, redis_tcp, spider) -> PriorityQueue:
 74 |     """
 75 |     # unit test example
 76 |     async def run():
 77 |         REDIS_TCP = {
 78 |                     "host": "172.16.7.172",
 79 |                     "port": 6379,
 80 |                     "password": "123456",
 81 |                     "db": 15
 82 |                 }
 83 |         queue = await aio_priority_queue('message:queue', REDIS_TCP)
 84 |         # await queue.push({"url": "https://www.baidu.com/?kw=1", "task_id": '123'})
 85 |         print(await queue.pop())
 86 | 
 87 | 
 88 |     if __name__ == "__main__":
 89 |         import asyncio
 90 |         asyncio.run(run())
 91 |     """
 92 | 
 93 |     if isinstance(redis_tcp, str):
 94 |         redis_tcp = {'url': redis_tcp}
 95 |     server = await AsyncRedis(**redis_tcp).get_redis_pool
 96 |     return PriorityQueue(server=server, spider=spider, key=key, serializer='json')
 97 | 
 98 | 
 99 | spider_aio_priority_queue = aio_priority_queue
100 | 


--------------------------------------------------------------------------------
/aioscpy/settings/default_settings.py:
--------------------------------------------------------------------------------
  1 | BOT_NAME = "aioscpy"
  2 | 
  3 | # Concurrency settings
  4 | CONCURRENT_REQUESTS = 16
  5 | CONCURRENT_REQUESTS_PER_DOMAIN = 8
  6 | CONCURRENT_REQUESTS_PER_IP = 0
  7 | CONCURRENT_ITEMS = 16
  8 | 
  9 | # Adaptive concurrency settings
 10 | ADAPTIVE_CONCURRENCY_ENABLED = False
 11 | ADAPTIVE_CONCURRENCY_TARGET_RESPONSE_TIME = 1.0  # seconds
 12 | ADAPTIVE_CONCURRENCY_MIN_REQUESTS = 8
 13 | ADAPTIVE_CONCURRENCY_MAX_REQUESTS = 32
 14 | ADAPTIVE_CONCURRENCY_WINDOW_SIZE = 20
 15 | ADAPTIVE_CONCURRENCY_ADJUSTMENT_INTERVAL = 10  # seconds
 16 | 
 17 | # Download settings
 18 | DOWNLOAD_DELAY = 0
 19 | DOWNLOAD_TIMEOUT = 20
 20 | RANDOMIZE_DOWNLOAD_DELAY = True
 21 | 
 22 | # Memory optimization settings
 23 | GC_ENABLED = True
 24 | GC_FREQUENCY = 10  # Run garbage collection every 10 heartbeats
 25 | 
 26 | # Task beat settings
 27 | TASK_BEAT_ACTIVE_SLEEP = 0.2  # Sleep when active (seconds)
 28 | TASK_BEAT_IDLE_SLEEP = 1.0    # Sleep when idle (seconds)
 29 | TASK_BEAT_BATCH_SIZE = 100    # Max requests per batch
 30 | 
 31 | # Handler and scheduler settings
 32 | # DOWNLOAD_HANDLER = "aioscpy.core.downloader.handlers.aiohttp.AioHttpDownloadHandler"
 33 | DOWNLOAD_HANDLER = "aioscpy.core.downloader.handlers.httpx.HttpxDownloadHandler"
 34 | # DOWNLOAD_HANDLER = "aioscpy.core.downloader.handlers.requests.RequestsDownloadHandler"
 35 | # SCHEDULER = "aioscpy.core.scheduler.redis.RedisScheduler"
 36 | SCHEDULER = "aioscpy.core.scheduler.memory.MemoryScheduler"
 37 | REQUESTS_SESSION_STATS = False
 38 | 
 39 | SPIDER_IDLE = False
 40 | 
 41 | # LOG CONFIG
 42 | LOG_LEVEL = "DEBUG"
 43 | LOG_FILE = False
 44 | LOG_FILENAME = f"{BOT_NAME}.log"
 45 | LOG_ENCODING = "utf-8"
 46 | LOG_ROTATION = "1 week"
 47 | LOG_RETENTION = "30 days"
 48 | 
 49 | DI_CONFIG = {
 50 |     "scheduler": f"{SCHEDULER}",
 51 |     "log_formatter": "aioscpy.logformatter.LogFormatter",
 52 |     "extension": "aioscpy.middleware.ExtensionManager",
 53 | 
 54 | }
 55 | DI_CONFIG_CLS = {
 56 |     "request": "aioscpy.http.Request",
 57 |     "response": "aioscpy.http.TextResponse",
 58 |     "form_request": "aioscpy.http.FormRequest",
 59 |     "json_request": "aioscpy.http.JsonRequest",
 60 |     "logger": "aioscpy.utils.log.logger",
 61 |     "log": "aioscpy.utils.log",
 62 |     "exceptions": "aioscpy.exceptions",
 63 |     "tools": "aioscpy.utils.tools",
 64 |     'downloader_middleware': 'aioscpy.middleware.DownloaderMiddlewareManager',
 65 |     "item_processor": "aioscpy.middleware.ItemPipelineManager",
 66 | }
 67 | DI_CREATE_CLS = {
 68 |     'crawler': 'aioscpy.crawler.Crawler',
 69 |     'crawler_process': 'aioscpy.crawler.CrawlerProcess',
 70 |     'engine': 'aioscpy.core.engine.ExecutionEngine',
 71 |     'spider': 'aioscpy.spider.Spider',
 72 |     'downloader_handler': f'{DOWNLOAD_HANDLER}',
 73 |     'stats': 'aioscpy.libs.statscollectors.MemoryStatsCollector',
 74 |     'scraper': 'aioscpy.core.scraper.Scraper',
 75 |     "downloader": "aioscpy.core.downloader.Downloader",
 76 | }
 77 | 
 78 | # message config
 79 | # RABBITMQ_TCP = {
 80 | #     "host": "172.16.8.147",
 81 | #     # "port": 5672,
 82 | #     # "username": "admin",
 83 | #     # "password": "admin",
 84 | #     # "key": "message:queue",
 85 | #     # "max_priority": 100
 86 | # }
 87 | QUEUE_KEY = '%(spider)s:requests'
 88 | 
 89 | # REDIS_TCP = {
 90 | #     "host": "172.16.7.172",
 91 | #     "port": 6379,
 92 | #     "password": "123456",
 93 | #     "db": 15
 94 | # }
 95 | # REDIS_URI = "redis://:123456@172.16.7.172:6379/1"
 96 | 
 97 | 
 98 | EXTENSIONS_BASE = {
 99 |     'aioscpy.libs.extensions.corestats.CoreStats': 0,
100 |     'aioscpy.libs.extensions.logstats.LogStats': 0,
101 | 
102 | }
103 | 
104 | DOWNLOADER_MIDDLEWARES_BASE = {
105 |     # Engine side
106 |     'aioscpy.middleware.adaptive_concurrency.AdaptiveConcurrencyMiddleware': 500,
107 |     'aioscpy.libs.downloadermiddlewares.stats.DownloaderStats': 850,
108 |     # Downloader side
109 | }
110 | DOWNLOADER_STATS = True
111 | 
112 | LOGSTATS_INTERVAL = 60.0
113 | STATS_CLASS = 'aioscpy.libs.statscollectors.MemoryStatsCollector'
114 | STATS_DUMP = True
115 | SCRAPER_SLOT_MAX_ACTIVE_SIZE = 5000000
116 | 
117 | TLS_CIPHERS = False
118 | 
119 | 


--------------------------------------------------------------------------------
/aioscpy/signalmanager.py:
--------------------------------------------------------------------------------
 1 | from pydispatch import dispatcher
 2 | from aioscpy.utils import signal as _signal
 3 | 
 4 | 
 5 | class SignalManager:
 6 | 
 7 |     def __init__(self, sender=dispatcher.Anonymous):
 8 |         self.sender = sender
 9 | 
10 |     def connect(self, receiver, signal, **kwargs):
11 |         """
12 |         Connect a receiver function to a signal.
13 | 
14 |         The signal can be any object, although Aioscpy comes with some
15 |         predefined signals that are documented in the :ref:`topics-signals`
16 |         section.
17 | 
18 |         :param receiver: the function to be connected
19 |         :type receiver: callable
20 | 
21 |         :param signal: the signal to connect to
22 |         :type signal: object
23 |         """
24 |         kwargs.setdefault('sender', self.sender)
25 |         return dispatcher.connect(receiver, signal, **kwargs)
26 | 
27 |     def disconnect(self, receiver, signal, **kwargs):
28 |         """
29 |         Disconnect a receiver function from a signal. This has the
30 |         opposite effect of the :meth:`connect` method, and the arguments
31 |         are the same.
32 |         """
33 |         kwargs.setdefault('sender', self.sender)
34 |         return dispatcher.disconnect(receiver, signal, **kwargs)
35 | 
36 |     async def send_catch_log(self, signal, **kwargs):
37 |         """
38 |         Send a signal, catch exceptions and log them.
39 | 
40 |         The keyword arguments are passed to the signal handlers (connected
41 |         through the :meth:`connect` method).
42 |         """
43 |         kwargs.setdefault('sender', self.sender)
44 |         return await _signal.send_catch_log(signal, **kwargs)
45 | 
46 |     async def send_catch_log_coroutine(self, signal, **kwargs):
47 |         """
48 |         Like :meth:`send_catch_log` but supports returning
49 |         :class:`~twisted.internet.defer.Deferred` objects from signal handlers.
50 | 
51 |         Returns a Deferred that gets fired once all signal handlers
52 |         deferreds were fired. Send a signal, catch exceptions and log them.
53 | 
54 |         The keyword arguments are passed to the signal handlers (connected
55 |         through the :meth:`connect` method).
56 |         """
57 |         kwargs.setdefault('sender', self.sender)
58 |         return await _signal.send_catch_log_coroutine(signal, **kwargs)
59 | 
60 |     def disconnect_all(self, signal, **kwargs):
61 |         """
62 |         Disconnect all receivers from the given signal.
63 | 
64 |         :param signal: the signal to disconnect from
65 |         :type signal: object
66 |         """
67 |         kwargs.setdefault('sender', self.sender)
68 |         return _signal.disconnect_all(signal, **kwargs)
69 | 


--------------------------------------------------------------------------------
/aioscpy/signals.py:
--------------------------------------------------------------------------------
 1 | """
 2 | aioscpy like for Aioscpy signals
 3 | 
 4 | These signals are documented in docs/topics/signals.rst. Please don't add new
 5 | signals here without documenting them there.
 6 | """
 7 | 
 8 | engine_started = object()
 9 | engine_stopped = object()
10 | spider_opened = object()
11 | spider_idle = object()
12 | spider_closed = object()
13 | spider_error = object()
14 | request_scheduled = object()
15 | request_dropped = object()
16 | request_reached_downloader = object()
17 | request_left_downloader = object()
18 | response_received = object()
19 | response_downloaded = object()
20 | item_scraped = object()
21 | item_dropped = object()
22 | item_error = object()
23 | 
24 | # for backward compatibility
25 | stats_spider_opened = spider_opened
26 | stats_spider_closing = spider_closed
27 | stats_spider_closed = spider_closed
28 | 
29 | item_passed = item_scraped
30 | 
31 | request_received = request_scheduled
32 | 


--------------------------------------------------------------------------------
/aioscpy/spider.py:
--------------------------------------------------------------------------------
 1 | from aioscpy import signals
 2 | from aioscpy import call_grace_instance
 3 | 
 4 | 
 5 | class Spider(object):
 6 |     name = None
 7 |     custom_settings = None
 8 | 
 9 |     def __init__(self, name=None, **kwargs):
10 |         if name is not None:
11 |             self.name = name
12 |         self.__dict__.update(kwargs)
13 |         if not hasattr(self, 'start_urls'):
14 |             self.start_urls = []
15 | 
16 |     def log(self, message, level='DEBUG', **kw):
17 |         self.logger.log(level, message, **kw)
18 | 
19 |     @classmethod
20 |     def from_crawler(cls, crawler, *args, **kwargs):
21 |         spider = cls(*args, **kwargs)
22 |         spider._set_crawler(crawler)
23 |         return spider
24 | 
25 |     def _set_crawler(self, crawler):
26 |         self.crawler = crawler
27 |         self.settings = crawler.settings
28 |         crawler.signals.connect(self.close, signals.spider_closed)
29 |         crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
30 | 
31 |     async def start_requests(self):
32 |         for url in self.start_urls:
33 |             yield self.di.get('request')(url, dont_filter=True)
34 | 
35 |     async def _parse(self, response, **kwargs):
36 |         return self.parse(response)
37 | 
38 |     async def parse(self, response):
39 |         raise NotImplementedError(f'{self.__class__.__name__}.parse callback is not defined')
40 | 
41 |     @classmethod
42 |     def update_settings(cls, settings):
43 |         settings.setdict(cls.custom_settings or {}, priority='spider')
44 | 
45 |     @staticmethod
46 |     def close(spider, reason):
47 |         closed = getattr(spider, 'closed', None)
48 |         if callable(closed):
49 |             return closed(reason)
50 | 
51 |     @classmethod
52 |     def start(cls):
53 |         from aioscpy.crawler import CrawlerProcess
54 |         from aioscpy.utils.tools import get_project_settings
55 | 
56 |         process = call_grace_instance(CrawlerProcess, get_project_settings())
57 |         process.crawl(cls)
58 |         process.start()
59 | 
60 |     def spider_idle(self):
61 |         if self.settings.get("SPIDER_IDLE", True):
62 |             raise self.di.get('exceptions').DontCloseSpider
63 | 
64 |     def __str__(self):
65 |         return "<%s %r at 0x%0x>" % (type(self).__name__, self.name, id(self))
66 | 
67 |     __repr__ = __str__
68 | 
69 | 
70 | Spider = call_grace_instance('spider', only_instance=True)
71 | 


--------------------------------------------------------------------------------
/aioscpy/templates/project/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/aioscpy/templates/project/__init__.py


--------------------------------------------------------------------------------
/aioscpy/templates/project/aioscpy.cfg:
--------------------------------------------------------------------------------
 1 | [package_env]
 2 | path = ../
 3 | 
 4 | [settings]
 5 | default = settings
 6 | 
 7 | [deploy]
 8 | #url = http://localhost:6800/
 9 | project = ${project_name}
10 | 


--------------------------------------------------------------------------------
/aioscpy/templates/project/middlewares.py.tmpl:
--------------------------------------------------------------------------------
 1 | 
 2 | class ${ProjectName}DownloaderMiddleware:
 3 | 
 4 |     @classmethod
 5 |     def from_crawler(cls, crawler):
 6 |         # This method is used by Aioscpy to create your spiders.
 7 |         s = cls()
 8 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 9 |         return s
10 | 
11 |     def process_request(self, spider, request):
12 |         # Called for each request that goes through the downloader
13 |         # middleware.
14 | 
15 |         # Must either:
16 |         # - return None: continue processing this request
17 |         # - or return a Response object
18 |         # - or return a Request object
19 |         # - or raise IgnoreRequest: process_exception() methods of
20 |         #   installed downloader middleware will be called
21 |         return None
22 | 
23 |     def process_response(self, request, response, spider):
24 |         # Called with the response returned from the downloader.
25 | 
26 |         # Must either;
27 |         # - return a Response object
28 |         # - return a Request object
29 |         # - or raise IgnoreRequest
30 |         return response
31 | 
32 |     def process_exception(self, request, exception, spider):
33 |         # Called when a download handler or a process_request()
34 |         # (from other downloader middleware) raises an exception.
35 | 
36 |         # Must either:
37 |         # - return None: continue processing this exception
38 |         # - return a Response object: stops process_exception() chain
39 |         # - return a Request object: stops process_exception() chain
40 |         pass
41 | 
42 |     def spider_opened(self, spider):
43 |         spider.logger.info('Spider opened: %s' % spider.name)
44 | 


--------------------------------------------------------------------------------
/aioscpy/templates/project/pipelines.py.tmpl:
--------------------------------------------------------------------------------
1 | 
2 | class ${ProjectName}Pipeline:
3 |     def process_item(self, item, spider):
4 |         return item
5 | 


--------------------------------------------------------------------------------
/aioscpy/templates/project/settings.py.tmpl:
--------------------------------------------------------------------------------
 1 | BOT_NAME = '$project_name'
 2 | 
 3 | SPIDER_MODULES = ['spiders']
 4 | NEWSPIDER_MODULE = 'spiders'
 5 | 
 6 | # CONCURRENT_ITEMS = 100
 7 | # CONCURRENT_REQUESTS = 16
 8 | # CONCURRENT_REQUESTS_PER_DOMAIN = 8
 9 | # CONCURRENT_REQUESTS_PER_IP = 0
10 | # RANDOMIZE_DOWNLOAD_DELAY = True
11 | 
12 | # DOWNLOAD_DELAY = 0
13 | # DOWNLOAD_TIMEOUT = 20
14 | # DOWNLOAD_HANDLER = "aioscpy.core.downloader.http.AioHttpDownloadHandler"
15 | # DOWNLOAD_HANDLER = "aioscpy.core.downloader.handlers.httpx.HttpxDownloadHandler"
16 | # SCHEDULER = "aioscpy.core.scheduler.redis.RedisScheduler"
17 | # SCHEDULER = "aioscpy.core.scheduler.memory.MemoryScheduler"
18 | # REQUESTS_SESSION_STATS = False
19 | 
20 | # SCRAPER_SLOT_MAX_ACTIVE_SIZE = 500000
21 | 
22 | 
23 | # SPIDER_IDLE = False
24 | 
25 | # :LOG CONFIG
26 | # LOG_LEVEL = "DEBUG"
27 | # LOG_FILE = False
28 | # LOG_FILENAME = f"{BOT_NAME}.log"
29 | # LOG_ENCODING = "utf-8"
30 | # LOG_ROTATION = "1 week"
31 | # LOG_RETENTION = "30 days"
32 | 
33 | # message config
34 | # RABBITMQ_TCP = {
35 | #     "host": "172.16.8.147",
36 | #     # "port": 5672,
37 | #     # "username": "admin",
38 | #     # "password": "admin",
39 | #     # "key": "message:queue",
40 | #     # "max_priority": 100
41 | # }
42 | # QUEUE_KEY = '%(spider)s:requests'
43 | 
44 | # REDIS_TCP = {
45 | #     "host": "172.16.7.172",
46 | #     "port": 6379,
47 | #     "password": "123456",
48 | #     "db": 15
49 | # }
50 | # REDIS_URI = "redis://:123456@172.16.7.172:6379/1"
51 | 
52 | 
53 | # DOWNLOADER_STATS = True
54 | 
55 | # LOGSTATS_INTERVAL = 60.0
56 | # STATS_CLASS = 'aioscpy.libs.statscollectors.MemoryStatsCollector'
57 | # STATS_DUMP = True
58 | 
59 | # DOWNLOADER_MIDDLEWARES = {
60 | #     '$project_name.middlewares.${ProjectName}DownloaderMiddleware': 543,
61 | # }
62 | 
63 | # EXTENSIONS = {
64 | # }
65 | 
66 | # ITEM_PIPELINES = {
67 | #     '$project_name.pipelines.${ProjectName}Pipeline': 300,
68 | # }
69 | 


--------------------------------------------------------------------------------
/aioscpy/templates/project/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Aioscpy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/aioscpy/templates/project/start.py.tmpl:
--------------------------------------------------------------------------------
 1 | from aioscpy.crawler import call_grace_instance
 2 | from aioscpy.utils.tools import get_project_settings
 3 | 
 4 | """start spider method one:
 5 | from cegex.baidu import BaiduSpider
 6 | from cegex.httpbin import HttpBinSpider
 7 | 
 8 | process = CrawlerProcess()
 9 | process.crawl(HttpBinSpider)
10 | process.crawl(BaiduSpider)
11 | process.start()
12 | """
13 | 
14 | 
15 | def load_file_to_execute():
16 |     process = call_grace_instance("crawler_process", get_project_settings())
17 |     process.load_spider(path='[spiders path]]', spider_like='[spider name]')
18 |     process.start()
19 | 
20 | 
21 | def load_name_to_execute():
22 |     process = call_grace_instance("crawler_process", get_project_settings())
23 |     process.crawl('[spider name]', path="[spiders path]")
24 |     process.start()
25 | 


--------------------------------------------------------------------------------
/aioscpy/templates/spiders/basic.tmpl:
--------------------------------------------------------------------------------
 1 | from aioscpy.spider import Spider
 2 | 
 3 | 
 4 | class $classname(Spider):
 5 |     name = '$name'
 6 |     custom_settings = {
 7 |         "SPIDER_IDLE": False
 8 |     }
 9 |     start_urls = []
10 | 
11 |     async def parse(self, response):
12 |         item = {
13 |             'hot': '\n'.join(response.xpath('//span[@class="title-content-title"]/text()').extract()),
14 |         }
15 |         yield item
16 | 


--------------------------------------------------------------------------------
/aioscpy/templates/spiders/crawl.tmpl:
--------------------------------------------------------------------------------
 1 | from aioscpy.spider import Spider
 2 | from anti_header import Header
 3 | from pprint import pprint, pformat
 4 | 
 5 | 
 6 | class $classname(Spider):
 7 |     name = '$name'
 8 |     custom_settings = {
 9 |         "SPIDER_IDLE": False
10 |     }
11 |     start_urls = []
12 | 
13 |     async def process_request(self, request):
14 |         request.headers = Header(url=request.url, platform='windows', connection=True).random
15 |         return request
16 | 
17 |     async def process_response(self, request, response):
18 |         if response.status in [404, 503]:
19 |             return request
20 |         return response
21 | 
22 |     async def parse(self, response):
23 |         item = {
24 |             # 'hot': '\n'.join(response.xpath('//span[@class="title-content-title"]/text()').extract()),
25 |         }
26 |         yield item
27 | 
28 |     async def process_item(self, item):
29 |         pass
30 |         # self.logger.info("{item}", **{'item': pformat(item)})
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     sp = $classname()
35 |     sp.start()
36 | 


--------------------------------------------------------------------------------
/aioscpy/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/aioscpy/utils/__init__.py


--------------------------------------------------------------------------------
/aioscpy/utils/common.py:
--------------------------------------------------------------------------------
 1 | import numbers
 2 | import warnings
 3 | import os
 4 | 
 5 | from operator import itemgetter
 6 | from importlib import import_module
 7 | 
 8 | from aioscpy.settings import BaseSettings
 9 | 
10 | 
11 | def without_none_values(iterable):
12 |     """Return a copy of ``iterable`` with all ``None`` entries removed.
13 | 
14 |     If ``iterable`` is a mapping, return a dictionary where all pairs that have
15 |     value ``None`` have been removed.
16 |     """
17 |     try:
18 |         return {k: v for k, v in iterable.items() if v is not None}
19 |     except AttributeError:
20 |         return type(iterable)((v for v in iterable if v is not None))
21 | 
22 | 
23 | def build_component_list(compdict, custom=None):
24 |     """Compose a component list from a { class: order } dictionary."""
25 | 
26 |     def _check_components(complist):
27 |         if len({c for c in complist}) != len(complist):
28 |             raise ValueError(f'Some paths in {complist!r} convert to the same object, '
29 |                              'please update your settings')
30 | 
31 |     def _map_keys(compdict):
32 |         if isinstance(compdict, BaseSettings):
33 |             compbs = BaseSettings()
34 |             for k, v in compdict.items():
35 |                 prio = compdict.getpriority(k)
36 |                 if compbs.getpriority(k) == prio:
37 |                     raise ValueError(f'Some paths in {list(compdict.keys())!r} '
38 |                                      'convert to the same '
39 |                                      'object, please update your settings'
40 |                                      )
41 |                 else:
42 |                     compbs.set(k, v, priority=prio)
43 |             return compbs
44 |         else:
45 |             _check_components(compdict)
46 |             return {k: v for k, v in compdict.items()}
47 | 
48 |     def _validate_values(compdict):
49 |         """Fail if a value in the components dict is not a real number or None."""
50 |         for name, value in compdict.items():
51 |             if value is not None and not isinstance(value, numbers.Real):
52 |                 raise ValueError(f'Invalid value {value} for component {name}, '
53 |                                  'please provide a real number or None instead')
54 | 
55 |     # BEGIN Backward compatibility for old (base, custom) call signature
56 |     if isinstance(custom, (list, tuple)):
57 |         _check_components(custom)
58 |         return type(custom)(c for c in custom)
59 | 
60 |     if custom is not None:
61 |         compdict.update(custom)
62 |     # END Backward compatibility
63 | 
64 |     _validate_values(compdict)
65 |     compdict = without_none_values(_map_keys(compdict))
66 |     return [k for k, v in sorted(compdict.items(), key=itemgetter(1))]
67 | 
68 | 
69 | def arglist_to_dict(arglist):
70 |     """Convert a list of arguments like ['arg1=val1', 'arg2=val2', ...] to a
71 |     dict
72 |     """
73 |     return dict(x.split('=', 1) for x in arglist)
74 | 
75 | 
76 | def inside_project():
77 |     aioscpy_module = os.environ.get('AIOSCPY_SETTINGS_MODULE')
78 |     if aioscpy_module is not None:
79 |         try:
80 |             import_module(aioscpy_module)
81 |         except ImportError as exc:
82 |             warnings.warn(f"Cannot import aioscpy settings module {aioscpy_module}: {exc}")
83 |         else:
84 |             return True
85 |     return bool(closest_aioscpy_cfg())
86 | 
87 | 
88 | def closest_aioscpy_cfg(path='.', prevpath=None):
89 |     """Return the path to the closest aioscpy.cfg file by traversing the current
90 |     directory and its parents
91 |     """
92 |     if path == prevpath:
93 |         return ''
94 |     path = os.path.abspath(path)
95 |     cfgfile = os.path.join(path, 'aioscpy.cfg')
96 |     if os.path.exists(cfgfile):
97 |         return cfgfile
98 |     return closest_aioscpy_cfg(os.path.dirname(path), path)
99 | 


--------------------------------------------------------------------------------
/aioscpy/utils/curl.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import warnings
  3 | from shlex import split
  4 | from http.cookies import SimpleCookie
  5 | from urllib.parse import urlparse
  6 | 
  7 | from w3lib.http import basic_auth_header
  8 | 
  9 | 
 10 | class CurlParser(argparse.ArgumentParser):
 11 |     def error(self, message):
 12 |         error_msg = f'There was an error parsing the curl command: {message}'
 13 |         raise ValueError(error_msg)
 14 | 
 15 | 
 16 | curl_parser = CurlParser()
 17 | curl_parser.add_argument('url')
 18 | curl_parser.add_argument('-H', '--header', dest='headers', action='append')
 19 | curl_parser.add_argument('-X', '--request', dest='method')
 20 | curl_parser.add_argument('-d', '--data', '--data-raw', dest='data')
 21 | curl_parser.add_argument('-u', '--user', dest='auth')
 22 | 
 23 | 
 24 | safe_to_ignore_arguments = [
 25 |     ['--compressed'],
 26 |     # `--compressed` argument is not safe to ignore, but it's included here
 27 |     # because the `HttpCompressionMiddleware` is enabled by default
 28 |     ['-s', '--silent'],
 29 |     ['-v', '--verbose'],
 30 |     ['-#', '--progress-bar']
 31 | ]
 32 | 
 33 | for argument in safe_to_ignore_arguments:
 34 |     curl_parser.add_argument(*argument, action='store_true')
 35 | 
 36 | 
 37 | def _parse_headers_and_cookies(parsed_args):
 38 |     headers = []
 39 |     cookies = {}
 40 |     for header in parsed_args.headers or ():
 41 |         name, val = header.split(':', 1)
 42 |         name = name.strip()
 43 |         val = val.strip()
 44 |         if name.title() == 'Cookie':
 45 |             for name, morsel in SimpleCookie(val).items():
 46 |                 cookies[name] = morsel.value
 47 |         else:
 48 |             headers.append((name, val))
 49 | 
 50 |     if parsed_args.auth:
 51 |         user, password = parsed_args.auth.split(':', 1)
 52 |         headers.append(('Authorization', basic_auth_header(user, password)))
 53 | 
 54 |     return headers, cookies
 55 | 
 56 | 
 57 | def curl_to_request_kwargs(curl_command: str, ignore_unknown_options: bool = True) -> dict:
 58 |     """Convert a cURL command syntax to Request kwargs.
 59 | 
 60 |     :param str curl_command: string containing the curl command
 61 |     :param bool ignore_unknown_options: If true, only a warning is emitted when
 62 |                                         cURL options are unknown. Otherwise
 63 |                                         raises an error. (default: True)
 64 |     :return: dictionary of Request kwargs
 65 |     """
 66 | 
 67 |     curl_args = split(curl_command)
 68 | 
 69 |     if curl_args[0] != 'curl':
 70 |         raise ValueError('A curl command must start with "curl"')
 71 | 
 72 |     parsed_args, argv = curl_parser.parse_known_args(curl_args[1:])
 73 | 
 74 |     if argv:
 75 |         msg = f'Unrecognized options: {", ".join(argv)}'
 76 |         if ignore_unknown_options:
 77 |             warnings.warn(msg)
 78 |         else:
 79 |             raise ValueError(msg)
 80 | 
 81 |     url = parsed_args.url
 82 | 
 83 |     # curl automatically prepends 'http' if the scheme is missing, but Request
 84 |     # needs the scheme to work
 85 |     parsed_url = urlparse(url)
 86 |     if not parsed_url.scheme:
 87 |         url = 'http://' + url
 88 | 
 89 |     method = parsed_args.method or 'GET'
 90 | 
 91 |     result = {'method': method.upper(), 'url': url}
 92 | 
 93 |     headers, cookies = _parse_headers_and_cookies(parsed_args)
 94 | 
 95 |     if headers:
 96 |         result['headers'] = headers
 97 |     if cookies:
 98 |         result['cookies'] = cookies
 99 |     if parsed_args.data:
100 |         result['body'] = parsed_args.data
101 |         if not parsed_args.method:
102 |             # if the "data" is specified but the "method" is not specified,
103 |             # the default method is 'POST'
104 |             result['method'] = 'POST'
105 | 
106 |     return result
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     curl_str = """
111 |     curl 'https://quotes.toscrape.com/api/quotes?page=10' \
112 |   -H 'authority: quotes.toscrape.com' \
113 |   -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
114 |   -H 'accept-language: zh-CN,zh;q=0.9,en;q=0.8' \
115 |   -H 'cache-control: no-cache' \
116 |   -H 'pragma: no-cache' \
117 |   -H 'referer: https://docs.scrapy.org/en/latest/topics/developer-tools.html?highlight=curl' \
118 |   -H 'sec-ch-ua: " Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"' \
119 |   -H 'sec-ch-ua-mobile: ?0' \
120 |   -H 'sec-ch-ua-platform: "Windows"' \
121 |   -H 'sec-fetch-dest: document' \
122 |   -H 'sec-fetch-mode: navigate' \
123 |   -H 'sec-fetch-site: cross-site' \
124 |   -H 'sec-fetch-user: ?1' \
125 |   -H 'upgrade-insecure-requests: 1' \
126 |   -H 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36' \
127 |   --compressed
128 |     """
129 | 
130 |     res = curl_to_request_kwargs(curl_str)
131 |     print(res)
132 | 


--------------------------------------------------------------------------------
/aioscpy/utils/log.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, unicode_literals
  2 | 
  3 | import sys
  4 | import socket
  5 | import warnings
  6 | import aioscpy
  7 | 
  8 | from loguru import logger
  9 | 
 10 | from aioscpy.exceptions import AioscpyDeprecationWarning
 11 | # from aioscpy.settings import Settings
 12 | from aioscpy.utils.tools import get_project_settings
 13 | 
 14 | 
 15 | def set_log_config(formatter: str, settings):
 16 |     _log_config = {
 17 |         "default": {
 18 |             "handlers": [
 19 |                 {
 20 |                     "sink": sys.stdout,
 21 |                     "format": formatter,
 22 |                     "level": settings.get('LOG_LEVEL', "TRACE")
 23 |                 }
 24 |             ],
 25 |             "extra": {
 26 |                 "host": socket.gethostbyname(socket.gethostname()),
 27 |                 'log_name': settings.get("BOT_NAME", 'default'),
 28 |                 'type': 'None'
 29 |             },
 30 |             "levels": [
 31 |                 dict(name="TRACE", icon="✏️", color="<cyan><bold>"),
 32 |                 dict(name="DEBUG", icon="❄️", color="<blue><bold>"),
 33 |                 dict(name="INFO", icon="♻️", color="<bold>"),
 34 |                 dict(name="SUCCESS", icon="✔️", color="<green><bold>"),
 35 |                 dict(name="WARNING", icon="⚠️", color="<yellow><bold>"),
 36 |                 dict(name="ERROR", icon="❌️", color="<red><bold>"),
 37 |                 dict(name="CRITICAL", icon="☠️", color="<RED><bold>"),
 38 |             ]
 39 |         }
 40 |     }
 41 |     if settings.get('LOG_FILE', False):
 42 |         _log_config['default']['handlers'].append({
 43 |             "sink": settings.get('LOG_FILENAME', __file__),
 44 |             "format": formatter,
 45 |             "level": settings.get('LOG_LEVEL', "DEBUG"),
 46 |             "rotation": settings.get("LOG_ROTATION", '1 week'),
 47 |             "retention": settings.get("LOG_RETENTION", '30 days'),
 48 |             'encoding': settings.get("LOG_ENCODING", "utf-8")
 49 |         })
 50 |     return _log_config
 51 | 
 52 | 
 53 | class LogFormatter(object):
 54 |     simple_formatter = '<green>{time:YYYY-MM-DD HH:mm:ss}</green> ' \
 55 |                        '[<cyan>{name}</cyan>] ' \
 56 |                        '<level>{level.icon}{level}</level>: ' \
 57 |                        '<level>{message}</level> '
 58 | 
 59 |     default_formatter = '<green>{time:YYYY-MM-DD HH:mm:ss,SSS}</green> | ' \
 60 |                         '[<cyan>{extra[log_name]}</cyan>] <cyan>{module}</cyan>:<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> | ' \
 61 |                         '<red>{extra[host]}</red> | ' \
 62 |                         '<level>{level.icon}{level: <5}</level> | ' \
 63 |                         '<level>{level.no}</level> | ' \
 64 |                         '<level>{extra[type]}</level> | ' \
 65 |                         '<level>{message}</level> '
 66 | 
 67 |     kafka_formatter = '{time:YYYY-MM-DD HH:mm:ss,SSS}| ' \
 68 |                       '[{extra[log_name]}] {module}:{name}:{function}:{line} | ' \
 69 |                       '{extra[host]} | ' \
 70 |                       '{process} | ' \
 71 |                       '{thread} | ' \
 72 |                       '{level: <5} | ' \
 73 |                       '{level.no} | ' \
 74 |                       '{extra[type]}| ' \
 75 |                       '{message} '
 76 | 
 77 |     @classmethod
 78 |     def setter_log_handler(cls, log, callback=None):
 79 |         assert callable(callback), 'callback must be a callable object'
 80 |         log.add(callback, format=cls.kafka_formatter)
 81 | 
 82 |     @classmethod
 83 |     def get_logger(cls, log, name=None):
 84 |         settings = get_project_settings()
 85 |         log_config = set_log_config(cls.simple_formatter, settings)
 86 |         config = log_config.pop('default', {})
 87 |         if name:
 88 |             config['extra']['log_name'] = name
 89 |         log.configure(**config)
 90 |         return log
 91 | 
 92 |     @staticmethod
 93 |     def format(spider, meta):
 94 |         if hasattr(spider, 'logging_keys'):
 95 |             logging_txt = []
 96 |             for key in spider.logging_keys:
 97 |                 if meta.get(key, None) is not None:
 98 |                     logging_txt.append(u'{0}:{1} '.format(key, meta[key]))
 99 |             logging_txt.append('successfully')
100 |             return ' '.join(logging_txt)
101 | 
102 | 
103 | def logformatter_adapter(logkws):
104 |     if not {'level', 'msg', 'args'} <= set(logkws):
105 |         warnings.warn('Missing keys in LogFormatter method',
106 |                       AioscpyDeprecationWarning)
107 | 
108 |     if 'format' in logkws:
109 |         warnings.warn('`format` key in LogFormatter methods has been '
110 |                       'deprecated, use `msg` instead',
111 |                       AioscpyDeprecationWarning)
112 | 
113 |     level = logkws.get('level', 'INFO')
114 |     message = logkws.get('format', logkws.get('msg'))
115 |     args = logkws if not logkws.get('args') else logkws['args']
116 | 
117 |     return level, message, args
118 | 
119 | 
120 | def std_log_aioscpy_info(settings):
121 |     from pprint import pprint, pformat
122 | 
123 |     icon = """                          
124 |          (_)                           
125 |      __ _ _  ___  ___  ___ _ __  _   _ 
126 |     / _` | |/ _ \/ __|/ __| '_ \| | | |
127 |    | (_| | | (_) \__ \ (__| |_) | |_| |
128 |     \__,_|_|\___/|___/\___| .__/ \__, |
129 |                           | |     __/ |
130 |                           |_|    |___/ 
131 |     """
132 |     logger.info("{item}", **{'item': icon})
133 |     logger.info("aioscpy {version} started (bot: {bot})",
134 |                 **{'version': aioscpy.__version__, 'bot': settings['BOT_NAME']})
135 | 
136 | 
137 | lof = LogFormatter
138 | 
139 | logger = lof.get_logger(logger)
140 | 


--------------------------------------------------------------------------------
/aioscpy/utils/ossignal.py:
--------------------------------------------------------------------------------
 1 | import signal
 2 | 
 3 | 
 4 | signal_names = {}
 5 | for signame in dir(signal):
 6 |     if signame.startswith('SIG') and not signame.startswith('SIG_'):
 7 |         signum = getattr(signal, signame)
 8 |         if isinstance(signum, int):
 9 |             signal_names[signum] = signame
10 | 
11 | 
12 | def install_shutdown_handlers(function, override_sigint=True):
13 |     """Install the given function as a signal handler for all common shutdown
14 |     signals (such as SIGINT, SIGTERM, etc). If override_sigint is ``False`` the
15 |     SIGINT handler won't be install if there is already a handler in place
16 |     (e.g.  Pdb)
17 |     """
18 |     signal.signal(signal.SIGTERM, function)
19 |     if signal.getsignal(signal.SIGINT) == signal.default_int_handler or override_sigint:
20 |         signal.signal(signal.SIGINT, function)
21 |     # Catch Ctrl-Break in windows
22 |     if hasattr(signal, 'SIGBREAK'):
23 |         signal.signal(signal.SIGBREAK, function)
24 | 


--------------------------------------------------------------------------------
/aioscpy/utils/othtypes.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import weakref
  3 | 
  4 | from collections.abc import Mapping
  5 | from typing import Union
  6 | from urllib.parse import urlparse, ParseResult
  7 | from weakref import WeakKeyDictionary
  8 | 
  9 | from aioscpy.http import Request, Response
 10 | 
 11 | 
 12 | _urlparse_cache: "WeakKeyDictionary[Union[Request, Response], ParseResult]" = WeakKeyDictionary()
 13 | 
 14 | 
 15 | def urlparse_cached(request_or_response: Union[Request, Response]) -> ParseResult:
 16 |     """Return urlparse.urlparse caching the result, where the argument can be a
 17 |     Request or Response object
 18 |     """
 19 |     if request_or_response not in _urlparse_cache:
 20 |         _urlparse_cache[request_or_response] = urlparse(request_or_response.url)
 21 |     return _urlparse_cache[request_or_response]
 22 | 
 23 | 
 24 | class CaselessDict(dict):
 25 | 
 26 |     __slots__ = ()
 27 | 
 28 |     def __init__(self, seq=None):
 29 |         super().__init__()
 30 |         if seq:
 31 |             self.update(seq)
 32 | 
 33 |     def __getitem__(self, key):
 34 |         return dict.__getitem__(self, self.normkey(key))
 35 | 
 36 |     def __setitem__(self, key, value):
 37 |         dict.__setitem__(self, self.normkey(key), self.normvalue(value))
 38 | 
 39 |     def __delitem__(self, key):
 40 |         dict.__delitem__(self, self.normkey(key))
 41 | 
 42 |     def __contains__(self, key):
 43 |         return dict.__contains__(self, self.normkey(key))
 44 |     has_key = __contains__
 45 | 
 46 |     def __copy__(self):
 47 |         return self.__class__(self)
 48 |     copy = __copy__
 49 | 
 50 |     def normkey(self, key):
 51 |         """Method to normalize dictionary key access"""
 52 |         return key.lower()
 53 | 
 54 |     def normvalue(self, value):
 55 |         """Method to normalize values prior to be set"""
 56 |         return value
 57 | 
 58 |     def get(self, key, def_val=None):
 59 |         return dict.get(self, self.normkey(key), self.normvalue(def_val))
 60 | 
 61 |     def setdefault(self, key, def_val=None):
 62 |         return dict.setdefault(self, self.normkey(key), self.normvalue(def_val))
 63 | 
 64 |     def update(self, seq):
 65 |         seq = seq.items() if isinstance(seq, Mapping) else seq
 66 |         iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq)
 67 |         super().update(iseq)
 68 | 
 69 |     @classmethod
 70 |     def fromkeys(cls, keys, value=None):
 71 |         return cls((k, value) for k in keys)
 72 | 
 73 |     def pop(self, key, *args):
 74 |         return dict.pop(self, self.normkey(key), *args)
 75 | 
 76 | 
 77 | class LocalCache(collections.OrderedDict):
 78 |     """Dictionary with a finite number of keys.
 79 | 
 80 |     Older items expires first.
 81 |     """
 82 | 
 83 |     def __init__(self, limit=None):
 84 |         super().__init__()
 85 |         self.limit = limit
 86 | 
 87 |     def __setitem__(self, key, value):
 88 |         if self.limit:
 89 |             while len(self) >= self.limit:
 90 |                 self.popitem(last=False)
 91 |         super().__setitem__(key, value)
 92 | 
 93 | 
 94 | class LocalWeakReferencedCache(weakref.WeakKeyDictionary):
 95 |     """
 96 |     A weakref.WeakKeyDictionary implementation that uses LocalCache as its
 97 |     underlying data structure, making it ordered and capable of being size-limited.
 98 | 
 99 |     Useful for memoization, while avoiding keeping received
100 |     arguments in memory only because of the cached references.
101 | 
102 |     Note: like LocalCache and unlike weakref.WeakKeyDictionary,
103 |     it cannot be instantiated with an initial dictionary.
104 |     """
105 | 
106 |     def __init__(self, limit=None):
107 |         super().__init__()
108 |         self.data = LocalCache(limit=limit)
109 | 
110 |     def __setitem__(self, key, value):
111 |         try:
112 |             super().__setitem__(key, value)
113 |         except TypeError:
114 |             pass  # key is not weak-referenceable, skip caching
115 | 
116 |     def __getitem__(self, key):
117 |         try:
118 |             return super().__getitem__(key)
119 |         except (TypeError, KeyError):
120 |             return None  # key is either not weak-referenceable or not cached
121 | 
122 | 
123 | class SequenceExclude:
124 |     """Object to test if an item is NOT within some sequence."""
125 | 
126 |     def __init__(self, seq):
127 |         self.seq = seq
128 | 
129 |     def __contains__(self, item):
130 |         return item not in self.seq
131 | 
132 | 
133 | dnscache = LocalCache(10000)
134 | 


--------------------------------------------------------------------------------
/aioscpy/utils/signal.py:
--------------------------------------------------------------------------------
 1 | """Helper functions for working with signals"""
 2 | import asyncio
 3 | 
 4 | from pydispatch.dispatcher import Anonymous, Any, disconnect, getAllReceivers, liveReceivers
 5 | from pydispatch.robustapply import robustApply
 6 | from aioscpy.exceptions import StopDownload
 7 | from aioscpy.utils.log import logger
 8 | 
 9 | 
10 | class _IgnoredException(Exception):
11 |     pass
12 | 
13 | 
14 | async def robustApplyWrap(f, recv, *args, **kw):
15 |     dont_log = kw.pop('dont_log', None)
16 |     spider = kw.get('spider', None)
17 |     try:
18 |         result = f(recv, *args, **kw)
19 |         if asyncio.iscoroutine(result):
20 |             return await result
21 |     except (Exception, BaseException) as exc:  # noqa: E722
22 |         if dont_log is None or not isinstance(exc, dont_log):
23 |             logger.error("Error caught on signal handler: {receiver}",
24 |                          **{'receiver': recv},
25 |                          exc_info=exc,
26 |                          extra={'spider': spider})
27 |         return exc
28 | 
29 | 
30 | async def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named):
31 |     """Like pydispatcher.robust.sendRobust but it also logs errors and returns
32 |     Failures instead of exceptions.
33 |     """
34 |     named['dont_log'] = (named.pop('dont_log', _IgnoredException), StopDownload)
35 |     responses = []
36 |     for receiver in liveReceivers(getAllReceivers(sender, signal)):
37 |         result = await robustApplyWrap(robustApply, receiver, signal=signal, sender=sender, *arguments, **named)
38 |         responses.append((receiver, result))
39 |     return responses
40 | 
41 | 
42 | async def send_catch_log_coroutine(signal=Any, sender=Anonymous, *arguments, **named):
43 |     """Like send_catch_log but supports returning deferreds on signal handlers.
44 |     Returns a deferred that gets fired once all signal handlers deferreds were
45 |     fired.
46 |     """
47 |     dfds = []
48 |     for receiver in liveReceivers(getAllReceivers(sender, signal)):
49 |         dfds.append(asyncio.create_task(
50 |             robustApplyWrap(robustApply, receiver, signal=signal, sender=sender, *arguments, **named)))
51 |     res = await asyncio.gather(*dfds)
52 |     return res
53 | 
54 | 
55 | def disconnect_all(signal=Any, sender=Any):
56 |     """Disconnect all signal handlers. Useful for cleaning up after running
57 |     tests
58 |     """
59 |     for receiver in liveReceivers(getAllReceivers(sender, signal)):
60 |         disconnect(receiver, signal=signal, sender=sender)
61 | 
62 | 


--------------------------------------------------------------------------------
/aioscpy/utils/template.py:
--------------------------------------------------------------------------------
 1 | """Helper functions for working with templates_bak1"""
 2 | 
 3 | import os
 4 | import re
 5 | import string
 6 | 
 7 | 
 8 | def render_templatefile(path, **kwargs):
 9 |     with open(path, 'rb') as fp:
10 |         raw = fp.read().decode('utf8')
11 | 
12 |     content = string.Template(raw).substitute(**kwargs)
13 | 
14 |     render_path = path[:-len('.tmpl')] if path.endswith('.tmpl') else path
15 | 
16 |     if path.endswith('.tmpl'):
17 |         os.rename(path, render_path)
18 | 
19 |     with open(render_path, 'wb') as fp:
20 |         fp.write(content.encode('utf8'))
21 | 
22 | 
23 | CAMELCASE_INVALID_CHARS = re.compile(r'[^a-zA-Z\d]')
24 | 
25 | 
26 | def string_camelcase(string):
27 |     """ Convert a word  to its CamelCase version and remove invalid chars
28 | 
29 |     >>> string_camelcase('lost-pound')
30 |     'LostPound'
31 | 
32 |     >>> string_camelcase('missing_images')
33 |     'MissingImages'
34 | 
35 |     """
36 |     return CAMELCASE_INVALID_CHARS.sub('', string.title())
37 | 


--------------------------------------------------------------------------------
/cegex/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/cegex/__init__.py


--------------------------------------------------------------------------------
/cegex/baidu.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from aioscpy.spider import Spider
 4 | from anti_header import Header
 5 | from pprint import pprint, pformat
 6 | 
 7 | 
 8 | class BaiduSpider(Spider):
 9 |     name = 'baidu'
10 |     custom_settings = {
11 |         "SPIDER_IDLE": False,
12 |         'TLS_CIPHERS': True,
13 |         "DOWNLOAD_HANDLER": "aioscpy.core.downloader.handlers.requests.RequestsDownloadHandler"
14 |     }
15 |     start_urls = [f'https://www.baidu.com/?a{i}' for i in range(10)]
16 | 
17 |     async def process_request(self, request):
18 |         request.headers = Header(url=request.url, platform='windows', connection=True).random
19 |         return request
20 | 
21 |     async def process_response(self, request, response):
22 |         return response
23 | 
24 |     async def process_exception(self, request, exc):
25 |         raise exc
26 | 
27 |     async def parse(self, response):
28 |         item = {
29 |             'hot': '\n'.join(response.xpath('//span[@class="title-content-title"]/text()').extract()),
30 |         }
31 |         yield item
32 | 
33 |     async def process_item(self, item):
34 |         pass
35 |         # self.logger.info("{item}", **{'item': pformat(item)})
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     baidu = BaiduSpider()
40 |     baidu.start()
41 | 


--------------------------------------------------------------------------------
/cegex/httpbin.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from aioscpy.spider import Spider
 4 | 
 5 | 
 6 | class HttpBinSpider(Spider):
 7 |     name = 'httpbin'
 8 |     custom_settings = {
 9 |         'CONCURRENT_REQUESTS': 10
10 |     }
11 |     start_urls = [f'http://httpbin.org/get?a{i}' for i in range(20)]
12 | 
13 |     async def parse(self, response):
14 |         item = await response.json
15 |         await asyncio.sleep(2)
16 |         yield item
17 | 
18 |     async def process_item(self, item):
19 |         pass
20 |         # self.logger.info(item)
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     q = HttpBinSpider()
25 |     q.start()
26 | 


--------------------------------------------------------------------------------
/cegex/httpbin_post.py:
--------------------------------------------------------------------------------
 1 | from aioscpy.spider import Spider
 2 | from aioscpy import call_grace_instance
 3 | from aioscpy.http import FormRequest
 4 | 
 5 | 
 6 | class HttpBinPostSpider(Spider):
 7 |     name = 'httpbin_post'
 8 |     custom_settings = {
 9 |         'CONCURRENT_REQUESTS': 10
10 |     }
11 | 
12 |     start_urls = ['http://httpbin.org/post' for _ in range(20)]
13 | 
14 |     async def start_requests(self):
15 |         """
16 |         : request usage description:
17 |         <form_request>: data = body
18 |             [header]: Content-Type: application/x-www-form-urlencoded
19 |             [method]: POST
20 |             [body]:
21 |                   {
22 |                     'a': 1,
23 |                     'b': 2
24 |                   }
25 |         # supported special scenarios about json request
26 |         <json_request>: json = body
27 |             [header]: Content-Type: application/json
28 |             [method]: POST
29 |             [body]: {
30 |                 'a': 1,
31 |                 'b': 2
32 |             }
33 |         """
34 |         for url in self.start_urls:
35 |             yield call_grace_instance(
36 |                 FormRequest,
37 |                 # self.di.get('form_request'), 
38 |                 # self.di.get('json_request'), 
39 |                 url, 
40 |                 method='POST',
41 |                 formdata={"b": '11'}
42 |             )
43 | 
44 |     async def parse(self, response):
45 |         item = await response.json
46 |         yield item
47 | 
48 |     async def process_item(self, item):
49 |         self.logger.info(item)
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     q = HttpBinPostSpider()
54 |     q.start()
55 | 


--------------------------------------------------------------------------------
/cegex/ja3.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from aioscpy.spider import Spider
 4 | from anti_header import Header
 5 | from pprint import pprint, pformat
 6 | 
 7 | 
 8 | class Ja3Spider(Spider):
 9 |     name = 'ja3'
10 |     custom_settings = {
11 |         "SPIDER_IDLE": False,
12 |         'TLS_CIPHERS': True,
13 |         "DOWNLOAD_HANDLER": "aioscpy.core.downloader.handlers.requests.AiohttpDownloadHandler"
14 |     }
15 |     start_urls = [f'https://tls.browserleaks.com/json?a{i}' for i in range(10)]
16 | 
17 |     async def process_request(self, request):
18 |         request.headers = Header(url=request.url, platform='windows', connection=True).random
19 |         return request
20 | 
21 |     async def process_response(self, request, response):
22 |         return response
23 | 
24 |     async def process_exception(self, request, exc):
25 |         raise exc
26 | 
27 |     async def parse(self, response):
28 |         _ja = await response.json
29 |         item = {
30 |             'ja3': _ja['ja3_hash'],
31 |         }
32 |         yield item
33 | 
34 |     async def process_item(self, item):
35 |         pass
36 |         # self.logger.info("{item}", **{'item': pformat(item)})
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     ja3 = Ja3Spider()
41 |     ja3.start()
42 | 


--------------------------------------------------------------------------------
/doc/README_ZH.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | ![aioscpy](./images/aioscpy.png)
  4 | 
  5 | # Aioscpy
  6 | 
  7 | 一个强大的、高性能的异步Web爬取和抓取框架，基于Python的asyncio生态系统构建。
  8 | 
  9 | [英文](../README.md) | 中文
 10 | 
 11 | ## 概述
 12 | 
 13 | Aioscpy是一个快速的高级web爬行和web抓取框架，用于抓取网站并从其页面提取结构化数据。它受到Scrapy和scrapy_redis的启发，但从头开始设计，充分利用异步编程的全部功能。
 14 | 
 15 | ### 主要特点
 16 | 
 17 | - **完全异步**：基于Python的asyncio，实现高性能并发操作
 18 | - **Scrapy风格的API**：为来自Scrapy的用户提供熟悉的API
 19 | - **分布式爬取**：支持使用Redis进行分布式爬取
 20 | - **多种HTTP后端**：支持aiohttp、httpx和requests
 21 | - **动态变量注入**：强大的依赖注入系统
 22 | - **灵活的中间件系统**：可定制的请求/响应处理管道
 23 | - **强大的数据处理**：用于处理爬取数据的管道
 24 | 
 25 | ## 系统要求
 26 | 
 27 | - Python 3.8+
 28 | - 支持Linux、Windows、macOS、BSD
 29 | 
 30 | ## 安装
 31 | 
 32 | ### 基本安装
 33 | 
 34 | ```shell
 35 | pip install aioscpy
 36 | ```
 37 | 
 38 | ### 安装所有依赖
 39 | 
 40 | ```shell
 41 | pip install aioscpy[all]
 42 | ```
 43 | 
 44 | ### 安装特定HTTP后端
 45 | 
 46 | ```shell
 47 | pip install aioscpy[aiohttp,httpx]
 48 | ```
 49 | 
 50 | ### 从最新版本安装
 51 | 
 52 | ```shell
 53 | pip install git+https://github.com/ihandmine/aioscpy
 54 | ```
 55 | 
 56 | ## 快速开始
 57 | 
 58 | ### 创建新项目
 59 | 
 60 | ```shell
 61 | aioscpy startproject myproject
 62 | cd myproject
 63 | ```
 64 | 
 65 | ### 创建爬虫
 66 | 
 67 | ```shell
 68 | aioscpy genspider myspider
 69 | ```
 70 | 
 71 | 这将在`spiders`目录中创建一个基本爬虫。
 72 | 
 73 | ![tree](./images/tree.png)
 74 | 
 75 | ### 示例爬虫
 76 | 
 77 | ```python
 78 | from aioscpy.spider import Spider
 79 | 
 80 | 
 81 | class QuotesSpider(Spider):
 82 |     name = 'quotes'
 83 |     custom_settings = {
 84 |         "SPIDER_IDLE": False
 85 |     }
 86 |     start_urls = [
 87 |         'https://quotes.toscrape.com/tag/humor/',
 88 |     ]
 89 | 
 90 |     async def parse(self, response):
 91 |         for quote in response.css('div.quote'):
 92 |             yield {
 93 |                 'author': quote.xpath('span/small/text()').get(),
 94 |                 'text': quote.css('span.text::text').get(),
 95 |             }
 96 | 
 97 |         next_page = response.css('li.next a::attr("href")').get()
 98 |         if next_page is not None:
 99 |             yield response.follow(next_page, self.parse)
100 | ```
101 | 
102 | ### 创建单个爬虫脚本
103 | 
104 | ```shell
105 | aioscpy onespider single_quotes
106 | ```
107 | 
108 | ### 高级爬虫示例
109 | 
110 | ```python
111 | from aioscpy.spider import Spider
112 | from anti_header import Header
113 | from pprint import pprint, pformat
114 | 
115 | 
116 | class SingleQuotesSpider(Spider):
117 |     name = 'single_quotes'
118 |     custom_settings = {
119 |         "SPIDER_IDLE": False
120 |     }
121 |     start_urls = [
122 |         'https://quotes.toscrape.com/',
123 |     ]
124 | 
125 |     async def process_request(self, request):
126 |         request.headers = Header(url=request.url, platform='windows', connection=True).random
127 |         return request
128 | 
129 |     async def process_response(self, request, response):
130 |         if response.status in [404, 503]:
131 |             return request
132 |         return response
133 | 
134 |     async def process_exception(self, request, exc):
135 |         raise exc
136 | 
137 |     async def parse(self, response):
138 |         for quote in response.css('div.quote'):
139 |             yield {
140 |                 'author': quote.xpath('span/small/text()').get(),
141 |                 'text': quote.css('span.text::text').get(),
142 |             }
143 | 
144 |         next_page = response.css('li.next a::attr("href")').get()
145 |         if next_page is not None:
146 |             yield response.follow(next_page, callback=self.parse)
147 | 
148 |     async def process_item(self, item):
149 |         self.logger.info("{item}", **{'item': pformat(item)})
150 | 
151 | 
152 | if __name__ == '__main__':
153 |     quotes = SingleQuotesSpider()
154 |     quotes.start()
155 | ```
156 | 
157 | ### 运行爬虫
158 | 
159 | ```shell
160 | # 从项目中运行爬虫
161 | aioscpy crawl quotes
162 | 
163 | # 运行单个爬虫脚本
164 | aioscpy runspider quotes.py
165 | ```
166 | 
167 | ![run](./images/run.png)
168 | 
169 | ### 从代码中运行
170 | 
171 | ```python
172 | from aioscpy.crawler import call_grace_instance
173 | from aioscpy.utils.tools import get_project_settings
174 | 
175 | # 方法1：从目录中加载所有爬虫
176 | def load_spiders_from_directory():
177 |     process = call_grace_instance("crawler_process", get_project_settings())
178 |     process.load_spider(path='./spiders')
179 |     process.start()
180 | 
181 | # 方法2：按名称运行特定爬虫
182 | def run_specific_spider():
183 |     process = call_grace_instance("crawler_process", get_project_settings())
184 |     process.crawl('myspider')
185 |     process.start()
186 | 
187 | if __name__ == '__main__':
188 |     run_specific_spider()
189 | ```
190 | 
191 | ## 配置
192 | 
193 | Aioscpy可以通过项目中的`settings.py`文件进行配置。以下是最重要的设置：
194 | 
195 | ### 并发设置
196 | 
197 | ```python
198 | # 最大并发处理项目数
199 | CONCURRENT_ITEMS = 100
200 | 
201 | # 最大并发请求数
202 | CONCURRENT_REQUESTS = 16
203 | 
204 | # 每个域名的最大并发请求数
205 | CONCURRENT_REQUESTS_PER_DOMAIN = 8
206 | 
207 | # 每个IP的最大并发请求数
208 | CONCURRENT_REQUESTS_PER_IP = 0
209 | ```
210 | 
211 | ### 下载设置
212 | 
213 | ```python
214 | # 请求间的延迟（秒）
215 | DOWNLOAD_DELAY = 0
216 | 
217 | # 请求超时时间（秒）
218 | DOWNLOAD_TIMEOUT = 20
219 | 
220 | # 是否随机化下载延迟
221 | RANDOMIZE_DOWNLOAD_DELAY = True
222 | 
223 | # 使用的HTTP后端
224 | DOWNLOAD_HANDLER = "aioscpy.core.downloader.handlers.httpx.HttpxDownloadHandler"
225 | # 其他选项：
226 | # DOWNLOAD_HANDLER = "aioscpy.core.downloader.handlers.aiohttp.AioHttpDownloadHandler"
227 | # DOWNLOAD_HANDLER = "aioscpy.core.downloader.handlers.requests.RequestsDownloadHandler"
228 | ```
229 | 
230 | ### 调度器设置
231 | 
232 | ```python
233 | # 使用的调度器（基于内存或Redis）
234 | SCHEDULER = "aioscpy.core.scheduler.memory.MemoryScheduler"
235 | # 分布式爬取：
236 | # SCHEDULER = "aioscpy.core.scheduler.redis.RedisScheduler"
237 | 
238 | # Redis连接设置（用于Redis调度器）
239 | REDIS_URI = "redis://localhost:6379"
240 | QUEUE_KEY = "%(spider)s:queue"
241 | ```
242 | 
243 | ## 响应API
244 | 
245 | Aioscpy提供了丰富的API来处理响应：
246 | 
247 | ### 提取数据
248 | 
249 | ```python
250 | # 使用CSS选择器
251 | title = response.css('title::text').get()
252 | all_links = response.css('a::attr(href)').getall()
253 | 
254 | # 使用XPath
255 | title = response.xpath('//title/text()').get()
256 | all_links = response.xpath('//a/@href').getall()
257 | ```
258 | 
259 | ### 跟踪链接
260 | 
261 | ```python
262 | # 跟踪链接
263 | yield response.follow('next-page.html', self.parse)
264 | 
265 | # 使用回调跟踪链接
266 | yield response.follow('details.html', self.parse_details)
267 | 
268 | # 跟踪所有匹配的CSS选择器的链接
269 | yield from response.follow_all(css='a.product::attr(href)', callback=self.parse_product)
270 | ```
271 | 
272 | ## 更多命令
273 | 
274 | ```shell
275 | aioscpy -h
276 | ```
277 | 
278 | ## 分布式爬取
279 | 
280 | 要启用基于Redis的分布式爬取：
281 | 
282 | 1. 在设置中配置Redis：
283 | 
284 | ```python
285 | SCHEDULER = "aioscpy.core.scheduler.redis.RedisScheduler"
286 | REDIS_URI = "redis://localhost:6379"
287 | QUEUE_KEY = "%(spider)s:queue"
288 | ```
289 | 
290 | 2. 在不同的机器上运行多个爬虫实例，全部连接到同一个Redis服务器。
291 | 
292 | ## 贡献
293 | 
294 | 请通过创建issue向项目所有者提交您的建议。
295 | 
296 | ## 感谢
297 | 
298 | [aiohttp](https://github.com/aio-libs/aiohttp/)
299 | 
300 | [scrapy](https://github.com/scrapy/scrapy)
301 | 
302 | [loguru](https://github.com/Delgan/loguru)
303 | 
304 | [httpx](https://github.com/encode/httpx)
305 | 


--------------------------------------------------------------------------------
/doc/images/aioscpy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/doc/images/aioscpy.png


--------------------------------------------------------------------------------
/doc/images/run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/doc/images/run.png


--------------------------------------------------------------------------------
/doc/images/tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/doc/images/tree.png


--------------------------------------------------------------------------------
/example/project_quotes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ihandmine/aioscpy/018c78c809f292766e77f43dc59123711dd88566/example/project_quotes/__init__.py


--------------------------------------------------------------------------------
/example/project_quotes/aioscpy.cfg:
--------------------------------------------------------------------------------
1 | 
2 | [settings]
3 | default = settings
4 | 
5 | [deploy]
6 | #url = http://localhost:6800/
7 | project = project_quotes
8 | 


--------------------------------------------------------------------------------
/example/project_quotes/middlewares.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class ProjectQuotesDownloaderMiddleware:
 3 | 
 4 |     @classmethod
 5 |     def from_crawler(cls, crawler):
 6 |         # This method is used by Aioscpy to create your spiders.
 7 |         s = cls()
 8 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 9 |         return s
10 | 
11 |     def process_request(self, request, spider):
12 |         # Called for each request that goes through the downloader
13 |         # middleware.
14 | 
15 |         # Must either:
16 |         # - return None: continue processing this request
17 |         # - or return a Response object
18 |         # - or return a Request object
19 |         # - or raise IgnoreRequest: process_exception() methods of
20 |         #   installed downloader middleware will be called
21 |         return None
22 | 
23 |     def process_response(self, request, response, spider):
24 |         # Called with the response returned from the downloader.
25 | 
26 |         # Must either;
27 |         # - return a Response object
28 |         # - return a Request object
29 |         # - or raise IgnoreRequest
30 |         return response
31 | 
32 |     def process_exception(self, request, exception, spider):
33 |         # Called when a download handler or a process_request()
34 |         # (from other downloader middleware) raises an exception.
35 | 
36 |         # Must either:
37 |         # - return None: continue processing this exception
38 |         # - return a Response object: stops process_exception() chain
39 |         # - return a Request object: stops process_exception() chain
40 |         pass
41 | 
42 |     def spider_opened(self, spider):
43 |         spider.logger.info('Spider opened: %s' % spider.name)
44 | 


--------------------------------------------------------------------------------
/example/project_quotes/pipelines.py:
--------------------------------------------------------------------------------
1 | 
2 | class ProjectQuotesPipeline:
3 |     def process_item(self, item, spider):
4 |         return item
5 | 


--------------------------------------------------------------------------------
/example/project_quotes/settings.py:
--------------------------------------------------------------------------------
 1 | BOT_NAME = 'project_quotes'
 2 | 
 3 | SPIDER_MODULES = ['spiders']
 4 | NEWSPIDER_MODULE = 'spiders'
 5 | 
 6 | # CONCURRENT_REQUESTS = 16
 7 | # CONCURRENT_REQUESTS_PER_DOMAIN = 8
 8 | # CONCURRENT_REQUESTS_PER_IP = 0
 9 | # RANDOMIZE_DOWNLOAD_DELAY = True
10 | 
11 | # DOWNLOAD_DELAY = 0
12 | # DOWNLOAD_TIMEOUT = 20
13 | # DOWNLOAD_HANDLER = "aioscpy.core.downloader.http.AioHttpDownloadHandler"
14 | # SCHEDULER = "aioscpy.core.scheduler.redis.RedisScheduler"
15 | # SCHEDULER = "aioscpy.core.scheduler.memory.MemoryScheduler"
16 | 
17 | 
18 | # SPIDER_IDLE = False
19 | 
20 | # :LOG CONFIG
21 | # LOG_LEVEL = "DEBUG"
22 | # LOG_FILE = False
23 | # LOG_FILENAME = f"{BOT_NAME}.log"
24 | # LOG_ENCODING = "utf-8"
25 | # LOG_ROTATION = "1 week"
26 | # LOG_RETENTION = "30 days"
27 | 
28 | # message config
29 | # RABBITMQ_TCP = {
30 | #     "host": "172.16.8.147",
31 | #     # "port": 5672,
32 | #     # "username": "admin",
33 | #     # "password": "admin",
34 | #     # "key": "message:queue",
35 | #     # "max_priority": 100
36 | # }
37 | # QUEUE_KEY = '%(spider)s:requests'
38 | 
39 | # REDIS_TCP = {
40 | #     "host": "172.16.7.172",
41 | #     "port": 6379,
42 | #     "password": "123456",
43 | #     "db": 15
44 | # }
45 | # REDIS_URI = "redis://:123456@172.16.7.172:6379/1"
46 | 
47 | 
48 | # DOWNLOADER_STATS = True
49 | 
50 | # LOGSTATS_INTERVAL = 60.0
51 | # STATS_CLASS = 'aioscpy.libs.statscollectors.MemoryStatsCollector'
52 | # STATS_DUMP = True
53 | 
54 | # DOWNLOADER_MIDDLEWARES = {
55 | #     'project_quotes.middlewares.ProjectQuotesDownloaderMiddleware': 543,
56 | # }
57 | 
58 | # EXTENSIONS = {
59 | # }
60 | 
61 | # ITEM_PIPELINES = {
62 | #     'project_quotes.pipelines.ProjectQuotesPipeline': 300,
63 | # }
64 | 


--------------------------------------------------------------------------------
/example/project_quotes/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Aioscpy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/example/project_quotes/spiders/quotes.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from aioscpy.spider import Spider
 4 | from aioscpy import call_grace_instance
 5 | 
 6 | 
 7 | class QuotesSpider(Spider):
 8 |     name = 'quotes'
 9 |     custom_settings = {
10 |         "SPIDER_IDLE": False
11 |     }
12 |     start_urls = [
13 |         'https://quotes.toscrape.com/',
14 |     ]
15 | 
16 |     async def parse(self, response):
17 | 
18 |         for quote in response.css('div.quote'):
19 |             yield {
20 |                 'author': quote.xpath('span/small/text()').get(),
21 |                 'text': quote.css('span.text::text').get(),
22 |             }
23 | 
24 |         next_page = response.css('li.next a::attr("href")').get()
25 |         if next_page is not None:
26 |             # first next_page method:
27 |             yield response.follow(next_page, callback=self.parse)
28 | 
29 |             # second next_page method:
30 |             # next_page_url = 'https://quotes.toscrape.com' + next_page
31 |             # yield call_grace_instance(self.di.get("request"), next_page_url, callback=self.parse)
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     q = QuotesSpider()
36 |     q.start()
37 | 


--------------------------------------------------------------------------------
/example/project_quotes/start.py:
--------------------------------------------------------------------------------
 1 | from aioscpy import call_grace_instance
 2 | from aioscpy.utils.tools import get_project_settings
 3 | 
 4 | 
 5 | def load_file_to_execute():
 6 |     process = call_grace_instance("crawler_process", get_project_settings())
 7 |     process.load_spider(path='./spiders')
 8 |     process.start()
 9 | 
10 | 
11 | def load_name_to_execute():
12 |     process = call_grace_instance("crawler_process", get_project_settings())
13 |     process.crawl('quotes')
14 |     process.start()
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     load_name_to_execute()
19 | 


--------------------------------------------------------------------------------
/example/single_quotes.py:
--------------------------------------------------------------------------------
 1 | from aioscpy.spider import Spider
 2 | from anti_header import Header
 3 | from pprint import pprint, pformat
 4 | 
 5 | 
 6 | class SingleQuotesSpider(Spider):
 7 |     name = 'single_quotes'
 8 |     custom_settings = {
 9 |         "SPIDER_IDLE": False
10 |     }
11 |     start_urls = [
12 |         'https://quotes.toscrape.com/',
13 |     ]
14 | 
15 |     async def process_request(self, request):
16 |         request.headers = Header(url=request.url, platform='windows', connection=True).random
17 |         return request
18 | 
19 |     async def process_response(self, request, response):
20 |         if response.status in [404, 503]:
21 |             return request
22 |         return response
23 | 
24 |     async def process_exception(self, request, exc):
25 |         raise exc
26 | 
27 |     async def parse(self, response):
28 | 
29 |         for quote in response.css('div.quote'):
30 |             yield {
31 |                 'author': quote.xpath('span/small/text()').get(),
32 |                 'text': quote.css('span.text::text').get(),
33 |             }
34 | 
35 |         next_page = response.css('li.next a::attr("href")').get()
36 |         if next_page is not None:
37 |             # first next_page method:
38 |             yield response.follow(next_page, callback=self.parse)
39 | 
40 |             # second next_page method:
41 |             # next_page_url = 'https://quotes.toscrape.com' + next_page
42 |             # yield call_grace_instance(self.di.get("request"), next_page_url, callback=self.parse)
43 | 
44 |     async def process_item(self, item):
45 |         pass
46 |         # self.logger.info("{item}", **{'item': pformat(item)})
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     q = SingleQuotesSpider()
51 |     q.start()
52 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp~=3.8.1
2 | w3lib~=1.22.0
3 | parsel~=1.6.0
4 | redis~=4.3.1
5 | pika~=1.2.0
6 | loguru~=0.5.3
7 | PyDispatcher~=2.0.5
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from shutil import rmtree
  4 | from os.path import dirname, join
  5 | 
  6 | from setuptools import setup, Command, find_packages
  7 | 
  8 | # Package meta-data.
  9 | NAME = "aioscpy"
 10 | DESCRIPTION = "An asyncio + aiolibs crawler imitate scrapy framework"
 11 | URL = "https://github.com/ihandmine/aioscpy"
 12 | EMAIL = "handmine@outlook.com"
 13 | AUTHOR = "handmine"
 14 | REQUIRES_PYTHON = ">=3.8.0"
 15 | 
 16 | here = os.path.abspath(os.path.dirname(__file__))
 17 | with open(f"{here}/README.md", encoding='utf-8') as f:
 18 |     long_description = f.read()
 19 | 
 20 | with open(join(dirname(__file__), 'aioscpy/VERSION'), 'rb') as f:
 21 |     old_version = f.read().decode('ascii').strip()
 22 |     maxv, midv, minv = [int(v) for v in old_version.split('.')]
 23 |     if minv <= 24:
 24 |         minv += 1
 25 |     else:
 26 |         midv += 1
 27 |         minv = 0
 28 |     VERSION = '.'.join([str(v) for v in [maxv, midv, minv]])
 29 |     print(f'old version: {old_version}, new version: {VERSION}')
 30 | 
 31 | 
 32 | class UploadCommand(Command):
 33 |     """Support setup_bak.py upload."""
 34 | 
 35 |     description = "Build and publish the package."
 36 |     user_options = []
 37 | 
 38 |     @staticmethod
 39 |     def status(s):
 40 |         """Prints things in bold."""
 41 |         print("\033[1m{0}\033[0m".format(s))
 42 | 
 43 |     def initialize_options(self):
 44 |         pass
 45 | 
 46 |     def finalize_options(self):
 47 |         pass
 48 | 
 49 |     def run(self):
 50 |         try:
 51 |             self.status("Removing previous builds...")
 52 |             rmtree(os.path.join(here, "dist"))
 53 |         except OSError:
 54 |             pass
 55 | 
 56 |         self.status("Building Source and Wheel distribution...")
 57 |         os.system("{0} setup.py sdist bdist_wheel".format(sys.executable))
 58 | 
 59 |         self.status("Uploading the package to PyPI via Twine...")
 60 |         os.system("twine upload dist/*")
 61 | 
 62 |         with open(join(dirname(__file__), 'aioscpy/VERSION'), 'w') as f:
 63 |             f.write(VERSION + '\n')
 64 | 
 65 |         self.status("git option [add]")
 66 |         os.system("git add aioscpy/VERSION")
 67 | 
 68 |         self.status("git option [commit][push]")
 69 |         os.system(f'git commit -m "{VERSION}"')
 70 |         os.system("git push")
 71 |         sys.exit()
 72 | 
 73 | 
 74 | extras_require = {
 75 |     "all": [
 76 |         "aiohttp",
 77 |         "httpx",
 78 |         "anti-header",
 79 |         "w3lib",
 80 |         "parsel",
 81 |         "PyDispatcher",
 82 |         "redis",
 83 |         "anyio",
 84 |         "ujson"
 85 |     ],
 86 |     "aiohttp": ["aiohttp", "cryptography"],
 87 |     "httpx": ["httpx[http2]>=0.23.0"],
 88 | }
 89 | 
 90 | 
 91 | setup(
 92 |     name=NAME,
 93 |     version=VERSION,
 94 |     author=AUTHOR,
 95 |     packages=find_packages(),
 96 |     include_package_data=True,
 97 |     package_data={"": ["*.py", "*.tmpl", '*.cfg']},
 98 |     install_requires=[
 99 |         "aiohttp",
100 |         "httpx",
101 |         "anti-header",
102 |         "w3lib",
103 |         "parsel",
104 |         "PyDispatcher",
105 |         "redis",
106 |         "anyio",
107 |         "ujson"
108 |     ],
109 |     extras_require=extras_require,
110 |     description=DESCRIPTION,
111 |     long_description=long_description,
112 |     long_description_content_type='text/markdown',
113 |     url=URL,
114 |     author_email=EMAIL,
115 |     license="MIT",
116 |     keywords="""
117 |                 crawler
118 |                 scrapy
119 |                 asyncio
120 |                 aiohttp
121 |                 anti-header
122 |                 anti-useragent
123 |                 python3
124 |                """,
125 |     python_requires=REQUIRES_PYTHON,
126 |     zip_safe=False,
127 |     entry_points={
128 |         'console_scripts': ['aioscpy = aioscpy.cmdline:execute']
129 |     },
130 |     classifiers=[
131 |         "License :: OSI Approved :: MIT License",
132 |         "Programming Language :: Python",
133 |         "Programming Language :: Python :: 3.7",
134 |         "Development Status :: 3 - Alpha",
135 |         "Framework :: AsyncIO",
136 |         "Operating System :: Unix",
137 |         "Operating System :: Microsoft :: Windows",
138 |         "Operating System :: MacOS",
139 |     ],
140 |     # Build and upload package: python3 setup_bak.py upload
141 |     cmdclass={"upload": UploadCommand},
142 | )
143 | 


--------------------------------------------------------------------------------
/start.py:
--------------------------------------------------------------------------------
 1 | from aioscpy.crawler import call_grace_instance
 2 | from aioscpy.utils.tools import get_project_settings
 3 | 
 4 | """start spider method one:
 5 | from cegex.baidu import BaiduSpider
 6 | from cegex.httpbin import HttpBinSpider
 7 | 
 8 | process = CrawlerProcess()
 9 | process.crawl(HttpBinSpider)
10 | process.crawl(BaiduSpider)
11 | process.start()
12 | """
13 | 
14 | 
15 | def load_file_to_execute():
16 |     process = call_grace_instance("crawler_process", get_project_settings())
17 |     process.load_spider(path='./cegex', spider_like='httpbin')
18 |     process.start()
19 | 
20 | 
21 | def load_name_to_execute():
22 |     process = call_grace_instance("crawler_process", get_project_settings())
23 |     process.crawl('ja3', path="./cegex")
24 |     process.start()
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     load_name_to_execute()
29 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # Aioscpy Tests
 2 | 
 3 | This directory contains unit tests for the Aioscpy framework.
 4 | 
 5 | ## Running the Tests
 6 | 
 7 | To run all tests, use the following command from the project root:
 8 | 
 9 | ```bash
10 | python -m tests.run_tests
11 | ```
12 | 
13 | To run a specific test file:
14 | 
15 | ```bash
16 | python -m tests.test_engine_memory_management
17 | ```
18 | 
19 | ## Test Files
20 | 
21 | - `test_engine_memory_management.py`: Tests for the memory management optimizations in the ExecutionEngine.
22 | - `test_engine_task_beat.py`: Tests for the task beat optimizations in the ExecutionEngine.
23 | - `test_httpx_handler.py`: Tests for the improved error handling in the HttpxDownloadHandler.
24 | - `test_adaptive_concurrency.py`: Tests for the AdaptiveConcurrencyMiddleware.
25 | 
26 | ## Writing New Tests
27 | 
28 | When writing new tests, follow these guidelines:
29 | 
30 | 1. Create a new test file with a name that clearly indicates what is being tested.
31 | 2. Use the `unittest` framework.
32 | 3. Use mocks to isolate the code being tested.
33 | 4. Test both success and failure cases.
34 | 5. Add the new test to `run_tests.py`.
35 | 
36 | ## Test Coverage
37 | 
38 | To generate a test coverage report, install the `coverage` package:
39 | 
40 | ```bash
41 | pip install coverage
42 | ```
43 | 
44 | Then run the tests with coverage:
45 | 
46 | ```bash
47 | coverage run -m tests.run_tests
48 | ```
49 | 
50 | And generate a report:
51 | 
52 | ```bash
53 | coverage report
54 | ```
55 | 
56 | Or an HTML report:
57 | 
58 | ```bash
59 | coverage html
60 | ```
61 | 


--------------------------------------------------------------------------------
/tests/run_tests.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import sys
 3 | import os
 4 | 
 5 | # Add the parent directory to the path so we can import the modules
 6 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 7 | 
 8 | # Import the test modules
 9 | from test_engine_memory_management import TestEngineMemoryManagement
10 | from test_engine_task_beat import TestEngineTaskBeat
11 | from test_httpx_handler import TestHttpxHandler
12 | from test_adaptive_concurrency import TestAdaptiveConcurrencyMiddleware
13 | 
14 | 
15 | def run_tests():
16 |     """Run all the tests."""
17 |     # Create a test suite
18 |     test_suite = unittest.TestSuite()
19 |     
20 |     # Add the test cases
21 |     test_suite.addTest(unittest.makeSuite(TestEngineMemoryManagement))
22 |     test_suite.addTest(unittest.makeSuite(TestEngineTaskBeat))
23 |     test_suite.addTest(unittest.makeSuite(TestHttpxHandler))
24 |     test_suite.addTest(unittest.makeSuite(TestAdaptiveConcurrencyMiddleware))
25 |     
26 |     # Run the tests
27 |     runner = unittest.TextTestRunner(verbosity=2)
28 |     result = runner.run(test_suite)
29 |     
30 |     # Return the result
31 |     return result.wasSuccessful()
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     success = run_tests()
36 |     sys.exit(0 if success else 1)
37 | 


--------------------------------------------------------------------------------
/tests/test_adaptive_concurrency.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import time
  3 | from unittest.mock import MagicMock, patch
  4 | 
  5 | from aioscpy.middleware.adaptive_concurrency import AdaptiveConcurrencyMiddleware
  6 | 
  7 | 
  8 | class TestAdaptiveConcurrencyMiddleware(unittest.TestCase):
  9 |     """Test the AdaptiveConcurrencyMiddleware."""
 10 | 
 11 |     def setUp(self):
 12 |         # Create mocks
 13 |         self.crawler = MagicMock()
 14 |         self.crawler.settings = {
 15 |             'ADAPTIVE_CONCURRENCY_ENABLED': True,
 16 |             'ADAPTIVE_CONCURRENCY_TARGET_RESPONSE_TIME': 0.5,
 17 |             'ADAPTIVE_CONCURRENCY_MIN_REQUESTS': 5,
 18 |             'ADAPTIVE_CONCURRENCY_MAX_REQUESTS': 20,
 19 |             'ADAPTIVE_CONCURRENCY_WINDOW_SIZE': 10,
 20 |             'ADAPTIVE_CONCURRENCY_ADJUSTMENT_INTERVAL': 1,
 21 |             'CONCURRENT_REQUESTS': 10,
 22 |         }
 23 |         self.crawler.settings.getbool = lambda key, default: self.crawler.settings.get(key, default)
 24 |         self.crawler.settings.getfloat = lambda key, default: self.crawler.settings.get(key, default)
 25 |         self.crawler.settings.getint = lambda key, default: self.crawler.settings.get(key, default)
 26 |         
 27 |         self.spider = MagicMock()
 28 |         self.spider.name = 'test_spider'
 29 |         
 30 |         # Create middleware
 31 |         self.middleware = AdaptiveConcurrencyMiddleware(self.crawler)
 32 |         self.middleware.logger = MagicMock()
 33 |         
 34 |         # Create request and response mocks
 35 |         self.request = MagicMock()
 36 |         self.request.meta = {}
 37 |         self.response = MagicMock()
 38 | 
 39 |     async def test_process_request_adds_start_time(self):
 40 |         """Test that process_request adds a start time to the request meta."""
 41 |         result = await self.middleware.process_request(self.request, self.spider)
 42 |         
 43 |         # Verify that the result is None (middleware continues)
 44 |         self.assertIsNone(result)
 45 |         
 46 |         # Verify that a start time was added to the request meta
 47 |         self.assertIn('request_start_time', self.request.meta)
 48 |         self.assertIsInstance(self.request.meta['request_start_time'], float)
 49 | 
 50 |     async def test_process_response_calculates_time(self):
 51 |         """Test that process_response calculates the response time."""
 52 |         # Set up a request with a start time
 53 |         start_time = time.time() - 0.3  # 300ms ago
 54 |         self.request.meta['request_start_time'] = start_time
 55 |         
 56 |         result = await self.middleware.process_response(self.request, self.response, self.spider)
 57 |         
 58 |         # Verify that the result is the response
 59 |         self.assertEqual(result, self.response)
 60 |         
 61 |         # Verify that a response time was added to the deque
 62 |         self.assertEqual(len(self.middleware.response_times), 1)
 63 |         self.assertGreaterEqual(self.middleware.response_times[0], 0.3)
 64 | 
 65 |     async def test_adjust_concurrency_faster_responses(self):
 66 |         """Test that concurrency is increased when responses are faster than target."""
 67 |         # Fill the response times deque with fast responses (0.2s)
 68 |         self.middleware.response_times.extend([0.2] * self.middleware.window_size)
 69 |         self.middleware.current_concurrency = 10
 70 |         
 71 |         # Adjust concurrency
 72 |         self.middleware._adjust_concurrency()
 73 |         
 74 |         # Verify that concurrency was increased
 75 |         self.assertGreater(self.middleware.current_concurrency, 10)
 76 |         
 77 |         # Verify that the setting was updated
 78 |         self.crawler.settings.set.assert_called_with('CONCURRENT_REQUESTS', self.middleware.current_concurrency)
 79 |         
 80 |         # Verify that the change was logged
 81 |         self.middleware.logger.info.assert_called_once()
 82 | 
 83 |     async def test_adjust_concurrency_slower_responses(self):
 84 |         """Test that concurrency is decreased when responses are slower than target."""
 85 |         # Fill the response times deque with slow responses (1.0s)
 86 |         self.middleware.response_times.extend([1.0] * self.middleware.window_size)
 87 |         self.middleware.current_concurrency = 10
 88 |         
 89 |         # Adjust concurrency
 90 |         self.middleware._adjust_concurrency()
 91 |         
 92 |         # Verify that concurrency was decreased
 93 |         self.assertLess(self.middleware.current_concurrency, 10)
 94 |         
 95 |         # Verify that the setting was updated
 96 |         self.crawler.settings.set.assert_called_with('CONCURRENT_REQUESTS', self.middleware.current_concurrency)
 97 |         
 98 |         # Verify that the change was logged
 99 |         self.middleware.logger.info.assert_called_once()
100 | 
101 |     async def test_adjust_concurrency_respects_min_max(self):
102 |         """Test that concurrency adjustments respect the min and max limits."""
103 |         # Test minimum limit
104 |         self.middleware.response_times.extend([2.0] * self.middleware.window_size)  # Very slow responses
105 |         self.middleware.current_concurrency = 6
106 |         
107 |         self.middleware._adjust_concurrency()
108 |         
109 |         # Verify that concurrency was not decreased below the minimum
110 |         self.assertEqual(self.middleware.current_concurrency, 5)
111 |         
112 |         # Test maximum limit
113 |         self.middleware.response_times.clear()
114 |         self.middleware.response_times.extend([0.1] * self.middleware.window_size)  # Very fast responses
115 |         self.middleware.current_concurrency = 19
116 |         
117 |         self.middleware._adjust_concurrency()
118 |         
119 |         # Verify that concurrency was not increased above the maximum
120 |         self.assertEqual(self.middleware.current_concurrency, 20)
121 | 
122 |     async def test_disabled_middleware(self):
123 |         """Test that the middleware does nothing when disabled."""
124 |         # Disable the middleware
125 |         self.middleware.enabled = False
126 |         
127 |         # Process a request
128 |         result = await self.middleware.process_request(self.request, self.spider)
129 |         
130 |         # Verify that the result is None
131 |         self.assertIsNone(result)
132 |         
133 |         # Verify that no start time was added
134 |         self.assertNotIn('request_start_time', self.request.meta)
135 |         
136 |         # Process a response
137 |         result = await self.middleware.process_response(self.request, self.response, self.spider)
138 |         
139 |         # Verify that the result is the response
140 |         self.assertEqual(result, self.response)
141 |         
142 |         # Verify that no response times were recorded
143 |         self.assertEqual(len(self.middleware.response_times), 0)
144 | 
145 | 
146 | if __name__ == '__main__':
147 |     unittest.main()
148 | 


--------------------------------------------------------------------------------
/tests/test_engine_memory_management.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import asyncio
  3 | from unittest.mock import MagicMock, patch, AsyncMock
  4 | 
  5 | from aioscpy.core.engine import ExecutionEngine
  6 | 
  7 | 
  8 | class TestEngineMemoryManagement(unittest.TestCase):
  9 |     """Test the memory management optimizations in the ExecutionEngine."""
 10 | 
 11 |     def setUp(self):
 12 |         # Create mocks
 13 |         self.crawler = MagicMock()
 14 |         self.crawler.settings = {
 15 |             'GC_ENABLED': True,
 16 |             'GC_FREQUENCY': 3,  # Set to a small value for testing
 17 |         }
 18 |         self.crawler.settings.getint = lambda key, default: self.crawler.settings.get(key, default)
 19 |         self.crawler.settings.getbool = lambda key, default: self.crawler.settings.get(key, default)
 20 |         self.crawler.settings.getfloat = lambda key, default: self.crawler.settings.get(key, default)
 21 |         
 22 |         self.spider = MagicMock()
 23 |         self.spider.name = 'test_spider'
 24 |         
 25 |         self.slot = MagicMock()
 26 |         self.slot.close_if_idle = True
 27 |         
 28 |         # Create engine
 29 |         self.engine = ExecutionEngine(self.crawler, lambda: None)
 30 |         self.engine.logger = MagicMock()
 31 |         self.engine.spider_is_idle = AsyncMock(return_value=False)
 32 |         
 33 |         # Patch asyncio.sleep to avoid actual sleeping
 34 |         self.sleep_patch = patch('asyncio.sleep', new=AsyncMock())
 35 |         self.mock_sleep = self.sleep_patch.start()
 36 |         
 37 |         # Patch gc.collect
 38 |         self.gc_patch = patch('gc.collect')
 39 |         self.mock_gc = self.gc_patch.start()
 40 | 
 41 |     def tearDown(self):
 42 |         self.sleep_patch.stop()
 43 |         self.gc_patch.stop()
 44 | 
 45 |     async def _run_heart_beat(self, iterations):
 46 |         """Helper to run the heart_beat method for a specific number of iterations."""
 47 |         # Create a task for heart_beat
 48 |         task = asyncio.create_task(self.engine.heart_beat(0.1, self.spider, self.slot))
 49 |         
 50 |         # Let it run for a few iterations
 51 |         for _ in range(iterations):
 52 |             await asyncio.sleep(0)
 53 |         
 54 |         # Cancel the task
 55 |         task.cancel()
 56 |         try:
 57 |             await task
 58 |         except asyncio.CancelledError:
 59 |             pass
 60 | 
 61 |     def test_gc_enabled(self):
 62 |         """Test that garbage collection runs when enabled."""
 63 |         asyncio.run(self._run_heart_beat(10))
 64 |         
 65 |         # With GC_FREQUENCY=3, we should have called gc.collect about 3 times in 10 iterations
 66 |         # (not exactly 3 because of the counter initialization and async nature)
 67 |         self.assertGreaterEqual(self.mock_gc.call_count, 2)
 68 |         self.assertLessEqual(self.mock_gc.call_count, 4)
 69 | 
 70 |     def test_gc_disabled(self):
 71 |         """Test that garbage collection doesn't run when disabled."""
 72 |         self.crawler.settings['GC_ENABLED'] = False
 73 |         
 74 |         asyncio.run(self._run_heart_beat(10))
 75 |         
 76 |         # With GC_ENABLED=False, gc.collect should never be called
 77 |         self.mock_gc.assert_not_called()
 78 | 
 79 |     def test_gc_frequency(self):
 80 |         """Test that garbage collection respects the frequency setting."""
 81 |         # Set frequency to 5
 82 |         self.crawler.settings['GC_FREQUENCY'] = 5
 83 |         
 84 |         asyncio.run(self._run_heart_beat(15))
 85 |         
 86 |         # With GC_FREQUENCY=5, we should have called gc.collect about 3 times in 15 iterations
 87 |         self.assertGreaterEqual(self.mock_gc.call_count, 2)
 88 |         self.assertLessEqual(self.mock_gc.call_count, 4)
 89 | 
 90 |     def test_gc_exception_handling(self):
 91 |         """Test that exceptions in garbage collection are handled properly."""
 92 |         # Make gc.collect raise an exception
 93 |         self.mock_gc.side_effect = Exception("Test exception")
 94 |         
 95 |         asyncio.run(self._run_heart_beat(5))
 96 |         
 97 |         # The exception should be caught and logged
 98 |         self.engine.logger.warning.assert_called()
 99 |         
100 |         # The heart_beat should continue running despite the exception
101 |         self.assertGreater(self.mock_sleep.call_count, 3)
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     unittest.main()
106 | 


--------------------------------------------------------------------------------
/tests/test_engine_task_beat.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import asyncio
  3 | from unittest.mock import MagicMock, patch, AsyncMock
  4 | 
  5 | from aioscpy.core.engine import ExecutionEngine
  6 | 
  7 | 
  8 | class TestEngineTaskBeat(unittest.TestCase):
  9 |     """Test the task beat optimizations in the ExecutionEngine."""
 10 | 
 11 |     def setUp(self):
 12 |         # Create mocks
 13 |         self.crawler = MagicMock()
 14 |         self.crawler.settings = {
 15 |             'TASK_BEAT_ACTIVE_SLEEP': 0.1,
 16 |             'TASK_BEAT_IDLE_SLEEP': 0.5,
 17 |             'TASK_BEAT_BATCH_SIZE': 10,
 18 |         }
 19 |         self.crawler.settings.getint = lambda key, default: self.crawler.settings.get(key, default)
 20 |         self.crawler.settings.getbool = lambda key, default: self.crawler.settings.get(key, default)
 21 |         self.crawler.settings.getfloat = lambda key, default: self.crawler.settings.get(key, default)
 22 |         
 23 |         self.slot = MagicMock()
 24 |         self.slot.scheduler = MagicMock()
 25 |         self.slot.scheduler.async_next_request = AsyncMock()
 26 |         self.slot.add_request = MagicMock()
 27 |         
 28 |         # Create engine
 29 |         self.engine = ExecutionEngine(self.crawler, lambda: None)
 30 |         self.engine.logger = MagicMock()
 31 |         self.engine._needs_backout = MagicMock(return_value=False)
 32 |         self.engine.slot = self.slot
 33 |         self.engine.downloader = MagicMock()
 34 |         self.engine.downloader.fetch = AsyncMock()
 35 |         
 36 |         # Patch asyncio.sleep to avoid actual sleeping
 37 |         self.sleep_patch = patch('asyncio.sleep', new=AsyncMock())
 38 |         self.mock_sleep = self.sleep_patch.start()
 39 | 
 40 |     def tearDown(self):
 41 |         self.sleep_patch.stop()
 42 | 
 43 |     async def _run_task_beat(self, iterations):
 44 |         """Helper to run the task_beat method for a specific number of iterations."""
 45 |         # Create a task for task_beat
 46 |         task = asyncio.create_task(self.engine.task_beat())
 47 |         
 48 |         # Let it run for a few iterations
 49 |         for _ in range(iterations):
 50 |             await asyncio.sleep(0)
 51 |         
 52 |         # Cancel the task
 53 |         task.cancel()
 54 |         try:
 55 |             await task
 56 |         except asyncio.CancelledError:
 57 |             pass
 58 | 
 59 |     def test_task_beat_with_requests(self):
 60 |         """Test that task_beat processes requests when available."""
 61 |         # Set up mock to return some requests
 62 |         mock_requests = [MagicMock() for _ in range(3)]
 63 |         self.slot.scheduler.async_next_request.return_value = mock_requests
 64 |         
 65 |         asyncio.run(self._run_task_beat(2))
 66 |         
 67 |         # Verify that scheduler.async_next_request was called with the batch size
 68 |         self.slot.scheduler.async_next_request.assert_called_with(count=10)
 69 |         
 70 |         # Verify that add_request and fetch were called for each request
 71 |         self.assertEqual(self.slot.add_request.call_count, 3)
 72 |         self.assertEqual(self.engine.downloader.fetch.call_count, 3)
 73 |         
 74 |         # Verify that we used the active sleep time
 75 |         self.mock_sleep.assert_called_with(0.1)
 76 | 
 77 |     def test_task_beat_no_requests(self):
 78 |         """Test that task_beat handles the case when no requests are available."""
 79 |         # Set up mock to return no requests
 80 |         self.slot.scheduler.async_next_request.return_value = []
 81 |         
 82 |         asyncio.run(self._run_task_beat(2))
 83 |         
 84 |         # Verify that scheduler.async_next_request was called
 85 |         self.slot.scheduler.async_next_request.assert_called()
 86 |         
 87 |         # Verify that add_request and fetch were not called
 88 |         self.slot.add_request.assert_not_called()
 89 |         self.engine.downloader.fetch.assert_not_called()
 90 |         
 91 |         # Verify that we used the idle sleep time
 92 |         self.mock_sleep.assert_called_with(0.5)
 93 | 
 94 |     def test_task_beat_with_backout(self):
 95 |         """Test that task_beat respects the backout condition."""
 96 |         # Set up mock to indicate backout is needed
 97 |         self.engine._needs_backout.return_value = True
 98 |         
 99 |         asyncio.run(self._run_task_beat(2))
100 |         
101 |         # Verify that scheduler.async_next_request was not called
102 |         self.slot.scheduler.async_next_request.assert_not_called()
103 |         
104 |         # Verify that we used the idle sleep time
105 |         self.mock_sleep.assert_called_with(0.5)
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     unittest.main()
110 | 


--------------------------------------------------------------------------------
/tests/test_httpx_handler.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import asyncio
  3 | from unittest.mock import MagicMock, patch, AsyncMock
  4 | 
  5 | import httpx
  6 | 
  7 | from aioscpy.core.downloader.handlers.httpx import HttpxDownloadHandler
  8 | 
  9 | 
 10 | class TestHttpxHandler(unittest.TestCase):
 11 |     """Test the improved error handling in the HttpxDownloadHandler."""
 12 | 
 13 |     def setUp(self):
 14 |         # Create mocks
 15 |         self.settings = {
 16 |             'DOWNLOAD_TIMEOUT': 10,
 17 |         }
 18 |         self.settings.get = lambda key, default=None: self.settings.get(key, default)
 19 |         
 20 |         self.crawler = MagicMock()
 21 |         self.crawler.settings = self.settings
 22 |         
 23 |         self.spider = MagicMock()
 24 |         self.spider.name = 'test_spider'
 25 |         
 26 |         # Create request mock
 27 |         self.request = MagicMock()
 28 |         self.request.url = 'https://example.com'
 29 |         self.request.method = 'GET'
 30 |         self.request.headers = {}
 31 |         self.request.cookies = {}
 32 |         self.request.body = None
 33 |         self.request.json = None
 34 |         self.request.meta = {}
 35 |         
 36 |         # Create handler
 37 |         self.handler = HttpxDownloadHandler(self.settings, self.crawler)
 38 |         self.handler.logger = MagicMock()
 39 |         
 40 |         # Mock the dependency injection
 41 |         self.mock_response_cls = MagicMock()
 42 |         self.handler.di = MagicMock()
 43 |         self.handler.di.get.return_value = self.mock_response_cls
 44 |         
 45 |         # Patch httpx.AsyncClient
 46 |         self.client_patch = patch('httpx.AsyncClient')
 47 |         self.mock_client_cls = self.client_patch.start()
 48 |         self.mock_client = AsyncMock()
 49 |         self.mock_client_cls.return_value.__aenter__.return_value = self.mock_client
 50 |         
 51 |         # Create a mock response
 52 |         self.mock_http_response = MagicMock()
 53 |         self.mock_http_response.url = 'https://example.com'
 54 |         self.mock_http_response.status_code = 200
 55 |         self.mock_http_response.headers = {}
 56 |         self.mock_http_response.cookies = {}
 57 |         self.mock_http_response.read.return_value = b'response content'
 58 |         
 59 |         # Set up the client to return the mock response
 60 |         self.mock_client.request.return_value = self.mock_http_response
 61 | 
 62 |     def tearDown(self):
 63 |         self.client_patch.stop()
 64 | 
 65 |     async def test_successful_request(self):
 66 |         """Test that a successful request returns a response object."""
 67 |         response = await self.handler.download_request(self.request, self.spider)
 68 |         
 69 |         # Verify that the client was called with the correct arguments
 70 |         self.mock_client.request.assert_called_once()
 71 |         args, kwargs = self.mock_client.request.call_args
 72 |         self.assertEqual(args[0], 'GET')
 73 |         self.assertEqual(args[1], 'https://example.com')
 74 |         
 75 |         # Verify that the response was created correctly
 76 |         self.mock_response_cls.assert_called_once()
 77 |         self.assertEqual(response, self.mock_response_cls.return_value)
 78 | 
 79 |     async def test_timeout_exception(self):
 80 |         """Test that a timeout exception is handled properly."""
 81 |         # Make the client raise a timeout exception
 82 |         self.mock_client.request.side_effect = httpx.TimeoutException('Timeout')
 83 |         
 84 |         # Mock the exceptions
 85 |         mock_timeout_error = MagicMock()
 86 |         self.handler.di.get.side_effect = lambda x: mock_timeout_error if x == 'exceptions' else self.mock_response_cls
 87 |         
 88 |         with self.assertRaises(Exception):
 89 |             await self.handler.download_request(self.request, self.spider)
 90 |         
 91 |         # Verify that the error was logged
 92 |         self.handler.logger.warning.assert_called_once()
 93 |         
 94 |         # Verify that the correct exception was raised
 95 |         mock_timeout_error.TimeoutError.assert_called_once()
 96 | 
 97 |     async def test_request_error(self):
 98 |         """Test that a request error is handled properly."""
 99 |         # Make the client raise a request error
100 |         self.mock_client.request.side_effect = httpx.RequestError('Connection error')
101 |         
102 |         # Mock the exceptions
103 |         mock_connection_error = MagicMock()
104 |         self.handler.di.get.side_effect = lambda x: mock_connection_error if x == 'exceptions' else self.mock_response_cls
105 |         
106 |         with self.assertRaises(Exception):
107 |             await self.handler.download_request(self.request, self.spider)
108 |         
109 |         # Verify that the error was logged
110 |         self.handler.logger.warning.assert_called_once()
111 |         
112 |         # Verify that the correct exception was raised
113 |         mock_connection_error.ConnectionError.assert_called_once()
114 | 
115 |     async def test_unexpected_error(self):
116 |         """Test that an unexpected error is handled properly."""
117 |         # Make the client raise an unexpected error
118 |         self.mock_client.request.side_effect = ValueError('Unexpected error')
119 |         
120 |         # Mock the exceptions
121 |         mock_download_error = MagicMock()
122 |         self.handler.di.get.side_effect = lambda x: mock_download_error if x == 'exceptions' else self.mock_response_cls
123 |         
124 |         with self.assertRaises(Exception):
125 |             await self.handler.download_request(self.request, self.spider)
126 |         
127 |         # Verify that the error was logged
128 |         self.handler.logger.error.assert_called_once()
129 |         
130 |         # Verify that the correct exception was raised
131 |         mock_download_error.DownloadError.assert_called_once()
132 | 
133 |     async def test_proxy_configuration(self):
134 |         """Test that proxy configuration is handled properly."""
135 |         # Set up a request with a proxy
136 |         self.request.meta['proxy'] = 'http://proxy.example.com:8080'
137 |         
138 |         await self.handler.download_request(self.request, self.spider)
139 |         
140 |         # Verify that the client was created with the proxy
141 |         args, kwargs = self.mock_client_cls.call_args
142 |         self.assertEqual(kwargs['proxies'], 'http://proxy.example.com:8080')
143 |         
144 |         # Verify that the proxy usage was logged
145 |         self.handler.logger.debug.assert_called_once()
146 | 
147 | 
148 | if __name__ == '__main__':
149 |     unittest.main()
150 | 


--------------------------------------------------------------------------------