├── .gitignore ├── LICENSE ├── README.md ├── pyproject.toml ├── scrapeops_scrapy ├── __init__.py ├── core │ ├── __init__.py │ ├── api.py │ ├── controllers.py │ ├── core.py │ ├── error_logger.py │ ├── model.py │ └── setup.py ├── exceptions.py ├── extension.py ├── middleware │ ├── __init__.py │ ├── retry.py │ └── stats.py ├── normalizer │ ├── __init__.py │ ├── domains.py │ ├── exceptions.py │ ├── middleware.py │ ├── proxies.py │ ├── proxy_port_normalizer.py │ └── request_response.py ├── signals │ ├── __init__.py │ ├── scrapeops_signals.py │ └── triggers.py ├── stats │ ├── __init__.py │ ├── failed_urls.py │ ├── logger.py │ └── model.py ├── tests │ ├── __init__.py │ └── core.py ├── utils │ ├── __init__.py │ ├── error_handling.py │ └── utils.py └── validators │ ├── __init__.py │ ├── item_validator.py │ └── response_validator.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | venv/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2023, ScrapeOps 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ScrapeOps Scrapy SDK: Scrapy Extension For Spider Monitoring, Alerts and Scheduling. 2 | The ScrapeOps Scrapy SDK is an extension for your Scrapy spiders that gives you all the scraping monitoring, statistics, alerting, scheduling and data validation you will need straight out of the box. 3 | 4 | Just enable it in your `settings.py` file and the SDK will automatically monitor your scrapers and send your logs to your scraping dashboard. When connected to a ScrapyD server, you can schedule and manage all your jobs from one easy to use interface. 5 | 6 | **Full documentation can be found here:** [ScrapeOps Documentation](https://scrapeops.io/docs/intro) 7 | 8 | 9 | ![ScrapeOps Dashboard Demo](https://github.com/ScrapeOps/scrapeops-docs/blob/main/assets/scrapeops-hero-demo.jpg) 10 | 11 | 12 | ## :computer: Demo 13 | [:link: ScrapeOps Dashboard Demo](https://scrapeops.io/app/login/demo) 14 | 15 | ## :star: Features 16 |
17 | View features 18 | 19 | - **Scrapy Job Stats & Visualisation** 20 | - :chart_with_upwards_trend: Individual Job Progress Stats 21 | - :bar_chart: Compare Jobs versus Historical Jobs 22 | - :100: Job Stats Tracked 23 | - :white_check_mark: Pages Scraped & Missed 24 | - :white_check_mark: Items Parsed & Missed 25 | - :white_check_mark: Item Field Coverage 26 | - :white_check_mark: Runtimes 27 | - :white_check_mark: Response Status Codes 28 | - :white_check_mark: Success Rates & Average Latencies 29 | - :white_check_mark: Errors & Warnings 30 | - :white_check_mark: Bandwidth 31 | 32 | - **Health Checks & Alerts** 33 | - :male_detective: Custom Spider & Job Health Checks 34 | - :package: Out of the Box Alerts - Slack (More coming soon!) 35 | - :bookmark_tabs: Daily Scraping Reports 36 | 37 | - **ScrapyD Cluster Management** 38 | - :link: Integrate With ScrapyD Servers 39 | - :alarm_clock: Schedule Periodic Jobs 40 | - :100: All Scrapyd JSON API Supported 41 | - :closed_lock_with_key: Secure Your ScrapyD with BasicAuth, HTTPS or Whitelisted IPs 42 | - **Proxy Monitoring (Coming Soon)** 43 | - :chart_with_upwards_trend: Monitor Your Proxy Account Usage 44 | - :chart_with_downwards_trend: Track Your Proxy Providers Performance 45 | - :bar_chart: Compare Proxy Performance Verus Other Providers 46 | 47 |
48 | 49 | ## :rocket: Getting Started 50 | You can get the ScrapeOps monitoring suite up and running in **4 easy steps**. 51 | 52 | #### #1 - Install the ScrapeOps SDK: 53 | 54 | ``` 55 | pip install scrapeops-scrapy 56 | ``` 57 | 58 | #### #2 - Get Your ScrapeOps API Key: 59 | Create a [free ScrapeOps account here](https://scrapeops.io/app/register) and get your API key from the dashboard. 60 | 61 | When you have your API key, open your Scrapy projects `settings.py` file and insert your API key into it. 62 | 63 | ```python 64 | SCRAPEOPS_API_KEY = 'YOUR_API_KEY' 65 | ``` 66 | 67 | #### #3 - Add in the ScrapeOps Extension: 68 | In the `settings.py` file, add in the ScrapeOps extension, by simply adding it to the `EXTENSIONS` dictionary. 69 | 70 | ```python 71 | EXTENSIONS = { 72 | 'scrapeops_scrapy.extension.ScrapeOpsMonitor': 500, 73 | } 74 | ``` 75 | 76 | #### #4 - Enable the ScrapeOps Retry Middleware: 77 | To get the most accurate stats, you need to add in the ScrapeOps retry middleware into the `DOWNLOADER_MIDDLEWARES` dictionary and disable the default Scrapy Retry middleware in your Scrapy project's `settings.py` file. 78 | 79 | You can do this by setting the default Scrapy RetryMiddleware to `None` and enabling the ScrapeOps retry middleware in it's place. 80 | 81 | ```python 82 | DOWNLOADER_MIDDLEWARES = { 83 | 'scrapeops_scrapy.middleware.retry.RetryMiddleware': 550, 84 | 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, 85 | } 86 | ``` 87 | 88 | The retry middleware will operate the exactly as before, however, the ScrapeOps retry middleware will log every request, response and exception your spiders generate. 89 | 90 | #### #5 - (Optional) Exclude Settings From Being Logged By ScrapeOps SDK: 91 | By default the ScrapeOps SDK will log the settings used for each particular scrape so you can keep track of the settings used. However, to ensure it doesn't record sensitive information like API keys it won't log any settings that contain the following substrings: 92 | 93 | - `API_KEY` 94 | - `APIKEY` 95 | - `SECRET_KEY` 96 | - `SECRETKEY` 97 | 98 | However, it can still log other settings that don't match these patterns. You can specify which settings not to log by adding the setting to the `SCRAPEOPS_SETTINGS_EXCLUSION_LIST`. 99 | 100 | ```python 101 | SCRAPEOPS_SETTINGS_EXCLUSION_LIST = [ 102 | 'NAME_OF_SETTING_NOT_TO_LOG' 103 | ] 104 | ``` 105 | 106 | #### Done! 107 | That's all. From here, the ScrapeOps SDK will automatically monitor and collect statistics from your scraping jobs and display them in your [ScrapeOps dashboard](https://scrapeops.io/app/dashboard). 108 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /scrapeops_scrapy/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.5.6" -------------------------------------------------------------------------------- /scrapeops_scrapy/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/core/__init__.py -------------------------------------------------------------------------------- /scrapeops_scrapy/core/api.py: -------------------------------------------------------------------------------- 1 | from scrapeops_scrapy.exceptions import ScrapeOpsAPIResponseError 2 | from scrapeops_scrapy.utils.utils import merge_dicts 3 | from scrapeops_scrapy.normalizer.proxies import ProxyNormalizer 4 | import requests 5 | import time 6 | 7 | 8 | class SOPSRequest(object): 9 | 10 | TIMEOUT = 30 11 | RETRY_LIMIT = 3 12 | API_KEY = None 13 | JOB_GROUP_ID = None 14 | SCRAPEOPS_ENDPOINT = 'https://api.scrapeops.io/' 15 | SCRAPEOPS_API_VERSION = 'api/v1/' 16 | SCRAPEOPS_LOGGING_DATA = None 17 | HIGH_FREQ_ACC = True 18 | 19 | def __init__(self): 20 | self.data = None 21 | self.valid = None 22 | self.action = None 23 | self.error = None 24 | 25 | 26 | def setup_request(self, body=None): 27 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'setup/' 28 | data, error = SOPSRequest.post(url, body=body) 29 | data, self.valid, self.action, self.error = SOPSRequest.setup_stats_validation(data, error) 30 | return data, self 31 | 32 | 33 | def stats_request(self, body=None, log_body=None, files=None): 34 | post_body = merge_dicts(SOPSRequest.SCRAPEOPS_LOGGING_DATA, body) 35 | if files is not None: 36 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'logs/?log_type=scrapy' 37 | _, _ = SOPSRequest.post_file(url, body=log_body, files=files) 38 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'stats/' 39 | data, error = SOPSRequest.post(url, body=post_body) 40 | data, self.valid, self.action, self.error = SOPSRequest.setup_stats_validation(data, error) 41 | return data, self 42 | 43 | 44 | def error_report_request(self, error_type=None, body=None, files=None): 45 | post_body = merge_dicts(SOPSRequest.SCRAPEOPS_LOGGING_DATA, body) 46 | if files is None: 47 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'errors/?error_type={error_type}' 48 | data, error = SOPSRequest.post(url, body=post_body, files=files) 49 | else: 50 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'errors/logs/?error_type={error_type}' 51 | data, error = SOPSRequest.post_file(url, body=post_body, files=files) 52 | data, self.valid, self.action, self.error = SOPSRequest.error_report_validation(data, error) 53 | return data, self 54 | 55 | 56 | def proxy_normalisation_request(self, request_response_object): 57 | proxy_name = request_response_object.get_proxy_port_name() 58 | proxy_string = request_response_object.get_raw_proxy() 59 | post_body = merge_dicts(SOPSRequest.SCRAPEOPS_LOGGING_DATA, {'proxy_string': proxy_string}) 60 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'normalizer/proxy/?proxy_name={proxy_name}' 61 | data, error = SOPSRequest.post(url, body=post_body) 62 | data, self.valid, self.action, self.error = SOPSRequest.normaliser_validation(data, error, request_type='proxy') 63 | return data, self 64 | 65 | 66 | def proxy_api_normalisation_request(self, request_response_object): 67 | proxy_name = request_response_object.get_proxy_api_name() 68 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'normalizer/proxy_api/?proxy_name={proxy_name}' 69 | data, error = SOPSRequest.post(url, body=SOPSRequest.SCRAPEOPS_LOGGING_DATA) 70 | data, self.valid, self.action, self.error = SOPSRequest.normaliser_validation(data, error, request_type='proxy_api') 71 | return data, self 72 | 73 | def proxy_port_normalisation_request(self, request_response_object, test_data=None): 74 | post_body = merge_dicts(SOPSRequest.SCRAPEOPS_LOGGING_DATA, { 75 | 'proxy_string': request_response_object.get_complete_proxy_string(), 76 | 'proxy_headers': request_response_object.get_proxy_port_headers(), 77 | 'domain': request_response_object.get_proxy_port_name()}) 78 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'normalizer/proxy_port/?job_id={SOPSRequest.JOB_GROUP_ID}' 79 | if test_data is not None: 80 | post_body['test_data'] = test_data 81 | data, error = SOPSRequest.post(url, body=post_body) 82 | data, self.valid, self.action, self.error = SOPSRequest.normaliser_validation(data, error, request_type='proxy_port') 83 | return data, self 84 | 85 | 86 | def domain_normalisation_request(self, request_response_object): 87 | domain = request_response_object.get_domain() 88 | real_url = request_response_object.get_real_url() 89 | post_body = merge_dicts(SOPSRequest.SCRAPEOPS_LOGGING_DATA, {'url': real_url}) 90 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'normalizer/domain/?domain={domain}' 91 | data, error = SOPSRequest.post(url, body=post_body) 92 | data, self.valid, self.action, self.error = SOPSRequest.normaliser_validation(data, error, request_type='domain') 93 | return data, self 94 | 95 | def proxy_alert_request(self, request_response_object, job_group_id, error_response, alerts_sent): 96 | data = error_response 97 | data['domain'] = request_response_object.get_domain() 98 | data['proxy_provider'] = request_response_object.get_proxy_name() 99 | data['alerts_sent'] = alerts_sent 100 | post_body = merge_dicts(SOPSRequest.SCRAPEOPS_LOGGING_DATA, data) 101 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'alerts/proxy/?job_group_id={job_group_id}' 102 | data, error = SOPSRequest.post(url, body=post_body) 103 | data, self.valid, self.error = SOPSRequest.generic_validation(data, error) 104 | return data, self 105 | 106 | def proxy_test_request(self, url, request_response_object): 107 | data, _ = SOPSRequest.get(url, proxy=request_response_object.get_complete_proxy_string()) 108 | return data 109 | 110 | @staticmethod 111 | def generic_validation(data, error): 112 | if data is None: 113 | return data, False, str(error) 114 | elif data.get('api_key') == 'invalid': 115 | return data, False, 'invalid_api_key' 116 | elif data.get('job_id') == 'invalid': 117 | return data, False, 'invalid_job' 118 | return data, True, None 119 | 120 | 121 | @staticmethod 122 | def setup_stats_validation(data, error): 123 | if data is None: 124 | return data, False, 'retry', str(error) 125 | elif data.get('api_key') == 'invalid': 126 | return data, False, 'close', 'invalid_api_key' 127 | elif data.get('job_valid') is not True and data.get('job_id') is None: 128 | return data, False, 'retry', 'invalid_job' 129 | return data, True, 'valid', None 130 | 131 | 132 | @staticmethod 133 | def normaliser_validation(data, error, request_type=None): 134 | if data is None: 135 | return data, False, 'fallback', str(error) 136 | elif data.get('api_key') == 'invalid': 137 | return data, False, 'close', 'invalid_api_key' 138 | 139 | ## proxy port 140 | elif request_type=='proxy_port' and data.get('proxy_port_details') is None: 141 | return data, False, 'fallback', 'no_proxy_port_details' 142 | 143 | ## proxy api 144 | elif request_type=='proxy_api' and data.get('proxy_parsing_data') is None: 145 | return data, False, 'fallback', 'no_proxy_parsing_data' 146 | elif request_type=='proxy_api' and data.get('proxy_parsing_data') is not None: 147 | proxy_parsing_data = data.get('proxy_parsing_data') 148 | if proxy_parsing_data.get('known_proxy') is False: 149 | return data, False, 'fallback', 'unknown_proxy' 150 | 151 | ## domain specific 152 | elif request_type=='domain' and data.get('domain_parsing_data') is None: 153 | return data, False, 'fallback', 'no_domain_parsing_data' 154 | return data, True, 'valid', None 155 | 156 | 157 | @staticmethod 158 | def error_report_validation(data, error): 159 | if data is None: 160 | return data, False, 'retry', str(error) 161 | elif data.get('error_logged') is False: 162 | return data, False, 'close', 'error_not_logged' 163 | return data, True, 'valid', None 164 | 165 | @staticmethod 166 | def condense_stats_body(body): 167 | return { 168 | 'job_id': body.get('job_id'), 169 | 'job_group_id': body.get('job_group_id'), 170 | } 171 | 172 | @staticmethod 173 | def get(url, proxy=None, check=True): 174 | proxies = None 175 | if ProxyNormalizer.unknown_proxy_scheme(proxy) is not True: 176 | proxies = {ProxyNormalizer.get_proxy_scheme(proxy): proxy} 177 | for _ in range(SOPSRequest.RETRY_LIMIT): 178 | try: 179 | response = requests.get(url, timeout=SOPSRequest.TIMEOUT, proxies=proxies, headers={'api_key': SOPSRequest.API_KEY}) 180 | if check and response.status_code == 401: 181 | return None, 'invalid_api_key' 182 | if response.status_code == 200: 183 | data = response.json() 184 | return data, None 185 | else: 186 | raise ScrapeOpsAPIResponseError 187 | except requests.exceptions.ConnectionError as e: 188 | error = e 189 | continue 190 | except ScrapeOpsAPIResponseError as e: 191 | error = e 192 | continue 193 | except Exception as e: 194 | error = e 195 | continue 196 | return None, str(error) 197 | 198 | 199 | @staticmethod 200 | def post(url, body=None, files=None, proxy=None): 201 | proxies = None 202 | if ProxyNormalizer.unknown_proxy_scheme(proxy) is not True: 203 | proxies = {ProxyNormalizer.get_proxy_scheme(proxy): proxy} 204 | for _ in range(SOPSRequest.RETRY_LIMIT): 205 | try: 206 | response = requests.post(url, json=body, timeout=SOPSRequest.TIMEOUT, files=files, proxies=proxies, headers={'api_key': SOPSRequest.API_KEY}) 207 | if response.status_code == 401: 208 | return None, 'invalid_api_key' 209 | if response.status_code == 200: 210 | data = response.json() 211 | return data, None 212 | else: 213 | time.sleep(3) 214 | raise ScrapeOpsAPIResponseError 215 | except requests.exceptions.ConnectionError as e: 216 | error = e 217 | continue 218 | except ScrapeOpsAPIResponseError as e: 219 | error = e 220 | continue 221 | except Exception as e: 222 | error = e 223 | continue 224 | return None, str(error) 225 | 226 | 227 | @staticmethod 228 | def post_file(url, body=None, files=None): 229 | for _ in range(SOPSRequest.RETRY_LIMIT): 230 | try: 231 | response = requests.post(url, data=body, timeout=SOPSRequest.TIMEOUT, files=files, headers={'api_key': SOPSRequest.API_KEY}) 232 | if response.status_code == 401: 233 | return None, 'invalid_api_key' 234 | if response.status_code == 200: 235 | data = response.json() 236 | return data, None 237 | else: 238 | raise ScrapeOpsAPIResponseError 239 | except requests.exceptions.ConnectionError as e: 240 | error = e 241 | continue 242 | except ScrapeOpsAPIResponseError as e: 243 | error = e 244 | continue 245 | except Exception as e: 246 | error = e 247 | continue 248 | return None, str(error) 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | -------------------------------------------------------------------------------- /scrapeops_scrapy/core/controllers.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from scrapeops_scrapy.utils import utils 4 | from scrapeops_scrapy.core.setup import SDKSetup 5 | from scrapeops_scrapy.core.api import SOPSRequest 6 | import sys 7 | 8 | class SDKControllers(SDKSetup): 9 | 10 | SETUP_ATTEMPT_LIMIT = 3 11 | 12 | def __init__(self): 13 | SDKSetup.__init__(self) 14 | 15 | def send_setup_request(self): 16 | data, status = SOPSRequest().setup_request(body=self.setup_data()) 17 | if status.valid: 18 | self.initialize_job_details(data) 19 | elif status.action == 'retry' and self._setup_attempts < SDKControllers.SETUP_ATTEMPT_LIMIT: 20 | self._setup_attempts += 1 21 | self._error_logger.log_error(reason='setup_failed', 22 | error=status.error, 23 | data={'setup_attempts': self._setup_attempts}) 24 | elif status.action == 'retry' and self._setup_attempts >= SDKControllers.SETUP_ATTEMPT_LIMIT: 25 | self.deactivate_sdk(reason='exceeded_max_setup_attempts', 26 | error=status.error, 27 | data={'setup_attempts': self._setup_attempts}, 28 | request_type='setup') 29 | else: 30 | self.deactivate_sdk(reason=status.error, data=data, request_type='setup') 31 | 32 | 33 | def send_stats(self, periodic_stats=None, overall_stats=None, reason=None, stats_type=None): 34 | self._sdk_run_time = self._sdk_run_time + self._period_frequency 35 | post_body = self.stats_data(periodic_stats=periodic_stats, overall_stats=overall_stats, stats_type=stats_type, reason=reason) 36 | 37 | if self.job_active() is False: 38 | self.send_setup_request() 39 | 40 | ## retest if job is inactive 41 | if self.job_active() is False: 42 | self.cache_failed_stats(post_body) 43 | self._error_logger.log_error(reason=f'sending_{stats_type}_stats_failure', 44 | data={'failed_periods': self.failed_periods}) 45 | 46 | if self.job_active(): 47 | if stats_type == 'finished' and self.export_logs(): 48 | log_body = self.log_data() 49 | with open(self.log_file, 'rb') as f: 50 | data, status = SOPSRequest().stats_request(body=post_body, log_body=log_body, files={'file': f}) 51 | else: 52 | data, status = SOPSRequest().stats_request(body=post_body) 53 | 54 | if status.valid: 55 | self.update_sdk_settings(data) 56 | self.reset_failed_stats() 57 | elif status.action == 'retry': 58 | self.cache_failed_stats(post_body) 59 | self._error_logger.log_error(reason=f'sending_{stats_type}_stats_failure', 60 | error=status.error, 61 | data={'failed_periods': self.failed_periods}) 62 | 63 | 64 | def sdk_enabled(self): 65 | if self._sdk_active: 66 | if self.request_response_middleware is None: 67 | self.initialize_normalizer_middleware() 68 | return True 69 | return False 70 | 71 | 72 | def check_api_key_present(self): 73 | if self._scrapeops_api_key == None: 74 | self._sdk_active = False 75 | return False 76 | self._sdk_active = True 77 | return True 78 | 79 | def deactivate_sdk(self, reason=None, error=None, request_type=None, data=None): 80 | self._sdk_active = False 81 | if reason != 'scrapy_shell': 82 | self._error_logger.sdk_error_close(reason=reason, error=error, request_type=request_type) 83 | 84 | def job_active(self): 85 | if self.job_id is None and self._sdk_active: 86 | return False 87 | return True 88 | 89 | def cache_failed_stats(self, post_body): 90 | self.cached_failed_stats.append(post_body) 91 | self.failed_periods = len(self.cached_failed_stats) 92 | 93 | def reset_failed_stats(self): 94 | self.cached_failed_stats = [] 95 | self.failed_periods = 0 96 | 97 | def get_runtime(self, time=None): 98 | if time is None: 99 | return utils.current_time() - self._scrapeops_job_start 100 | return time - self._scrapeops_job_start 101 | 102 | def scrapeops_middleware_enabled(self): 103 | if self._scrapeops_middleware is True: 104 | return True 105 | return False 106 | 107 | def export_logs(self): 108 | if self._scrapeops_export_scrapy_logs and self.log_file is not None: 109 | return True 110 | return False 111 | 112 | def not_scrapy_shell(self): 113 | if sys.argv[0] == 'shell': 114 | self.deactivate_sdk(reason='scrapy_shell') 115 | return False 116 | return True 117 | 118 | 119 | -------------------------------------------------------------------------------- /scrapeops_scrapy/core/core.py: -------------------------------------------------------------------------------- 1 | 2 | from twisted.internet import task 3 | 4 | from scrapeops_scrapy.exceptions import ScrapeOpsMissingAPIKey 5 | from scrapeops_scrapy.utils import utils 6 | from scrapeops_scrapy.core.controllers import SDKControllers 7 | from scrapeops_scrapy.stats.logger import StatsLogger 8 | from scrapeops_scrapy.normalizer.request_response import RequestResponse 9 | 10 | 11 | 12 | class ScrapeopsCore(SDKControllers, StatsLogger): 13 | """ 14 | Where the core ScrapeOps Functionality Goes 15 | """ 16 | 17 | def __init__(self): 18 | SDKControllers.__init__(self) 19 | StatsLogger.__init__(self) 20 | 21 | def start_sdk(self, spider=None, crawler=None): 22 | if self.not_scrapy_shell(): 23 | self.start_time = self.period_start_time = utils.current_time() 24 | self.initialize_SDK(spider, crawler=crawler) 25 | if self.check_api_key_present(): 26 | self.send_setup_request() 27 | self.spider_open_stats() 28 | self.start_periodic_monitor() 29 | else: 30 | err = ScrapeOpsMissingAPIKey() 31 | self.deactivate_sdk(reason='no_api_key', error=err) 32 | raise err 33 | 34 | 35 | def close_sdk(self, spider=None, reason=None): 36 | if self.sdk_enabled(): 37 | self.period_finish_time = utils.current_time() 38 | self.spider_close_stats(reason=reason, crawler=self.crawler) 39 | self.send_stats(periodic_stats=self._periodic_stats, overall_stats=self._overall_stats, stats_type='finished', reason=reason) 40 | self.close_periodic_monitor() 41 | 42 | 43 | def request_stats(self, request=None): 44 | if self.sdk_enabled(): 45 | request.meta['sops_time'] = utils.current_time() 46 | request_response_object = RequestResponse(request=request) 47 | self.request_response_middleware.normalise_domain_proxy_data(request_response_object) 48 | self.add_missed_urls_callback(request) 49 | self.generate_request_stats(request_response_object, request=request) 50 | 51 | 52 | def response_stats(self, request=None, response=None): 53 | if self.sdk_enabled(): 54 | request_response_object = RequestResponse(request=request, response=response) 55 | self.request_response_middleware.process(request_response_object, response) 56 | self.generate_response_stats(request_response_object, request=request, response=response) 57 | 58 | 59 | def exception_stats(self, request=None, exception_class=None): 60 | if self.sdk_enabled(): 61 | request_response_object = RequestResponse(request=request) 62 | self.request_response_middleware.normalise_domain_proxy_data(request_response_object) 63 | self.generate_exception_stats(request_response_object, request=request, exception_class=exception_class) 64 | 65 | 66 | def item_stats(self, signal_type=None, item=None, response=None, spider=None): 67 | if self.sdk_enabled(): 68 | request_response_object = RequestResponse(response=response) 69 | if response is not None: 70 | self.request_response_middleware.normalise_domain_proxy_data(request_response_object) 71 | if signal_type == 'item_scraped': 72 | self.item_validation_middleware.validate(request_response_object, item) 73 | self.generate_item_stats(request_response_object, signal=signal_type, response=response) 74 | 75 | 76 | def add_missed_urls_callback(self, request): 77 | if request.errback is None: 78 | request.errback = self.failed_url_middleware.log_failure 79 | 80 | 81 | 82 | """ 83 | PERIODIC MONITOR 84 | """ 85 | def start_periodic_monitor(self): 86 | if self.sdk_enabled(): 87 | self.loop = task.LoopingCall(self.periodic_monitor) 88 | self.periodic_loop = self.loop.start(1, now=False) # Start looping every 1 second (1.0). 89 | 90 | def periodic_monitor(self): 91 | period_time = utils.current_time() 92 | if self.get_runtime(time=period_time) % self.get_periodic_frequency() == 0: 93 | self.period_finish_time = period_time 94 | if self.sdk_enabled(): 95 | self.aggregate_stats(crawler=self.crawler, middleware=self.scrapeops_middleware_enabled()) 96 | self.send_stats(periodic_stats=self._periodic_stats, overall_stats=self._overall_stats, stats_type='periodic') 97 | self.reset_periodic_stats() 98 | self.period_start_time = utils.current_time() 99 | self.inc_value(self._overall_stats, 'periodic_runs') 100 | elif self.periodic_monitor_active(): 101 | self.close_periodic_monitor() 102 | 103 | def close_periodic_monitor(self): 104 | if self.periodic_monitor_active(): 105 | self.loop.stop() 106 | 107 | def periodic_monitor_active(self): 108 | if self.loop is not None: 109 | if self.loop.running: 110 | return True 111 | return False 112 | 113 | def get_periodic_frequency(self): 114 | self.period_count = 0 115 | runtime = self.get_runtime() 116 | if self._period_freq_list is None: 117 | self.period_count = int(runtime//self._period_frequency) 118 | return self._period_frequency 119 | for index, row in enumerate(self._period_freq_list): 120 | if runtime > int(row.get('total_time')): 121 | if index == 0: 122 | period_time = row.get('total_time') 123 | else: 124 | period_time = row.get('total_time') - self._period_freq_list[index - 1].get('total_time') 125 | self.period_count += int(period_time/row.get('periodic_frequency')) 126 | if runtime <= int(row.get('total_time')): 127 | self._period_frequency = row.get('periodic_frequency') 128 | if index == 0: 129 | diff = runtime 130 | else: 131 | diff = runtime - int(self._period_freq_list[index - 1].get('total_time')) 132 | self.period_count += int(diff//self._period_frequency) 133 | return self._period_frequency 134 | return self._period_frequency 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | -------------------------------------------------------------------------------- /scrapeops_scrapy/core/error_logger.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import re 4 | import time 5 | 6 | from scrapeops_scrapy.core.api import SOPSRequest 7 | from scrapeops_scrapy.normalizer.domains import DomainNormalizer 8 | from scrapeops_scrapy.utils import utils 9 | 10 | 11 | class ErrorLogger(object): 12 | ERROR_LOGGER_ACTIVE = True 13 | 14 | def __init__(self, spider, crawler, spider_settings, server_hostname, server_ip, start_time, log_file): 15 | self.spider = spider 16 | self.crawler = crawler 17 | self.bot_name = crawler.settings.get("BOT_NAME", "None") 18 | self.spider_settings = spider_settings 19 | self.server_hostname = server_hostname 20 | self.server_ip = server_ip 21 | self.start_time = start_time 22 | self.log_file = log_file 23 | self._error_history = [] 24 | self.job_group_name = None 25 | self.job_group_id = None 26 | 27 | def update_error_logger(self, job_name, job_id): 28 | self.job_group_name = job_name 29 | self.job_group_id = job_id 30 | 31 | def log_error(self, reason=None, error=None, data=None, request_type=None): 32 | if ErrorLogger.ERROR_LOGGER_ACTIVE: 33 | self._error_history.append( 34 | { 35 | "time": utils.current_time(), 36 | "reason": reason, 37 | "error": str(error), 38 | "data": data, 39 | "request_type": request_type, 40 | } 41 | ) 42 | 43 | def send_error_report(self, error_type=None, body=None, log_data=False): 44 | if ErrorLogger.ERROR_LOGGER_ACTIVE: 45 | try: 46 | data, status = SOPSRequest().error_report_request(error_type=error_type, body=body) 47 | if status.valid: 48 | if log_data and self.log_file is not None and data.get("sdk_error_id") is not None: 49 | with open(self.log_file, "rb") as f: 50 | post_body = { 51 | "sops_sdk": "scrapy", 52 | "spider_name": self.spider.name, 53 | "job_group_id": self.job_group_id, 54 | "job_group_name": self.job_group_name, 55 | "sdk_error_id": data.get("sdk_error_id"), 56 | } 57 | _, status = SOPSRequest().error_report_request( 58 | error_type=error_type, body=post_body, files={"file": f} 59 | ) 60 | if status.valid is False: 61 | self.log_error(reason="send_error_logs_failed", error=status.error) 62 | 63 | if status.valid is False: 64 | self.log_error(reason="send_error_report_failed", error=status.error) 65 | except Exception: 66 | pass 67 | 68 | def sdk_error_close(self, reason=None, error=None, request_type=None, data=None): 69 | if ErrorLogger.ERROR_LOGGER_ACTIVE: 70 | self.log_error(reason=reason, error=error, data=data, request_type=request_type) 71 | error_data = { 72 | "final_reason": reason, 73 | "sops_sdk": "scrapy", 74 | "spider_name": self.spider.name, 75 | "bot_name": self.bot_name, 76 | "server_ip": self.server_ip, 77 | "server_hostname": self.server_hostname, 78 | "job_group_id": self.job_group_id, 79 | "job_group_name": self.job_group_name, 80 | "job_args": utils.get_args(), 81 | "job_start_time": self.start_time, 82 | "sops_scrapeops_version": utils.get_scrapeops_version(), 83 | "sops_scrapy_version": utils.get_scrapy_version(), 84 | "sops_python_version": utils.get_python_version(), 85 | "sops_system_version": utils.get_system_version(), 86 | "sops_middleware_enabled": utils.scrapeops_middleware_installed(self.spider_settings), 87 | "error_history": self._error_history, 88 | } 89 | 90 | self.send_error_report(error_type="sdk_close", body=error_data, log_data=True) 91 | 92 | 93 | class TailLogHandler(logging.Handler): 94 | retryErrors = [ 95 | "Couldn't bind", 96 | "Hostname couldn't be looked up'" "No route to host", 97 | "Connection was refused by other side", 98 | "TCP connection timed out", 99 | "File used for UNIX socket is no good", 100 | "Service name given as port is unknown", 101 | "User aborted connection", 102 | "User timeout caused connection failure", 103 | "An SSL error occurred", 104 | "Could not verify something that was supposed to be signed.", 105 | "The peer rejected our verify error.", 106 | "We did not find a certificate where we expected to find one.", 107 | "Bad Request", 108 | "Unauthorized", 109 | "Payment Required", 110 | "Forbidden", 111 | "Not Found", 112 | "Method Not Allowed", 113 | "Request Time-out", 114 | "Internal Server Error", 115 | "Bad Gateway", 116 | "Service Unavailable", 117 | "HTTP Version not supported", 118 | "Gateway Time-out", 119 | "Unknown Status", 120 | ] 121 | 122 | def __init__(self, log_dict, log_dict_cumulative): 123 | logging.Handler.__init__(self) 124 | self.log_dict = log_dict 125 | self.log_dict_cumulative = log_dict_cumulative 126 | 127 | def flush(self): 128 | self.log_dict.clear() 129 | 130 | def emit(self, record): 131 | try: 132 | if record.levelname == "ERROR" or record.levelname == "WARNING" or record.levelname == "CRITICAL": 133 | if hasattr(record, "message"): 134 | errorMessage = record.message 135 | fileAndLine = record.pathname + ", line: " + str(record.lineno) 136 | dateTime = self.format_time(record) 137 | type = record.levelname 138 | engine = record.name 139 | 140 | # covering warnings/probableCause/traceback missing 141 | traceback = "No traceback available" 142 | probableCause = "" 143 | 144 | if record.exc_text is not None: 145 | traceback = record.exc_text 146 | splitTraceback = traceback.split("\n") 147 | probableCause = splitTraceback[len(splitTraceback) - 1] 148 | 149 | # covering retrys 150 | if "Gave up retrying <" in record.message: 151 | for retryError in self.retryErrors: 152 | if retryError in record.message: 153 | method = record.message.split("<")[1].split(" ")[0] 154 | errorMessage = "Error: Gave up retrying " + method + " request - " + retryError 155 | fileAndLine = "" 156 | probableCause = retryError 157 | break 158 | 159 | # Deprecation Warnings 160 | if "ScrapyDeprecationWarning:" in record.message and record.message[0] == "/": 161 | splitString = record.message.split("ScrapyDeprecationWarning:") 162 | errorMessage = "ScrapyDeprecationWarning: " + splitString[1] 163 | probableCause = splitString[0] 164 | 165 | # "Some Other Error Occurred" 166 | if "Some other error occurred: " in record.message: 167 | splitError = record.message.split(" /") 168 | cleanError = splitError[0].split(">: ")[1] 169 | errorMessage = "Some other error occurred: " + cleanError 170 | probableCause = cleanError 171 | traceback = record.message 172 | 173 | # Convert Urls To Domains in Error Messages 174 | urls = re.findall(r"(https?://[^\s]+)", errorMessage) 175 | for url in urls: 176 | domain = DomainNormalizer.get_domain(url) 177 | errorMessage = errorMessage.replace(url, domain) 178 | 179 | if errorMessage in self.log_dict: 180 | self.log_dict[errorMessage]["count"] = self.log_dict[errorMessage]["count"] + 1 181 | else: 182 | self.log_dict[errorMessage] = { 183 | "type": type, 184 | "engine": engine, 185 | "name": errorMessage, 186 | "count": 1, 187 | "traceback": traceback, 188 | "message": probableCause, 189 | "filepath": fileAndLine, 190 | "dateTime": dateTime, 191 | } 192 | 193 | if SOPSRequest.HIGH_FREQ_ACC == True: 194 | if errorMessage in self.log_dict_cumulative: 195 | self.log_dict_cumulative[errorMessage]["count"] = ( 196 | self.log_dict_cumulative[errorMessage]["count"] + 1 197 | ) 198 | else: 199 | self.log_dict_cumulative[errorMessage] = { 200 | "type": type, 201 | "engine": engine, 202 | "name": errorMessage, 203 | "count": 1, 204 | "traceback": traceback, 205 | "message": probableCause, 206 | "filepath": fileAndLine, 207 | "dateTime": dateTime, 208 | } 209 | 210 | except Exception as e: 211 | logging.info("Error: Error in error logger") 212 | logging.info(e, exc_info=True) 213 | 214 | def format_time(self, record): 215 | if self.formatter: 216 | return self.formatter.formatTime(record) 217 | else: 218 | # Fallback to a basic time format if no formatter is set 219 | return time.strftime("%Y-%m-%d %H:%M:%S") 220 | 221 | 222 | class TailLogger(object): 223 | def __init__(self): 224 | self._log_dict = {} 225 | self._log_dict_cumulative = {} 226 | self._log_handler = TailLogHandler(self._log_dict, self._log_dict_cumulative) 227 | 228 | def contents(self, type="diff"): 229 | if type == "cumulative": 230 | jsonLogsCumulative = json.dumps(self._log_dict_cumulative, indent=2) 231 | return jsonLogsCumulative 232 | 233 | else: 234 | jsonLogs = json.dumps(self._log_dict, indent=2) 235 | self._log_handler.flush() 236 | return jsonLogs 237 | 238 | @property 239 | def log_handler(self): 240 | return self._log_handler 241 | -------------------------------------------------------------------------------- /scrapeops_scrapy/core/model.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import scrapy.settings.default_settings as default_settings 3 | from scrapeops_scrapy.core.api import SOPSRequest 4 | 5 | 6 | class BaseSDKModel(object): 7 | 8 | """ 9 | SDK Model: 10 | The core data types used to control the SDK's operation. 11 | """ 12 | 13 | def __init__(self): 14 | ## User Data 15 | self._scrapeops_api_key = None 16 | 17 | ## SDK Data 18 | self._sdk_active = None 19 | self._scrapeops_endpoint = None 20 | self._scrapeops_middleware = None 21 | self._scrapeops_settings_exclusion_list = [] 22 | self._scrapeops_export_scrapy_logs = False 23 | self._period_frequency = 60 24 | self._period_freq_list = None 25 | self._sdk_run_time = 0 26 | self._setup_attempts = 0 27 | self._scrapeops_test_id = None 28 | self._error_logger = None 29 | self._scrapeops_sdk_version = None 30 | self._scrapeops_scrapy_version = None 31 | self._scrapeops_python_version = None 32 | self._scrapeops_system_version = None 33 | self._scrapeops_job_start = None 34 | 35 | ## Spider Data 36 | self.crawler = None 37 | self.spider = None 38 | self.spider_name = None 39 | self.spider_id= None 40 | self.spider_settings = None 41 | self.server_id= None 42 | self.project_id = None 43 | self.project_name = None 44 | self.bot_name = None 45 | self.retry_enabled = None 46 | self.retry_times = None 47 | self.log_file = None 48 | 49 | ## Overall Job Data 50 | self.job_args = None 51 | self.job_id = None 52 | self.job_group_id = None 53 | self.job_group_uuid = None 54 | self.job_group_name = None 55 | self.job_group_version = None 56 | self.job_custom_groups = None 57 | self.start_time = None 58 | self.finish_time = None 59 | self.server_hostname = None 60 | self.server_ip = None 61 | self._proxy_apis = {} 62 | self._generic_validators = {} 63 | self.multi_server = False 64 | self.failed_urls = [] 65 | 66 | ## Period Data 67 | self.period_start_time = None 68 | self.period_finish_time = None 69 | self.period_run_time = 0 70 | self.period_concurrency = 0 71 | self.period_count = 0 72 | 73 | ## ScrapeOps Triggered Jobs 74 | self._scrapeops_server_id = None 75 | self.job_group_type = None 76 | 77 | ## Periodic Monitor 78 | self.loop = None 79 | self.periodic_loop = None 80 | 81 | ## Validation/Normalisation Data 82 | self.proxy_domains = [] 83 | 84 | ## Failure 85 | self.failed_periods = 0 86 | self.cached_failed_stats = [] 87 | 88 | ## Middleware 89 | self.request_response_middleware = None 90 | self.item_validation_middleware = None 91 | self.failed_url_middleware = None 92 | 93 | self.allowed_response_codes = [] 94 | 95 | 96 | class SDKData(BaseSDKModel): 97 | 98 | def __init__(self): 99 | BaseSDKModel.__init__(self) 100 | 101 | 102 | def setup_data(self): 103 | return { 104 | 'sops_api_key': self._scrapeops_api_key, 105 | 'job_group_name': self.job_group_name, 106 | 'job_group_version': self.job_group_version, 107 | 'job_group_identifier': self.job_group_uuid, 108 | 'job_group_type': self.job_group_type, 109 | 'job_settings': self.spider_settings, 110 | 'job_args': self.job_args, 111 | 'job_start_time': self.start_time, 112 | 'sops_sdk': 'scrapy', 113 | 'sops_scrapeops_version': self._scrapeops_sdk_version, 114 | 'sops_scrapy_version': self._scrapeops_scrapy_version, 115 | 'sops_python_version': self._scrapeops_python_version, 116 | 'sops_system_version': self._scrapeops_system_version, 117 | 'sops_middleware_enabled': self._scrapeops_middleware, 118 | 'sops_test_id': self._scrapeops_test_id, 119 | 'sops_server_id': self._scrapeops_server_id, 120 | 'scrapeops_job_start': self._scrapeops_job_start, 121 | 'spider_name': self.spider_name, 122 | 'job_custom_groups': self.job_custom_groups, 123 | 'server_ip': self.server_ip, 124 | 'server_hostname': self.server_hostname, 125 | 'project_name': self.project_name, 126 | 'bot_name': self.bot_name, 127 | 'multi_server': self.multi_server, 128 | 'retry_enabled': self.retry_enabled, 129 | 'retry_times': self.retry_times, 130 | } 131 | 132 | 133 | def stats_data(self, periodic_stats=None, overall_stats=None, stats_type=None, reason=None): 134 | data = { 135 | 'job_id': self.job_id, 136 | 'job_group_id': self.job_group_id, 137 | 'type': stats_type, 138 | 'period_start_time': self.period_start_time, 139 | 'period_finish_time': self.period_finish_time, 140 | 'period_run_time': self._period_frequency, 141 | 'sdk_run_time': self._sdk_run_time, 142 | 'periodic': periodic_stats, 143 | 'overall': overall_stats, 144 | 'cached_failed_stats': self.cached_failed_stats, 145 | 'periodic_warnings': periodic_stats.get('log_count/WARNING', 0), 146 | 'periodic_errors': periodic_stats.get('log_count/ERROR', 0), 147 | 'periodic_criticals': periodic_stats.get('log_count/CRITICAL', 0), 148 | 'multi_server': self.multi_server, 149 | 'period_count': self.period_count, 150 | 'data_coverage': self.item_validation_middleware.get_item_coverage_data(), 151 | 'invalid_items_count': self.item_validation_middleware.get_num_invalid_items(), 152 | 'field_coverage': self.item_validation_middleware.get_field_coverage(), 153 | 'failed_urls_count': self.failed_url_middleware.get_url_count(), 154 | 'failed_urls_enabled': self.failed_url_middleware.enabled(), 155 | 'scrapy_stats': self.get_scrapy_stats(), 156 | 'job_custom_groups': self.job_custom_groups, 157 | 'error_details': self.tail.contents(), 158 | 'error_details_cumulative': self.tail.contents('cumulative'), 159 | 'high_freq': SOPSRequest.HIGH_FREQ_ACC 160 | } 161 | 162 | if stats_type == 'finished': 163 | data['job_finish_time'] = self.period_finish_time 164 | data['job_status'] = stats_type 165 | data['job_finish_reason'] = reason 166 | data['failed_urls_list'] = self.failed_url_middleware.get_url_list() 167 | data['invalid_items_urls_list'] = self.item_validation_middleware.get_invalid_items_urls() 168 | return data 169 | 170 | 171 | def log_data(self): 172 | return { 173 | 'job_group_id': self.job_group_id, 174 | 'job_group_name': self.job_group_name, 175 | 'job_group_identifier': self.job_group_uuid, 176 | 'spider_name': self.spider_name, 177 | 'sops_sdk': 'scrapy', 178 | } 179 | 180 | 181 | 182 | def logging_data(self): 183 | return { 184 | 'sops_api_key': self._scrapeops_api_key, 185 | 'job_id': self.job_id, 186 | 'job_group_id': self.job_group_id, 187 | 'job_group_identifier': self.job_group_uuid, 188 | 'job_group_name': self.job_group_name, 189 | 'spider_name': self.spider_name, 190 | 'spider_id': self.spider_id, 191 | 'server_id': self.server_id, 192 | 'project_id': self.project_id, 193 | 'project_name': self.project_name, 194 | 'bot_name': self.bot_name, 195 | 'server_ip': self.server_ip, 196 | 'server_hostname': self.server_hostname, 197 | 'sops_scrapeops_version': self._scrapeops_sdk_version, 198 | 'sops_scrapy_version': self._scrapeops_scrapy_version, 199 | 'sops_python_version': self._scrapeops_python_version, 200 | 'sops_system_version': self._scrapeops_system_version, 201 | 'sops_middleware_enabled': self._scrapeops_middleware, 202 | 'sops_sdk': 'scrapy', 203 | } 204 | 205 | def check_spider_attributes(self, spider): 206 | if hasattr(spider, 'sops_test'): 207 | if spider.sops_test.test_active(): 208 | self._scrapeops_test_id = spider.sops_test.generate_test_id() 209 | 210 | if hasattr(spider, 'sops_custom_groups'): 211 | if isinstance(spider.sops_custom_groups, dict): 212 | clean_dict = {} 213 | for k, v in spider.sops_custom_groups.items(): 214 | clean_dict[str(k)] = str(v) 215 | self.job_custom_groups = clean_dict 216 | 217 | 218 | def get_settings(self, spider): 219 | default_scrapy_settings = default_settings.__dict__ 220 | full_settings = spider.settings.copy_to_dict() 221 | self.spider_settings = {} 222 | for key, value in full_settings.items(): 223 | if key not in default_scrapy_settings and self.include_setting(key): 224 | self.spider_settings[key] = value 225 | elif default_scrapy_settings.get(key) != value and self.include_setting(key): 226 | self.spider_settings[key] = value 227 | 228 | def include_setting(self, key): 229 | exclusion_terms = ['API_KEY', 'APIKEY', 'SECRET_KEY', 'SECRETKEY', 'PASSWORD', 'CONNECTION_STRING'] 230 | if key in self._scrapeops_settings_exclusion_list: 231 | return False 232 | for term in exclusion_terms: 233 | if term in key.upper(): return False 234 | return True 235 | 236 | 237 | def get_job_name(self): 238 | ## check args 239 | for arg in self.job_args.get('args'): 240 | if 'SCRAPEOPS_JOB_NAME' in arg: 241 | return arg.split('=')[1] 242 | 243 | ## check spider defined 244 | if hasattr(self.spider, 'sops_job_name'): 245 | return self.spider.sops_job_name 246 | if hasattr(self.spider, 'name'): 247 | return self.spider.name 248 | return 'no_spider_name' 249 | 250 | 251 | def get_job_version(self): 252 | ## check args 253 | for arg in self.job_args.get('args'): 254 | if 'SCRAPEOPS_JOB_VERSION' in arg: 255 | return arg.split('=')[1] 256 | 257 | ## check spider defined 258 | if hasattr(self.spider, 'sops_job_version'): 259 | return self.spider.sops_job_version 260 | return 'default' 261 | 262 | 263 | def get_server_id(self, crawler): 264 | for arg in self.job_args.get('args'): 265 | if 'SCRAPEOPS_SERVER_ID' in arg: 266 | return arg.split('=')[1] 267 | if crawler.settings.get('SCRAPEOPS_SERVER_ID') is not None: 268 | return crawler.settings.get('SCRAPEOPS_SERVER_ID') 269 | return '-1' 270 | 271 | 272 | def check_scrapeops_triggered_job(self, crawler): 273 | self._scrapeops_server_id = self.get_server_id(crawler) 274 | if isinstance(self._scrapeops_server_id, str) is False: self._scrapeops_server_id = str(self._scrapeops_server_id) 275 | if self._scrapeops_server_id != '-1': 276 | self.job_group_type = 'scrapeops_triggered' 277 | else: 278 | self.job_group_type = 'user_triggered' 279 | 280 | def get_server_details(self): 281 | try: 282 | self.server_hostname = socket.gethostname() 283 | self.server_ip = socket.gethostbyname(self.server_hostname) 284 | except Exception: 285 | self.server_hostname = 'unknown' 286 | self.server_ip = 'unknown' 287 | 288 | 289 | def get_uuid(self): 290 | for arg in self.job_args.get('args'): 291 | if 'SCRAPEOPS_JOB_GROUP_IDENTIFIER' in arg: 292 | return arg.split('=')[1] 293 | if hasattr(self.spider, 'sops_job_group_identifier'): 294 | return self.spider.sops_job_group_identifier 295 | self.multi_server = False 296 | return '' 297 | 298 | 299 | def get_export_logs(self, crawler): 300 | for arg in self.job_args.get('args'): 301 | if 'SCRAPEOPS_EXPORT_SCRAPY_LOGS' in arg: 302 | try: 303 | if arg.split('=')[1] == 'True': 304 | return True 305 | except Exception: 306 | pass 307 | if crawler.settings.get('SCRAPEOPS_EXPORT_SCRAPY_LOGS') is not None: 308 | return True 309 | return False 310 | 311 | def get_scrapy_stats(self): 312 | scrapy_stats = self.crawler.stats.get_stats() 313 | return {k:str(v) for (k,v) in scrapy_stats.items()} 314 | 315 | -------------------------------------------------------------------------------- /scrapeops_scrapy/core/setup.py: -------------------------------------------------------------------------------- 1 | from scrapeops_scrapy.utils import utils 2 | from scrapeops_scrapy.core.error_logger import ErrorLogger 3 | from scrapeops_scrapy.core.api import SOPSRequest 4 | from scrapeops_scrapy.normalizer.middleware import RequestResponseMiddleware 5 | from scrapeops_scrapy.validators.item_validator import ItemValidator 6 | from scrapeops_scrapy.stats.failed_urls import FailedUrlsHandler 7 | from scrapeops_scrapy.core.model import SDKData 8 | 9 | 10 | 11 | class SDKSetup(SDKData): 12 | 13 | def __init__(self): 14 | SDKData.__init__(self) 15 | 16 | 17 | def initialize_SDK(self, spider, crawler=None): 18 | 19 | ## Spider Data 20 | self.spider = spider 21 | self.crawler = crawler 22 | self.spider_name = spider.name 23 | self.project_name = crawler.settings.get('PROJECT', None) 24 | self.bot_name = crawler.settings.get('BOT_NAME', None) 25 | self.retry_enabled = crawler.settings.get('RETRY_ENABLED', None) 26 | self.retry_times = crawler.settings.get('RETRY_TIMES', None) 27 | self.log_file = crawler.settings.get('LOG_FILE', None) 28 | self.allowed_response_codes = crawler.settings.get('HTTPERROR_ALLOWED_CODES', []) 29 | self._scrapeops_settings_exclusion_list = crawler.settings.get('SCRAPEOPS_SETTINGS_EXCLUSION_LIST', []) 30 | self.check_spider_attributes(spider) 31 | self.get_settings(spider) 32 | 33 | ## Job Data 34 | self.job_args = utils.get_args() 35 | self.job_group_name = crawler.settings.get('SCRAPEOPS_JOB_NAME', self.get_job_name()) 36 | self.job_group_uuid = crawler.settings.get('SCRAPEOPS_JOB_GROUP_IDENTIFIER', self.get_uuid()) ## Multi-server 37 | self.job_group_version = crawler.settings.get('SCRAPEOPS_JOB_VERSION', self.get_job_version()) 38 | self.check_scrapeops_triggered_job(crawler) 39 | 40 | ## System Settings 41 | self._scrapeops_sdk_version = utils.get_scrapeops_version() 42 | self._scrapeops_scrapy_version = utils.get_scrapy_version() 43 | self._scrapeops_python_version = utils.get_python_version() 44 | self._scrapeops_system_version = utils.get_system_version() 45 | self.get_server_details() 46 | 47 | ## SDK Setup Data 48 | self._scrapeops_middleware = utils.scrapeops_middleware_installed(self.spider_settings) 49 | self._scrapeops_job_start = crawler.settings.get('SCRAPEOPS_JOB_START', utils.current_time()) ## Multi-server 50 | self._scrapeops_server_id = crawler.settings.get('SCRAPEOPS_SERVER_ID', "-1") 51 | self._scrapeops_debug_mode = crawler.settings.get('SCRAPEOPS_DEBUG_MODE', False) 52 | self._scrapeops_export_scrapy_logs = self.get_export_logs(crawler) 53 | 54 | ## SOPS API 55 | SOPSRequest.SCRAPEOPS_ENDPOINT = crawler.settings.get('SCRAPEOPS_ENDPOINT', 'https://api.scrapeops.io/') 56 | SOPSRequest.API_KEY = self._scrapeops_api_key = crawler.settings.get('SCRAPEOPS_API_KEY', None) 57 | SOPSRequest.SCRAPEOPS_LOGGING_DATA = {'logging_data': self.logging_data()} 58 | 59 | ## Middlewares 60 | self.initialize_middlewares() 61 | self.initialize_error_logger() 62 | 63 | 64 | 65 | def initialize_middlewares(self): 66 | if self.item_validation_middleware is None: 67 | self.item_validation_middleware = ItemValidator() 68 | 69 | if self.failed_url_middleware is None: 70 | self.failed_url_middleware = FailedUrlsHandler() 71 | 72 | 73 | def initialize_error_logger(self): 74 | self._error_logger = ErrorLogger( 75 | self.spider, 76 | self.crawler, 77 | self.spider_settings, 78 | self.server_hostname, 79 | self.server_ip, 80 | self.start_time, 81 | self.log_file) 82 | 83 | 84 | def initialize_job_details(self, data): 85 | self.job_id = data.get('job_id') 86 | self.job_group_name = data.get('job_group_name', self.job_group_name) 87 | self.job_group_id = SOPSRequest.JOB_GROUP_ID = data.get('job_group_id') 88 | self.spider_id= data.get('spider_id') 89 | self.server_id= data.get('server_id') 90 | self.project_id= data.get('project_id') 91 | self.multi_server = data.get('multi_server', False) 92 | SOPSRequest.HIGH_FREQ_ACC = data.get('high_freq', False) 93 | self._period_frequency = data.get('stats_period_frequency') 94 | self._period_freq_list = data.get('stats_period_freq_list') 95 | self._error_logger.update_error_logger(self.job_group_name, self.job_group_id) 96 | self.update_sdk_settings(data) 97 | self.initialize_normalizer_middleware(data) 98 | SOPSRequest.SCRAPEOPS_LOGGING_DATA = {'logging_data': self.logging_data()} 99 | 100 | 101 | def initialize_normalizer_middleware(self, data=None): 102 | if data is not None: 103 | self._proxy_apis = data.get('proxy_apis', {}) 104 | self._generic_validators = data.get('generic_validators', []) 105 | if self.request_response_middleware is None: 106 | self.request_response_middleware = RequestResponseMiddleware(self.job_group_id, 107 | self._proxy_apis, 108 | self._generic_validators, 109 | self._error_logger, 110 | self.allowed_response_codes) 111 | 112 | 113 | def update_sdk_settings(self, data): 114 | self._sdk_active = data.get('sdk_active', self._sdk_active) 115 | self.multi_server = data.get('multi_server', self.multi_server) 116 | self.job_group_name = data.get('job_group_name', self.job_group_name) 117 | self._scrapeops_export_scrapy_logs = data.get('scrapeops_export_scrapy_logs', self._scrapeops_export_scrapy_logs) 118 | 119 | ## SOPS API Endpoints 120 | SOPSRequest.SCRAPEOPS_ENDPOINT = data.get('scrapeops_endpoint', SOPSRequest.SCRAPEOPS_ENDPOINT) 121 | SOPSRequest.SCRAPEOPS_API_VERSION = data.get('scrapeops_api_version', SOPSRequest.SCRAPEOPS_API_VERSION) 122 | 123 | ## Normalisation Middleware 124 | RequestResponseMiddleware.PROXY_DOMAIN_NORMALIZATION = data.get('proxy_domain_normalization', RequestResponseMiddleware.PROXY_DOMAIN_NORMALIZATION) 125 | RequestResponseMiddleware.PROXY_ALERTS = data.get('proxy_alerts', RequestResponseMiddleware.PROXY_ALERTS) 126 | RequestResponseMiddleware.RESPONSE_VALIDATION = data.get('response_validation', RequestResponseMiddleware.RESPONSE_VALIDATION) 127 | 128 | ## Item Validation Middleware 129 | ItemValidator.ITEM_COVERAGE_ENABLED = data.get('item_coverage_enabled', ItemValidator.ITEM_COVERAGE_ENABLED) 130 | ItemValidator.INVALID_ITEM_URLS_LOGGING_ENABLED = data.get('ivalid_item_coverage_url_logging_enabled', ItemValidator.INVALID_ITEM_URLS_LOGGING_ENABLED) 131 | ItemValidator.MAX_ITEM_URLS = data.get('max_item_urls', ItemValidator.MAX_ITEM_URLS) 132 | 133 | ## Failed URL Middleware 134 | FailedUrlsHandler.FAILED_URL_LOGGER_ENABLED = data.get('FAILED_URL_LOGGER_ENABLED', FailedUrlsHandler.FAILED_URL_LOGGER_ENABLED) 135 | FailedUrlsHandler.LOG_MISSED_URLS = data.get('log_missed_urls', FailedUrlsHandler.LOG_MISSED_URLS) 136 | FailedUrlsHandler.MAX_LOGGED_URLS = data.get('max_failed_urls', FailedUrlsHandler.MAX_LOGGED_URLS) 137 | 138 | ## Error Logger 139 | ErrorLogger.ERROR_LOGGER_ACTIVE = data.get('error_logger', ErrorLogger.ERROR_LOGGER_ACTIVE) 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /scrapeops_scrapy/exceptions.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class ScrapeOpsMissingAPIKey(Exception): 4 | """Indicates that no ScrapeOps API key added""" 5 | def __init__(self): 6 | self.message = 'No ScrapeOps API key defined.' 7 | super().__init__(self.message) 8 | 9 | def __str__(self): 10 | return f'ScrapeOpsMissingAPIKey: {self.message}' 11 | 12 | 13 | class ScrapeOpsAPIResponseError(Exception): 14 | 15 | def __init__(self): 16 | super().__init__() 17 | 18 | 19 | class DecodeError(Exception): 20 | pass -------------------------------------------------------------------------------- /scrapeops_scrapy/extension.py: -------------------------------------------------------------------------------- 1 | from scrapy import signals 2 | import logging 3 | 4 | from scrapeops_scrapy.core.core import ScrapeopsCore 5 | from scrapeops_scrapy.signals import scrapeops_signals 6 | from scrapeops_scrapy.core.error_logger import TailLogger 7 | 8 | 9 | class ScrapeOpsMonitor(ScrapeopsCore): 10 | 11 | def __init__(self, crawler): 12 | ScrapeopsCore.__init__(self) 13 | self.crawler = crawler 14 | 15 | self.tail = TailLogger() 16 | log_handler = self.tail.log_handler 17 | logging.getLogger().addHandler(log_handler) 18 | 19 | 20 | @classmethod 21 | def from_crawler(cls, crawler): 22 | ext = cls(crawler) 23 | 24 | # connect the extension object to signals 25 | crawler.signals.connect(ext.spider_opened, 26 | signal=signals.spider_opened) 27 | 28 | crawler.signals.connect(ext.spider_closed, 29 | signal=signals.spider_closed) 30 | 31 | crawler.signals.connect(ext.log_request, 32 | signal=signals.request_reached_downloader) 33 | 34 | crawler.signals.connect(ext.log_response, 35 | signal=signals.response_downloaded) 36 | 37 | crawler.signals.connect(ext.log_response_middleware, 38 | signal=scrapeops_signals.scrapeops_response_recieved) 39 | 40 | crawler.signals.connect(ext.log_exception, 41 | signal=scrapeops_signals.scrapeops_exception_recieved) 42 | 43 | crawler.signals.connect(ext.item_scraped, 44 | signal=signals.item_scraped) 45 | 46 | crawler.signals.connect(ext.item_dropped, 47 | signal=signals.item_dropped) 48 | 49 | crawler.signals.connect(ext.item_error, 50 | signal=signals.item_error) 51 | 52 | return ext 53 | 54 | def spider_opened(self, spider): 55 | self.start_sdk(spider=spider, crawler=self.crawler) 56 | 57 | def spider_closed(self, spider, reason): 58 | self.close_sdk(spider=spider, reason=reason) 59 | 60 | def log_request(self, request, spider): 61 | if self.sdk_enabled(): 62 | self.request_stats(request=request) 63 | 64 | def log_response(self, response, request, spider): 65 | if self.scrapeops_middleware_enabled() is False and self.sdk_enabled(): 66 | self.response_stats(request=request, response=response) 67 | 68 | def log_response_middleware(self, request=None, response=None, spider=None): 69 | if self.scrapeops_middleware_enabled() and self.sdk_enabled(): 70 | self.response_stats(request=request, response=response) 71 | 72 | def log_exception(self, request=None, spider=None, exception_class=None): 73 | if self.scrapeops_middleware_enabled() and self.sdk_enabled(): 74 | self.exception_stats(request=request, exception_class=exception_class) 75 | 76 | def item_scraped(self, item, response, spider): 77 | if self.sdk_enabled(): 78 | self.item_stats(signal_type='item_scraped', item=item, response=response, spider=spider) 79 | 80 | def item_dropped(self, item, response, spider): 81 | if self.sdk_enabled(): 82 | self.item_stats(signal_type='item_dropped', item=item, response=response, spider=spider) 83 | 84 | def item_error(self, item, response, spider): 85 | if self.sdk_enabled(): 86 | self.item_stats(signal_type='item_error', item=item, response=response, spider=spider) 87 | 88 | -------------------------------------------------------------------------------- /scrapeops_scrapy/middleware/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/middleware/__init__.py -------------------------------------------------------------------------------- /scrapeops_scrapy/middleware/retry.py: -------------------------------------------------------------------------------- 1 | """ 2 | An extension to retry failed requests that are potentially caused by temporary 3 | problems such as a connection timeout or HTTP 500 error. 4 | 5 | You can change the behaviour of this middleware by modifing the scraping settings: 6 | RETRY_TIMES - how many times to retry a failed page 7 | RETRY_HTTP_CODES - which HTTP response codes to retry 8 | 9 | Failed pages are collected on the scraping process and rescheduled at the end, 10 | once the spider has finished crawling all regular (non failed) pages. 11 | """ 12 | from logging import getLogger, Logger 13 | from typing import Optional, Union 14 | 15 | from twisted.internet import defer 16 | from twisted.internet.error import ( 17 | ConnectError, 18 | ConnectionDone, 19 | ConnectionLost, 20 | ConnectionRefusedError, 21 | DNSLookupError, 22 | TCPTimedOutError, 23 | TimeoutError, 24 | ) 25 | from twisted.web.client import ResponseFailed 26 | 27 | from scrapy.core.downloader.handlers.http11 import TunnelError 28 | from scrapy.exceptions import NotConfigured 29 | from scrapy.http.request import Request 30 | from scrapy.spiders import Spider 31 | from scrapy.utils.python import global_object_name 32 | from scrapy.utils.response import response_status_message 33 | 34 | from scrapeops_scrapy.signals import scrapeops_signals 35 | 36 | 37 | retry_logger = getLogger(__name__) 38 | 39 | 40 | def get_retry_request( 41 | request: Request, 42 | *, 43 | spider: Spider, 44 | #response: Response, 45 | reason: Union[str, Exception] = 'unspecified', 46 | max_retry_times: Optional[int] = None, 47 | priority_adjust: Optional[int] = None, 48 | logger: Logger = retry_logger, 49 | stats_base_key: str = 'retry', 50 | ): 51 | """ 52 | Returns a new :class:`~scrapy.Request` object to retry the specified 53 | request, or ``None`` if retries of the specified request have been 54 | exhausted. 55 | 56 | For example, in a :class:`~scrapy.Spider` callback, you could use it as 57 | follows:: 58 | 59 | def parse(self, response): 60 | if not response.text: 61 | new_request_or_none = get_retry_request( 62 | response.request, 63 | spider=self, 64 | reason='empty', 65 | ) 66 | return new_request_or_none 67 | 68 | *spider* is the :class:`~scrapy.Spider` instance which is asking for the 69 | retry request. It is used to access the :ref:`settings ` 70 | and :ref:`stats `, and to provide extra logging context (see 71 | :func:`logging.debug`). 72 | 73 | *reason* is a string or an :class:`Exception` object that indicates the 74 | reason why the request needs to be retried. It is used to name retry stats. 75 | 76 | *max_retry_times* is a number that determines the maximum number of times 77 | that *request* can be retried. If not specified or ``None``, the number is 78 | read from the :reqmeta:`max_retry_times` meta key of the request. If the 79 | :reqmeta:`max_retry_times` meta key is not defined or ``None``, the number 80 | is read from the :setting:`RETRY_TIMES` setting. 81 | 82 | *priority_adjust* is a number that determines how the priority of the new 83 | request changes in relation to *request*. If not specified, the number is 84 | read from the :setting:`RETRY_PRIORITY_ADJUST` setting. 85 | 86 | *logger* is the logging.Logger object to be used when logging messages 87 | 88 | *stats_base_key* is a string to be used as the base key for the 89 | retry-related job stats 90 | """ 91 | settings = spider.crawler.settings 92 | stats = spider.crawler.stats 93 | retry_times = request.meta.get('retry_times', 0) + 1 94 | if max_retry_times is None: 95 | max_retry_times = request.meta.get('max_retry_times') 96 | if max_retry_times is None: 97 | max_retry_times = settings.getint('RETRY_TIMES') 98 | if retry_times <= max_retry_times: 99 | logger.debug( 100 | "Retrying %(request)s (failed %(retry_times)d times): %(reason)s", 101 | {'request': request, 'retry_times': retry_times, 'reason': reason}, 102 | extra={'spider': spider} 103 | ) 104 | new_request = request.copy() 105 | new_request.meta['retry_times'] = retry_times 106 | 107 | 108 | new_request.dont_filter = True 109 | if priority_adjust is None: 110 | priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST') 111 | new_request.priority = request.priority + priority_adjust 112 | 113 | if callable(reason): 114 | reason = reason() 115 | if isinstance(reason, Exception): 116 | reason = global_object_name(reason.__class__) 117 | 118 | stats.inc_value(f'{stats_base_key}/count') 119 | stats.inc_value(f'{stats_base_key}/reason_count/{reason}') 120 | return new_request 121 | else: 122 | stats.inc_value(f'{stats_base_key}/max_reached') 123 | logger.error( 124 | "Gave up retrying %(request)s (failed %(retry_times)d times): " 125 | "%(reason)s", 126 | {'request': request, 'retry_times': retry_times, 'reason': reason}, 127 | extra={'spider': spider}, 128 | ) 129 | return None 130 | 131 | 132 | 133 | class RetryMiddleware: 134 | 135 | # IOError is raised by the HttpCompression middleware when trying to 136 | # decompress an empty response 137 | EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError, 138 | ConnectionRefusedError, ConnectionDone, ConnectError, 139 | ConnectionLost, TCPTimedOutError, ResponseFailed, 140 | IOError, TunnelError) 141 | 142 | def __init__(self, settings): 143 | if not settings.getbool('RETRY_ENABLED'): 144 | raise NotConfigured 145 | self.max_retry_times = settings.getint('RETRY_TIMES') 146 | self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES')) 147 | self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST') 148 | 149 | @classmethod 150 | def from_crawler(cls, crawler): 151 | return cls(crawler.settings) 152 | 153 | def process_response(self, request, response, spider): 154 | spider.crawler.signals.send_catch_log( 155 | signal=scrapeops_signals.scrapeops_response_recieved, 156 | request=request, 157 | response=response, 158 | spider=spider) 159 | 160 | if request.meta.get('dont_retry', False): 161 | return response 162 | if response.status in self.retry_http_codes: 163 | reason = response_status_message(response.status) 164 | return self._retry(request, reason, spider) or response 165 | return response 166 | 167 | def process_exception(self, request, exception, spider): 168 | ex_class = global_object_name(exception.__class__) 169 | spider.crawler.signals.send_catch_log( 170 | signal=scrapeops_signals.scrapeops_exception_recieved, 171 | request=request, 172 | spider=spider, 173 | exception_class=ex_class) 174 | 175 | if ( 176 | isinstance(exception, self.EXCEPTIONS_TO_RETRY) 177 | and not request.meta.get('dont_retry', False) 178 | ): 179 | return self._retry(request, exception, spider) 180 | 181 | def _retry(self, request, reason, spider): 182 | max_retry_times = request.meta.get('max_retry_times', self.max_retry_times) 183 | priority_adjust = request.meta.get('priority_adjust', self.priority_adjust) 184 | return get_retry_request( 185 | request, 186 | reason=reason, 187 | spider=spider, 188 | max_retry_times=max_retry_times, 189 | priority_adjust=priority_adjust, 190 | ) -------------------------------------------------------------------------------- /scrapeops_scrapy/middleware/stats.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.python import global_object_name 2 | 3 | from scrapeops_scrapy.signals import scrapeops_signals 4 | 5 | 6 | class ScrapeOpsStats: 7 | 8 | def __init__(self): 9 | pass 10 | 11 | def process_response(self, request, response, spider): 12 | spider.crawler.signals.send_catch_log( 13 | signal=scrapeops_signals.scrapeops_response_recieved, 14 | request=request, 15 | response=response, 16 | spider=spider) 17 | return response 18 | 19 | def process_exception(self, request, exception, spider): 20 | ex_class = global_object_name(exception.__class__) 21 | spider.crawler.signals.send_catch_log( 22 | signal=scrapeops_signals.scrapeops_exception_recieved, 23 | request=request, 24 | spider=spider, 25 | exception_class=ex_class) -------------------------------------------------------------------------------- /scrapeops_scrapy/normalizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/normalizer/__init__.py -------------------------------------------------------------------------------- /scrapeops_scrapy/normalizer/domains.py: -------------------------------------------------------------------------------- 1 | from tld import get_tld 2 | from urllib.parse import urlparse, parse_qs 3 | 4 | class DomainNormalizer(object): 5 | 6 | def __init__(self): 7 | pass 8 | 9 | @staticmethod 10 | def get_domain(url): 11 | #if 'http://' not in url or 'http://' not in url or 'socks5://' not in url 12 | try: 13 | if DomainNormalizer.if_localhost(url): 14 | return 'localhost' 15 | res = get_tld(url, as_object=True) 16 | return res.fld 17 | except Exception: 18 | return 'unknown' 19 | 20 | @staticmethod 21 | def get_full_domain(url): 22 | try: 23 | if DomainNormalizer.if_localhost(url): 24 | return 'localhost' 25 | res = get_tld(url, as_object=True) 26 | if res.subdomain != '': 27 | return res.subdomain + '.' + res.fld 28 | return res.fld 29 | except Exception: 30 | return 'unknown' 31 | 32 | 33 | @staticmethod 34 | def if_localhost(url): 35 | if 'http://localhost:' in url or 'http://127.0.0.1:' in url: 36 | return True 37 | return False 38 | 39 | 40 | @staticmethod 41 | def parse_url(url): 42 | parsed_url = urlparse(url) 43 | query_params = parse_qs(parsed_url.query) 44 | query_dict = {} 45 | for key, value in query_params.items(): 46 | query_dict[key] = value[0] 47 | return query_dict 48 | 49 | 50 | @staticmethod 51 | def get_url_proxy_api(url=None, proxy_settings=None): 52 | url_identifier = proxy_settings.get('url_identifier') 53 | query_params = DomainNormalizer.parse_url(url) 54 | url = query_params.get(url_identifier) 55 | return url 56 | 57 | 58 | @staticmethod 59 | def get_page_type(url, domain_data): 60 | if domain_data.get('url_classification'): 61 | url_classifiers = domain_data.get('url_contains_page_types', {}) 62 | for k, v in url_classifiers.items(): 63 | if k in url: 64 | return v 65 | query_param_page_types = domain_data.get('query_param_page_types', {}) 66 | query_params = DomainNormalizer.parse_url(url) 67 | for k, v in query_params.items(): 68 | key_mapping = query_param_page_types.get(k, None) 69 | if key_mapping is not None: 70 | return v 71 | return 'none' 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /scrapeops_scrapy/normalizer/exceptions.py: -------------------------------------------------------------------------------- 1 | class ExceptionNormalizer(object): 2 | 3 | def __init__(self): 4 | pass 5 | 6 | @staticmethod 7 | def normalise_exception(exception_class): 8 | 9 | if 'ResponseNeverReceived' in exception_class: 10 | return 'ResponseNeverReceived' 11 | 12 | if 'Timeout' in exception_class: 13 | return 'Timeout' 14 | 15 | if 'TimedOut' in exception_class: 16 | return 'Timeout' 17 | 18 | if 'PotentialDataLoss' in exception_class: 19 | return 'PotentialDataLoss' 20 | 21 | if 'ConnectionLost' in exception_class: 22 | return 'ConnectionLost' 23 | 24 | return exception_class 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /scrapeops_scrapy/normalizer/middleware.py: -------------------------------------------------------------------------------- 1 | from scrapeops_scrapy.core.api import SOPSRequest 2 | from scrapeops_scrapy.validators.response_validator import ResponseValidator 3 | from scrapeops_scrapy.normalizer.proxies import ProxyNormalizer 4 | from scrapeops_scrapy.normalizer.proxy_port_normalizer import ProxyPortStringNormalizer 5 | 6 | 7 | class RequestResponseMiddleware(object): 8 | 9 | PROXY_DOMAIN_NORMALIZATION = True 10 | RESPONSE_VALIDATION = True 11 | PROXY_ALERTS = False 12 | FAILED_URL_LOGGER_ENABLED = True 13 | LOG_MISSED_URLS = False 14 | 15 | def __init__(self, job_group_id, proxy_apis, generic_validators, error_logger, allowed_response_codes): 16 | self.job_group_id = job_group_id 17 | self._proxy_apis = proxy_apis 18 | self._data_coverage_validation = False 19 | self._domains = {} 20 | self._proxies = {} 21 | self._proxy_port_setups = {} 22 | self._generic_validators = generic_validators 23 | self._allowed_response_codes = allowed_response_codes 24 | self._error_logger = error_logger 25 | self._error_count = 0 26 | self._error_alerts_sent = {} 27 | self._missed_urls = {} 28 | 29 | 30 | def process(self, request_response_object, response): 31 | self.normalise_domain_proxy_data(request_response_object) 32 | self.check_proxy_responses(request_response_object, response) 33 | self.validate_response_data(request_response_object, response) 34 | 35 | 36 | def normalise_domain_proxy_data(self, request_response_object): 37 | if RequestResponseMiddleware.PROXY_DOMAIN_NORMALIZATION: 38 | 39 | proxy_api = self.normalise_proxy_api(request_response_object) 40 | if proxy_api is False: 41 | self.normalise_proxy_port(request_response_object) 42 | self.normalise_domain_data(request_response_object) 43 | 44 | if RequestResponseMiddleware.PROXY_DOMAIN_NORMALIZATION is False: 45 | request_response_object.fallback_domain_proxy_details(reason='disabled') 46 | 47 | 48 | def normalise_proxy_api(self, request_response_object): 49 | try: 50 | proxy_api, update = request_response_object.check_proxy_api(self._proxy_apis) 51 | if proxy_api and update: 52 | data, status = SOPSRequest().proxy_api_normalisation_request(request_response_object) 53 | if status.valid: 54 | self._proxy_apis[request_response_object.get_proxy_api_name()] = data.get('proxy_parsing_data') 55 | request_response_object.update_proxy_api(data.get('proxy_parsing_data')) 56 | else: 57 | if self._proxy_apis.get(request_response_object.get_proxy_api_name()) is None: 58 | self._proxy_apis[request_response_object.get_proxy_api_name()] = {} 59 | self._proxy_apis[request_response_object.get_proxy_api_name()]['proxy_setup'] = {} 60 | self._error_logger.log_error(reason='get_proxy_api_details_failed', 61 | error=status.error, 62 | data={'proxy_api': request_response_object.get_proxy_api_name()}) 63 | request_response_object.fallback_proxy_details(proxy_type='proxy_api', proxy_apis=self._proxy_apis) 64 | 65 | except Exception: 66 | request_response_object.fallback_proxy_details(proxy_type='proxy_api', proxy_apis=self._proxy_apis) 67 | 68 | return proxy_api or False 69 | 70 | 71 | def normalise_proxy_port(self, request_response_object): 72 | try: 73 | if request_response_object.active_proxy_port(): 74 | named_proxy, update = request_response_object.check_proxy_port_type(self._proxies) 75 | if named_proxy and update: 76 | data, status = SOPSRequest().proxy_port_normalisation_request(request_response_object) 77 | if status.valid: 78 | ProxyNormalizer.update_proxy_details(self._proxies, request_response_object, data, valid=True) 79 | ProxyPortStringNormalizer.proxy_port_test(self._proxies, request_response_object, data, valid=True) 80 | else: 81 | ProxyNormalizer.update_proxy_details(self._proxies, request_response_object, data, valid=False) 82 | self._error_logger.log_error(reason='get_proxy_port_details_failed', 83 | error=status.error, 84 | data={'proxy_port': request_response_object.get_raw_proxy()}) 85 | 86 | ## Using No Proxy 87 | if request_response_object.active_proxy() is False: 88 | request_response_object.update_no_proxy() 89 | 90 | except Exception: 91 | request_response_object.fallback_proxy_details(proxy_type='proxy_port') 92 | 93 | 94 | def normalise_domain_data(self, request_response_object): 95 | try: 96 | ## Normalise domain/page type data 97 | unknown = request_response_object.check_domain(self._domains) 98 | if unknown: 99 | data, status = SOPSRequest().domain_normalisation_request(request_response_object) 100 | if status.valid: 101 | self._domains[request_response_object.get_domain()] = data.get('domain_parsing_data') 102 | request_response_object.update_page_type(data.get('domain_parsing_data')) 103 | else: 104 | if self._domains.get(request_response_object.get_domain()) is None: 105 | self._domains[request_response_object.get_domain()] = {} 106 | self._domains[request_response_object.get_domain()]['url_contains_page_types'] = {} 107 | self._domains[request_response_object.get_domain()]['query_param_page_types'] = {} 108 | self._domains[request_response_object.get_domain()]['validation_details'] = [] 109 | self._error_logger.log_error(reason='get_domain_details_failed', 110 | error=status.error, 111 | data={'real_url': request_response_object.get_real_url()}) 112 | request_response_object.fallback_domain_data() 113 | 114 | except Exception: 115 | request_response_object.fallback_domain_data() 116 | 117 | 118 | def check_proxy_responses(self, request_response_object, response): 119 | if RequestResponseMiddleware.PROXY_ALERTS: 120 | if request_response_object.active_proxy_api(): 121 | proxy_details = self._proxy_apis.get(request_response_object.get_proxy_api_name()) 122 | if proxy_details is not None: 123 | self.check_proxy_error_codes(request_response_object, proxy_details, response) 124 | 125 | if request_response_object.active_named_proxy(): 126 | proxy_details = self._proxies.get(request_response_object.get_proxy_port_name()) 127 | if proxy_details is not None: 128 | self.check_proxy_error_codes(request_response_object, proxy_details, response) 129 | 130 | 131 | def check_proxy_error_codes(self, request_response_object, proxy_details, response): 132 | error_codes = proxy_details.get('error_codes') 133 | if error_codes is not None: 134 | status_code = str(response.status) 135 | error_response = error_codes.get(status_code) 136 | if error_response is not None: 137 | if error_response.get('action') == 'alert' and self.should_alert(error_response, status_code): 138 | _, status = SOPSRequest().proxy_alert_request(request_response_object, self.job_group_id, error_response, self._error_alerts_sent.get(status_code)) 139 | if status.valid: 140 | self._error_alerts_sent[status_code] += 1 141 | elif error_response.get('action') == 'monitor': 142 | self._error_count += 1 143 | if self._error_count > error_response.get('error_limit', 0) and self.should_alert(error_response, status_code): 144 | _, status = SOPSRequest().proxy_alert_request(request_response_object, self.job_group_id, error_response, self._error_alerts_sent.get(status_code)) 145 | if status.valid: 146 | self._error_alerts_sent[status_code] += 1 147 | 148 | 149 | def should_alert(self, error_response, status_code): 150 | if self._error_alerts_sent.get(status_code) is None: 151 | self._error_alerts_sent[status_code] = 0 152 | return True 153 | if self._error_alerts_sent.get(status_code) is not None: 154 | if self._error_alerts_sent[status_code] < error_response.get('alert_limit'): 155 | return True 156 | return False 157 | 158 | 159 | def validate_response_data(self, request_response_object, response=None): 160 | if RequestResponseMiddleware.RESPONSE_VALIDATION and response is not None: 161 | if response.status == 200: 162 | domain_tests = ResponseValidator.get_domain_tests(request_response_object, self._domains) 163 | ResponseValidator.validate(request_response_object, response, domain_tests=domain_tests, generic_tests=self._generic_validators) 164 | 165 | if response.status != 200 and ResponseValidator.failed_scan(request_response_object, self._domains): 166 | ResponseValidator.validate(request_response_object, response, generic_tests=self._generic_validators) 167 | 168 | 169 | def failed_url(self, request_response_object, response=None): 170 | if RequestResponseMiddleware.FAILED_URL_LOGGER_ENABLED: 171 | if (response.status < 200 and response.status > 300) and (response.status not in self._allowed_response_codes): 172 | if self._missed_urls.get('count') is None: 173 | self._missed_urls['count'] = 0 174 | self._missed_urls['count'] += 1 175 | 176 | if RequestResponseMiddleware.LOG_MISSED_URLS: 177 | if self._missed_urls.get(response.status) is None: 178 | self._missed_urls[response.status] = [] 179 | self._missed_urls[response.status].append(request_response_object.get_real_url()) 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | -------------------------------------------------------------------------------- /scrapeops_scrapy/normalizer/proxies.py: -------------------------------------------------------------------------------- 1 | import re 2 | import socket 3 | 4 | from base64 import b64decode 5 | from urllib.parse import unquote, urlparse 6 | 7 | from scrapeops_scrapy.normalizer.domains import DomainNormalizer 8 | from scrapeops_scrapy.exceptions import DecodeError 9 | 10 | 11 | class ProxyNormalizer(object): 12 | 13 | def __init__(self): 14 | pass 15 | 16 | @staticmethod 17 | def check_named_proxy(proxy_string): 18 | try: 19 | proxy_address = DomainNormalizer.get_full_domain(proxy_string) 20 | proxy_domain = DomainNormalizer.get_domain(proxy_string) 21 | return True, proxy_address, proxy_domain 22 | except Exception: 23 | return False, 'ip_list_proxy' 24 | 25 | @staticmethod 26 | def remove_brackets(string): 27 | characters_to_remove = ['[',']'] 28 | new_string = string 29 | for character in characters_to_remove: 30 | new_string = new_string.replace(character, "") 31 | return new_string 32 | 33 | 34 | @staticmethod 35 | def check_ip_address(proxy_string): 36 | s = ProxyNormalizer.remove_brackets(proxy_string) 37 | ipv6_split_string = re.split('://|@|/', s) 38 | for el in ipv6_split_string: 39 | if ProxyNormalizer.is_valid_ipv6_address(el): return True 40 | 41 | ipv4_split_string = re.split('://|:|@', proxy_string) 42 | for el in ipv4_split_string: 43 | if ProxyNormalizer.is_valid_ipv4_address(el): return True 44 | return False 45 | 46 | 47 | @staticmethod 48 | def is_valid_ipv4_address(address): 49 | try: 50 | socket.inet_pton(socket.AF_INET, address) 51 | except AttributeError: # no inet_pton here, sorry 52 | try: 53 | socket.inet_aton(address) 54 | except socket.error: 55 | return False 56 | return address.count('.') == 3 57 | except socket.error: # not a valid address 58 | return False 59 | 60 | return True 61 | 62 | @staticmethod 63 | def is_valid_ipv6_address(address): 64 | try: 65 | socket.inet_pton(socket.AF_INET6, address) 66 | except socket.error: # not a valid address 67 | return False 68 | return True 69 | 70 | @staticmethod 71 | def get_proxy_port(proxy_string): 72 | try: 73 | return urlparse(proxy_string).port 74 | except Exception: 75 | return '80' 76 | 77 | @staticmethod 78 | def get_proxy_host(proxy_string): 79 | try: 80 | return DomainNormalizer.get_full_domain(proxy_string) 81 | except Exception: 82 | return 'ip_list_proxy' 83 | 84 | @staticmethod 85 | def get_proxy_scheme(proxy_string): 86 | try: 87 | return urlparse(proxy_string).scheme 88 | except Exception: 89 | return '' 90 | 91 | @staticmethod 92 | def unknown_proxy_scheme(proxy_string): 93 | if ProxyNormalizer.get_proxy_scheme(proxy_string) == '': 94 | return True 95 | return False 96 | 97 | @staticmethod 98 | def convert_bytes_to_string(inputValue): 99 | if isinstance(inputValue, (str, int)): 100 | return inputValue 101 | if isinstance(inputValue, (bytes, bytearray)): 102 | return inputValue.decode('utf-8') 103 | if isinstance(inputValue, list): 104 | tempList = [] 105 | for el in inputValue: 106 | if isinstance(el, (bytes, bytearray)): 107 | tempList.append(el.decode('utf-8')) 108 | elif isinstance(el, list): 109 | tempList.append(['']) 110 | elif isinstance(el, dict): 111 | tempList.append({'': ''}) 112 | else: 113 | tempList.append(el) 114 | return tempList 115 | return inputValue 116 | 117 | @staticmethod 118 | def convert_headers(raw_headers): 119 | header_dict = {} 120 | try: 121 | for key, value in raw_headers.items(): 122 | k = ProxyNormalizer.convert_bytes_to_string(key) 123 | v = ProxyNormalizer.convert_bytes_to_string(value) 124 | header_dict[k] = v 125 | return header_dict 126 | except Exception: 127 | return header_dict 128 | 129 | @staticmethod 130 | def decode_basic_auth(auth_string): 131 | """Decode an encrypted HTTP basic authentication string. Returns a tuple of 132 | the form (username, password), and raises a DecodeError exception if 133 | nothing could be decoded. 134 | """ 135 | split = auth_string.strip().split(' ') 136 | 137 | # If split is only one element, try to decode the username and password 138 | # directly. 139 | if len(split) == 1: 140 | try: 141 | username, password = b64decode(split[0]).decode().split(':', 1) 142 | except Exception: 143 | raise DecodeError 144 | 145 | # If there are only two elements, check the first and ensure it says 146 | # 'basic' so that we know we're about to decode the right thing. If not, 147 | # bail out. 148 | elif len(split) == 2: 149 | if split[0].strip().lower() == 'basic': 150 | try: 151 | username, password = b64decode(split[1]).decode().split(':', 1) 152 | except Exception: 153 | raise DecodeError 154 | else: 155 | raise DecodeError 156 | 157 | # If there are more than 2 elements, something crazy must be happening. 158 | # Bail. 159 | else: 160 | raise DecodeError 161 | 162 | return unquote(username), unquote(password) 163 | 164 | @staticmethod 165 | def create_dict_if_none_exists(dict, key): 166 | if dict.get(key) is None: 167 | dict[key] = {} 168 | 169 | @staticmethod 170 | def update_proxy_details(proxy_dict, request_response_object, data, valid=False): 171 | proxy_name = request_response_object.get_proxy_port_name() 172 | if proxy_dict.get(proxy_name) is None: 173 | proxy_dict[proxy_name] = {} 174 | 175 | ## Update counter 176 | proxy_port_details = data.get('proxy_port_details') 177 | count = proxy_dict[proxy_name].get('count', 0) 178 | proxy_dict[proxy_name]['count'] = count + 1 179 | proxy_dict[proxy_name]['max_count'] = proxy_port_details.get('max_count', 3) 180 | 181 | if valid: 182 | proxy_dict[proxy_name]['normalization_actions'] = data.get('normalization_actions') 183 | proxy_dict[proxy_name]['fallback'] = data.get('fallback', 'port') 184 | 185 | 186 | proxy_setup_key = proxy_port_details.get('proxy_setup_key') 187 | proxy_setup_value = proxy_port_details.get('proxy_setup_value') 188 | if proxy_setup_value is None: 189 | proxy_setup_value = data.get('fallback', 'port_type=unknown') 190 | proxy_dict[proxy_name][proxy_setup_key] = proxy_setup_value 191 | proxy_dict[proxy_name]['known'] = proxy_port_details.get('proxy_known_domain', False) 192 | request_response_object.update_proxy_port(proxy_name, proxy_setup_value) 193 | 194 | 195 | else: 196 | proxy_dict[proxy_name]['fallback'] = 'port' 197 | request_response_object.fallback_proxy_details(proxy_type='proxy_port') 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | -------------------------------------------------------------------------------- /scrapeops_scrapy/normalizer/proxy_port_normalizer.py: -------------------------------------------------------------------------------- 1 | from scrapeops_scrapy.core.api import SOPSRequest 2 | 3 | 4 | class ProxyPortStringNormalizer(object): 5 | 6 | def __init__(self): 7 | pass 8 | 9 | 10 | @staticmethod 11 | def run_proxy_string_normalization(request_response_object, normalization_actions): 12 | 13 | if normalization_actions is not None: 14 | for action_type, actions in normalization_actions.items(): 15 | if actions is not None: 16 | if action_type == 'username': 17 | for action_block in actions: 18 | updated = ProxyPortStringNormalizer.process_action(request_response_object.get_normalized_proxy_port_username(), action_block) 19 | request_response_object.set_normalized_proxy_port_username(updated) 20 | 21 | if action_type == 'password': 22 | for action_block in actions: 23 | updated = ProxyPortStringNormalizer.process_action(request_response_object.get_normalized_proxy_port_password(), action_block) 24 | request_response_object.set_normalized_proxy_port_password(updated) 25 | 26 | if action_type == 'host': 27 | for action_block in actions: 28 | updated = ProxyPortStringNormalizer.process_action(request_response_object.get_normalized_proxy_port_host(), action_block) 29 | request_response_object.set_normalized_proxy_port_host(updated) 30 | 31 | if action_type == 'port': 32 | for action_block in actions: 33 | updated = ProxyPortStringNormalizer.process_action(request_response_object.get_normalized_proxy_port_port(), action_block) 34 | request_response_object.set_normalized_proxy_port_port(updated) 35 | 36 | if action_type == 'headers': 37 | for action_block in actions: 38 | updated = ProxyPortStringNormalizer.process_action(request_response_object.get_proxy_port_headers(), action_block) 39 | if updated is not None: 40 | request_response_object.update_normalized_proxy_port_header_string(updated) 41 | 42 | 43 | @staticmethod 44 | def process_action(inputValue, action_block): 45 | 46 | if action_block.get('action') == 'contains_replace': 47 | return ProxyPortStringNormalizer.contains_replace(inputValue, action_block) 48 | 49 | if action_block.get('action') == 'contains_replace_all': 50 | return ProxyPortStringNormalizer.contains_replace_all(inputValue, action_block) 51 | 52 | if action_block.get('action') == 'not_contains_replace_all': 53 | return ProxyPortStringNormalizer.not_contains_replace_all(inputValue, action_block) 54 | 55 | if action_block.get('action') == 'replace_key_value': 56 | return ProxyPortStringNormalizer.replace_key_value(inputValue, action_block) 57 | 58 | if action_block.get('action') == 'replace_key_seperator_value': 59 | return ProxyPortStringNormalizer.replace_key_seperator_value(inputValue, action_block) 60 | 61 | if action_block.get('action') == 'check_headers_contains': 62 | return ProxyPortStringNormalizer.check_headers_contains(inputValue, action_block) 63 | 64 | if action_block.get('action') == 'not_ends_in_replace': 65 | return ProxyPortStringNormalizer.not_ends_in_replace(inputValue, action_block) 66 | 67 | if action_block.get('action') == 'ends_in_replace': 68 | return ProxyPortStringNormalizer.ends_in_replace(inputValue, action_block) 69 | 70 | if action_block.get('action') == 'equals_replace': 71 | return ProxyPortStringNormalizer.equals_replace(inputValue, action_block) 72 | 73 | if action_block.get('action') == 'not_equals_replace': 74 | return ProxyPortStringNormalizer.not_equals_replace(inputValue, action_block) 75 | 76 | if action_block.get('action') == 'is_none_replace': 77 | return ProxyPortStringNormalizer.is_none_replace(inputValue, action_block) 78 | 79 | if action_block.get('action') == 'in_list_replace': 80 | return ProxyPortStringNormalizer.in_list_replace(inputValue, action_block) 81 | 82 | if action_block.get('action') == 'not_in_list_replace': 83 | return ProxyPortStringNormalizer.not_in_list_replace(inputValue, action_block) 84 | 85 | 86 | """ 87 | Conditional Checks 88 | """ 89 | @staticmethod 90 | def conditional_checks(inputString, condition=None): 91 | if condition is not None and condition.get('type') is not None: 92 | 93 | ## If substring in string 94 | if condition.get('type') == "contains": 95 | if condition.get('value') in inputString: 96 | return True 97 | return False 98 | 99 | if condition.get('type') == "not_contains": 100 | if condition.get('value') not in inputString: 101 | return True 102 | return False 103 | 104 | if condition.get('type') == "equals": 105 | if condition.get('value') == inputString: 106 | return True 107 | return False 108 | 109 | if condition.get('type') == "not_equal": 110 | if condition.get('value') != inputString: 111 | return True 112 | return False 113 | 114 | if condition.get('type') == "not_none": 115 | if inputString is not None: 116 | return True 117 | return False 118 | 119 | 120 | ## If all tests fail 121 | return False 122 | 123 | return True 124 | 125 | @staticmethod 126 | def get_condition_arguements(action_block): 127 | return action_block.get('condition'), action_block.get('arguements') 128 | 129 | 130 | """ 131 | Actions 132 | """ 133 | 134 | @staticmethod 135 | def replace_key_value(inputString, action_block): 136 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block) 137 | 138 | substring = arguements.get('substring') 139 | string_seperator = arguements.get('seperator') 140 | replacement = arguements.get('replacement') 141 | 142 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition): 143 | outputString = inputString 144 | splitString = inputString.split(string_seperator) 145 | for el in splitString: 146 | if substring.startswith('**'): 147 | if el.split('=')[0] == substring[2:]: 148 | outputString = outputString.replace(el, replacement) 149 | elif substring in el: 150 | outputString = outputString.replace(el, replacement) 151 | return outputString 152 | 153 | return inputString 154 | 155 | 156 | @staticmethod 157 | def replace_key_seperator_value(inputString, action_block): 158 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block) 159 | 160 | substring = arguements.get('substring') 161 | string_seperator = arguements.get('seperator') 162 | replacement = arguements.get('replacement') 163 | next_value = arguements.get('next_value') 164 | 165 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition): 166 | outputString = inputString 167 | splitString = inputString.split(string_seperator) 168 | for i, el in enumerate(splitString): 169 | if substring == el: 170 | if i + int(next_value) <= len(splitString): 171 | outputString = outputString.replace(string_seperator + splitString[i + int(next_value)], replacement) 172 | return outputString 173 | 174 | return inputString 175 | 176 | 177 | @staticmethod 178 | def check_headers_contains(inputheaders, action_block): 179 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block) 180 | value = condition.get('value') 181 | if inputheaders.get(value) is not None: 182 | header_value = inputheaders.get(value) 183 | if type(header_value) is list: 184 | header_value = header_value[0] 185 | value_check = arguements.get('check_type') 186 | if value_check == 'equals': 187 | if header_value == arguements.get('value'): 188 | return arguements.get('addition') 189 | if value_check == 'not_equal': 190 | if header_value != arguements.get('value'): 191 | return arguements.get('addition') 192 | if value_check is None: 193 | return arguements.get('addition') 194 | return None 195 | 196 | @staticmethod 197 | def not_ends_in_replace(inputString, action_block): 198 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block) 199 | 200 | if isinstance(inputString, str) is False: 201 | inputString = str(inputString) 202 | 203 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition): 204 | substring = arguements.get('substring') 205 | if inputString.endswith(substring) is False: 206 | return arguements.get('replacement') 207 | 208 | return inputString 209 | 210 | @staticmethod 211 | def ends_in_replace(inputString, action_block): 212 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block) 213 | 214 | if isinstance(inputString, str) is False: 215 | inputString = str(inputString) 216 | 217 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition): 218 | substring = arguements.get('substring') 219 | if inputString.endswith(substring): 220 | return arguements.get('replacement') 221 | 222 | return inputString 223 | 224 | 225 | @staticmethod 226 | def not_equals_replace(inputString, action_block): 227 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block) 228 | 229 | if isinstance(inputString, str) is False: 230 | inputString = str(inputString) 231 | 232 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition): 233 | substring = arguements.get('substring') 234 | if inputString != substring: 235 | return arguements.get('replacement') 236 | 237 | return inputString 238 | 239 | 240 | @staticmethod 241 | def equals_replace(inputString, action_block): 242 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block) 243 | 244 | if isinstance(inputString, str) is False: 245 | inputString = str(inputString) 246 | 247 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition): 248 | substring = arguements.get('substring') 249 | if inputString == substring: 250 | return arguements.get('replacement') 251 | 252 | return inputString 253 | 254 | 255 | 256 | @staticmethod 257 | def contains_replace(inputString, action_block): 258 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block) 259 | 260 | if isinstance(inputString, str) is False: 261 | inputString = str(inputString) 262 | 263 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition): 264 | substring = arguements.get('substring') 265 | replacement = arguements.get('replacement') 266 | return inputString.replace(substring, replacement) 267 | 268 | return inputString 269 | 270 | 271 | @staticmethod 272 | def not_contains_replace_all(inputString, action_block): 273 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block) 274 | 275 | if isinstance(inputString, str) is False: 276 | inputString = str(inputString) 277 | 278 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition): 279 | substring = arguements.get('substring') 280 | if substring not in inputString: 281 | return arguements.get('replacement') 282 | 283 | return inputString 284 | 285 | @staticmethod 286 | def contains_replace_all(inputString, action_block): 287 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block) 288 | 289 | if isinstance(inputString, str) is False: 290 | inputString = str(inputString) 291 | 292 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition): 293 | substring = arguements.get('substring') 294 | if substring in inputString: 295 | return arguements.get('replacement') 296 | 297 | return inputString 298 | 299 | 300 | @staticmethod 301 | def is_none_replace(inputString, action_block): 302 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block) 303 | 304 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition): 305 | if inputString is None: 306 | return arguements.get('replacement') 307 | 308 | return inputString 309 | 310 | @staticmethod 311 | def in_list_replace(inputString, action_block): 312 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block) 313 | 314 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition): 315 | list = arguements.get('list', '').split(',') 316 | if inputString in list: 317 | return arguements.get('replacement') 318 | 319 | return inputString 320 | 321 | 322 | @staticmethod 323 | def not_in_list_replace(inputString, action_block): 324 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block) 325 | 326 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition): 327 | list = arguements.get('list', '').split(',') 328 | if inputString not in list: 329 | return arguements.get('replacement') 330 | 331 | return inputString 332 | 333 | 334 | @staticmethod 335 | def proxy_port_test(proxy_dict, request_response_object, data, valid=False): 336 | if valid: 337 | proxy_name = request_response_object.get_proxy_port_name() 338 | test_request = data.get('test_request') 339 | proxy_port_details = data.get('proxy_port_details') 340 | proxy_setup_key = proxy_port_details.get('proxy_setup_key') 341 | if proxy_dict[proxy_name].get('sops_test_request') is None: 342 | proxy_dict[proxy_name]['sops_test_request'] = test_request 343 | test_request_count = proxy_dict[proxy_name]['sops_test_request'].get('count', 0) 344 | if test_request.get('send') and test_request_count < test_request.get('max_count', 1): 345 | proxy_dict[proxy_name]['sops_test_request']['count'] = test_request_count + 1 346 | json = SOPSRequest().proxy_test_request(test_request.get('url'), request_response_object) 347 | json['test_id'] = test_request.get('test_id') 348 | updated_data, status = SOPSRequest().proxy_port_normalisation_request(request_response_object, test_data=json) 349 | if status.valid: 350 | proxy_port_details = updated_data.get('proxy_port_details') 351 | proxy_setup_value = proxy_port_details.get('proxy_setup_value') 352 | proxy_dict[proxy_name][proxy_setup_key] = proxy_setup_value 353 | 354 | 355 | 356 | 357 | 358 | -------------------------------------------------------------------------------- /scrapeops_scrapy/normalizer/request_response.py: -------------------------------------------------------------------------------- 1 | from scrapeops_scrapy.normalizer.domains import DomainNormalizer 2 | from scrapeops_scrapy.normalizer.proxies import ProxyNormalizer 3 | from scrapeops_scrapy.normalizer.proxy_port_normalizer import ProxyPortStringNormalizer 4 | 5 | 6 | class BaseRequestResponse(object): 7 | """ 8 | Normalised request/response data structure. 9 | """ 10 | 11 | def __init__(self): 12 | self.signal_type = None 13 | self.request = None 14 | self.raw_url = None 15 | self.raw_proxy_port = None 16 | self.raw_domain = None 17 | self.raw_headers = None 18 | 19 | ## Proxy Checks 20 | self._active_proxy = None 21 | self._active_proxy_port=None 22 | self._real_url = None 23 | self._ip_proxy_list = False 24 | self._named_proxy = False 25 | 26 | ## Proxy Port 27 | self._proxy_port_name = None 28 | self._complete_proxy_port_string = None 29 | self._proxy_setup_key = None 30 | 31 | self._proxy_port_scheme = '' 32 | self._proxy_port_username = '' 33 | self._proxy_port_password = '' 34 | self._proxy_port_host = '' 35 | self._proxy_port_port = '' 36 | self._proxy_port_headers = {} 37 | 38 | self._normalized_proxy_port_username = None 39 | self._normalized_proxy_port_password = None 40 | self._normalized_proxy_port_host = None 41 | self._normalized_proxy_port_port = None 42 | self._normalized_proxy_port_header_string = None 43 | 44 | 45 | ## Proxy API 46 | self._proxy_api = False 47 | self._proxy_api_name = None 48 | 49 | ## Validation 50 | self._validation_test = None 51 | self._geo = None 52 | self._custom_tag = None 53 | self.json_response_keys = [] 54 | 55 | ## Final 56 | self._domain = None 57 | self._page_type = None 58 | self._proxy_type = None 59 | self._proxy_name = None 60 | self._proxy_setup = None 61 | 62 | 63 | """ 64 | Getters 65 | """ 66 | 67 | def get_proxy_name(self): 68 | return self._proxy_name or 'unknown' 69 | 70 | def get_proxy_setup(self): 71 | return self._proxy_setup or 'unknown' 72 | 73 | def get_domain(self): 74 | return self._domain or 'unknown' 75 | 76 | def get_page_type(self): 77 | return self._page_type or 'unknown' 78 | 79 | def get_proxy_api_name(self): 80 | return self._proxy_api_name 81 | 82 | def get_proxy_port_name(self): 83 | return self._proxy_port_name 84 | 85 | def get_raw_proxy(self): 86 | return self.raw_proxy_port 87 | 88 | def get_real_url(self): 89 | return self._real_url or 'unknown' 90 | 91 | def get_validation_test(self): 92 | return self._validation_test or 'pass' 93 | 94 | def get_geo(self): 95 | return self._geo or 'none' 96 | 97 | def get_custom_tag(self): 98 | return self._custom_tag or 'none' 99 | 100 | def get_proxy_port_username(self): 101 | return self._proxy_port_username 102 | 103 | def get_proxy_port_password(self): 104 | return self._proxy_port_password 105 | 106 | def get_proxy_port_host(self): 107 | return self._proxy_port_host 108 | 109 | def get_proxy_port_port(self): 110 | return self._proxy_port_port 111 | 112 | def get_proxy_port_headers(self): 113 | if self._proxy_port_headers == {}: 114 | self._proxy_port_headers = ProxyNormalizer.convert_headers(self.raw_headers) 115 | return self._proxy_port_headers 116 | 117 | def get_complete_proxy_string(self): 118 | if self._complete_proxy_port_string is None: 119 | self._complete_proxy_port_string = "{}://{}:{}@{}:{}".format(self._proxy_port_scheme, self._proxy_port_username, self._proxy_port_password, 120 | self._proxy_port_host, self._proxy_port_port) 121 | return self._complete_proxy_port_string 122 | 123 | def get_normalized_proxy_port_username(self): 124 | if self._normalized_proxy_port_username is None: 125 | return self._proxy_port_username 126 | return self._normalized_proxy_port_username 127 | 128 | def get_normalized_proxy_port_password(self): 129 | if self._normalized_proxy_port_password is None: 130 | return self._proxy_port_password 131 | return self._normalized_proxy_port_password 132 | 133 | def get_normalized_proxy_port_host(self): 134 | if self._normalized_proxy_port_host is None: 135 | return self._proxy_port_host 136 | return self._normalized_proxy_port_host 137 | 138 | def get_normalized_proxy_port_port(self): 139 | if self._normalized_proxy_port_port is None: 140 | return self._proxy_port_port 141 | return self._normalized_proxy_port_port 142 | 143 | def get_normalized_proxy_port_header_string(self): 144 | if self._normalized_proxy_port_header_string is not None: 145 | return f' -H {self._normalized_proxy_port_header_string}' 146 | return '' 147 | 148 | def is_json_response(self): 149 | if len(self.json_response_keys) > 0: 150 | return True 151 | return False 152 | 153 | def get_json_response_keys(self): 154 | return self.json_response_keys 155 | 156 | 157 | """ 158 | SETTERS 159 | """ 160 | 161 | def set_normalized_proxy_port_username(self, username): 162 | self._normalized_proxy_port_username = username 163 | 164 | def set_normalized_proxy_port_password(self, password): 165 | self._normalized_proxy_port_password = password 166 | 167 | def set_normalized_proxy_port_host(self, host): 168 | self._normalized_proxy_port_host = host 169 | 170 | def set_normalized_proxy_port_port(self, port): 171 | self._normalized_proxy_port_port = port 172 | 173 | def update_normalized_proxy_port_header_string(self, header_string): 174 | if self._normalized_proxy_port_header_string is None: 175 | self._normalized_proxy_port_header_string = header_string 176 | else: 177 | self._normalized_proxy_port_header_string = f'{self._normalized_proxy_port_header_string} {header_string}' 178 | 179 | 180 | """ 181 | Proxy Type Methods 182 | """ 183 | 184 | def active_proxy(self): 185 | return True if self._active_proxy else False 186 | 187 | def active_proxy_port(self): 188 | return True if self._active_proxy_port else False 189 | 190 | def active_proxy_api(self): 191 | return self._proxy_api 192 | 193 | def active_named_proxy(self): 194 | return self._named_proxy 195 | 196 | 197 | 198 | 199 | 200 | 201 | class RequestResponse(BaseRequestResponse): 202 | 203 | def __init__(self, signal_type=None, request=None, response=None): 204 | BaseRequestResponse.__init__(self) 205 | self.signal_type = signal_type 206 | if request is not None or response is not None: 207 | self.request = response.request if request is None else request 208 | self.raw_url = request.url if response is None else response.url 209 | self.raw_proxy_port = self.request.meta.get('proxy') 210 | self.raw_domain = DomainNormalizer.get_domain(self.raw_url) 211 | self._active_proxy = self._active_proxy_port = False if self.raw_proxy_port is None else True 212 | self.raw_headers = self.request.headers 213 | 214 | """ 215 | Domain Normalization 216 | """ 217 | 218 | def check_domain(self, domain_obj): 219 | domain_details = domain_obj.get(self._domain) 220 | if domain_details is not None: 221 | self._page_type = DomainNormalizer.get_page_type(self._real_url, domain_data=domain_details) 222 | return False 223 | return True 224 | 225 | 226 | def update_page_type(self, domain_details): 227 | if domain_details is not None: 228 | self._page_type = DomainNormalizer.get_page_type(self._real_url, domain_data=domain_details) 229 | 230 | 231 | def fallback_domain_data(self): 232 | if self._domain is None: 233 | self._domain = DomainNormalizer.get_domain(self.raw_url) 234 | self._page_type = 'none' 235 | 236 | 237 | """ 238 | Proxy Port Normalization 239 | """ 240 | 241 | def check_proxy_port_type(self, proxy_ports): 242 | if ProxyNormalizer.check_ip_address(self.raw_proxy_port): 243 | self._proxy_type = 'proxy_ip_list' 244 | self._real_url = self.raw_url 245 | self._domain = self.raw_domain 246 | self._proxy_name = 'unknown_ip' 247 | self._proxy_setup = 'ip_address' 248 | return False, False 249 | 250 | self._named_proxy, self._proxy_port_host, self._proxy_port_name = ProxyNormalizer.check_named_proxy(self.raw_proxy_port) 251 | 252 | if self._named_proxy: 253 | self._proxy_type = 'named_proxy_port' 254 | self._real_url = self.raw_url 255 | self._domain = self.raw_domain 256 | self.get_proxy_port_details() 257 | 258 | proxy_details = proxy_ports.get(self._proxy_port_name) 259 | 260 | if proxy_details is not None: 261 | 262 | if proxy_details.get(self._complete_proxy_port_string) is not None: 263 | self._proxy_setup = proxy_details.get(self._complete_proxy_port_string) 264 | elif proxy_details.get(self._complete_proxy_port_string) is None and proxy_details.get('known', False): 265 | ProxyPortStringNormalizer.run_proxy_string_normalization(self, proxy_ports[self._proxy_port_name].get('normalization_actions')) 266 | self.create_normalized_proxy_port_string() 267 | self._proxy_setup = proxy_details.get(self._normalized_proxy_port_string) 268 | 269 | if self._proxy_setup is None: 270 | self._proxy_setup = proxy_details.get('fallback') 271 | if proxy_details.get('count') > proxy_details.get('max_count') or proxy_details.get('known') is False: 272 | return True, False 273 | ## Get details 274 | return True, True 275 | return True, False 276 | 277 | ## get proxy details 278 | return True, True 279 | 280 | 281 | def get_proxy_port_details(self): 282 | self._proxy_name = self._proxy_port_name 283 | self._proxy_port_port = ProxyNormalizer.get_proxy_port(self.raw_proxy_port) 284 | self._proxy_port_scheme = ProxyNormalizer.get_proxy_scheme(self.raw_proxy_port) 285 | if self.raw_headers.get('Proxy-Authorization') is not None: 286 | auth_string = self.raw_headers.get('Proxy-Authorization').decode('utf-8') 287 | self._proxy_port_username, self._proxy_port_password = ProxyNormalizer.decode_basic_auth(auth_string) 288 | self._complete_proxy_port_string = "{}://{}:{}@{}:{}".format(self._proxy_port_scheme, self._proxy_port_username, self._proxy_port_password, 289 | self._proxy_port_host, self._proxy_port_port) 290 | 291 | def create_normalized_proxy_port_string(self): 292 | username = self.get_normalized_proxy_port_username() 293 | password = self.get_normalized_proxy_port_password() 294 | host = self.get_normalized_proxy_port_host() 295 | port = self.get_normalized_proxy_port_port() 296 | header_string = self.get_normalized_proxy_port_header_string() 297 | self._normalized_proxy_port_string = "{}://{}:{}@{}:{}".format(self._proxy_port_scheme, username, password, host, port) 298 | if header_string != '': 299 | self._normalized_proxy_port_string = self._normalized_proxy_port_string + header_string 300 | 301 | def proxy_port_setup(self, proxy_details): 302 | proxy_setup = proxy_details.get('proxy_setup') 303 | if proxy_setup is None: 304 | return 'none' 305 | proxy_string = 'port' 306 | ## Generate settings string 307 | return proxy_string 308 | 309 | def update_proxy_port(self, proxy_name, proxy_setup_value): 310 | self._active_proxy = True 311 | self._proxy_api = False 312 | self._proxy_type = 'named_proxy_port' 313 | self._proxy_name = proxy_name 314 | self._proxy_setup = proxy_setup_value 315 | 316 | 317 | 318 | """ 319 | Proxy API Normalization 320 | """ 321 | 322 | def check_proxy_api(self, proxy_apis): 323 | proxy_details = proxy_apis.get(self.raw_domain) 324 | if proxy_details is not None: 325 | if proxy_details.get('proxy_setup') is None: 326 | self._proxy_api_name = proxy_details.get('proxy_name') 327 | return True, True 328 | self.update_proxy_api(proxy_details) 329 | return True, False 330 | return False, False 331 | 332 | 333 | def update_proxy_api(self, proxy_details): 334 | self._real_url = DomainNormalizer.get_url_proxy_api(url=self.raw_url, proxy_settings=proxy_details) 335 | self._domain = DomainNormalizer.get_domain(self._real_url) 336 | self._active_proxy = True 337 | self._proxy_api = True 338 | self._proxy_type = 'proxy_api' 339 | self._proxy_name = self._proxy_api_name = proxy_details.get('proxy_name') 340 | self._proxy_setup = self.proxy_api_setup(proxy_details) ## into new file 341 | self.json_response_keys = proxy_details.get('json_response_keys', []) 342 | 343 | 344 | def proxy_api_setup(self, proxy_details): 345 | proxy_string = 'api' 346 | proxy_setup = proxy_details.get('proxy_setup') 347 | if proxy_setup is None: 348 | return proxy_string 349 | query_params = DomainNormalizer.parse_url(self.raw_url) 350 | for key, value in query_params.items(): 351 | key_mapping = proxy_setup.get(key) 352 | if key_mapping is not None: 353 | if key_mapping.startswith('**'): 354 | proxy_string = f'{proxy_string}_{key_mapping[2:]}' 355 | elif key_mapping.startswith('--'): 356 | proxy_string = f'{proxy_string}_{key_mapping[2:]}={value.lower()}' 357 | elif key_mapping.startswith('^^'): 358 | proxy_string = f'{proxy_string}_{key_mapping[2:]}=false' 359 | else: 360 | proxy_string = f'{proxy_string}_{key_mapping}=true' 361 | return proxy_string 362 | 363 | 364 | 365 | """ 366 | Fallback Proxy Details 367 | """ 368 | 369 | def update_no_proxy(self): 370 | self._proxy_type = self._proxy_name = 'no_proxy' 371 | self._proxy_setup = 'none' 372 | self._real_url = self.raw_url 373 | self._domain = self.raw_domain 374 | 375 | def fallback_proxy_details(self, proxy_type=None, proxy_apis=None): 376 | if proxy_type == 'proxy_api': 377 | proxy_details = proxy_apis.get(self.raw_domain) 378 | if proxy_details is not None: 379 | self.update_proxy_api(proxy_details) 380 | else: 381 | self._proxy_name = 'unknown_proxy_api' if self._proxy_api_name is None else self._proxy_api_name 382 | self._proxy_setup = 'fallback' if self._proxy_setup is None else self._proxy_setup 383 | else: 384 | self._proxy_name = 'unknown_proxy_port' if self._proxy_name is None else self._proxy_name 385 | self._proxy_setup = 'fallback' if self._proxy_setup is None else self._proxy_setup 386 | 387 | 388 | """ 389 | Fallback Proxy + Domain Details 390 | """ 391 | 392 | def fallback_domain_proxy_details(self, reason='fallback'): 393 | """ 394 | Fallback -> if issue with domain/proxy normalising 395 | """ 396 | self._domain = DomainNormalizer.get_domain(self.raw_url) 397 | self._page_type = 'none' 398 | self._proxy_name = reason 399 | self._proxy_setup = 'none' 400 | 401 | 402 | """ 403 | Response Validation Tests 404 | """ 405 | 406 | def failed_validation_test(self, test): 407 | if self._validation_test is None: 408 | self._validation_test = test.get('validation_msg', 'failed') 409 | else: 410 | self._validation_test = f'{self._validation_test}&&{test.get("validation_msg", "failed")}' 411 | if test.get('validation_test_id', -1) != -1: 412 | self._validation_test = f'{self._validation_test}_{test.get("validation_test_id")}' 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | -------------------------------------------------------------------------------- /scrapeops_scrapy/signals/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/signals/__init__.py -------------------------------------------------------------------------------- /scrapeops_scrapy/signals/scrapeops_signals.py: -------------------------------------------------------------------------------- 1 | scrapeops_response_recieved = object() 2 | scrapeops_exception_recieved = object() 3 | scrapeops_response_rejected = object() 4 | scrapeops_item_rejected = object() -------------------------------------------------------------------------------- /scrapeops_scrapy/signals/triggers.py: -------------------------------------------------------------------------------- 1 | from scrapeops_scrapy.signals import scrapeops_signals 2 | 3 | class ScrapeOpsTrigger(object): 4 | 5 | def __init__(self): 6 | pass 7 | 8 | @staticmethod 9 | def reject_response(crawler=None, response=None, reason=None): 10 | crawler.signals.send_catch_log(signal=scrapeops_signals.scrapeops_response_rejected, 11 | spider=crawler.spider, 12 | response=response, 13 | reason=reason, 14 | ) 15 | 16 | @staticmethod 17 | def reject_item(crawler=None, response=None, item=None, reason=None): 18 | crawler.signals.send_catch_log(signal=scrapeops_signals.scrapeops_item_rejected, 19 | spider=crawler.spider, 20 | response=response, 21 | item=item, 22 | reason=reason, 23 | ) 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /scrapeops_scrapy/stats/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/stats/__init__.py -------------------------------------------------------------------------------- /scrapeops_scrapy/stats/failed_urls.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class FailedUrlsHandler(object): 4 | 5 | FAILED_URL_LOGGER_ENABLED = True 6 | LOG_MISSED_URLS = False 7 | MAX_LOGGED_URLS = 100 8 | 9 | def __init__(self): 10 | self.failed_urls_count = 0 11 | self.failed_urls_list = [] 12 | self.errback_free = True 13 | 14 | def log_failure(self, failure): 15 | if FailedUrlsHandler.FAILED_URL_LOGGER_ENABLED: 16 | self.failed_urls_count += 1 17 | if FailedUrlsHandler.LOG_MISSED_URLS and len(self.failed_urls_list) < FailedUrlsHandler.MAX_LOGGED_URLS: 18 | request = failure.request 19 | self.failed_urls_list.append(request.url) 20 | 21 | def get_url_count(self): 22 | return self.failed_urls_count 23 | 24 | def get_url_list(self): 25 | return self.failed_urls_list 26 | 27 | def disable_errback(self): 28 | self.errback_free = False 29 | 30 | def enabled(self): 31 | return self.errback_free 32 | 33 | 34 | -------------------------------------------------------------------------------- /scrapeops_scrapy/stats/logger.py: -------------------------------------------------------------------------------- 1 | ## scrapy 2 | from scrapy.utils.request import request_httprepr 3 | 4 | ## scrapeops 5 | from scrapeops_scrapy.stats.model import OverallStatsModel, PeriodicStatsModel 6 | from scrapeops_scrapy.utils import utils 7 | from scrapeops_scrapy.normalizer.exceptions import ExceptionNormalizer 8 | from scrapeops_scrapy.utils.utils import get_header_size, get_status_size 9 | 10 | import copy 11 | 12 | 13 | class StatsLogger(OverallStatsModel, PeriodicStatsModel): 14 | 15 | def __init__(self): 16 | OverallStatsModel.__init__(self) 17 | PeriodicStatsModel.__init__(self) 18 | 19 | 20 | def display_stats(self): 21 | self.display_periodic_stats() 22 | self.display_overall_stats() 23 | 24 | 25 | def check_periodic_stats(self): 26 | if self._periodic_stats == {}: 27 | self.set_value(self._periodic_stats, 'job_id', self.job_id) 28 | 29 | 30 | def spider_open_stats(self): 31 | self.set_value(self._overall_stats, 'job_id', self.job_id) 32 | self.set_value(self._overall_stats, 'job_name', self.job_group_name) 33 | self.set_value(self._overall_stats, 'job_start_time', self.start_time) 34 | self.set_value(self._overall_stats, 'job_finish_time', 0) 35 | self.set_value(self._overall_stats, 'job_run_time', 0) 36 | self.set_value(self._overall_stats, 'status', 'Live') 37 | self.set_value(self._overall_stats, 'middleware_enabled', self._scrapeops_middleware) 38 | 39 | 40 | 41 | def spider_close_stats(self, reason=None, crawler=None): 42 | finish_time = utils.current_time() 43 | self.aggregate_stats(crawler) 44 | self.set_value(self._overall_stats, 'job_finish_time', finish_time) 45 | self.set_value(self._overall_stats, 'job_run_time', finish_time - self.start_time) 46 | self.set_value(self._overall_stats, 'status', 'Finished') 47 | self.set_value(self._overall_stats, 'reason', reason) 48 | self.set_value(self._overall_stats, 'period_frequency', self._period_frequency) 49 | 50 | 51 | def generate_request_stats(self, request_response_object, request=None): 52 | proxy_name = request_response_object.get_proxy_name() 53 | proxy_setup = request_response_object.get_proxy_setup() 54 | domain_name = request_response_object.get_domain() 55 | page_type = request_response_object.get_page_type() 56 | custom_tag = request_response_object.get_custom_tag() 57 | reqlen = len(request_httprepr(request)) 58 | 59 | ## periodic stats 60 | self.check_periodic_stats() 61 | self.inc_value(self._periodic_stats, f'requests|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{custom_tag}|count') 62 | self.inc_value(self._periodic_stats, f'requests|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{custom_tag}|bytes', count=reqlen) 63 | 64 | ## overall stats 65 | self.inc_value(self._overall_stats, f'requests|{request.method}|count') 66 | self.inc_value(self._overall_stats, f'requests|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{custom_tag}|count') 67 | self.inc_value(self._overall_stats, f'requests|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{custom_tag}|bytes', count=reqlen) 68 | 69 | 70 | def generate_response_stats(self, request_response_object, request=None, response=None): 71 | proxy_name = request_response_object.get_proxy_name() 72 | proxy_setup = request_response_object.get_proxy_setup() 73 | domain_name = request_response_object.get_domain() 74 | page_type = request_response_object.get_page_type() 75 | validation = request_response_object.get_validation_test() 76 | geo = request_response_object.get_geo() 77 | custom_tag = request_response_object.get_custom_tag() 78 | custom_signal = 'none' 79 | reslen = len(response.body) + get_header_size(response.headers) + get_status_size(response.status) + 4 80 | total_latency = request.meta.get('download_latency', 0) 81 | 82 | ## periodic stats 83 | self.check_periodic_stats() 84 | self.inc_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|count') 85 | self.inc_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|bytes', count=reslen) 86 | self.inc_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|retries', count=request.meta.get('retry_times', 0)) 87 | self.inc_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|total_latency', count=total_latency) 88 | self.min_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|min_latency', total_latency) 89 | self.max_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|max_latency', total_latency) 90 | 91 | ## overall stats 92 | self.inc_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|count') 93 | self.inc_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|bytes', count=reslen) 94 | self.inc_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|retries', count=request.meta.get('retry_times', 0)) 95 | self.inc_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|total_latency', count=total_latency) 96 | self.min_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|min_latency', total_latency) 97 | self.max_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|max_latency', total_latency) 98 | 99 | 100 | def generate_item_stats(self, request_response_object, signal=None, response=None): 101 | if response is not None: 102 | request = response.request 103 | request_method = request.method 104 | status = response.status 105 | else: 106 | request_method = status = 'unknown' 107 | proxy_name = request_response_object.get_proxy_name() 108 | proxy_setup = request_response_object.get_proxy_setup() 109 | domain_name = request_response_object.get_domain() 110 | page_type = request_response_object.get_page_type() 111 | validation = request_response_object.get_validation_test() 112 | geo = request_response_object.get_geo() 113 | custom_tag = request_response_object.get_custom_tag() 114 | custom_signal = 'none' 115 | self.check_periodic_stats() 116 | 117 | if signal == 'item_scraped': 118 | self.inc_value(self._periodic_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items') 119 | self.inc_value(self._overall_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items') 120 | 121 | elif signal == 'item_dropped': 122 | self.inc_value(self._periodic_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items_dropped') 123 | self.inc_value(self._overall_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items_dropped') 124 | 125 | elif signal == 'item_error': 126 | self.inc_value(self._periodic_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|item_errors') 127 | self.inc_value(self._overall_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|item_errors') 128 | 129 | 130 | 131 | def generate_exception_stats(self, request_response_object, request=None, exception_class=None): 132 | proxy_name = request_response_object.get_proxy_name() 133 | proxy_setup = request_response_object.get_proxy_setup() 134 | domain_name = request_response_object.get_domain() 135 | page_type = request_response_object.get_page_type() 136 | validation = request_response_object.get_validation_test() 137 | geo = request_response_object.get_geo() 138 | custom_tag = request_response_object.get_custom_tag() 139 | custom_signal = 'none' 140 | exception_type = ExceptionNormalizer.normalise_exception(exception_class) 141 | download_latency = request.meta.get('download_latency', 0) 142 | if download_latency is None: 143 | start_time = request.meta.get('sops_time', 0) 144 | if start_time != 0: download_latency = utils.current_time() - start_time 145 | else: download_latency = 0 146 | 147 | self.check_periodic_stats() 148 | self.inc_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{exception_type}|{validation}|{geo}|{custom_tag}|{custom_signal}|count') 149 | self.inc_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{exception_type}|{validation}|{geo}|{custom_tag}|{custom_signal}|count') 150 | self.inc_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{exception_type}|{validation}|{geo}|{custom_tag}|{custom_signal}|total_latency', count=download_latency) 151 | 152 | def aggregate_stats(self, crawler=None, middleware=False): 153 | self.avg_latency() 154 | self.log_levels(crawler) 155 | if middleware is False: 156 | self.get_exception_stats(crawler) 157 | 158 | 159 | def avg_latency(self): 160 | for stat_type in [self._periodic_stats, self._overall_stats]: 161 | stats_copy = copy.deepcopy(stat_type) 162 | for key, value in stats_copy.items(): 163 | if 'responses' in key and 'total_latency' in key: 164 | count_key = key.replace('total_latency', 'count') 165 | avg_latency = value / stats_copy.get(count_key) 166 | self.set_value(stat_type, key.replace('total_latency', 'avg_latency'), avg_latency) 167 | 168 | 169 | def log_levels(self, crawler): 170 | scrapy_stats = crawler.stats.get_stats() 171 | for log_level in ['WARNING', 'ERROR', 'CRITICAL']: 172 | log_key = 'log_count/' + log_level 173 | log_value = scrapy_stats.get(log_key, 0) 174 | previous_value = self._overall_stats.get(log_key, 0) 175 | self.set_value(self._periodic_stats, log_key, log_value - previous_value) 176 | self.set_value(self._overall_stats, log_key, log_value) 177 | 178 | 179 | def exception_type_check(self, key): 180 | if isinstance(key, str): 181 | return key.startswith('downloader/exception_type_count/') 182 | return False 183 | 184 | def get_exception_stats(self, crawler): 185 | scrapy_stats = crawler.stats.get_stats() 186 | if scrapy_stats.get('downloader/exception_count') is not None: 187 | exception_values = [ {k:v} for k,v in scrapy_stats.items() if self.exception_type_check(k)] 188 | for exception in exception_values: 189 | for key, value in exception.items(): 190 | key_type = key.replace('downloader/exception_type_count/', '') 191 | try: 192 | exception_type = key_type.split('.')[-1] 193 | except Exception: 194 | exception_type = key_type 195 | self.set_value(self._overall_stats, f'responses|unknown|unknown|unknown|unknown|unknown|{exception_type}|unknown|unknown|unknown|unknown|count', value) 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | -------------------------------------------------------------------------------- /scrapeops_scrapy/stats/model.py: -------------------------------------------------------------------------------- 1 | 2 | class BaseStatsModel(object): 3 | 4 | def __init__(self): 5 | pass 6 | 7 | def get_value(self, stats, key, default=None): 8 | return stats.get(key, default) 9 | 10 | def set_value(self, stats, key, value): 11 | stats[key] = value 12 | 13 | def inc_value(self, stats, key, count=1, start=0, spider=None): 14 | d = stats 15 | d[key] = d.setdefault(key, start) + count 16 | 17 | def max_value(self, stats, key, value, spider=None): 18 | stats[key] = max(stats.setdefault(key, value), value) 19 | 20 | def min_value(self, stats, key, value, spider=None): 21 | stats[key] = min(stats.setdefault(key, value), value) 22 | 23 | def print_stats(self, statsType, stats): 24 | print(f'#### SCRAPEOPS {statsType.upper()} STATS ####') 25 | print('{') 26 | for key, value in stats.items(): 27 | if key[0] != '_': 28 | print(f" '{key}': {value},") 29 | print('}') 30 | 31 | 32 | class PeriodicStatsModel(BaseStatsModel): 33 | 34 | def __init__(self): 35 | self._periodic_stats = {} 36 | self._periodic_errors = 0 37 | self._periodic_warnings = 0 38 | self._periodic_criticals = 0 39 | 40 | def get_periodic_stats(self): 41 | return self._periodic_stats 42 | 43 | def reset_periodic_stats(self): 44 | self._periodic_stats = {} 45 | 46 | def display_periodic_stats(self): 47 | stats = self.get_periodic_stats() 48 | self.print_stats('periodic', stats) 49 | 50 | 51 | class OverallStatsModel(BaseStatsModel): 52 | 53 | def __init__(self): 54 | self._overall_stats = {} 55 | self._overall_errors = 0 56 | self._overall_warnings = 0 57 | self._overall_criticals = 0 58 | 59 | def get_overall_stats(self): 60 | return self._overall_stats 61 | 62 | def display_overall_stats(self): 63 | stats = self.get_overall_stats() 64 | self.print_stats('overall', stats) 65 | 66 | -------------------------------------------------------------------------------- /scrapeops_scrapy/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/tests/__init__.py -------------------------------------------------------------------------------- /scrapeops_scrapy/tests/core.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | class ScrapeOpsTest: 4 | 5 | def __init__(self): 6 | self.active = True 7 | self.test_id = None 8 | 9 | def test_active(self): 10 | if self.active is True: 11 | return True 12 | return False 13 | 14 | def get_test_id(self): 15 | return self.test_id 16 | 17 | def generate_test_id(self): 18 | response = requests.post('https://api.scrapeops.io/api/v1/start_test?api_key=1234&sdk_type=scrapy') 19 | data = response.json() 20 | self.test_id = data.get('test_id', None) 21 | return self.test_id 22 | 23 | @staticmethod 24 | def generate_test_settings(): 25 | return { 26 | 'RETRY_TIMES': 0, 27 | 'RETRY_ENABLED': False, 28 | } 29 | 30 | -------------------------------------------------------------------------------- /scrapeops_scrapy/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/utils/__init__.py -------------------------------------------------------------------------------- /scrapeops_scrapy/utils/error_handling.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | from scrapeops_scrapy.exceptions import ScrapeOpsAPIResponseError 4 | 5 | def exception_handler(func): 6 | @functools.wraps(func) 7 | def wrapper(*args, **kwargs): 8 | try: 9 | return func(*args, **kwargs) 10 | except ScrapeOpsAPIResponseError as e: 11 | pass 12 | except Exception as e: 13 | pass 14 | return wrapper 15 | 16 | 17 | -------------------------------------------------------------------------------- /scrapeops_scrapy/utils/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import sys 4 | import sys 5 | import scrapy 6 | import scrapeops_scrapy 7 | import platform 8 | 9 | from scrapeops_scrapy.utils.error_handling import exception_handler 10 | from scrapy.utils.python import to_bytes 11 | from twisted.web import http 12 | 13 | 14 | def current_time(): 15 | t = time.time() 16 | return int(round(t, 0)) 17 | 18 | @exception_handler 19 | def get_args(): 20 | arg_dict = {'raw_string': '', 'args': [], 'options': []} 21 | if sys.argv[0] == 'crawl' or sys.argv[0] == 'runspider': 22 | args = sys.argv[2:] 23 | else: 24 | args = sys.argv[1:] 25 | for index, arg in enumerate(args): 26 | arg_dict['raw_string'] += append_raw_string(arg) 27 | if arg.startswith('--'): 28 | arg_dict['options'].append(arg) 29 | if arg.startswith('-a'): 30 | try: 31 | if args[index + 1].startswith('-') is False and args[index + 1].startswith('--') is False: arg_dict['args'].append(args[index + 1]) 32 | except Exception: 33 | arg_dict['args'].append(arg) 34 | return arg_dict 35 | 36 | 37 | def scrapeops_middleware_installed(spider_settings): 38 | downloader_middlerwares = spider_settings.get('DOWNLOADER_MIDDLEWARES', {}) 39 | if downloader_middlerwares.get('scrapeops_scrapy.middleware.stats.ScrapeOpsStats') is not None: 40 | return True 41 | if downloader_middlerwares.get('scrapeops_scrapy.middleware.retry.RetryMiddleware') is not None: 42 | return True 43 | return False 44 | 45 | @exception_handler 46 | def get_python_version(): 47 | verions_string = sys.version 48 | split_string = verions_string.split(' ') 49 | return split_string[0] 50 | 51 | @exception_handler 52 | def get_scrapy_version(): 53 | return scrapy.__version__ 54 | 55 | @exception_handler 56 | def get_scrapeops_version(): 57 | return scrapeops_scrapy.__version__ 58 | 59 | @exception_handler 60 | def get_system_version(): 61 | return platform.platform() 62 | 63 | def append_raw_string(arg): 64 | if ' ' in arg: 65 | return '"{}" '.format(arg) 66 | return "{} ".format(arg) 67 | 68 | def merge_dicts(x, y): 69 | z = x.copy() 70 | z.update(y) 71 | return z 72 | 73 | # from scrapy 74 | def get_header_size(headers): 75 | size = 0 76 | for key, value in headers.items(): 77 | if isinstance(value, (list, tuple)): 78 | for v in value: 79 | size += len(b": ") + len(key) + len(v) 80 | return size + len(b'\r\n') * (len(headers.keys()) - 1) 81 | 82 | 83 | def get_status_size(response_status): 84 | return len(to_bytes(http.RESPONSES.get(response_status, b''))) + 15 85 | # resp.status + b"\r\n" + b"HTTP/1.1 <100-599> " 86 | 87 | 88 | def remove_url(string, replacement=""): 89 | return re.sub(r'http\S+', replacement, string) 90 | 91 | 92 | -------------------------------------------------------------------------------- /scrapeops_scrapy/validators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/validators/__init__.py -------------------------------------------------------------------------------- /scrapeops_scrapy/validators/item_validator.py: -------------------------------------------------------------------------------- 1 | from itemadapter import ItemAdapter, is_item 2 | 3 | class ItemValidator(object): 4 | 5 | ITEM_COVERAGE_ENABLED = True 6 | INVALID_ITEM_URLS_LOGGING_ENABLED = False 7 | MAX_ITEM_URLS = 1000 8 | 9 | def __init__(self): 10 | self.item_coverage = { 11 | '_SOP_OVERAL_STATS': { 12 | 'num_items': 0, 13 | 'num_invalid_items': 0, 14 | 'num_total_fields': 0, 15 | 'num_invalid_fields': 0, 16 | } 17 | } 18 | self.items = 0 19 | self.invalid_items = 0 20 | self.invalid_items_urls = {} 21 | 22 | 23 | def extract_name_fields_item(item): 24 | return 25 | 26 | def validate(self, request_response_object, item): 27 | if ItemValidator.ITEM_COVERAGE_ENABLED and is_item(item): 28 | try: 29 | self.increment_items() 30 | adapter = ItemAdapter(item) 31 | item_name = ItemValidator.get_item_name(item) 32 | dict_item = adapter.asdict() 33 | field_keys = dict_item.keys() 34 | if item_name is not None and field_keys is not None: 35 | domain = request_response_object.get_domain() 36 | invalid_fields = [] 37 | valid_item = True 38 | self.check_item_exists(domain, item_name, field_keys) 39 | self.item_coverage[domain][item_name]['num_items'] += 1 40 | self.increment_total_fields(field_keys) 41 | for k in field_keys: 42 | if(dict_item.get(k) is not None and dict_item.get(k) != ''): 43 | self.item_coverage[domain][item_name]['coverage'][k] += 1 44 | else: 45 | valid_item = False 46 | self.increment_invalid_fields() 47 | invalid_fields.append(k) 48 | 49 | if valid_item is False: 50 | self.item_coverage[domain][item_name]['num_invalid_items'] += 1 51 | self.increment_invalid_items() 52 | if ItemValidator.INVALID_ITEM_URLS_LOGGING_ENABLED and len(invalid_fields) > 0: 53 | self.log_invalid_item_url(request_response_object.get_real_url(), item_name, invalid_fields) 54 | except Exception: 55 | pass 56 | 57 | 58 | def check_item_exists(self, domain, item_name, field_keys): 59 | if self.item_coverage.get(domain) is None: 60 | self.item_coverage[domain] = {} 61 | if self.item_coverage[domain].get(item_name) is None: 62 | self.item_coverage[domain][item_name] = { 63 | 'coverage': {}, 64 | 'num_fields': 0, 65 | 'num_items': 0, 66 | 'num_invalid_items': 0, 67 | } 68 | self.item_coverage[domain][item_name]['num_fields'] = len(field_keys) 69 | for k in field_keys: 70 | self.item_coverage[domain][item_name]['coverage'][k] = 0 71 | 72 | 73 | def log_invalid_item_url(self, url, item_name, invalid_fields): 74 | if self.invalid_items_urls.get(item_name) is None: 75 | self.invalid_items_urls[item_name] = {} 76 | missing_fields_string = ItemValidator.generate_fields_key(invalid_fields) 77 | if self.invalid_items_urls[item_name].get(missing_fields_string) is None: 78 | self.invalid_items_urls[item_name][missing_fields_string] = [] 79 | if url not in self.invalid_items_urls[item_name][missing_fields_string] and len(self.invalid_items_urls[item_name][missing_fields_string]) < ItemValidator.MAX_ITEM_URLS: 80 | self.invalid_items_urls[item_name][missing_fields_string].append(url) 81 | 82 | 83 | def increment_total_fields(self, fields): 84 | self.item_coverage['_SOP_OVERAL_STATS']['num_total_fields'] += len(fields) 85 | 86 | def increment_invalid_fields(self): 87 | self.item_coverage['_SOP_OVERAL_STATS']['num_invalid_fields'] += 1 88 | 89 | def increment_items(self): 90 | self.items += 1 91 | self.item_coverage['_SOP_OVERAL_STATS']['num_items'] += 1 92 | 93 | def increment_invalid_items(self): 94 | self.invalid_items += 1 95 | self.item_coverage['_SOP_OVERAL_STATS']['num_invalid_items'] += 1 96 | 97 | def get_item_coverage_data(self): 98 | return self.item_coverage 99 | 100 | def get_num_items(self): 101 | return self.items 102 | 103 | def get_num_invalid_items(self): 104 | return self.invalid_items 105 | 106 | def get_invalid_items_urls(self): 107 | return self.invalid_items_urls 108 | 109 | def get_field_coverage(self): 110 | overall_stats = self.item_coverage.get('_SOP_OVERAL_STATS') 111 | if overall_stats is None: return 0 112 | if overall_stats.get('num_total_fields', 0) == 0: return 0 113 | valid_fields = overall_stats.get('num_total_fields') - overall_stats.get('num_invalid_fields') 114 | return round((valid_fields / overall_stats.get('num_total_fields'))*100) 115 | 116 | 117 | @staticmethod 118 | def get_item_fields(item): 119 | return item.fields 120 | 121 | @staticmethod 122 | def get_item_name(item): 123 | return item.__class__.__name__ 124 | 125 | @staticmethod 126 | def generate_fields_key(fields): 127 | missing_fields_string = '' 128 | for field in fields: 129 | if len(missing_fields_string) > 0: 130 | missing_fields_string += '&&' 131 | missing_fields_string += field 132 | return missing_fields_string 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /scrapeops_scrapy/validators/response_validator.py: -------------------------------------------------------------------------------- 1 | from scrapy.http import Response 2 | from scrapeops_scrapy.utils.utils import get_header_size, get_status_size 3 | from random import randint 4 | import json 5 | 6 | class ResponseValidator(object): 7 | 8 | def __init__(self): 9 | pass 10 | 11 | @staticmethod 12 | def validate(request_response_object, response, domain_tests=None, generic_tests=None, geotargeting_tests=None): 13 | if domain_tests is not None: 14 | for test in domain_tests: 15 | if ResponseValidator.run_validation_test(request_response_object, response, test.get('validation_tests', [])) is False: 16 | request_response_object.failed_validation_test(test) 17 | break 18 | 19 | if generic_tests is not None: 20 | for test in generic_tests: 21 | if ResponseValidator.run_validation_test(request_response_object, response, test.get('validation_tests', [])) is False: 22 | request_response_object.failed_validation_test(test) 23 | break 24 | 25 | 26 | @staticmethod 27 | def run_validation_test(request_response_object, response, test_array): 28 | """ 29 | Returns True if test is passed, False if test is failed. 30 | """ 31 | fail_counter = 0 32 | for test in test_array: 33 | 34 | if test.get('test_type') == 'bytes_check': 35 | if ResponseValidator.bytes_check(response, test.get('threshold', 0), test.get('comparison_type')): 36 | fail_counter += 1 37 | else: return True 38 | 39 | if test.get('test_type') == 'response_length_check': 40 | if ResponseValidator.response_length_check(ResponseValidator.get_response_text(request_response_object, response), test.get('threshold', 0), test.get('comparison_type')): 41 | fail_counter += 1 42 | else: return True 43 | 44 | if test.get('test_type') == 'string_check' and test.get('test_location') == 'body': 45 | if ResponseValidator.string_check(ResponseValidator.get_response_text(request_response_object, response), test.get('text_check', ''), test.get('comparison_type'), text_slice=test.get('text_slice')): 46 | fail_counter += 1 47 | else: return True 48 | 49 | if test.get('test_type') == 'string_check' and test.get('test_location') == 'user_agent': 50 | pass 51 | 52 | if test.get('test_type') == 'string_check' and test.get('test_location') == 'url': 53 | if ResponseValidator.string_check(request_response_object.get_real_url(), test.get('text_check', ''), test.get('comparison_type'), text_slice=test.get('text_slice')): 54 | fail_counter += 1 55 | else: return True 56 | 57 | 58 | if fail_counter == len(test_array): 59 | return False 60 | return True 61 | 62 | 63 | @staticmethod 64 | def get_domain_tests(request_response_object, domains): 65 | domain_details = domains.get(request_response_object.get_domain()) 66 | if domain_details is not None: 67 | return domain_details.get('validation_details') 68 | return None 69 | 70 | 71 | @staticmethod 72 | def failed_scan(request_response_object, domains): 73 | domain_details = domains.get(request_response_object.get_domain()) 74 | if domain_details is not None: 75 | failed_scan_ratio = domain_details.get('failed_generic_scan', 0) 76 | if failed_scan_ratio == 0: return False 77 | if failed_scan_ratio == 1: return True 78 | if randint(1, failed_scan_ratio) == 1: return True 79 | return False 80 | 81 | 82 | @staticmethod 83 | def get_response_text(request_response_object, response): 84 | try: 85 | if isinstance(response, Response): 86 | if request_response_object.is_json_response(): 87 | json_response = json.loads(response.text) 88 | json_response_keys = request_response_object.get_json_response_keys() 89 | for key in json_response_keys: 90 | json_response = json_response.get(key) 91 | return json_response or '' 92 | return response.text 93 | else: return '' 94 | except AttributeError: 95 | return '' 96 | 97 | 98 | @staticmethod 99 | def string_check(text, text_check, comparison, text_slice=None): 100 | if isinstance(text, str): 101 | if text_slice is not None: 102 | text = ResponseValidator.string_slice(text, text_slice) 103 | if comparison == 'contains' and text_check in text: 104 | return True 105 | elif comparison == 'not_contain' and text_check not in text: 106 | return True 107 | return False 108 | 109 | 110 | @staticmethod 111 | def string_slice(text, text_slice): 112 | if text_slice.get('active'): 113 | if (text_slice.get('slice_type') == 'first') and (len(text) > 0): 114 | return text[:text_slice.get('slice_upper_threshold', len(text))] 115 | if (text_slice.get('slice_type') == 'last') and (len(text) > 0): 116 | return text[-text_slice.get('slice_lower_threshold', 0)] 117 | if text_slice.get('slice_type') == 'range': 118 | return text[text_slice.get('slice_lower_threshold', 0):text_slice.get('slice_upper_threshold', len(text))] 119 | return text 120 | 121 | 122 | @staticmethod 123 | def bytes_check(response, threshold, comparison): 124 | if threshold == 0: return False 125 | reslen = len(response.body) + get_header_size(response.headers) + get_status_size(response.status) + 4 126 | return ResponseValidator.comparison_operators(reslen, threshold, comparison) 127 | 128 | 129 | @staticmethod 130 | def response_length_check(text, threshold, comparison): 131 | if threshold == 0: return False 132 | response_text_length = len(text) 133 | return ResponseValidator.comparison_operators(response_text_length, threshold, comparison) 134 | 135 | 136 | @staticmethod 137 | def comparison_operators(value, threshold, comparison): 138 | if comparison == 'less_than': 139 | return value < threshold 140 | if comparison == 'less_than_equal': 141 | return value <= threshold 142 | if comparison == 'greater_than': 143 | return value > threshold 144 | if comparison == 'greater_than_equal': 145 | return value >= threshold 146 | if comparison == 'equals': 147 | return value == threshold 148 | if comparison == 'not_equal': 149 | return value != threshold 150 | return False 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | VERSION = '0.5.6' 5 | DESCRIPTION = 'Scrapeops Scrapy SDK, is a monitoring tool for your Scrapy spiders.' 6 | 7 | setup(name='scrapeops_scrapy', 8 | description=DESCRIPTION, 9 | long_description=DESCRIPTION, 10 | author="ScrapeOps", 11 | author_email="info@scrapeops.io", 12 | version=VERSION, 13 | license="BSD", 14 | url="https://github.com/ScrapeOps/scrapeops-scrapy-sdk", 15 | packages=find_packages(), 16 | install_requires=[ 17 | "tld>=0.13", 18 | "requests>=2.32.0", 19 | "json5>=0.9.13", 20 | # The latest version of requests (2.29.0) does not support urllib3 2.0.0 #6432 - https://github.com/psf/requests/issues/6432 21 | "urllib3>=1.26.14", 22 | "itemadapter>=0.8.0", 23 | ], 24 | classifiers=[ 25 | "Programming Language :: Python", 26 | "Programming Language :: Python :: 3", 27 | "Programming Language :: Python :: 3.8", 28 | "Programming Language :: Python :: 3.9", 29 | "Programming Language :: Python :: 3.10", 30 | "Programming Language :: Python :: 3.11", 31 | "License :: OSI Approved :: BSD License", 32 | "Operating System :: OS Independent", 33 | "Intended Audience :: Developers", 34 | ], 35 | python_requires=">=3.8", 36 | ) --------------------------------------------------------------------------------