├── .gitignore
├── LICENSE
├── README.md
├── pyproject.toml
├── scrapeops_scrapy
├── __init__.py
├── core
│ ├── __init__.py
│ ├── api.py
│ ├── controllers.py
│ ├── core.py
│ ├── error_logger.py
│ ├── model.py
│ └── setup.py
├── exceptions.py
├── extension.py
├── middleware
│ ├── __init__.py
│ ├── retry.py
│ └── stats.py
├── normalizer
│ ├── __init__.py
│ ├── domains.py
│ ├── exceptions.py
│ ├── middleware.py
│ ├── proxies.py
│ ├── proxy_port_normalizer.py
│ └── request_response.py
├── signals
│ ├── __init__.py
│ ├── scrapeops_signals.py
│ └── triggers.py
├── stats
│ ├── __init__.py
│ ├── failed_urls.py
│ ├── logger.py
│ └── model.py
├── tests
│ ├── __init__.py
│ └── core.py
├── utils
│ ├── __init__.py
│ ├── error_handling.py
│ └── utils.py
└── validators
│ ├── __init__.py
│ ├── item_validator.py
│ └── response_validator.py
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 | venv/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2023, ScrapeOps
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ScrapeOps Scrapy SDK: Scrapy Extension For Spider Monitoring, Alerts and Scheduling.
2 | The ScrapeOps Scrapy SDK is an extension for your Scrapy spiders that gives you all the scraping monitoring, statistics, alerting, scheduling and data validation you will need straight out of the box.
3 |
4 | Just enable it in your `settings.py` file and the SDK will automatically monitor your scrapers and send your logs to your scraping dashboard. When connected to a ScrapyD server, you can schedule and manage all your jobs from one easy to use interface.
5 |
6 | **Full documentation can be found here:** [ScrapeOps Documentation](https://scrapeops.io/docs/intro)
7 |
8 |
9 | 
10 |
11 |
12 | ## :computer: Demo
13 | [:link: ScrapeOps Dashboard Demo](https://scrapeops.io/app/login/demo)
14 |
15 | ## :star: Features
16 |
17 | View features
18 |
19 | - **Scrapy Job Stats & Visualisation**
20 | - :chart_with_upwards_trend: Individual Job Progress Stats
21 | - :bar_chart: Compare Jobs versus Historical Jobs
22 | - :100: Job Stats Tracked
23 | - :white_check_mark: Pages Scraped & Missed
24 | - :white_check_mark: Items Parsed & Missed
25 | - :white_check_mark: Item Field Coverage
26 | - :white_check_mark: Runtimes
27 | - :white_check_mark: Response Status Codes
28 | - :white_check_mark: Success Rates & Average Latencies
29 | - :white_check_mark: Errors & Warnings
30 | - :white_check_mark: Bandwidth
31 |
32 | - **Health Checks & Alerts**
33 | - :male_detective: Custom Spider & Job Health Checks
34 | - :package: Out of the Box Alerts - Slack (More coming soon!)
35 | - :bookmark_tabs: Daily Scraping Reports
36 |
37 | - **ScrapyD Cluster Management**
38 | - :link: Integrate With ScrapyD Servers
39 | - :alarm_clock: Schedule Periodic Jobs
40 | - :100: All Scrapyd JSON API Supported
41 | - :closed_lock_with_key: Secure Your ScrapyD with BasicAuth, HTTPS or Whitelisted IPs
42 | - **Proxy Monitoring (Coming Soon)**
43 | - :chart_with_upwards_trend: Monitor Your Proxy Account Usage
44 | - :chart_with_downwards_trend: Track Your Proxy Providers Performance
45 | - :bar_chart: Compare Proxy Performance Verus Other Providers
46 |
47 |
48 |
49 | ## :rocket: Getting Started
50 | You can get the ScrapeOps monitoring suite up and running in **4 easy steps**.
51 |
52 | #### #1 - Install the ScrapeOps SDK:
53 |
54 | ```
55 | pip install scrapeops-scrapy
56 | ```
57 |
58 | #### #2 - Get Your ScrapeOps API Key:
59 | Create a [free ScrapeOps account here](https://scrapeops.io/app/register) and get your API key from the dashboard.
60 |
61 | When you have your API key, open your Scrapy projects `settings.py` file and insert your API key into it.
62 |
63 | ```python
64 | SCRAPEOPS_API_KEY = 'YOUR_API_KEY'
65 | ```
66 |
67 | #### #3 - Add in the ScrapeOps Extension:
68 | In the `settings.py` file, add in the ScrapeOps extension, by simply adding it to the `EXTENSIONS` dictionary.
69 |
70 | ```python
71 | EXTENSIONS = {
72 | 'scrapeops_scrapy.extension.ScrapeOpsMonitor': 500,
73 | }
74 | ```
75 |
76 | #### #4 - Enable the ScrapeOps Retry Middleware:
77 | To get the most accurate stats, you need to add in the ScrapeOps retry middleware into the `DOWNLOADER_MIDDLEWARES` dictionary and disable the default Scrapy Retry middleware in your Scrapy project's `settings.py` file.
78 |
79 | You can do this by setting the default Scrapy RetryMiddleware to `None` and enabling the ScrapeOps retry middleware in it's place.
80 |
81 | ```python
82 | DOWNLOADER_MIDDLEWARES = {
83 | 'scrapeops_scrapy.middleware.retry.RetryMiddleware': 550,
84 | 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
85 | }
86 | ```
87 |
88 | The retry middleware will operate the exactly as before, however, the ScrapeOps retry middleware will log every request, response and exception your spiders generate.
89 |
90 | #### #5 - (Optional) Exclude Settings From Being Logged By ScrapeOps SDK:
91 | By default the ScrapeOps SDK will log the settings used for each particular scrape so you can keep track of the settings used. However, to ensure it doesn't record sensitive information like API keys it won't log any settings that contain the following substrings:
92 |
93 | - `API_KEY`
94 | - `APIKEY`
95 | - `SECRET_KEY`
96 | - `SECRETKEY`
97 |
98 | However, it can still log other settings that don't match these patterns. You can specify which settings not to log by adding the setting to the `SCRAPEOPS_SETTINGS_EXCLUSION_LIST`.
99 |
100 | ```python
101 | SCRAPEOPS_SETTINGS_EXCLUSION_LIST = [
102 | 'NAME_OF_SETTING_NOT_TO_LOG'
103 | ]
104 | ```
105 |
106 | #### Done!
107 | That's all. From here, the ScrapeOps SDK will automatically monitor and collect statistics from your scraping jobs and display them in your [ScrapeOps dashboard](https://scrapeops.io/app/dashboard).
108 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools>=42",
4 | "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
--------------------------------------------------------------------------------
/scrapeops_scrapy/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.5.6"
--------------------------------------------------------------------------------
/scrapeops_scrapy/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/core/__init__.py
--------------------------------------------------------------------------------
/scrapeops_scrapy/core/api.py:
--------------------------------------------------------------------------------
1 | from scrapeops_scrapy.exceptions import ScrapeOpsAPIResponseError
2 | from scrapeops_scrapy.utils.utils import merge_dicts
3 | from scrapeops_scrapy.normalizer.proxies import ProxyNormalizer
4 | import requests
5 | import time
6 |
7 |
8 | class SOPSRequest(object):
9 |
10 | TIMEOUT = 30
11 | RETRY_LIMIT = 3
12 | API_KEY = None
13 | JOB_GROUP_ID = None
14 | SCRAPEOPS_ENDPOINT = 'https://api.scrapeops.io/'
15 | SCRAPEOPS_API_VERSION = 'api/v1/'
16 | SCRAPEOPS_LOGGING_DATA = None
17 | HIGH_FREQ_ACC = True
18 |
19 | def __init__(self):
20 | self.data = None
21 | self.valid = None
22 | self.action = None
23 | self.error = None
24 |
25 |
26 | def setup_request(self, body=None):
27 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'setup/'
28 | data, error = SOPSRequest.post(url, body=body)
29 | data, self.valid, self.action, self.error = SOPSRequest.setup_stats_validation(data, error)
30 | return data, self
31 |
32 |
33 | def stats_request(self, body=None, log_body=None, files=None):
34 | post_body = merge_dicts(SOPSRequest.SCRAPEOPS_LOGGING_DATA, body)
35 | if files is not None:
36 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'logs/?log_type=scrapy'
37 | _, _ = SOPSRequest.post_file(url, body=log_body, files=files)
38 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'stats/'
39 | data, error = SOPSRequest.post(url, body=post_body)
40 | data, self.valid, self.action, self.error = SOPSRequest.setup_stats_validation(data, error)
41 | return data, self
42 |
43 |
44 | def error_report_request(self, error_type=None, body=None, files=None):
45 | post_body = merge_dicts(SOPSRequest.SCRAPEOPS_LOGGING_DATA, body)
46 | if files is None:
47 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'errors/?error_type={error_type}'
48 | data, error = SOPSRequest.post(url, body=post_body, files=files)
49 | else:
50 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'errors/logs/?error_type={error_type}'
51 | data, error = SOPSRequest.post_file(url, body=post_body, files=files)
52 | data, self.valid, self.action, self.error = SOPSRequest.error_report_validation(data, error)
53 | return data, self
54 |
55 |
56 | def proxy_normalisation_request(self, request_response_object):
57 | proxy_name = request_response_object.get_proxy_port_name()
58 | proxy_string = request_response_object.get_raw_proxy()
59 | post_body = merge_dicts(SOPSRequest.SCRAPEOPS_LOGGING_DATA, {'proxy_string': proxy_string})
60 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'normalizer/proxy/?proxy_name={proxy_name}'
61 | data, error = SOPSRequest.post(url, body=post_body)
62 | data, self.valid, self.action, self.error = SOPSRequest.normaliser_validation(data, error, request_type='proxy')
63 | return data, self
64 |
65 |
66 | def proxy_api_normalisation_request(self, request_response_object):
67 | proxy_name = request_response_object.get_proxy_api_name()
68 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'normalizer/proxy_api/?proxy_name={proxy_name}'
69 | data, error = SOPSRequest.post(url, body=SOPSRequest.SCRAPEOPS_LOGGING_DATA)
70 | data, self.valid, self.action, self.error = SOPSRequest.normaliser_validation(data, error, request_type='proxy_api')
71 | return data, self
72 |
73 | def proxy_port_normalisation_request(self, request_response_object, test_data=None):
74 | post_body = merge_dicts(SOPSRequest.SCRAPEOPS_LOGGING_DATA, {
75 | 'proxy_string': request_response_object.get_complete_proxy_string(),
76 | 'proxy_headers': request_response_object.get_proxy_port_headers(),
77 | 'domain': request_response_object.get_proxy_port_name()})
78 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'normalizer/proxy_port/?job_id={SOPSRequest.JOB_GROUP_ID}'
79 | if test_data is not None:
80 | post_body['test_data'] = test_data
81 | data, error = SOPSRequest.post(url, body=post_body)
82 | data, self.valid, self.action, self.error = SOPSRequest.normaliser_validation(data, error, request_type='proxy_port')
83 | return data, self
84 |
85 |
86 | def domain_normalisation_request(self, request_response_object):
87 | domain = request_response_object.get_domain()
88 | real_url = request_response_object.get_real_url()
89 | post_body = merge_dicts(SOPSRequest.SCRAPEOPS_LOGGING_DATA, {'url': real_url})
90 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'normalizer/domain/?domain={domain}'
91 | data, error = SOPSRequest.post(url, body=post_body)
92 | data, self.valid, self.action, self.error = SOPSRequest.normaliser_validation(data, error, request_type='domain')
93 | return data, self
94 |
95 | def proxy_alert_request(self, request_response_object, job_group_id, error_response, alerts_sent):
96 | data = error_response
97 | data['domain'] = request_response_object.get_domain()
98 | data['proxy_provider'] = request_response_object.get_proxy_name()
99 | data['alerts_sent'] = alerts_sent
100 | post_body = merge_dicts(SOPSRequest.SCRAPEOPS_LOGGING_DATA, data)
101 | url = SOPSRequest.SCRAPEOPS_ENDPOINT + SOPSRequest.SCRAPEOPS_API_VERSION + f'alerts/proxy/?job_group_id={job_group_id}'
102 | data, error = SOPSRequest.post(url, body=post_body)
103 | data, self.valid, self.error = SOPSRequest.generic_validation(data, error)
104 | return data, self
105 |
106 | def proxy_test_request(self, url, request_response_object):
107 | data, _ = SOPSRequest.get(url, proxy=request_response_object.get_complete_proxy_string())
108 | return data
109 |
110 | @staticmethod
111 | def generic_validation(data, error):
112 | if data is None:
113 | return data, False, str(error)
114 | elif data.get('api_key') == 'invalid':
115 | return data, False, 'invalid_api_key'
116 | elif data.get('job_id') == 'invalid':
117 | return data, False, 'invalid_job'
118 | return data, True, None
119 |
120 |
121 | @staticmethod
122 | def setup_stats_validation(data, error):
123 | if data is None:
124 | return data, False, 'retry', str(error)
125 | elif data.get('api_key') == 'invalid':
126 | return data, False, 'close', 'invalid_api_key'
127 | elif data.get('job_valid') is not True and data.get('job_id') is None:
128 | return data, False, 'retry', 'invalid_job'
129 | return data, True, 'valid', None
130 |
131 |
132 | @staticmethod
133 | def normaliser_validation(data, error, request_type=None):
134 | if data is None:
135 | return data, False, 'fallback', str(error)
136 | elif data.get('api_key') == 'invalid':
137 | return data, False, 'close', 'invalid_api_key'
138 |
139 | ## proxy port
140 | elif request_type=='proxy_port' and data.get('proxy_port_details') is None:
141 | return data, False, 'fallback', 'no_proxy_port_details'
142 |
143 | ## proxy api
144 | elif request_type=='proxy_api' and data.get('proxy_parsing_data') is None:
145 | return data, False, 'fallback', 'no_proxy_parsing_data'
146 | elif request_type=='proxy_api' and data.get('proxy_parsing_data') is not None:
147 | proxy_parsing_data = data.get('proxy_parsing_data')
148 | if proxy_parsing_data.get('known_proxy') is False:
149 | return data, False, 'fallback', 'unknown_proxy'
150 |
151 | ## domain specific
152 | elif request_type=='domain' and data.get('domain_parsing_data') is None:
153 | return data, False, 'fallback', 'no_domain_parsing_data'
154 | return data, True, 'valid', None
155 |
156 |
157 | @staticmethod
158 | def error_report_validation(data, error):
159 | if data is None:
160 | return data, False, 'retry', str(error)
161 | elif data.get('error_logged') is False:
162 | return data, False, 'close', 'error_not_logged'
163 | return data, True, 'valid', None
164 |
165 | @staticmethod
166 | def condense_stats_body(body):
167 | return {
168 | 'job_id': body.get('job_id'),
169 | 'job_group_id': body.get('job_group_id'),
170 | }
171 |
172 | @staticmethod
173 | def get(url, proxy=None, check=True):
174 | proxies = None
175 | if ProxyNormalizer.unknown_proxy_scheme(proxy) is not True:
176 | proxies = {ProxyNormalizer.get_proxy_scheme(proxy): proxy}
177 | for _ in range(SOPSRequest.RETRY_LIMIT):
178 | try:
179 | response = requests.get(url, timeout=SOPSRequest.TIMEOUT, proxies=proxies, headers={'api_key': SOPSRequest.API_KEY})
180 | if check and response.status_code == 401:
181 | return None, 'invalid_api_key'
182 | if response.status_code == 200:
183 | data = response.json()
184 | return data, None
185 | else:
186 | raise ScrapeOpsAPIResponseError
187 | except requests.exceptions.ConnectionError as e:
188 | error = e
189 | continue
190 | except ScrapeOpsAPIResponseError as e:
191 | error = e
192 | continue
193 | except Exception as e:
194 | error = e
195 | continue
196 | return None, str(error)
197 |
198 |
199 | @staticmethod
200 | def post(url, body=None, files=None, proxy=None):
201 | proxies = None
202 | if ProxyNormalizer.unknown_proxy_scheme(proxy) is not True:
203 | proxies = {ProxyNormalizer.get_proxy_scheme(proxy): proxy}
204 | for _ in range(SOPSRequest.RETRY_LIMIT):
205 | try:
206 | response = requests.post(url, json=body, timeout=SOPSRequest.TIMEOUT, files=files, proxies=proxies, headers={'api_key': SOPSRequest.API_KEY})
207 | if response.status_code == 401:
208 | return None, 'invalid_api_key'
209 | if response.status_code == 200:
210 | data = response.json()
211 | return data, None
212 | else:
213 | time.sleep(3)
214 | raise ScrapeOpsAPIResponseError
215 | except requests.exceptions.ConnectionError as e:
216 | error = e
217 | continue
218 | except ScrapeOpsAPIResponseError as e:
219 | error = e
220 | continue
221 | except Exception as e:
222 | error = e
223 | continue
224 | return None, str(error)
225 |
226 |
227 | @staticmethod
228 | def post_file(url, body=None, files=None):
229 | for _ in range(SOPSRequest.RETRY_LIMIT):
230 | try:
231 | response = requests.post(url, data=body, timeout=SOPSRequest.TIMEOUT, files=files, headers={'api_key': SOPSRequest.API_KEY})
232 | if response.status_code == 401:
233 | return None, 'invalid_api_key'
234 | if response.status_code == 200:
235 | data = response.json()
236 | return data, None
237 | else:
238 | raise ScrapeOpsAPIResponseError
239 | except requests.exceptions.ConnectionError as e:
240 | error = e
241 | continue
242 | except ScrapeOpsAPIResponseError as e:
243 | error = e
244 | continue
245 | except Exception as e:
246 | error = e
247 | continue
248 | return None, str(error)
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/core/controllers.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from scrapeops_scrapy.utils import utils
4 | from scrapeops_scrapy.core.setup import SDKSetup
5 | from scrapeops_scrapy.core.api import SOPSRequest
6 | import sys
7 |
8 | class SDKControllers(SDKSetup):
9 |
10 | SETUP_ATTEMPT_LIMIT = 3
11 |
12 | def __init__(self):
13 | SDKSetup.__init__(self)
14 |
15 | def send_setup_request(self):
16 | data, status = SOPSRequest().setup_request(body=self.setup_data())
17 | if status.valid:
18 | self.initialize_job_details(data)
19 | elif status.action == 'retry' and self._setup_attempts < SDKControllers.SETUP_ATTEMPT_LIMIT:
20 | self._setup_attempts += 1
21 | self._error_logger.log_error(reason='setup_failed',
22 | error=status.error,
23 | data={'setup_attempts': self._setup_attempts})
24 | elif status.action == 'retry' and self._setup_attempts >= SDKControllers.SETUP_ATTEMPT_LIMIT:
25 | self.deactivate_sdk(reason='exceeded_max_setup_attempts',
26 | error=status.error,
27 | data={'setup_attempts': self._setup_attempts},
28 | request_type='setup')
29 | else:
30 | self.deactivate_sdk(reason=status.error, data=data, request_type='setup')
31 |
32 |
33 | def send_stats(self, periodic_stats=None, overall_stats=None, reason=None, stats_type=None):
34 | self._sdk_run_time = self._sdk_run_time + self._period_frequency
35 | post_body = self.stats_data(periodic_stats=periodic_stats, overall_stats=overall_stats, stats_type=stats_type, reason=reason)
36 |
37 | if self.job_active() is False:
38 | self.send_setup_request()
39 |
40 | ## retest if job is inactive
41 | if self.job_active() is False:
42 | self.cache_failed_stats(post_body)
43 | self._error_logger.log_error(reason=f'sending_{stats_type}_stats_failure',
44 | data={'failed_periods': self.failed_periods})
45 |
46 | if self.job_active():
47 | if stats_type == 'finished' and self.export_logs():
48 | log_body = self.log_data()
49 | with open(self.log_file, 'rb') as f:
50 | data, status = SOPSRequest().stats_request(body=post_body, log_body=log_body, files={'file': f})
51 | else:
52 | data, status = SOPSRequest().stats_request(body=post_body)
53 |
54 | if status.valid:
55 | self.update_sdk_settings(data)
56 | self.reset_failed_stats()
57 | elif status.action == 'retry':
58 | self.cache_failed_stats(post_body)
59 | self._error_logger.log_error(reason=f'sending_{stats_type}_stats_failure',
60 | error=status.error,
61 | data={'failed_periods': self.failed_periods})
62 |
63 |
64 | def sdk_enabled(self):
65 | if self._sdk_active:
66 | if self.request_response_middleware is None:
67 | self.initialize_normalizer_middleware()
68 | return True
69 | return False
70 |
71 |
72 | def check_api_key_present(self):
73 | if self._scrapeops_api_key == None:
74 | self._sdk_active = False
75 | return False
76 | self._sdk_active = True
77 | return True
78 |
79 | def deactivate_sdk(self, reason=None, error=None, request_type=None, data=None):
80 | self._sdk_active = False
81 | if reason != 'scrapy_shell':
82 | self._error_logger.sdk_error_close(reason=reason, error=error, request_type=request_type)
83 |
84 | def job_active(self):
85 | if self.job_id is None and self._sdk_active:
86 | return False
87 | return True
88 |
89 | def cache_failed_stats(self, post_body):
90 | self.cached_failed_stats.append(post_body)
91 | self.failed_periods = len(self.cached_failed_stats)
92 |
93 | def reset_failed_stats(self):
94 | self.cached_failed_stats = []
95 | self.failed_periods = 0
96 |
97 | def get_runtime(self, time=None):
98 | if time is None:
99 | return utils.current_time() - self._scrapeops_job_start
100 | return time - self._scrapeops_job_start
101 |
102 | def scrapeops_middleware_enabled(self):
103 | if self._scrapeops_middleware is True:
104 | return True
105 | return False
106 |
107 | def export_logs(self):
108 | if self._scrapeops_export_scrapy_logs and self.log_file is not None:
109 | return True
110 | return False
111 |
112 | def not_scrapy_shell(self):
113 | if sys.argv[0] == 'shell':
114 | self.deactivate_sdk(reason='scrapy_shell')
115 | return False
116 | return True
117 |
118 |
119 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/core/core.py:
--------------------------------------------------------------------------------
1 |
2 | from twisted.internet import task
3 |
4 | from scrapeops_scrapy.exceptions import ScrapeOpsMissingAPIKey
5 | from scrapeops_scrapy.utils import utils
6 | from scrapeops_scrapy.core.controllers import SDKControllers
7 | from scrapeops_scrapy.stats.logger import StatsLogger
8 | from scrapeops_scrapy.normalizer.request_response import RequestResponse
9 |
10 |
11 |
12 | class ScrapeopsCore(SDKControllers, StatsLogger):
13 | """
14 | Where the core ScrapeOps Functionality Goes
15 | """
16 |
17 | def __init__(self):
18 | SDKControllers.__init__(self)
19 | StatsLogger.__init__(self)
20 |
21 | def start_sdk(self, spider=None, crawler=None):
22 | if self.not_scrapy_shell():
23 | self.start_time = self.period_start_time = utils.current_time()
24 | self.initialize_SDK(spider, crawler=crawler)
25 | if self.check_api_key_present():
26 | self.send_setup_request()
27 | self.spider_open_stats()
28 | self.start_periodic_monitor()
29 | else:
30 | err = ScrapeOpsMissingAPIKey()
31 | self.deactivate_sdk(reason='no_api_key', error=err)
32 | raise err
33 |
34 |
35 | def close_sdk(self, spider=None, reason=None):
36 | if self.sdk_enabled():
37 | self.period_finish_time = utils.current_time()
38 | self.spider_close_stats(reason=reason, crawler=self.crawler)
39 | self.send_stats(periodic_stats=self._periodic_stats, overall_stats=self._overall_stats, stats_type='finished', reason=reason)
40 | self.close_periodic_monitor()
41 |
42 |
43 | def request_stats(self, request=None):
44 | if self.sdk_enabled():
45 | request.meta['sops_time'] = utils.current_time()
46 | request_response_object = RequestResponse(request=request)
47 | self.request_response_middleware.normalise_domain_proxy_data(request_response_object)
48 | self.add_missed_urls_callback(request)
49 | self.generate_request_stats(request_response_object, request=request)
50 |
51 |
52 | def response_stats(self, request=None, response=None):
53 | if self.sdk_enabled():
54 | request_response_object = RequestResponse(request=request, response=response)
55 | self.request_response_middleware.process(request_response_object, response)
56 | self.generate_response_stats(request_response_object, request=request, response=response)
57 |
58 |
59 | def exception_stats(self, request=None, exception_class=None):
60 | if self.sdk_enabled():
61 | request_response_object = RequestResponse(request=request)
62 | self.request_response_middleware.normalise_domain_proxy_data(request_response_object)
63 | self.generate_exception_stats(request_response_object, request=request, exception_class=exception_class)
64 |
65 |
66 | def item_stats(self, signal_type=None, item=None, response=None, spider=None):
67 | if self.sdk_enabled():
68 | request_response_object = RequestResponse(response=response)
69 | if response is not None:
70 | self.request_response_middleware.normalise_domain_proxy_data(request_response_object)
71 | if signal_type == 'item_scraped':
72 | self.item_validation_middleware.validate(request_response_object, item)
73 | self.generate_item_stats(request_response_object, signal=signal_type, response=response)
74 |
75 |
76 | def add_missed_urls_callback(self, request):
77 | if request.errback is None:
78 | request.errback = self.failed_url_middleware.log_failure
79 |
80 |
81 |
82 | """
83 | PERIODIC MONITOR
84 | """
85 | def start_periodic_monitor(self):
86 | if self.sdk_enabled():
87 | self.loop = task.LoopingCall(self.periodic_monitor)
88 | self.periodic_loop = self.loop.start(1, now=False) # Start looping every 1 second (1.0).
89 |
90 | def periodic_monitor(self):
91 | period_time = utils.current_time()
92 | if self.get_runtime(time=period_time) % self.get_periodic_frequency() == 0:
93 | self.period_finish_time = period_time
94 | if self.sdk_enabled():
95 | self.aggregate_stats(crawler=self.crawler, middleware=self.scrapeops_middleware_enabled())
96 | self.send_stats(periodic_stats=self._periodic_stats, overall_stats=self._overall_stats, stats_type='periodic')
97 | self.reset_periodic_stats()
98 | self.period_start_time = utils.current_time()
99 | self.inc_value(self._overall_stats, 'periodic_runs')
100 | elif self.periodic_monitor_active():
101 | self.close_periodic_monitor()
102 |
103 | def close_periodic_monitor(self):
104 | if self.periodic_monitor_active():
105 | self.loop.stop()
106 |
107 | def periodic_monitor_active(self):
108 | if self.loop is not None:
109 | if self.loop.running:
110 | return True
111 | return False
112 |
113 | def get_periodic_frequency(self):
114 | self.period_count = 0
115 | runtime = self.get_runtime()
116 | if self._period_freq_list is None:
117 | self.period_count = int(runtime//self._period_frequency)
118 | return self._period_frequency
119 | for index, row in enumerate(self._period_freq_list):
120 | if runtime > int(row.get('total_time')):
121 | if index == 0:
122 | period_time = row.get('total_time')
123 | else:
124 | period_time = row.get('total_time') - self._period_freq_list[index - 1].get('total_time')
125 | self.period_count += int(period_time/row.get('periodic_frequency'))
126 | if runtime <= int(row.get('total_time')):
127 | self._period_frequency = row.get('periodic_frequency')
128 | if index == 0:
129 | diff = runtime
130 | else:
131 | diff = runtime - int(self._period_freq_list[index - 1].get('total_time'))
132 | self.period_count += int(diff//self._period_frequency)
133 | return self._period_frequency
134 | return self._period_frequency
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/core/error_logger.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import re
4 | import time
5 |
6 | from scrapeops_scrapy.core.api import SOPSRequest
7 | from scrapeops_scrapy.normalizer.domains import DomainNormalizer
8 | from scrapeops_scrapy.utils import utils
9 |
10 |
11 | class ErrorLogger(object):
12 | ERROR_LOGGER_ACTIVE = True
13 |
14 | def __init__(self, spider, crawler, spider_settings, server_hostname, server_ip, start_time, log_file):
15 | self.spider = spider
16 | self.crawler = crawler
17 | self.bot_name = crawler.settings.get("BOT_NAME", "None")
18 | self.spider_settings = spider_settings
19 | self.server_hostname = server_hostname
20 | self.server_ip = server_ip
21 | self.start_time = start_time
22 | self.log_file = log_file
23 | self._error_history = []
24 | self.job_group_name = None
25 | self.job_group_id = None
26 |
27 | def update_error_logger(self, job_name, job_id):
28 | self.job_group_name = job_name
29 | self.job_group_id = job_id
30 |
31 | def log_error(self, reason=None, error=None, data=None, request_type=None):
32 | if ErrorLogger.ERROR_LOGGER_ACTIVE:
33 | self._error_history.append(
34 | {
35 | "time": utils.current_time(),
36 | "reason": reason,
37 | "error": str(error),
38 | "data": data,
39 | "request_type": request_type,
40 | }
41 | )
42 |
43 | def send_error_report(self, error_type=None, body=None, log_data=False):
44 | if ErrorLogger.ERROR_LOGGER_ACTIVE:
45 | try:
46 | data, status = SOPSRequest().error_report_request(error_type=error_type, body=body)
47 | if status.valid:
48 | if log_data and self.log_file is not None and data.get("sdk_error_id") is not None:
49 | with open(self.log_file, "rb") as f:
50 | post_body = {
51 | "sops_sdk": "scrapy",
52 | "spider_name": self.spider.name,
53 | "job_group_id": self.job_group_id,
54 | "job_group_name": self.job_group_name,
55 | "sdk_error_id": data.get("sdk_error_id"),
56 | }
57 | _, status = SOPSRequest().error_report_request(
58 | error_type=error_type, body=post_body, files={"file": f}
59 | )
60 | if status.valid is False:
61 | self.log_error(reason="send_error_logs_failed", error=status.error)
62 |
63 | if status.valid is False:
64 | self.log_error(reason="send_error_report_failed", error=status.error)
65 | except Exception:
66 | pass
67 |
68 | def sdk_error_close(self, reason=None, error=None, request_type=None, data=None):
69 | if ErrorLogger.ERROR_LOGGER_ACTIVE:
70 | self.log_error(reason=reason, error=error, data=data, request_type=request_type)
71 | error_data = {
72 | "final_reason": reason,
73 | "sops_sdk": "scrapy",
74 | "spider_name": self.spider.name,
75 | "bot_name": self.bot_name,
76 | "server_ip": self.server_ip,
77 | "server_hostname": self.server_hostname,
78 | "job_group_id": self.job_group_id,
79 | "job_group_name": self.job_group_name,
80 | "job_args": utils.get_args(),
81 | "job_start_time": self.start_time,
82 | "sops_scrapeops_version": utils.get_scrapeops_version(),
83 | "sops_scrapy_version": utils.get_scrapy_version(),
84 | "sops_python_version": utils.get_python_version(),
85 | "sops_system_version": utils.get_system_version(),
86 | "sops_middleware_enabled": utils.scrapeops_middleware_installed(self.spider_settings),
87 | "error_history": self._error_history,
88 | }
89 |
90 | self.send_error_report(error_type="sdk_close", body=error_data, log_data=True)
91 |
92 |
93 | class TailLogHandler(logging.Handler):
94 | retryErrors = [
95 | "Couldn't bind",
96 | "Hostname couldn't be looked up'" "No route to host",
97 | "Connection was refused by other side",
98 | "TCP connection timed out",
99 | "File used for UNIX socket is no good",
100 | "Service name given as port is unknown",
101 | "User aborted connection",
102 | "User timeout caused connection failure",
103 | "An SSL error occurred",
104 | "Could not verify something that was supposed to be signed.",
105 | "The peer rejected our verify error.",
106 | "We did not find a certificate where we expected to find one.",
107 | "Bad Request",
108 | "Unauthorized",
109 | "Payment Required",
110 | "Forbidden",
111 | "Not Found",
112 | "Method Not Allowed",
113 | "Request Time-out",
114 | "Internal Server Error",
115 | "Bad Gateway",
116 | "Service Unavailable",
117 | "HTTP Version not supported",
118 | "Gateway Time-out",
119 | "Unknown Status",
120 | ]
121 |
122 | def __init__(self, log_dict, log_dict_cumulative):
123 | logging.Handler.__init__(self)
124 | self.log_dict = log_dict
125 | self.log_dict_cumulative = log_dict_cumulative
126 |
127 | def flush(self):
128 | self.log_dict.clear()
129 |
130 | def emit(self, record):
131 | try:
132 | if record.levelname == "ERROR" or record.levelname == "WARNING" or record.levelname == "CRITICAL":
133 | if hasattr(record, "message"):
134 | errorMessage = record.message
135 | fileAndLine = record.pathname + ", line: " + str(record.lineno)
136 | dateTime = self.format_time(record)
137 | type = record.levelname
138 | engine = record.name
139 |
140 | # covering warnings/probableCause/traceback missing
141 | traceback = "No traceback available"
142 | probableCause = ""
143 |
144 | if record.exc_text is not None:
145 | traceback = record.exc_text
146 | splitTraceback = traceback.split("\n")
147 | probableCause = splitTraceback[len(splitTraceback) - 1]
148 |
149 | # covering retrys
150 | if "Gave up retrying <" in record.message:
151 | for retryError in self.retryErrors:
152 | if retryError in record.message:
153 | method = record.message.split("<")[1].split(" ")[0]
154 | errorMessage = "Error: Gave up retrying " + method + " request - " + retryError
155 | fileAndLine = ""
156 | probableCause = retryError
157 | break
158 |
159 | # Deprecation Warnings
160 | if "ScrapyDeprecationWarning:" in record.message and record.message[0] == "/":
161 | splitString = record.message.split("ScrapyDeprecationWarning:")
162 | errorMessage = "ScrapyDeprecationWarning: " + splitString[1]
163 | probableCause = splitString[0]
164 |
165 | # "Some Other Error Occurred"
166 | if "Some other error occurred: " in record.message:
167 | splitError = record.message.split(" /")
168 | cleanError = splitError[0].split(">: ")[1]
169 | errorMessage = "Some other error occurred: " + cleanError
170 | probableCause = cleanError
171 | traceback = record.message
172 |
173 | # Convert Urls To Domains in Error Messages
174 | urls = re.findall(r"(https?://[^\s]+)", errorMessage)
175 | for url in urls:
176 | domain = DomainNormalizer.get_domain(url)
177 | errorMessage = errorMessage.replace(url, domain)
178 |
179 | if errorMessage in self.log_dict:
180 | self.log_dict[errorMessage]["count"] = self.log_dict[errorMessage]["count"] + 1
181 | else:
182 | self.log_dict[errorMessage] = {
183 | "type": type,
184 | "engine": engine,
185 | "name": errorMessage,
186 | "count": 1,
187 | "traceback": traceback,
188 | "message": probableCause,
189 | "filepath": fileAndLine,
190 | "dateTime": dateTime,
191 | }
192 |
193 | if SOPSRequest.HIGH_FREQ_ACC == True:
194 | if errorMessage in self.log_dict_cumulative:
195 | self.log_dict_cumulative[errorMessage]["count"] = (
196 | self.log_dict_cumulative[errorMessage]["count"] + 1
197 | )
198 | else:
199 | self.log_dict_cumulative[errorMessage] = {
200 | "type": type,
201 | "engine": engine,
202 | "name": errorMessage,
203 | "count": 1,
204 | "traceback": traceback,
205 | "message": probableCause,
206 | "filepath": fileAndLine,
207 | "dateTime": dateTime,
208 | }
209 |
210 | except Exception as e:
211 | logging.info("Error: Error in error logger")
212 | logging.info(e, exc_info=True)
213 |
214 | def format_time(self, record):
215 | if self.formatter:
216 | return self.formatter.formatTime(record)
217 | else:
218 | # Fallback to a basic time format if no formatter is set
219 | return time.strftime("%Y-%m-%d %H:%M:%S")
220 |
221 |
222 | class TailLogger(object):
223 | def __init__(self):
224 | self._log_dict = {}
225 | self._log_dict_cumulative = {}
226 | self._log_handler = TailLogHandler(self._log_dict, self._log_dict_cumulative)
227 |
228 | def contents(self, type="diff"):
229 | if type == "cumulative":
230 | jsonLogsCumulative = json.dumps(self._log_dict_cumulative, indent=2)
231 | return jsonLogsCumulative
232 |
233 | else:
234 | jsonLogs = json.dumps(self._log_dict, indent=2)
235 | self._log_handler.flush()
236 | return jsonLogs
237 |
238 | @property
239 | def log_handler(self):
240 | return self._log_handler
241 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/core/model.py:
--------------------------------------------------------------------------------
1 | import socket
2 | import scrapy.settings.default_settings as default_settings
3 | from scrapeops_scrapy.core.api import SOPSRequest
4 |
5 |
6 | class BaseSDKModel(object):
7 |
8 | """
9 | SDK Model:
10 | The core data types used to control the SDK's operation.
11 | """
12 |
13 | def __init__(self):
14 | ## User Data
15 | self._scrapeops_api_key = None
16 |
17 | ## SDK Data
18 | self._sdk_active = None
19 | self._scrapeops_endpoint = None
20 | self._scrapeops_middleware = None
21 | self._scrapeops_settings_exclusion_list = []
22 | self._scrapeops_export_scrapy_logs = False
23 | self._period_frequency = 60
24 | self._period_freq_list = None
25 | self._sdk_run_time = 0
26 | self._setup_attempts = 0
27 | self._scrapeops_test_id = None
28 | self._error_logger = None
29 | self._scrapeops_sdk_version = None
30 | self._scrapeops_scrapy_version = None
31 | self._scrapeops_python_version = None
32 | self._scrapeops_system_version = None
33 | self._scrapeops_job_start = None
34 |
35 | ## Spider Data
36 | self.crawler = None
37 | self.spider = None
38 | self.spider_name = None
39 | self.spider_id= None
40 | self.spider_settings = None
41 | self.server_id= None
42 | self.project_id = None
43 | self.project_name = None
44 | self.bot_name = None
45 | self.retry_enabled = None
46 | self.retry_times = None
47 | self.log_file = None
48 |
49 | ## Overall Job Data
50 | self.job_args = None
51 | self.job_id = None
52 | self.job_group_id = None
53 | self.job_group_uuid = None
54 | self.job_group_name = None
55 | self.job_group_version = None
56 | self.job_custom_groups = None
57 | self.start_time = None
58 | self.finish_time = None
59 | self.server_hostname = None
60 | self.server_ip = None
61 | self._proxy_apis = {}
62 | self._generic_validators = {}
63 | self.multi_server = False
64 | self.failed_urls = []
65 |
66 | ## Period Data
67 | self.period_start_time = None
68 | self.period_finish_time = None
69 | self.period_run_time = 0
70 | self.period_concurrency = 0
71 | self.period_count = 0
72 |
73 | ## ScrapeOps Triggered Jobs
74 | self._scrapeops_server_id = None
75 | self.job_group_type = None
76 |
77 | ## Periodic Monitor
78 | self.loop = None
79 | self.periodic_loop = None
80 |
81 | ## Validation/Normalisation Data
82 | self.proxy_domains = []
83 |
84 | ## Failure
85 | self.failed_periods = 0
86 | self.cached_failed_stats = []
87 |
88 | ## Middleware
89 | self.request_response_middleware = None
90 | self.item_validation_middleware = None
91 | self.failed_url_middleware = None
92 |
93 | self.allowed_response_codes = []
94 |
95 |
96 | class SDKData(BaseSDKModel):
97 |
98 | def __init__(self):
99 | BaseSDKModel.__init__(self)
100 |
101 |
102 | def setup_data(self):
103 | return {
104 | 'sops_api_key': self._scrapeops_api_key,
105 | 'job_group_name': self.job_group_name,
106 | 'job_group_version': self.job_group_version,
107 | 'job_group_identifier': self.job_group_uuid,
108 | 'job_group_type': self.job_group_type,
109 | 'job_settings': self.spider_settings,
110 | 'job_args': self.job_args,
111 | 'job_start_time': self.start_time,
112 | 'sops_sdk': 'scrapy',
113 | 'sops_scrapeops_version': self._scrapeops_sdk_version,
114 | 'sops_scrapy_version': self._scrapeops_scrapy_version,
115 | 'sops_python_version': self._scrapeops_python_version,
116 | 'sops_system_version': self._scrapeops_system_version,
117 | 'sops_middleware_enabled': self._scrapeops_middleware,
118 | 'sops_test_id': self._scrapeops_test_id,
119 | 'sops_server_id': self._scrapeops_server_id,
120 | 'scrapeops_job_start': self._scrapeops_job_start,
121 | 'spider_name': self.spider_name,
122 | 'job_custom_groups': self.job_custom_groups,
123 | 'server_ip': self.server_ip,
124 | 'server_hostname': self.server_hostname,
125 | 'project_name': self.project_name,
126 | 'bot_name': self.bot_name,
127 | 'multi_server': self.multi_server,
128 | 'retry_enabled': self.retry_enabled,
129 | 'retry_times': self.retry_times,
130 | }
131 |
132 |
133 | def stats_data(self, periodic_stats=None, overall_stats=None, stats_type=None, reason=None):
134 | data = {
135 | 'job_id': self.job_id,
136 | 'job_group_id': self.job_group_id,
137 | 'type': stats_type,
138 | 'period_start_time': self.period_start_time,
139 | 'period_finish_time': self.period_finish_time,
140 | 'period_run_time': self._period_frequency,
141 | 'sdk_run_time': self._sdk_run_time,
142 | 'periodic': periodic_stats,
143 | 'overall': overall_stats,
144 | 'cached_failed_stats': self.cached_failed_stats,
145 | 'periodic_warnings': periodic_stats.get('log_count/WARNING', 0),
146 | 'periodic_errors': periodic_stats.get('log_count/ERROR', 0),
147 | 'periodic_criticals': periodic_stats.get('log_count/CRITICAL', 0),
148 | 'multi_server': self.multi_server,
149 | 'period_count': self.period_count,
150 | 'data_coverage': self.item_validation_middleware.get_item_coverage_data(),
151 | 'invalid_items_count': self.item_validation_middleware.get_num_invalid_items(),
152 | 'field_coverage': self.item_validation_middleware.get_field_coverage(),
153 | 'failed_urls_count': self.failed_url_middleware.get_url_count(),
154 | 'failed_urls_enabled': self.failed_url_middleware.enabled(),
155 | 'scrapy_stats': self.get_scrapy_stats(),
156 | 'job_custom_groups': self.job_custom_groups,
157 | 'error_details': self.tail.contents(),
158 | 'error_details_cumulative': self.tail.contents('cumulative'),
159 | 'high_freq': SOPSRequest.HIGH_FREQ_ACC
160 | }
161 |
162 | if stats_type == 'finished':
163 | data['job_finish_time'] = self.period_finish_time
164 | data['job_status'] = stats_type
165 | data['job_finish_reason'] = reason
166 | data['failed_urls_list'] = self.failed_url_middleware.get_url_list()
167 | data['invalid_items_urls_list'] = self.item_validation_middleware.get_invalid_items_urls()
168 | return data
169 |
170 |
171 | def log_data(self):
172 | return {
173 | 'job_group_id': self.job_group_id,
174 | 'job_group_name': self.job_group_name,
175 | 'job_group_identifier': self.job_group_uuid,
176 | 'spider_name': self.spider_name,
177 | 'sops_sdk': 'scrapy',
178 | }
179 |
180 |
181 |
182 | def logging_data(self):
183 | return {
184 | 'sops_api_key': self._scrapeops_api_key,
185 | 'job_id': self.job_id,
186 | 'job_group_id': self.job_group_id,
187 | 'job_group_identifier': self.job_group_uuid,
188 | 'job_group_name': self.job_group_name,
189 | 'spider_name': self.spider_name,
190 | 'spider_id': self.spider_id,
191 | 'server_id': self.server_id,
192 | 'project_id': self.project_id,
193 | 'project_name': self.project_name,
194 | 'bot_name': self.bot_name,
195 | 'server_ip': self.server_ip,
196 | 'server_hostname': self.server_hostname,
197 | 'sops_scrapeops_version': self._scrapeops_sdk_version,
198 | 'sops_scrapy_version': self._scrapeops_scrapy_version,
199 | 'sops_python_version': self._scrapeops_python_version,
200 | 'sops_system_version': self._scrapeops_system_version,
201 | 'sops_middleware_enabled': self._scrapeops_middleware,
202 | 'sops_sdk': 'scrapy',
203 | }
204 |
205 | def check_spider_attributes(self, spider):
206 | if hasattr(spider, 'sops_test'):
207 | if spider.sops_test.test_active():
208 | self._scrapeops_test_id = spider.sops_test.generate_test_id()
209 |
210 | if hasattr(spider, 'sops_custom_groups'):
211 | if isinstance(spider.sops_custom_groups, dict):
212 | clean_dict = {}
213 | for k, v in spider.sops_custom_groups.items():
214 | clean_dict[str(k)] = str(v)
215 | self.job_custom_groups = clean_dict
216 |
217 |
218 | def get_settings(self, spider):
219 | default_scrapy_settings = default_settings.__dict__
220 | full_settings = spider.settings.copy_to_dict()
221 | self.spider_settings = {}
222 | for key, value in full_settings.items():
223 | if key not in default_scrapy_settings and self.include_setting(key):
224 | self.spider_settings[key] = value
225 | elif default_scrapy_settings.get(key) != value and self.include_setting(key):
226 | self.spider_settings[key] = value
227 |
228 | def include_setting(self, key):
229 | exclusion_terms = ['API_KEY', 'APIKEY', 'SECRET_KEY', 'SECRETKEY', 'PASSWORD', 'CONNECTION_STRING']
230 | if key in self._scrapeops_settings_exclusion_list:
231 | return False
232 | for term in exclusion_terms:
233 | if term in key.upper(): return False
234 | return True
235 |
236 |
237 | def get_job_name(self):
238 | ## check args
239 | for arg in self.job_args.get('args'):
240 | if 'SCRAPEOPS_JOB_NAME' in arg:
241 | return arg.split('=')[1]
242 |
243 | ## check spider defined
244 | if hasattr(self.spider, 'sops_job_name'):
245 | return self.spider.sops_job_name
246 | if hasattr(self.spider, 'name'):
247 | return self.spider.name
248 | return 'no_spider_name'
249 |
250 |
251 | def get_job_version(self):
252 | ## check args
253 | for arg in self.job_args.get('args'):
254 | if 'SCRAPEOPS_JOB_VERSION' in arg:
255 | return arg.split('=')[1]
256 |
257 | ## check spider defined
258 | if hasattr(self.spider, 'sops_job_version'):
259 | return self.spider.sops_job_version
260 | return 'default'
261 |
262 |
263 | def get_server_id(self, crawler):
264 | for arg in self.job_args.get('args'):
265 | if 'SCRAPEOPS_SERVER_ID' in arg:
266 | return arg.split('=')[1]
267 | if crawler.settings.get('SCRAPEOPS_SERVER_ID') is not None:
268 | return crawler.settings.get('SCRAPEOPS_SERVER_ID')
269 | return '-1'
270 |
271 |
272 | def check_scrapeops_triggered_job(self, crawler):
273 | self._scrapeops_server_id = self.get_server_id(crawler)
274 | if isinstance(self._scrapeops_server_id, str) is False: self._scrapeops_server_id = str(self._scrapeops_server_id)
275 | if self._scrapeops_server_id != '-1':
276 | self.job_group_type = 'scrapeops_triggered'
277 | else:
278 | self.job_group_type = 'user_triggered'
279 |
280 | def get_server_details(self):
281 | try:
282 | self.server_hostname = socket.gethostname()
283 | self.server_ip = socket.gethostbyname(self.server_hostname)
284 | except Exception:
285 | self.server_hostname = 'unknown'
286 | self.server_ip = 'unknown'
287 |
288 |
289 | def get_uuid(self):
290 | for arg in self.job_args.get('args'):
291 | if 'SCRAPEOPS_JOB_GROUP_IDENTIFIER' in arg:
292 | return arg.split('=')[1]
293 | if hasattr(self.spider, 'sops_job_group_identifier'):
294 | return self.spider.sops_job_group_identifier
295 | self.multi_server = False
296 | return ''
297 |
298 |
299 | def get_export_logs(self, crawler):
300 | for arg in self.job_args.get('args'):
301 | if 'SCRAPEOPS_EXPORT_SCRAPY_LOGS' in arg:
302 | try:
303 | if arg.split('=')[1] == 'True':
304 | return True
305 | except Exception:
306 | pass
307 | if crawler.settings.get('SCRAPEOPS_EXPORT_SCRAPY_LOGS') is not None:
308 | return True
309 | return False
310 |
311 | def get_scrapy_stats(self):
312 | scrapy_stats = self.crawler.stats.get_stats()
313 | return {k:str(v) for (k,v) in scrapy_stats.items()}
314 |
315 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/core/setup.py:
--------------------------------------------------------------------------------
1 | from scrapeops_scrapy.utils import utils
2 | from scrapeops_scrapy.core.error_logger import ErrorLogger
3 | from scrapeops_scrapy.core.api import SOPSRequest
4 | from scrapeops_scrapy.normalizer.middleware import RequestResponseMiddleware
5 | from scrapeops_scrapy.validators.item_validator import ItemValidator
6 | from scrapeops_scrapy.stats.failed_urls import FailedUrlsHandler
7 | from scrapeops_scrapy.core.model import SDKData
8 |
9 |
10 |
11 | class SDKSetup(SDKData):
12 |
13 | def __init__(self):
14 | SDKData.__init__(self)
15 |
16 |
17 | def initialize_SDK(self, spider, crawler=None):
18 |
19 | ## Spider Data
20 | self.spider = spider
21 | self.crawler = crawler
22 | self.spider_name = spider.name
23 | self.project_name = crawler.settings.get('PROJECT', None)
24 | self.bot_name = crawler.settings.get('BOT_NAME', None)
25 | self.retry_enabled = crawler.settings.get('RETRY_ENABLED', None)
26 | self.retry_times = crawler.settings.get('RETRY_TIMES', None)
27 | self.log_file = crawler.settings.get('LOG_FILE', None)
28 | self.allowed_response_codes = crawler.settings.get('HTTPERROR_ALLOWED_CODES', [])
29 | self._scrapeops_settings_exclusion_list = crawler.settings.get('SCRAPEOPS_SETTINGS_EXCLUSION_LIST', [])
30 | self.check_spider_attributes(spider)
31 | self.get_settings(spider)
32 |
33 | ## Job Data
34 | self.job_args = utils.get_args()
35 | self.job_group_name = crawler.settings.get('SCRAPEOPS_JOB_NAME', self.get_job_name())
36 | self.job_group_uuid = crawler.settings.get('SCRAPEOPS_JOB_GROUP_IDENTIFIER', self.get_uuid()) ## Multi-server
37 | self.job_group_version = crawler.settings.get('SCRAPEOPS_JOB_VERSION', self.get_job_version())
38 | self.check_scrapeops_triggered_job(crawler)
39 |
40 | ## System Settings
41 | self._scrapeops_sdk_version = utils.get_scrapeops_version()
42 | self._scrapeops_scrapy_version = utils.get_scrapy_version()
43 | self._scrapeops_python_version = utils.get_python_version()
44 | self._scrapeops_system_version = utils.get_system_version()
45 | self.get_server_details()
46 |
47 | ## SDK Setup Data
48 | self._scrapeops_middleware = utils.scrapeops_middleware_installed(self.spider_settings)
49 | self._scrapeops_job_start = crawler.settings.get('SCRAPEOPS_JOB_START', utils.current_time()) ## Multi-server
50 | self._scrapeops_server_id = crawler.settings.get('SCRAPEOPS_SERVER_ID', "-1")
51 | self._scrapeops_debug_mode = crawler.settings.get('SCRAPEOPS_DEBUG_MODE', False)
52 | self._scrapeops_export_scrapy_logs = self.get_export_logs(crawler)
53 |
54 | ## SOPS API
55 | SOPSRequest.SCRAPEOPS_ENDPOINT = crawler.settings.get('SCRAPEOPS_ENDPOINT', 'https://api.scrapeops.io/')
56 | SOPSRequest.API_KEY = self._scrapeops_api_key = crawler.settings.get('SCRAPEOPS_API_KEY', None)
57 | SOPSRequest.SCRAPEOPS_LOGGING_DATA = {'logging_data': self.logging_data()}
58 |
59 | ## Middlewares
60 | self.initialize_middlewares()
61 | self.initialize_error_logger()
62 |
63 |
64 |
65 | def initialize_middlewares(self):
66 | if self.item_validation_middleware is None:
67 | self.item_validation_middleware = ItemValidator()
68 |
69 | if self.failed_url_middleware is None:
70 | self.failed_url_middleware = FailedUrlsHandler()
71 |
72 |
73 | def initialize_error_logger(self):
74 | self._error_logger = ErrorLogger(
75 | self.spider,
76 | self.crawler,
77 | self.spider_settings,
78 | self.server_hostname,
79 | self.server_ip,
80 | self.start_time,
81 | self.log_file)
82 |
83 |
84 | def initialize_job_details(self, data):
85 | self.job_id = data.get('job_id')
86 | self.job_group_name = data.get('job_group_name', self.job_group_name)
87 | self.job_group_id = SOPSRequest.JOB_GROUP_ID = data.get('job_group_id')
88 | self.spider_id= data.get('spider_id')
89 | self.server_id= data.get('server_id')
90 | self.project_id= data.get('project_id')
91 | self.multi_server = data.get('multi_server', False)
92 | SOPSRequest.HIGH_FREQ_ACC = data.get('high_freq', False)
93 | self._period_frequency = data.get('stats_period_frequency')
94 | self._period_freq_list = data.get('stats_period_freq_list')
95 | self._error_logger.update_error_logger(self.job_group_name, self.job_group_id)
96 | self.update_sdk_settings(data)
97 | self.initialize_normalizer_middleware(data)
98 | SOPSRequest.SCRAPEOPS_LOGGING_DATA = {'logging_data': self.logging_data()}
99 |
100 |
101 | def initialize_normalizer_middleware(self, data=None):
102 | if data is not None:
103 | self._proxy_apis = data.get('proxy_apis', {})
104 | self._generic_validators = data.get('generic_validators', [])
105 | if self.request_response_middleware is None:
106 | self.request_response_middleware = RequestResponseMiddleware(self.job_group_id,
107 | self._proxy_apis,
108 | self._generic_validators,
109 | self._error_logger,
110 | self.allowed_response_codes)
111 |
112 |
113 | def update_sdk_settings(self, data):
114 | self._sdk_active = data.get('sdk_active', self._sdk_active)
115 | self.multi_server = data.get('multi_server', self.multi_server)
116 | self.job_group_name = data.get('job_group_name', self.job_group_name)
117 | self._scrapeops_export_scrapy_logs = data.get('scrapeops_export_scrapy_logs', self._scrapeops_export_scrapy_logs)
118 |
119 | ## SOPS API Endpoints
120 | SOPSRequest.SCRAPEOPS_ENDPOINT = data.get('scrapeops_endpoint', SOPSRequest.SCRAPEOPS_ENDPOINT)
121 | SOPSRequest.SCRAPEOPS_API_VERSION = data.get('scrapeops_api_version', SOPSRequest.SCRAPEOPS_API_VERSION)
122 |
123 | ## Normalisation Middleware
124 | RequestResponseMiddleware.PROXY_DOMAIN_NORMALIZATION = data.get('proxy_domain_normalization', RequestResponseMiddleware.PROXY_DOMAIN_NORMALIZATION)
125 | RequestResponseMiddleware.PROXY_ALERTS = data.get('proxy_alerts', RequestResponseMiddleware.PROXY_ALERTS)
126 | RequestResponseMiddleware.RESPONSE_VALIDATION = data.get('response_validation', RequestResponseMiddleware.RESPONSE_VALIDATION)
127 |
128 | ## Item Validation Middleware
129 | ItemValidator.ITEM_COVERAGE_ENABLED = data.get('item_coverage_enabled', ItemValidator.ITEM_COVERAGE_ENABLED)
130 | ItemValidator.INVALID_ITEM_URLS_LOGGING_ENABLED = data.get('ivalid_item_coverage_url_logging_enabled', ItemValidator.INVALID_ITEM_URLS_LOGGING_ENABLED)
131 | ItemValidator.MAX_ITEM_URLS = data.get('max_item_urls', ItemValidator.MAX_ITEM_URLS)
132 |
133 | ## Failed URL Middleware
134 | FailedUrlsHandler.FAILED_URL_LOGGER_ENABLED = data.get('FAILED_URL_LOGGER_ENABLED', FailedUrlsHandler.FAILED_URL_LOGGER_ENABLED)
135 | FailedUrlsHandler.LOG_MISSED_URLS = data.get('log_missed_urls', FailedUrlsHandler.LOG_MISSED_URLS)
136 | FailedUrlsHandler.MAX_LOGGED_URLS = data.get('max_failed_urls', FailedUrlsHandler.MAX_LOGGED_URLS)
137 |
138 | ## Error Logger
139 | ErrorLogger.ERROR_LOGGER_ACTIVE = data.get('error_logger', ErrorLogger.ERROR_LOGGER_ACTIVE)
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/exceptions.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | class ScrapeOpsMissingAPIKey(Exception):
4 | """Indicates that no ScrapeOps API key added"""
5 | def __init__(self):
6 | self.message = 'No ScrapeOps API key defined.'
7 | super().__init__(self.message)
8 |
9 | def __str__(self):
10 | return f'ScrapeOpsMissingAPIKey: {self.message}'
11 |
12 |
13 | class ScrapeOpsAPIResponseError(Exception):
14 |
15 | def __init__(self):
16 | super().__init__()
17 |
18 |
19 | class DecodeError(Exception):
20 | pass
--------------------------------------------------------------------------------
/scrapeops_scrapy/extension.py:
--------------------------------------------------------------------------------
1 | from scrapy import signals
2 | import logging
3 |
4 | from scrapeops_scrapy.core.core import ScrapeopsCore
5 | from scrapeops_scrapy.signals import scrapeops_signals
6 | from scrapeops_scrapy.core.error_logger import TailLogger
7 |
8 |
9 | class ScrapeOpsMonitor(ScrapeopsCore):
10 |
11 | def __init__(self, crawler):
12 | ScrapeopsCore.__init__(self)
13 | self.crawler = crawler
14 |
15 | self.tail = TailLogger()
16 | log_handler = self.tail.log_handler
17 | logging.getLogger().addHandler(log_handler)
18 |
19 |
20 | @classmethod
21 | def from_crawler(cls, crawler):
22 | ext = cls(crawler)
23 |
24 | # connect the extension object to signals
25 | crawler.signals.connect(ext.spider_opened,
26 | signal=signals.spider_opened)
27 |
28 | crawler.signals.connect(ext.spider_closed,
29 | signal=signals.spider_closed)
30 |
31 | crawler.signals.connect(ext.log_request,
32 | signal=signals.request_reached_downloader)
33 |
34 | crawler.signals.connect(ext.log_response,
35 | signal=signals.response_downloaded)
36 |
37 | crawler.signals.connect(ext.log_response_middleware,
38 | signal=scrapeops_signals.scrapeops_response_recieved)
39 |
40 | crawler.signals.connect(ext.log_exception,
41 | signal=scrapeops_signals.scrapeops_exception_recieved)
42 |
43 | crawler.signals.connect(ext.item_scraped,
44 | signal=signals.item_scraped)
45 |
46 | crawler.signals.connect(ext.item_dropped,
47 | signal=signals.item_dropped)
48 |
49 | crawler.signals.connect(ext.item_error,
50 | signal=signals.item_error)
51 |
52 | return ext
53 |
54 | def spider_opened(self, spider):
55 | self.start_sdk(spider=spider, crawler=self.crawler)
56 |
57 | def spider_closed(self, spider, reason):
58 | self.close_sdk(spider=spider, reason=reason)
59 |
60 | def log_request(self, request, spider):
61 | if self.sdk_enabled():
62 | self.request_stats(request=request)
63 |
64 | def log_response(self, response, request, spider):
65 | if self.scrapeops_middleware_enabled() is False and self.sdk_enabled():
66 | self.response_stats(request=request, response=response)
67 |
68 | def log_response_middleware(self, request=None, response=None, spider=None):
69 | if self.scrapeops_middleware_enabled() and self.sdk_enabled():
70 | self.response_stats(request=request, response=response)
71 |
72 | def log_exception(self, request=None, spider=None, exception_class=None):
73 | if self.scrapeops_middleware_enabled() and self.sdk_enabled():
74 | self.exception_stats(request=request, exception_class=exception_class)
75 |
76 | def item_scraped(self, item, response, spider):
77 | if self.sdk_enabled():
78 | self.item_stats(signal_type='item_scraped', item=item, response=response, spider=spider)
79 |
80 | def item_dropped(self, item, response, spider):
81 | if self.sdk_enabled():
82 | self.item_stats(signal_type='item_dropped', item=item, response=response, spider=spider)
83 |
84 | def item_error(self, item, response, spider):
85 | if self.sdk_enabled():
86 | self.item_stats(signal_type='item_error', item=item, response=response, spider=spider)
87 |
88 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/middleware/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/middleware/__init__.py
--------------------------------------------------------------------------------
/scrapeops_scrapy/middleware/retry.py:
--------------------------------------------------------------------------------
1 | """
2 | An extension to retry failed requests that are potentially caused by temporary
3 | problems such as a connection timeout or HTTP 500 error.
4 |
5 | You can change the behaviour of this middleware by modifing the scraping settings:
6 | RETRY_TIMES - how many times to retry a failed page
7 | RETRY_HTTP_CODES - which HTTP response codes to retry
8 |
9 | Failed pages are collected on the scraping process and rescheduled at the end,
10 | once the spider has finished crawling all regular (non failed) pages.
11 | """
12 | from logging import getLogger, Logger
13 | from typing import Optional, Union
14 |
15 | from twisted.internet import defer
16 | from twisted.internet.error import (
17 | ConnectError,
18 | ConnectionDone,
19 | ConnectionLost,
20 | ConnectionRefusedError,
21 | DNSLookupError,
22 | TCPTimedOutError,
23 | TimeoutError,
24 | )
25 | from twisted.web.client import ResponseFailed
26 |
27 | from scrapy.core.downloader.handlers.http11 import TunnelError
28 | from scrapy.exceptions import NotConfigured
29 | from scrapy.http.request import Request
30 | from scrapy.spiders import Spider
31 | from scrapy.utils.python import global_object_name
32 | from scrapy.utils.response import response_status_message
33 |
34 | from scrapeops_scrapy.signals import scrapeops_signals
35 |
36 |
37 | retry_logger = getLogger(__name__)
38 |
39 |
40 | def get_retry_request(
41 | request: Request,
42 | *,
43 | spider: Spider,
44 | #response: Response,
45 | reason: Union[str, Exception] = 'unspecified',
46 | max_retry_times: Optional[int] = None,
47 | priority_adjust: Optional[int] = None,
48 | logger: Logger = retry_logger,
49 | stats_base_key: str = 'retry',
50 | ):
51 | """
52 | Returns a new :class:`~scrapy.Request` object to retry the specified
53 | request, or ``None`` if retries of the specified request have been
54 | exhausted.
55 |
56 | For example, in a :class:`~scrapy.Spider` callback, you could use it as
57 | follows::
58 |
59 | def parse(self, response):
60 | if not response.text:
61 | new_request_or_none = get_retry_request(
62 | response.request,
63 | spider=self,
64 | reason='empty',
65 | )
66 | return new_request_or_none
67 |
68 | *spider* is the :class:`~scrapy.Spider` instance which is asking for the
69 | retry request. It is used to access the :ref:`settings `
70 | and :ref:`stats `, and to provide extra logging context (see
71 | :func:`logging.debug`).
72 |
73 | *reason* is a string or an :class:`Exception` object that indicates the
74 | reason why the request needs to be retried. It is used to name retry stats.
75 |
76 | *max_retry_times* is a number that determines the maximum number of times
77 | that *request* can be retried. If not specified or ``None``, the number is
78 | read from the :reqmeta:`max_retry_times` meta key of the request. If the
79 | :reqmeta:`max_retry_times` meta key is not defined or ``None``, the number
80 | is read from the :setting:`RETRY_TIMES` setting.
81 |
82 | *priority_adjust* is a number that determines how the priority of the new
83 | request changes in relation to *request*. If not specified, the number is
84 | read from the :setting:`RETRY_PRIORITY_ADJUST` setting.
85 |
86 | *logger* is the logging.Logger object to be used when logging messages
87 |
88 | *stats_base_key* is a string to be used as the base key for the
89 | retry-related job stats
90 | """
91 | settings = spider.crawler.settings
92 | stats = spider.crawler.stats
93 | retry_times = request.meta.get('retry_times', 0) + 1
94 | if max_retry_times is None:
95 | max_retry_times = request.meta.get('max_retry_times')
96 | if max_retry_times is None:
97 | max_retry_times = settings.getint('RETRY_TIMES')
98 | if retry_times <= max_retry_times:
99 | logger.debug(
100 | "Retrying %(request)s (failed %(retry_times)d times): %(reason)s",
101 | {'request': request, 'retry_times': retry_times, 'reason': reason},
102 | extra={'spider': spider}
103 | )
104 | new_request = request.copy()
105 | new_request.meta['retry_times'] = retry_times
106 |
107 |
108 | new_request.dont_filter = True
109 | if priority_adjust is None:
110 | priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
111 | new_request.priority = request.priority + priority_adjust
112 |
113 | if callable(reason):
114 | reason = reason()
115 | if isinstance(reason, Exception):
116 | reason = global_object_name(reason.__class__)
117 |
118 | stats.inc_value(f'{stats_base_key}/count')
119 | stats.inc_value(f'{stats_base_key}/reason_count/{reason}')
120 | return new_request
121 | else:
122 | stats.inc_value(f'{stats_base_key}/max_reached')
123 | logger.error(
124 | "Gave up retrying %(request)s (failed %(retry_times)d times): "
125 | "%(reason)s",
126 | {'request': request, 'retry_times': retry_times, 'reason': reason},
127 | extra={'spider': spider},
128 | )
129 | return None
130 |
131 |
132 |
133 | class RetryMiddleware:
134 |
135 | # IOError is raised by the HttpCompression middleware when trying to
136 | # decompress an empty response
137 | EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
138 | ConnectionRefusedError, ConnectionDone, ConnectError,
139 | ConnectionLost, TCPTimedOutError, ResponseFailed,
140 | IOError, TunnelError)
141 |
142 | def __init__(self, settings):
143 | if not settings.getbool('RETRY_ENABLED'):
144 | raise NotConfigured
145 | self.max_retry_times = settings.getint('RETRY_TIMES')
146 | self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
147 | self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
148 |
149 | @classmethod
150 | def from_crawler(cls, crawler):
151 | return cls(crawler.settings)
152 |
153 | def process_response(self, request, response, spider):
154 | spider.crawler.signals.send_catch_log(
155 | signal=scrapeops_signals.scrapeops_response_recieved,
156 | request=request,
157 | response=response,
158 | spider=spider)
159 |
160 | if request.meta.get('dont_retry', False):
161 | return response
162 | if response.status in self.retry_http_codes:
163 | reason = response_status_message(response.status)
164 | return self._retry(request, reason, spider) or response
165 | return response
166 |
167 | def process_exception(self, request, exception, spider):
168 | ex_class = global_object_name(exception.__class__)
169 | spider.crawler.signals.send_catch_log(
170 | signal=scrapeops_signals.scrapeops_exception_recieved,
171 | request=request,
172 | spider=spider,
173 | exception_class=ex_class)
174 |
175 | if (
176 | isinstance(exception, self.EXCEPTIONS_TO_RETRY)
177 | and not request.meta.get('dont_retry', False)
178 | ):
179 | return self._retry(request, exception, spider)
180 |
181 | def _retry(self, request, reason, spider):
182 | max_retry_times = request.meta.get('max_retry_times', self.max_retry_times)
183 | priority_adjust = request.meta.get('priority_adjust', self.priority_adjust)
184 | return get_retry_request(
185 | request,
186 | reason=reason,
187 | spider=spider,
188 | max_retry_times=max_retry_times,
189 | priority_adjust=priority_adjust,
190 | )
--------------------------------------------------------------------------------
/scrapeops_scrapy/middleware/stats.py:
--------------------------------------------------------------------------------
1 | from scrapy.utils.python import global_object_name
2 |
3 | from scrapeops_scrapy.signals import scrapeops_signals
4 |
5 |
6 | class ScrapeOpsStats:
7 |
8 | def __init__(self):
9 | pass
10 |
11 | def process_response(self, request, response, spider):
12 | spider.crawler.signals.send_catch_log(
13 | signal=scrapeops_signals.scrapeops_response_recieved,
14 | request=request,
15 | response=response,
16 | spider=spider)
17 | return response
18 |
19 | def process_exception(self, request, exception, spider):
20 | ex_class = global_object_name(exception.__class__)
21 | spider.crawler.signals.send_catch_log(
22 | signal=scrapeops_signals.scrapeops_exception_recieved,
23 | request=request,
24 | spider=spider,
25 | exception_class=ex_class)
--------------------------------------------------------------------------------
/scrapeops_scrapy/normalizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/normalizer/__init__.py
--------------------------------------------------------------------------------
/scrapeops_scrapy/normalizer/domains.py:
--------------------------------------------------------------------------------
1 | from tld import get_tld
2 | from urllib.parse import urlparse, parse_qs
3 |
4 | class DomainNormalizer(object):
5 |
6 | def __init__(self):
7 | pass
8 |
9 | @staticmethod
10 | def get_domain(url):
11 | #if 'http://' not in url or 'http://' not in url or 'socks5://' not in url
12 | try:
13 | if DomainNormalizer.if_localhost(url):
14 | return 'localhost'
15 | res = get_tld(url, as_object=True)
16 | return res.fld
17 | except Exception:
18 | return 'unknown'
19 |
20 | @staticmethod
21 | def get_full_domain(url):
22 | try:
23 | if DomainNormalizer.if_localhost(url):
24 | return 'localhost'
25 | res = get_tld(url, as_object=True)
26 | if res.subdomain != '':
27 | return res.subdomain + '.' + res.fld
28 | return res.fld
29 | except Exception:
30 | return 'unknown'
31 |
32 |
33 | @staticmethod
34 | def if_localhost(url):
35 | if 'http://localhost:' in url or 'http://127.0.0.1:' in url:
36 | return True
37 | return False
38 |
39 |
40 | @staticmethod
41 | def parse_url(url):
42 | parsed_url = urlparse(url)
43 | query_params = parse_qs(parsed_url.query)
44 | query_dict = {}
45 | for key, value in query_params.items():
46 | query_dict[key] = value[0]
47 | return query_dict
48 |
49 |
50 | @staticmethod
51 | def get_url_proxy_api(url=None, proxy_settings=None):
52 | url_identifier = proxy_settings.get('url_identifier')
53 | query_params = DomainNormalizer.parse_url(url)
54 | url = query_params.get(url_identifier)
55 | return url
56 |
57 |
58 | @staticmethod
59 | def get_page_type(url, domain_data):
60 | if domain_data.get('url_classification'):
61 | url_classifiers = domain_data.get('url_contains_page_types', {})
62 | for k, v in url_classifiers.items():
63 | if k in url:
64 | return v
65 | query_param_page_types = domain_data.get('query_param_page_types', {})
66 | query_params = DomainNormalizer.parse_url(url)
67 | for k, v in query_params.items():
68 | key_mapping = query_param_page_types.get(k, None)
69 | if key_mapping is not None:
70 | return v
71 | return 'none'
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/normalizer/exceptions.py:
--------------------------------------------------------------------------------
1 | class ExceptionNormalizer(object):
2 |
3 | def __init__(self):
4 | pass
5 |
6 | @staticmethod
7 | def normalise_exception(exception_class):
8 |
9 | if 'ResponseNeverReceived' in exception_class:
10 | return 'ResponseNeverReceived'
11 |
12 | if 'Timeout' in exception_class:
13 | return 'Timeout'
14 |
15 | if 'TimedOut' in exception_class:
16 | return 'Timeout'
17 |
18 | if 'PotentialDataLoss' in exception_class:
19 | return 'PotentialDataLoss'
20 |
21 | if 'ConnectionLost' in exception_class:
22 | return 'ConnectionLost'
23 |
24 | return exception_class
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/normalizer/middleware.py:
--------------------------------------------------------------------------------
1 | from scrapeops_scrapy.core.api import SOPSRequest
2 | from scrapeops_scrapy.validators.response_validator import ResponseValidator
3 | from scrapeops_scrapy.normalizer.proxies import ProxyNormalizer
4 | from scrapeops_scrapy.normalizer.proxy_port_normalizer import ProxyPortStringNormalizer
5 |
6 |
7 | class RequestResponseMiddleware(object):
8 |
9 | PROXY_DOMAIN_NORMALIZATION = True
10 | RESPONSE_VALIDATION = True
11 | PROXY_ALERTS = False
12 | FAILED_URL_LOGGER_ENABLED = True
13 | LOG_MISSED_URLS = False
14 |
15 | def __init__(self, job_group_id, proxy_apis, generic_validators, error_logger, allowed_response_codes):
16 | self.job_group_id = job_group_id
17 | self._proxy_apis = proxy_apis
18 | self._data_coverage_validation = False
19 | self._domains = {}
20 | self._proxies = {}
21 | self._proxy_port_setups = {}
22 | self._generic_validators = generic_validators
23 | self._allowed_response_codes = allowed_response_codes
24 | self._error_logger = error_logger
25 | self._error_count = 0
26 | self._error_alerts_sent = {}
27 | self._missed_urls = {}
28 |
29 |
30 | def process(self, request_response_object, response):
31 | self.normalise_domain_proxy_data(request_response_object)
32 | self.check_proxy_responses(request_response_object, response)
33 | self.validate_response_data(request_response_object, response)
34 |
35 |
36 | def normalise_domain_proxy_data(self, request_response_object):
37 | if RequestResponseMiddleware.PROXY_DOMAIN_NORMALIZATION:
38 |
39 | proxy_api = self.normalise_proxy_api(request_response_object)
40 | if proxy_api is False:
41 | self.normalise_proxy_port(request_response_object)
42 | self.normalise_domain_data(request_response_object)
43 |
44 | if RequestResponseMiddleware.PROXY_DOMAIN_NORMALIZATION is False:
45 | request_response_object.fallback_domain_proxy_details(reason='disabled')
46 |
47 |
48 | def normalise_proxy_api(self, request_response_object):
49 | try:
50 | proxy_api, update = request_response_object.check_proxy_api(self._proxy_apis)
51 | if proxy_api and update:
52 | data, status = SOPSRequest().proxy_api_normalisation_request(request_response_object)
53 | if status.valid:
54 | self._proxy_apis[request_response_object.get_proxy_api_name()] = data.get('proxy_parsing_data')
55 | request_response_object.update_proxy_api(data.get('proxy_parsing_data'))
56 | else:
57 | if self._proxy_apis.get(request_response_object.get_proxy_api_name()) is None:
58 | self._proxy_apis[request_response_object.get_proxy_api_name()] = {}
59 | self._proxy_apis[request_response_object.get_proxy_api_name()]['proxy_setup'] = {}
60 | self._error_logger.log_error(reason='get_proxy_api_details_failed',
61 | error=status.error,
62 | data={'proxy_api': request_response_object.get_proxy_api_name()})
63 | request_response_object.fallback_proxy_details(proxy_type='proxy_api', proxy_apis=self._proxy_apis)
64 |
65 | except Exception:
66 | request_response_object.fallback_proxy_details(proxy_type='proxy_api', proxy_apis=self._proxy_apis)
67 |
68 | return proxy_api or False
69 |
70 |
71 | def normalise_proxy_port(self, request_response_object):
72 | try:
73 | if request_response_object.active_proxy_port():
74 | named_proxy, update = request_response_object.check_proxy_port_type(self._proxies)
75 | if named_proxy and update:
76 | data, status = SOPSRequest().proxy_port_normalisation_request(request_response_object)
77 | if status.valid:
78 | ProxyNormalizer.update_proxy_details(self._proxies, request_response_object, data, valid=True)
79 | ProxyPortStringNormalizer.proxy_port_test(self._proxies, request_response_object, data, valid=True)
80 | else:
81 | ProxyNormalizer.update_proxy_details(self._proxies, request_response_object, data, valid=False)
82 | self._error_logger.log_error(reason='get_proxy_port_details_failed',
83 | error=status.error,
84 | data={'proxy_port': request_response_object.get_raw_proxy()})
85 |
86 | ## Using No Proxy
87 | if request_response_object.active_proxy() is False:
88 | request_response_object.update_no_proxy()
89 |
90 | except Exception:
91 | request_response_object.fallback_proxy_details(proxy_type='proxy_port')
92 |
93 |
94 | def normalise_domain_data(self, request_response_object):
95 | try:
96 | ## Normalise domain/page type data
97 | unknown = request_response_object.check_domain(self._domains)
98 | if unknown:
99 | data, status = SOPSRequest().domain_normalisation_request(request_response_object)
100 | if status.valid:
101 | self._domains[request_response_object.get_domain()] = data.get('domain_parsing_data')
102 | request_response_object.update_page_type(data.get('domain_parsing_data'))
103 | else:
104 | if self._domains.get(request_response_object.get_domain()) is None:
105 | self._domains[request_response_object.get_domain()] = {}
106 | self._domains[request_response_object.get_domain()]['url_contains_page_types'] = {}
107 | self._domains[request_response_object.get_domain()]['query_param_page_types'] = {}
108 | self._domains[request_response_object.get_domain()]['validation_details'] = []
109 | self._error_logger.log_error(reason='get_domain_details_failed',
110 | error=status.error,
111 | data={'real_url': request_response_object.get_real_url()})
112 | request_response_object.fallback_domain_data()
113 |
114 | except Exception:
115 | request_response_object.fallback_domain_data()
116 |
117 |
118 | def check_proxy_responses(self, request_response_object, response):
119 | if RequestResponseMiddleware.PROXY_ALERTS:
120 | if request_response_object.active_proxy_api():
121 | proxy_details = self._proxy_apis.get(request_response_object.get_proxy_api_name())
122 | if proxy_details is not None:
123 | self.check_proxy_error_codes(request_response_object, proxy_details, response)
124 |
125 | if request_response_object.active_named_proxy():
126 | proxy_details = self._proxies.get(request_response_object.get_proxy_port_name())
127 | if proxy_details is not None:
128 | self.check_proxy_error_codes(request_response_object, proxy_details, response)
129 |
130 |
131 | def check_proxy_error_codes(self, request_response_object, proxy_details, response):
132 | error_codes = proxy_details.get('error_codes')
133 | if error_codes is not None:
134 | status_code = str(response.status)
135 | error_response = error_codes.get(status_code)
136 | if error_response is not None:
137 | if error_response.get('action') == 'alert' and self.should_alert(error_response, status_code):
138 | _, status = SOPSRequest().proxy_alert_request(request_response_object, self.job_group_id, error_response, self._error_alerts_sent.get(status_code))
139 | if status.valid:
140 | self._error_alerts_sent[status_code] += 1
141 | elif error_response.get('action') == 'monitor':
142 | self._error_count += 1
143 | if self._error_count > error_response.get('error_limit', 0) and self.should_alert(error_response, status_code):
144 | _, status = SOPSRequest().proxy_alert_request(request_response_object, self.job_group_id, error_response, self._error_alerts_sent.get(status_code))
145 | if status.valid:
146 | self._error_alerts_sent[status_code] += 1
147 |
148 |
149 | def should_alert(self, error_response, status_code):
150 | if self._error_alerts_sent.get(status_code) is None:
151 | self._error_alerts_sent[status_code] = 0
152 | return True
153 | if self._error_alerts_sent.get(status_code) is not None:
154 | if self._error_alerts_sent[status_code] < error_response.get('alert_limit'):
155 | return True
156 | return False
157 |
158 |
159 | def validate_response_data(self, request_response_object, response=None):
160 | if RequestResponseMiddleware.RESPONSE_VALIDATION and response is not None:
161 | if response.status == 200:
162 | domain_tests = ResponseValidator.get_domain_tests(request_response_object, self._domains)
163 | ResponseValidator.validate(request_response_object, response, domain_tests=domain_tests, generic_tests=self._generic_validators)
164 |
165 | if response.status != 200 and ResponseValidator.failed_scan(request_response_object, self._domains):
166 | ResponseValidator.validate(request_response_object, response, generic_tests=self._generic_validators)
167 |
168 |
169 | def failed_url(self, request_response_object, response=None):
170 | if RequestResponseMiddleware.FAILED_URL_LOGGER_ENABLED:
171 | if (response.status < 200 and response.status > 300) and (response.status not in self._allowed_response_codes):
172 | if self._missed_urls.get('count') is None:
173 | self._missed_urls['count'] = 0
174 | self._missed_urls['count'] += 1
175 |
176 | if RequestResponseMiddleware.LOG_MISSED_URLS:
177 | if self._missed_urls.get(response.status) is None:
178 | self._missed_urls[response.status] = []
179 | self._missed_urls[response.status].append(request_response_object.get_real_url())
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/normalizer/proxies.py:
--------------------------------------------------------------------------------
1 | import re
2 | import socket
3 |
4 | from base64 import b64decode
5 | from urllib.parse import unquote, urlparse
6 |
7 | from scrapeops_scrapy.normalizer.domains import DomainNormalizer
8 | from scrapeops_scrapy.exceptions import DecodeError
9 |
10 |
11 | class ProxyNormalizer(object):
12 |
13 | def __init__(self):
14 | pass
15 |
16 | @staticmethod
17 | def check_named_proxy(proxy_string):
18 | try:
19 | proxy_address = DomainNormalizer.get_full_domain(proxy_string)
20 | proxy_domain = DomainNormalizer.get_domain(proxy_string)
21 | return True, proxy_address, proxy_domain
22 | except Exception:
23 | return False, 'ip_list_proxy'
24 |
25 | @staticmethod
26 | def remove_brackets(string):
27 | characters_to_remove = ['[',']']
28 | new_string = string
29 | for character in characters_to_remove:
30 | new_string = new_string.replace(character, "")
31 | return new_string
32 |
33 |
34 | @staticmethod
35 | def check_ip_address(proxy_string):
36 | s = ProxyNormalizer.remove_brackets(proxy_string)
37 | ipv6_split_string = re.split('://|@|/', s)
38 | for el in ipv6_split_string:
39 | if ProxyNormalizer.is_valid_ipv6_address(el): return True
40 |
41 | ipv4_split_string = re.split('://|:|@', proxy_string)
42 | for el in ipv4_split_string:
43 | if ProxyNormalizer.is_valid_ipv4_address(el): return True
44 | return False
45 |
46 |
47 | @staticmethod
48 | def is_valid_ipv4_address(address):
49 | try:
50 | socket.inet_pton(socket.AF_INET, address)
51 | except AttributeError: # no inet_pton here, sorry
52 | try:
53 | socket.inet_aton(address)
54 | except socket.error:
55 | return False
56 | return address.count('.') == 3
57 | except socket.error: # not a valid address
58 | return False
59 |
60 | return True
61 |
62 | @staticmethod
63 | def is_valid_ipv6_address(address):
64 | try:
65 | socket.inet_pton(socket.AF_INET6, address)
66 | except socket.error: # not a valid address
67 | return False
68 | return True
69 |
70 | @staticmethod
71 | def get_proxy_port(proxy_string):
72 | try:
73 | return urlparse(proxy_string).port
74 | except Exception:
75 | return '80'
76 |
77 | @staticmethod
78 | def get_proxy_host(proxy_string):
79 | try:
80 | return DomainNormalizer.get_full_domain(proxy_string)
81 | except Exception:
82 | return 'ip_list_proxy'
83 |
84 | @staticmethod
85 | def get_proxy_scheme(proxy_string):
86 | try:
87 | return urlparse(proxy_string).scheme
88 | except Exception:
89 | return ''
90 |
91 | @staticmethod
92 | def unknown_proxy_scheme(proxy_string):
93 | if ProxyNormalizer.get_proxy_scheme(proxy_string) == '':
94 | return True
95 | return False
96 |
97 | @staticmethod
98 | def convert_bytes_to_string(inputValue):
99 | if isinstance(inputValue, (str, int)):
100 | return inputValue
101 | if isinstance(inputValue, (bytes, bytearray)):
102 | return inputValue.decode('utf-8')
103 | if isinstance(inputValue, list):
104 | tempList = []
105 | for el in inputValue:
106 | if isinstance(el, (bytes, bytearray)):
107 | tempList.append(el.decode('utf-8'))
108 | elif isinstance(el, list):
109 | tempList.append([''])
110 | elif isinstance(el, dict):
111 | tempList.append({'': ''})
112 | else:
113 | tempList.append(el)
114 | return tempList
115 | return inputValue
116 |
117 | @staticmethod
118 | def convert_headers(raw_headers):
119 | header_dict = {}
120 | try:
121 | for key, value in raw_headers.items():
122 | k = ProxyNormalizer.convert_bytes_to_string(key)
123 | v = ProxyNormalizer.convert_bytes_to_string(value)
124 | header_dict[k] = v
125 | return header_dict
126 | except Exception:
127 | return header_dict
128 |
129 | @staticmethod
130 | def decode_basic_auth(auth_string):
131 | """Decode an encrypted HTTP basic authentication string. Returns a tuple of
132 | the form (username, password), and raises a DecodeError exception if
133 | nothing could be decoded.
134 | """
135 | split = auth_string.strip().split(' ')
136 |
137 | # If split is only one element, try to decode the username and password
138 | # directly.
139 | if len(split) == 1:
140 | try:
141 | username, password = b64decode(split[0]).decode().split(':', 1)
142 | except Exception:
143 | raise DecodeError
144 |
145 | # If there are only two elements, check the first and ensure it says
146 | # 'basic' so that we know we're about to decode the right thing. If not,
147 | # bail out.
148 | elif len(split) == 2:
149 | if split[0].strip().lower() == 'basic':
150 | try:
151 | username, password = b64decode(split[1]).decode().split(':', 1)
152 | except Exception:
153 | raise DecodeError
154 | else:
155 | raise DecodeError
156 |
157 | # If there are more than 2 elements, something crazy must be happening.
158 | # Bail.
159 | else:
160 | raise DecodeError
161 |
162 | return unquote(username), unquote(password)
163 |
164 | @staticmethod
165 | def create_dict_if_none_exists(dict, key):
166 | if dict.get(key) is None:
167 | dict[key] = {}
168 |
169 | @staticmethod
170 | def update_proxy_details(proxy_dict, request_response_object, data, valid=False):
171 | proxy_name = request_response_object.get_proxy_port_name()
172 | if proxy_dict.get(proxy_name) is None:
173 | proxy_dict[proxy_name] = {}
174 |
175 | ## Update counter
176 | proxy_port_details = data.get('proxy_port_details')
177 | count = proxy_dict[proxy_name].get('count', 0)
178 | proxy_dict[proxy_name]['count'] = count + 1
179 | proxy_dict[proxy_name]['max_count'] = proxy_port_details.get('max_count', 3)
180 |
181 | if valid:
182 | proxy_dict[proxy_name]['normalization_actions'] = data.get('normalization_actions')
183 | proxy_dict[proxy_name]['fallback'] = data.get('fallback', 'port')
184 |
185 |
186 | proxy_setup_key = proxy_port_details.get('proxy_setup_key')
187 | proxy_setup_value = proxy_port_details.get('proxy_setup_value')
188 | if proxy_setup_value is None:
189 | proxy_setup_value = data.get('fallback', 'port_type=unknown')
190 | proxy_dict[proxy_name][proxy_setup_key] = proxy_setup_value
191 | proxy_dict[proxy_name]['known'] = proxy_port_details.get('proxy_known_domain', False)
192 | request_response_object.update_proxy_port(proxy_name, proxy_setup_value)
193 |
194 |
195 | else:
196 | proxy_dict[proxy_name]['fallback'] = 'port'
197 | request_response_object.fallback_proxy_details(proxy_type='proxy_port')
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/normalizer/proxy_port_normalizer.py:
--------------------------------------------------------------------------------
1 | from scrapeops_scrapy.core.api import SOPSRequest
2 |
3 |
4 | class ProxyPortStringNormalizer(object):
5 |
6 | def __init__(self):
7 | pass
8 |
9 |
10 | @staticmethod
11 | def run_proxy_string_normalization(request_response_object, normalization_actions):
12 |
13 | if normalization_actions is not None:
14 | for action_type, actions in normalization_actions.items():
15 | if actions is not None:
16 | if action_type == 'username':
17 | for action_block in actions:
18 | updated = ProxyPortStringNormalizer.process_action(request_response_object.get_normalized_proxy_port_username(), action_block)
19 | request_response_object.set_normalized_proxy_port_username(updated)
20 |
21 | if action_type == 'password':
22 | for action_block in actions:
23 | updated = ProxyPortStringNormalizer.process_action(request_response_object.get_normalized_proxy_port_password(), action_block)
24 | request_response_object.set_normalized_proxy_port_password(updated)
25 |
26 | if action_type == 'host':
27 | for action_block in actions:
28 | updated = ProxyPortStringNormalizer.process_action(request_response_object.get_normalized_proxy_port_host(), action_block)
29 | request_response_object.set_normalized_proxy_port_host(updated)
30 |
31 | if action_type == 'port':
32 | for action_block in actions:
33 | updated = ProxyPortStringNormalizer.process_action(request_response_object.get_normalized_proxy_port_port(), action_block)
34 | request_response_object.set_normalized_proxy_port_port(updated)
35 |
36 | if action_type == 'headers':
37 | for action_block in actions:
38 | updated = ProxyPortStringNormalizer.process_action(request_response_object.get_proxy_port_headers(), action_block)
39 | if updated is not None:
40 | request_response_object.update_normalized_proxy_port_header_string(updated)
41 |
42 |
43 | @staticmethod
44 | def process_action(inputValue, action_block):
45 |
46 | if action_block.get('action') == 'contains_replace':
47 | return ProxyPortStringNormalizer.contains_replace(inputValue, action_block)
48 |
49 | if action_block.get('action') == 'contains_replace_all':
50 | return ProxyPortStringNormalizer.contains_replace_all(inputValue, action_block)
51 |
52 | if action_block.get('action') == 'not_contains_replace_all':
53 | return ProxyPortStringNormalizer.not_contains_replace_all(inputValue, action_block)
54 |
55 | if action_block.get('action') == 'replace_key_value':
56 | return ProxyPortStringNormalizer.replace_key_value(inputValue, action_block)
57 |
58 | if action_block.get('action') == 'replace_key_seperator_value':
59 | return ProxyPortStringNormalizer.replace_key_seperator_value(inputValue, action_block)
60 |
61 | if action_block.get('action') == 'check_headers_contains':
62 | return ProxyPortStringNormalizer.check_headers_contains(inputValue, action_block)
63 |
64 | if action_block.get('action') == 'not_ends_in_replace':
65 | return ProxyPortStringNormalizer.not_ends_in_replace(inputValue, action_block)
66 |
67 | if action_block.get('action') == 'ends_in_replace':
68 | return ProxyPortStringNormalizer.ends_in_replace(inputValue, action_block)
69 |
70 | if action_block.get('action') == 'equals_replace':
71 | return ProxyPortStringNormalizer.equals_replace(inputValue, action_block)
72 |
73 | if action_block.get('action') == 'not_equals_replace':
74 | return ProxyPortStringNormalizer.not_equals_replace(inputValue, action_block)
75 |
76 | if action_block.get('action') == 'is_none_replace':
77 | return ProxyPortStringNormalizer.is_none_replace(inputValue, action_block)
78 |
79 | if action_block.get('action') == 'in_list_replace':
80 | return ProxyPortStringNormalizer.in_list_replace(inputValue, action_block)
81 |
82 | if action_block.get('action') == 'not_in_list_replace':
83 | return ProxyPortStringNormalizer.not_in_list_replace(inputValue, action_block)
84 |
85 |
86 | """
87 | Conditional Checks
88 | """
89 | @staticmethod
90 | def conditional_checks(inputString, condition=None):
91 | if condition is not None and condition.get('type') is not None:
92 |
93 | ## If substring in string
94 | if condition.get('type') == "contains":
95 | if condition.get('value') in inputString:
96 | return True
97 | return False
98 |
99 | if condition.get('type') == "not_contains":
100 | if condition.get('value') not in inputString:
101 | return True
102 | return False
103 |
104 | if condition.get('type') == "equals":
105 | if condition.get('value') == inputString:
106 | return True
107 | return False
108 |
109 | if condition.get('type') == "not_equal":
110 | if condition.get('value') != inputString:
111 | return True
112 | return False
113 |
114 | if condition.get('type') == "not_none":
115 | if inputString is not None:
116 | return True
117 | return False
118 |
119 |
120 | ## If all tests fail
121 | return False
122 |
123 | return True
124 |
125 | @staticmethod
126 | def get_condition_arguements(action_block):
127 | return action_block.get('condition'), action_block.get('arguements')
128 |
129 |
130 | """
131 | Actions
132 | """
133 |
134 | @staticmethod
135 | def replace_key_value(inputString, action_block):
136 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block)
137 |
138 | substring = arguements.get('substring')
139 | string_seperator = arguements.get('seperator')
140 | replacement = arguements.get('replacement')
141 |
142 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition):
143 | outputString = inputString
144 | splitString = inputString.split(string_seperator)
145 | for el in splitString:
146 | if substring.startswith('**'):
147 | if el.split('=')[0] == substring[2:]:
148 | outputString = outputString.replace(el, replacement)
149 | elif substring in el:
150 | outputString = outputString.replace(el, replacement)
151 | return outputString
152 |
153 | return inputString
154 |
155 |
156 | @staticmethod
157 | def replace_key_seperator_value(inputString, action_block):
158 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block)
159 |
160 | substring = arguements.get('substring')
161 | string_seperator = arguements.get('seperator')
162 | replacement = arguements.get('replacement')
163 | next_value = arguements.get('next_value')
164 |
165 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition):
166 | outputString = inputString
167 | splitString = inputString.split(string_seperator)
168 | for i, el in enumerate(splitString):
169 | if substring == el:
170 | if i + int(next_value) <= len(splitString):
171 | outputString = outputString.replace(string_seperator + splitString[i + int(next_value)], replacement)
172 | return outputString
173 |
174 | return inputString
175 |
176 |
177 | @staticmethod
178 | def check_headers_contains(inputheaders, action_block):
179 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block)
180 | value = condition.get('value')
181 | if inputheaders.get(value) is not None:
182 | header_value = inputheaders.get(value)
183 | if type(header_value) is list:
184 | header_value = header_value[0]
185 | value_check = arguements.get('check_type')
186 | if value_check == 'equals':
187 | if header_value == arguements.get('value'):
188 | return arguements.get('addition')
189 | if value_check == 'not_equal':
190 | if header_value != arguements.get('value'):
191 | return arguements.get('addition')
192 | if value_check is None:
193 | return arguements.get('addition')
194 | return None
195 |
196 | @staticmethod
197 | def not_ends_in_replace(inputString, action_block):
198 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block)
199 |
200 | if isinstance(inputString, str) is False:
201 | inputString = str(inputString)
202 |
203 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition):
204 | substring = arguements.get('substring')
205 | if inputString.endswith(substring) is False:
206 | return arguements.get('replacement')
207 |
208 | return inputString
209 |
210 | @staticmethod
211 | def ends_in_replace(inputString, action_block):
212 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block)
213 |
214 | if isinstance(inputString, str) is False:
215 | inputString = str(inputString)
216 |
217 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition):
218 | substring = arguements.get('substring')
219 | if inputString.endswith(substring):
220 | return arguements.get('replacement')
221 |
222 | return inputString
223 |
224 |
225 | @staticmethod
226 | def not_equals_replace(inputString, action_block):
227 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block)
228 |
229 | if isinstance(inputString, str) is False:
230 | inputString = str(inputString)
231 |
232 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition):
233 | substring = arguements.get('substring')
234 | if inputString != substring:
235 | return arguements.get('replacement')
236 |
237 | return inputString
238 |
239 |
240 | @staticmethod
241 | def equals_replace(inputString, action_block):
242 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block)
243 |
244 | if isinstance(inputString, str) is False:
245 | inputString = str(inputString)
246 |
247 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition):
248 | substring = arguements.get('substring')
249 | if inputString == substring:
250 | return arguements.get('replacement')
251 |
252 | return inputString
253 |
254 |
255 |
256 | @staticmethod
257 | def contains_replace(inputString, action_block):
258 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block)
259 |
260 | if isinstance(inputString, str) is False:
261 | inputString = str(inputString)
262 |
263 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition):
264 | substring = arguements.get('substring')
265 | replacement = arguements.get('replacement')
266 | return inputString.replace(substring, replacement)
267 |
268 | return inputString
269 |
270 |
271 | @staticmethod
272 | def not_contains_replace_all(inputString, action_block):
273 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block)
274 |
275 | if isinstance(inputString, str) is False:
276 | inputString = str(inputString)
277 |
278 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition):
279 | substring = arguements.get('substring')
280 | if substring not in inputString:
281 | return arguements.get('replacement')
282 |
283 | return inputString
284 |
285 | @staticmethod
286 | def contains_replace_all(inputString, action_block):
287 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block)
288 |
289 | if isinstance(inputString, str) is False:
290 | inputString = str(inputString)
291 |
292 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition):
293 | substring = arguements.get('substring')
294 | if substring in inputString:
295 | return arguements.get('replacement')
296 |
297 | return inputString
298 |
299 |
300 | @staticmethod
301 | def is_none_replace(inputString, action_block):
302 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block)
303 |
304 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition):
305 | if inputString is None:
306 | return arguements.get('replacement')
307 |
308 | return inputString
309 |
310 | @staticmethod
311 | def in_list_replace(inputString, action_block):
312 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block)
313 |
314 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition):
315 | list = arguements.get('list', '').split(',')
316 | if inputString in list:
317 | return arguements.get('replacement')
318 |
319 | return inputString
320 |
321 |
322 | @staticmethod
323 | def not_in_list_replace(inputString, action_block):
324 | condition, arguements = ProxyPortStringNormalizer.get_condition_arguements(action_block)
325 |
326 | if ProxyPortStringNormalizer.conditional_checks(inputString, condition=condition):
327 | list = arguements.get('list', '').split(',')
328 | if inputString not in list:
329 | return arguements.get('replacement')
330 |
331 | return inputString
332 |
333 |
334 | @staticmethod
335 | def proxy_port_test(proxy_dict, request_response_object, data, valid=False):
336 | if valid:
337 | proxy_name = request_response_object.get_proxy_port_name()
338 | test_request = data.get('test_request')
339 | proxy_port_details = data.get('proxy_port_details')
340 | proxy_setup_key = proxy_port_details.get('proxy_setup_key')
341 | if proxy_dict[proxy_name].get('sops_test_request') is None:
342 | proxy_dict[proxy_name]['sops_test_request'] = test_request
343 | test_request_count = proxy_dict[proxy_name]['sops_test_request'].get('count', 0)
344 | if test_request.get('send') and test_request_count < test_request.get('max_count', 1):
345 | proxy_dict[proxy_name]['sops_test_request']['count'] = test_request_count + 1
346 | json = SOPSRequest().proxy_test_request(test_request.get('url'), request_response_object)
347 | json['test_id'] = test_request.get('test_id')
348 | updated_data, status = SOPSRequest().proxy_port_normalisation_request(request_response_object, test_data=json)
349 | if status.valid:
350 | proxy_port_details = updated_data.get('proxy_port_details')
351 | proxy_setup_value = proxy_port_details.get('proxy_setup_value')
352 | proxy_dict[proxy_name][proxy_setup_key] = proxy_setup_value
353 |
354 |
355 |
356 |
357 |
358 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/normalizer/request_response.py:
--------------------------------------------------------------------------------
1 | from scrapeops_scrapy.normalizer.domains import DomainNormalizer
2 | from scrapeops_scrapy.normalizer.proxies import ProxyNormalizer
3 | from scrapeops_scrapy.normalizer.proxy_port_normalizer import ProxyPortStringNormalizer
4 |
5 |
6 | class BaseRequestResponse(object):
7 | """
8 | Normalised request/response data structure.
9 | """
10 |
11 | def __init__(self):
12 | self.signal_type = None
13 | self.request = None
14 | self.raw_url = None
15 | self.raw_proxy_port = None
16 | self.raw_domain = None
17 | self.raw_headers = None
18 |
19 | ## Proxy Checks
20 | self._active_proxy = None
21 | self._active_proxy_port=None
22 | self._real_url = None
23 | self._ip_proxy_list = False
24 | self._named_proxy = False
25 |
26 | ## Proxy Port
27 | self._proxy_port_name = None
28 | self._complete_proxy_port_string = None
29 | self._proxy_setup_key = None
30 |
31 | self._proxy_port_scheme = ''
32 | self._proxy_port_username = ''
33 | self._proxy_port_password = ''
34 | self._proxy_port_host = ''
35 | self._proxy_port_port = ''
36 | self._proxy_port_headers = {}
37 |
38 | self._normalized_proxy_port_username = None
39 | self._normalized_proxy_port_password = None
40 | self._normalized_proxy_port_host = None
41 | self._normalized_proxy_port_port = None
42 | self._normalized_proxy_port_header_string = None
43 |
44 |
45 | ## Proxy API
46 | self._proxy_api = False
47 | self._proxy_api_name = None
48 |
49 | ## Validation
50 | self._validation_test = None
51 | self._geo = None
52 | self._custom_tag = None
53 | self.json_response_keys = []
54 |
55 | ## Final
56 | self._domain = None
57 | self._page_type = None
58 | self._proxy_type = None
59 | self._proxy_name = None
60 | self._proxy_setup = None
61 |
62 |
63 | """
64 | Getters
65 | """
66 |
67 | def get_proxy_name(self):
68 | return self._proxy_name or 'unknown'
69 |
70 | def get_proxy_setup(self):
71 | return self._proxy_setup or 'unknown'
72 |
73 | def get_domain(self):
74 | return self._domain or 'unknown'
75 |
76 | def get_page_type(self):
77 | return self._page_type or 'unknown'
78 |
79 | def get_proxy_api_name(self):
80 | return self._proxy_api_name
81 |
82 | def get_proxy_port_name(self):
83 | return self._proxy_port_name
84 |
85 | def get_raw_proxy(self):
86 | return self.raw_proxy_port
87 |
88 | def get_real_url(self):
89 | return self._real_url or 'unknown'
90 |
91 | def get_validation_test(self):
92 | return self._validation_test or 'pass'
93 |
94 | def get_geo(self):
95 | return self._geo or 'none'
96 |
97 | def get_custom_tag(self):
98 | return self._custom_tag or 'none'
99 |
100 | def get_proxy_port_username(self):
101 | return self._proxy_port_username
102 |
103 | def get_proxy_port_password(self):
104 | return self._proxy_port_password
105 |
106 | def get_proxy_port_host(self):
107 | return self._proxy_port_host
108 |
109 | def get_proxy_port_port(self):
110 | return self._proxy_port_port
111 |
112 | def get_proxy_port_headers(self):
113 | if self._proxy_port_headers == {}:
114 | self._proxy_port_headers = ProxyNormalizer.convert_headers(self.raw_headers)
115 | return self._proxy_port_headers
116 |
117 | def get_complete_proxy_string(self):
118 | if self._complete_proxy_port_string is None:
119 | self._complete_proxy_port_string = "{}://{}:{}@{}:{}".format(self._proxy_port_scheme, self._proxy_port_username, self._proxy_port_password,
120 | self._proxy_port_host, self._proxy_port_port)
121 | return self._complete_proxy_port_string
122 |
123 | def get_normalized_proxy_port_username(self):
124 | if self._normalized_proxy_port_username is None:
125 | return self._proxy_port_username
126 | return self._normalized_proxy_port_username
127 |
128 | def get_normalized_proxy_port_password(self):
129 | if self._normalized_proxy_port_password is None:
130 | return self._proxy_port_password
131 | return self._normalized_proxy_port_password
132 |
133 | def get_normalized_proxy_port_host(self):
134 | if self._normalized_proxy_port_host is None:
135 | return self._proxy_port_host
136 | return self._normalized_proxy_port_host
137 |
138 | def get_normalized_proxy_port_port(self):
139 | if self._normalized_proxy_port_port is None:
140 | return self._proxy_port_port
141 | return self._normalized_proxy_port_port
142 |
143 | def get_normalized_proxy_port_header_string(self):
144 | if self._normalized_proxy_port_header_string is not None:
145 | return f' -H {self._normalized_proxy_port_header_string}'
146 | return ''
147 |
148 | def is_json_response(self):
149 | if len(self.json_response_keys) > 0:
150 | return True
151 | return False
152 |
153 | def get_json_response_keys(self):
154 | return self.json_response_keys
155 |
156 |
157 | """
158 | SETTERS
159 | """
160 |
161 | def set_normalized_proxy_port_username(self, username):
162 | self._normalized_proxy_port_username = username
163 |
164 | def set_normalized_proxy_port_password(self, password):
165 | self._normalized_proxy_port_password = password
166 |
167 | def set_normalized_proxy_port_host(self, host):
168 | self._normalized_proxy_port_host = host
169 |
170 | def set_normalized_proxy_port_port(self, port):
171 | self._normalized_proxy_port_port = port
172 |
173 | def update_normalized_proxy_port_header_string(self, header_string):
174 | if self._normalized_proxy_port_header_string is None:
175 | self._normalized_proxy_port_header_string = header_string
176 | else:
177 | self._normalized_proxy_port_header_string = f'{self._normalized_proxy_port_header_string} {header_string}'
178 |
179 |
180 | """
181 | Proxy Type Methods
182 | """
183 |
184 | def active_proxy(self):
185 | return True if self._active_proxy else False
186 |
187 | def active_proxy_port(self):
188 | return True if self._active_proxy_port else False
189 |
190 | def active_proxy_api(self):
191 | return self._proxy_api
192 |
193 | def active_named_proxy(self):
194 | return self._named_proxy
195 |
196 |
197 |
198 |
199 |
200 |
201 | class RequestResponse(BaseRequestResponse):
202 |
203 | def __init__(self, signal_type=None, request=None, response=None):
204 | BaseRequestResponse.__init__(self)
205 | self.signal_type = signal_type
206 | if request is not None or response is not None:
207 | self.request = response.request if request is None else request
208 | self.raw_url = request.url if response is None else response.url
209 | self.raw_proxy_port = self.request.meta.get('proxy')
210 | self.raw_domain = DomainNormalizer.get_domain(self.raw_url)
211 | self._active_proxy = self._active_proxy_port = False if self.raw_proxy_port is None else True
212 | self.raw_headers = self.request.headers
213 |
214 | """
215 | Domain Normalization
216 | """
217 |
218 | def check_domain(self, domain_obj):
219 | domain_details = domain_obj.get(self._domain)
220 | if domain_details is not None:
221 | self._page_type = DomainNormalizer.get_page_type(self._real_url, domain_data=domain_details)
222 | return False
223 | return True
224 |
225 |
226 | def update_page_type(self, domain_details):
227 | if domain_details is not None:
228 | self._page_type = DomainNormalizer.get_page_type(self._real_url, domain_data=domain_details)
229 |
230 |
231 | def fallback_domain_data(self):
232 | if self._domain is None:
233 | self._domain = DomainNormalizer.get_domain(self.raw_url)
234 | self._page_type = 'none'
235 |
236 |
237 | """
238 | Proxy Port Normalization
239 | """
240 |
241 | def check_proxy_port_type(self, proxy_ports):
242 | if ProxyNormalizer.check_ip_address(self.raw_proxy_port):
243 | self._proxy_type = 'proxy_ip_list'
244 | self._real_url = self.raw_url
245 | self._domain = self.raw_domain
246 | self._proxy_name = 'unknown_ip'
247 | self._proxy_setup = 'ip_address'
248 | return False, False
249 |
250 | self._named_proxy, self._proxy_port_host, self._proxy_port_name = ProxyNormalizer.check_named_proxy(self.raw_proxy_port)
251 |
252 | if self._named_proxy:
253 | self._proxy_type = 'named_proxy_port'
254 | self._real_url = self.raw_url
255 | self._domain = self.raw_domain
256 | self.get_proxy_port_details()
257 |
258 | proxy_details = proxy_ports.get(self._proxy_port_name)
259 |
260 | if proxy_details is not None:
261 |
262 | if proxy_details.get(self._complete_proxy_port_string) is not None:
263 | self._proxy_setup = proxy_details.get(self._complete_proxy_port_string)
264 | elif proxy_details.get(self._complete_proxy_port_string) is None and proxy_details.get('known', False):
265 | ProxyPortStringNormalizer.run_proxy_string_normalization(self, proxy_ports[self._proxy_port_name].get('normalization_actions'))
266 | self.create_normalized_proxy_port_string()
267 | self._proxy_setup = proxy_details.get(self._normalized_proxy_port_string)
268 |
269 | if self._proxy_setup is None:
270 | self._proxy_setup = proxy_details.get('fallback')
271 | if proxy_details.get('count') > proxy_details.get('max_count') or proxy_details.get('known') is False:
272 | return True, False
273 | ## Get details
274 | return True, True
275 | return True, False
276 |
277 | ## get proxy details
278 | return True, True
279 |
280 |
281 | def get_proxy_port_details(self):
282 | self._proxy_name = self._proxy_port_name
283 | self._proxy_port_port = ProxyNormalizer.get_proxy_port(self.raw_proxy_port)
284 | self._proxy_port_scheme = ProxyNormalizer.get_proxy_scheme(self.raw_proxy_port)
285 | if self.raw_headers.get('Proxy-Authorization') is not None:
286 | auth_string = self.raw_headers.get('Proxy-Authorization').decode('utf-8')
287 | self._proxy_port_username, self._proxy_port_password = ProxyNormalizer.decode_basic_auth(auth_string)
288 | self._complete_proxy_port_string = "{}://{}:{}@{}:{}".format(self._proxy_port_scheme, self._proxy_port_username, self._proxy_port_password,
289 | self._proxy_port_host, self._proxy_port_port)
290 |
291 | def create_normalized_proxy_port_string(self):
292 | username = self.get_normalized_proxy_port_username()
293 | password = self.get_normalized_proxy_port_password()
294 | host = self.get_normalized_proxy_port_host()
295 | port = self.get_normalized_proxy_port_port()
296 | header_string = self.get_normalized_proxy_port_header_string()
297 | self._normalized_proxy_port_string = "{}://{}:{}@{}:{}".format(self._proxy_port_scheme, username, password, host, port)
298 | if header_string != '':
299 | self._normalized_proxy_port_string = self._normalized_proxy_port_string + header_string
300 |
301 | def proxy_port_setup(self, proxy_details):
302 | proxy_setup = proxy_details.get('proxy_setup')
303 | if proxy_setup is None:
304 | return 'none'
305 | proxy_string = 'port'
306 | ## Generate settings string
307 | return proxy_string
308 |
309 | def update_proxy_port(self, proxy_name, proxy_setup_value):
310 | self._active_proxy = True
311 | self._proxy_api = False
312 | self._proxy_type = 'named_proxy_port'
313 | self._proxy_name = proxy_name
314 | self._proxy_setup = proxy_setup_value
315 |
316 |
317 |
318 | """
319 | Proxy API Normalization
320 | """
321 |
322 | def check_proxy_api(self, proxy_apis):
323 | proxy_details = proxy_apis.get(self.raw_domain)
324 | if proxy_details is not None:
325 | if proxy_details.get('proxy_setup') is None:
326 | self._proxy_api_name = proxy_details.get('proxy_name')
327 | return True, True
328 | self.update_proxy_api(proxy_details)
329 | return True, False
330 | return False, False
331 |
332 |
333 | def update_proxy_api(self, proxy_details):
334 | self._real_url = DomainNormalizer.get_url_proxy_api(url=self.raw_url, proxy_settings=proxy_details)
335 | self._domain = DomainNormalizer.get_domain(self._real_url)
336 | self._active_proxy = True
337 | self._proxy_api = True
338 | self._proxy_type = 'proxy_api'
339 | self._proxy_name = self._proxy_api_name = proxy_details.get('proxy_name')
340 | self._proxy_setup = self.proxy_api_setup(proxy_details) ## into new file
341 | self.json_response_keys = proxy_details.get('json_response_keys', [])
342 |
343 |
344 | def proxy_api_setup(self, proxy_details):
345 | proxy_string = 'api'
346 | proxy_setup = proxy_details.get('proxy_setup')
347 | if proxy_setup is None:
348 | return proxy_string
349 | query_params = DomainNormalizer.parse_url(self.raw_url)
350 | for key, value in query_params.items():
351 | key_mapping = proxy_setup.get(key)
352 | if key_mapping is not None:
353 | if key_mapping.startswith('**'):
354 | proxy_string = f'{proxy_string}_{key_mapping[2:]}'
355 | elif key_mapping.startswith('--'):
356 | proxy_string = f'{proxy_string}_{key_mapping[2:]}={value.lower()}'
357 | elif key_mapping.startswith('^^'):
358 | proxy_string = f'{proxy_string}_{key_mapping[2:]}=false'
359 | else:
360 | proxy_string = f'{proxy_string}_{key_mapping}=true'
361 | return proxy_string
362 |
363 |
364 |
365 | """
366 | Fallback Proxy Details
367 | """
368 |
369 | def update_no_proxy(self):
370 | self._proxy_type = self._proxy_name = 'no_proxy'
371 | self._proxy_setup = 'none'
372 | self._real_url = self.raw_url
373 | self._domain = self.raw_domain
374 |
375 | def fallback_proxy_details(self, proxy_type=None, proxy_apis=None):
376 | if proxy_type == 'proxy_api':
377 | proxy_details = proxy_apis.get(self.raw_domain)
378 | if proxy_details is not None:
379 | self.update_proxy_api(proxy_details)
380 | else:
381 | self._proxy_name = 'unknown_proxy_api' if self._proxy_api_name is None else self._proxy_api_name
382 | self._proxy_setup = 'fallback' if self._proxy_setup is None else self._proxy_setup
383 | else:
384 | self._proxy_name = 'unknown_proxy_port' if self._proxy_name is None else self._proxy_name
385 | self._proxy_setup = 'fallback' if self._proxy_setup is None else self._proxy_setup
386 |
387 |
388 | """
389 | Fallback Proxy + Domain Details
390 | """
391 |
392 | def fallback_domain_proxy_details(self, reason='fallback'):
393 | """
394 | Fallback -> if issue with domain/proxy normalising
395 | """
396 | self._domain = DomainNormalizer.get_domain(self.raw_url)
397 | self._page_type = 'none'
398 | self._proxy_name = reason
399 | self._proxy_setup = 'none'
400 |
401 |
402 | """
403 | Response Validation Tests
404 | """
405 |
406 | def failed_validation_test(self, test):
407 | if self._validation_test is None:
408 | self._validation_test = test.get('validation_msg', 'failed')
409 | else:
410 | self._validation_test = f'{self._validation_test}&&{test.get("validation_msg", "failed")}'
411 | if test.get('validation_test_id', -1) != -1:
412 | self._validation_test = f'{self._validation_test}_{test.get("validation_test_id")}'
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/signals/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/signals/__init__.py
--------------------------------------------------------------------------------
/scrapeops_scrapy/signals/scrapeops_signals.py:
--------------------------------------------------------------------------------
1 | scrapeops_response_recieved = object()
2 | scrapeops_exception_recieved = object()
3 | scrapeops_response_rejected = object()
4 | scrapeops_item_rejected = object()
--------------------------------------------------------------------------------
/scrapeops_scrapy/signals/triggers.py:
--------------------------------------------------------------------------------
1 | from scrapeops_scrapy.signals import scrapeops_signals
2 |
3 | class ScrapeOpsTrigger(object):
4 |
5 | def __init__(self):
6 | pass
7 |
8 | @staticmethod
9 | def reject_response(crawler=None, response=None, reason=None):
10 | crawler.signals.send_catch_log(signal=scrapeops_signals.scrapeops_response_rejected,
11 | spider=crawler.spider,
12 | response=response,
13 | reason=reason,
14 | )
15 |
16 | @staticmethod
17 | def reject_item(crawler=None, response=None, item=None, reason=None):
18 | crawler.signals.send_catch_log(signal=scrapeops_signals.scrapeops_item_rejected,
19 | spider=crawler.spider,
20 | response=response,
21 | item=item,
22 | reason=reason,
23 | )
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/stats/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/stats/__init__.py
--------------------------------------------------------------------------------
/scrapeops_scrapy/stats/failed_urls.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | class FailedUrlsHandler(object):
4 |
5 | FAILED_URL_LOGGER_ENABLED = True
6 | LOG_MISSED_URLS = False
7 | MAX_LOGGED_URLS = 100
8 |
9 | def __init__(self):
10 | self.failed_urls_count = 0
11 | self.failed_urls_list = []
12 | self.errback_free = True
13 |
14 | def log_failure(self, failure):
15 | if FailedUrlsHandler.FAILED_URL_LOGGER_ENABLED:
16 | self.failed_urls_count += 1
17 | if FailedUrlsHandler.LOG_MISSED_URLS and len(self.failed_urls_list) < FailedUrlsHandler.MAX_LOGGED_URLS:
18 | request = failure.request
19 | self.failed_urls_list.append(request.url)
20 |
21 | def get_url_count(self):
22 | return self.failed_urls_count
23 |
24 | def get_url_list(self):
25 | return self.failed_urls_list
26 |
27 | def disable_errback(self):
28 | self.errback_free = False
29 |
30 | def enabled(self):
31 | return self.errback_free
32 |
33 |
34 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/stats/logger.py:
--------------------------------------------------------------------------------
1 | ## scrapy
2 | from scrapy.utils.request import request_httprepr
3 |
4 | ## scrapeops
5 | from scrapeops_scrapy.stats.model import OverallStatsModel, PeriodicStatsModel
6 | from scrapeops_scrapy.utils import utils
7 | from scrapeops_scrapy.normalizer.exceptions import ExceptionNormalizer
8 | from scrapeops_scrapy.utils.utils import get_header_size, get_status_size
9 |
10 | import copy
11 |
12 |
13 | class StatsLogger(OverallStatsModel, PeriodicStatsModel):
14 |
15 | def __init__(self):
16 | OverallStatsModel.__init__(self)
17 | PeriodicStatsModel.__init__(self)
18 |
19 |
20 | def display_stats(self):
21 | self.display_periodic_stats()
22 | self.display_overall_stats()
23 |
24 |
25 | def check_periodic_stats(self):
26 | if self._periodic_stats == {}:
27 | self.set_value(self._periodic_stats, 'job_id', self.job_id)
28 |
29 |
30 | def spider_open_stats(self):
31 | self.set_value(self._overall_stats, 'job_id', self.job_id)
32 | self.set_value(self._overall_stats, 'job_name', self.job_group_name)
33 | self.set_value(self._overall_stats, 'job_start_time', self.start_time)
34 | self.set_value(self._overall_stats, 'job_finish_time', 0)
35 | self.set_value(self._overall_stats, 'job_run_time', 0)
36 | self.set_value(self._overall_stats, 'status', 'Live')
37 | self.set_value(self._overall_stats, 'middleware_enabled', self._scrapeops_middleware)
38 |
39 |
40 |
41 | def spider_close_stats(self, reason=None, crawler=None):
42 | finish_time = utils.current_time()
43 | self.aggregate_stats(crawler)
44 | self.set_value(self._overall_stats, 'job_finish_time', finish_time)
45 | self.set_value(self._overall_stats, 'job_run_time', finish_time - self.start_time)
46 | self.set_value(self._overall_stats, 'status', 'Finished')
47 | self.set_value(self._overall_stats, 'reason', reason)
48 | self.set_value(self._overall_stats, 'period_frequency', self._period_frequency)
49 |
50 |
51 | def generate_request_stats(self, request_response_object, request=None):
52 | proxy_name = request_response_object.get_proxy_name()
53 | proxy_setup = request_response_object.get_proxy_setup()
54 | domain_name = request_response_object.get_domain()
55 | page_type = request_response_object.get_page_type()
56 | custom_tag = request_response_object.get_custom_tag()
57 | reqlen = len(request_httprepr(request))
58 |
59 | ## periodic stats
60 | self.check_periodic_stats()
61 | self.inc_value(self._periodic_stats, f'requests|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{custom_tag}|count')
62 | self.inc_value(self._periodic_stats, f'requests|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{custom_tag}|bytes', count=reqlen)
63 |
64 | ## overall stats
65 | self.inc_value(self._overall_stats, f'requests|{request.method}|count')
66 | self.inc_value(self._overall_stats, f'requests|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{custom_tag}|count')
67 | self.inc_value(self._overall_stats, f'requests|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{custom_tag}|bytes', count=reqlen)
68 |
69 |
70 | def generate_response_stats(self, request_response_object, request=None, response=None):
71 | proxy_name = request_response_object.get_proxy_name()
72 | proxy_setup = request_response_object.get_proxy_setup()
73 | domain_name = request_response_object.get_domain()
74 | page_type = request_response_object.get_page_type()
75 | validation = request_response_object.get_validation_test()
76 | geo = request_response_object.get_geo()
77 | custom_tag = request_response_object.get_custom_tag()
78 | custom_signal = 'none'
79 | reslen = len(response.body) + get_header_size(response.headers) + get_status_size(response.status) + 4
80 | total_latency = request.meta.get('download_latency', 0)
81 |
82 | ## periodic stats
83 | self.check_periodic_stats()
84 | self.inc_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|count')
85 | self.inc_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|bytes', count=reslen)
86 | self.inc_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|retries', count=request.meta.get('retry_times', 0))
87 | self.inc_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|total_latency', count=total_latency)
88 | self.min_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|min_latency', total_latency)
89 | self.max_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|max_latency', total_latency)
90 |
91 | ## overall stats
92 | self.inc_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|count')
93 | self.inc_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|bytes', count=reslen)
94 | self.inc_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|retries', count=request.meta.get('retry_times', 0))
95 | self.inc_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|total_latency', count=total_latency)
96 | self.min_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|min_latency', total_latency)
97 | self.max_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{response.status}|{validation}|{geo}|{custom_tag}|{custom_signal}|max_latency', total_latency)
98 |
99 |
100 | def generate_item_stats(self, request_response_object, signal=None, response=None):
101 | if response is not None:
102 | request = response.request
103 | request_method = request.method
104 | status = response.status
105 | else:
106 | request_method = status = 'unknown'
107 | proxy_name = request_response_object.get_proxy_name()
108 | proxy_setup = request_response_object.get_proxy_setup()
109 | domain_name = request_response_object.get_domain()
110 | page_type = request_response_object.get_page_type()
111 | validation = request_response_object.get_validation_test()
112 | geo = request_response_object.get_geo()
113 | custom_tag = request_response_object.get_custom_tag()
114 | custom_signal = 'none'
115 | self.check_periodic_stats()
116 |
117 | if signal == 'item_scraped':
118 | self.inc_value(self._periodic_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items')
119 | self.inc_value(self._overall_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items')
120 |
121 | elif signal == 'item_dropped':
122 | self.inc_value(self._periodic_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items_dropped')
123 | self.inc_value(self._overall_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|items_dropped')
124 |
125 | elif signal == 'item_error':
126 | self.inc_value(self._periodic_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|item_errors')
127 | self.inc_value(self._overall_stats, f'responses|{request_method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{status}|{validation}|{geo}|{custom_tag}|{custom_signal}|item_errors')
128 |
129 |
130 |
131 | def generate_exception_stats(self, request_response_object, request=None, exception_class=None):
132 | proxy_name = request_response_object.get_proxy_name()
133 | proxy_setup = request_response_object.get_proxy_setup()
134 | domain_name = request_response_object.get_domain()
135 | page_type = request_response_object.get_page_type()
136 | validation = request_response_object.get_validation_test()
137 | geo = request_response_object.get_geo()
138 | custom_tag = request_response_object.get_custom_tag()
139 | custom_signal = 'none'
140 | exception_type = ExceptionNormalizer.normalise_exception(exception_class)
141 | download_latency = request.meta.get('download_latency', 0)
142 | if download_latency is None:
143 | start_time = request.meta.get('sops_time', 0)
144 | if start_time != 0: download_latency = utils.current_time() - start_time
145 | else: download_latency = 0
146 |
147 | self.check_periodic_stats()
148 | self.inc_value(self._periodic_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{exception_type}|{validation}|{geo}|{custom_tag}|{custom_signal}|count')
149 | self.inc_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{exception_type}|{validation}|{geo}|{custom_tag}|{custom_signal}|count')
150 | self.inc_value(self._overall_stats, f'responses|{request.method}|{proxy_name}|{proxy_setup}|{domain_name}|{page_type}|{exception_type}|{validation}|{geo}|{custom_tag}|{custom_signal}|total_latency', count=download_latency)
151 |
152 | def aggregate_stats(self, crawler=None, middleware=False):
153 | self.avg_latency()
154 | self.log_levels(crawler)
155 | if middleware is False:
156 | self.get_exception_stats(crawler)
157 |
158 |
159 | def avg_latency(self):
160 | for stat_type in [self._periodic_stats, self._overall_stats]:
161 | stats_copy = copy.deepcopy(stat_type)
162 | for key, value in stats_copy.items():
163 | if 'responses' in key and 'total_latency' in key:
164 | count_key = key.replace('total_latency', 'count')
165 | avg_latency = value / stats_copy.get(count_key)
166 | self.set_value(stat_type, key.replace('total_latency', 'avg_latency'), avg_latency)
167 |
168 |
169 | def log_levels(self, crawler):
170 | scrapy_stats = crawler.stats.get_stats()
171 | for log_level in ['WARNING', 'ERROR', 'CRITICAL']:
172 | log_key = 'log_count/' + log_level
173 | log_value = scrapy_stats.get(log_key, 0)
174 | previous_value = self._overall_stats.get(log_key, 0)
175 | self.set_value(self._periodic_stats, log_key, log_value - previous_value)
176 | self.set_value(self._overall_stats, log_key, log_value)
177 |
178 |
179 | def exception_type_check(self, key):
180 | if isinstance(key, str):
181 | return key.startswith('downloader/exception_type_count/')
182 | return False
183 |
184 | def get_exception_stats(self, crawler):
185 | scrapy_stats = crawler.stats.get_stats()
186 | if scrapy_stats.get('downloader/exception_count') is not None:
187 | exception_values = [ {k:v} for k,v in scrapy_stats.items() if self.exception_type_check(k)]
188 | for exception in exception_values:
189 | for key, value in exception.items():
190 | key_type = key.replace('downloader/exception_type_count/', '')
191 | try:
192 | exception_type = key_type.split('.')[-1]
193 | except Exception:
194 | exception_type = key_type
195 | self.set_value(self._overall_stats, f'responses|unknown|unknown|unknown|unknown|unknown|{exception_type}|unknown|unknown|unknown|unknown|count', value)
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/stats/model.py:
--------------------------------------------------------------------------------
1 |
2 | class BaseStatsModel(object):
3 |
4 | def __init__(self):
5 | pass
6 |
7 | def get_value(self, stats, key, default=None):
8 | return stats.get(key, default)
9 |
10 | def set_value(self, stats, key, value):
11 | stats[key] = value
12 |
13 | def inc_value(self, stats, key, count=1, start=0, spider=None):
14 | d = stats
15 | d[key] = d.setdefault(key, start) + count
16 |
17 | def max_value(self, stats, key, value, spider=None):
18 | stats[key] = max(stats.setdefault(key, value), value)
19 |
20 | def min_value(self, stats, key, value, spider=None):
21 | stats[key] = min(stats.setdefault(key, value), value)
22 |
23 | def print_stats(self, statsType, stats):
24 | print(f'#### SCRAPEOPS {statsType.upper()} STATS ####')
25 | print('{')
26 | for key, value in stats.items():
27 | if key[0] != '_':
28 | print(f" '{key}': {value},")
29 | print('}')
30 |
31 |
32 | class PeriodicStatsModel(BaseStatsModel):
33 |
34 | def __init__(self):
35 | self._periodic_stats = {}
36 | self._periodic_errors = 0
37 | self._periodic_warnings = 0
38 | self._periodic_criticals = 0
39 |
40 | def get_periodic_stats(self):
41 | return self._periodic_stats
42 |
43 | def reset_periodic_stats(self):
44 | self._periodic_stats = {}
45 |
46 | def display_periodic_stats(self):
47 | stats = self.get_periodic_stats()
48 | self.print_stats('periodic', stats)
49 |
50 |
51 | class OverallStatsModel(BaseStatsModel):
52 |
53 | def __init__(self):
54 | self._overall_stats = {}
55 | self._overall_errors = 0
56 | self._overall_warnings = 0
57 | self._overall_criticals = 0
58 |
59 | def get_overall_stats(self):
60 | return self._overall_stats
61 |
62 | def display_overall_stats(self):
63 | stats = self.get_overall_stats()
64 | self.print_stats('overall', stats)
65 |
66 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/tests/__init__.py
--------------------------------------------------------------------------------
/scrapeops_scrapy/tests/core.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | class ScrapeOpsTest:
4 |
5 | def __init__(self):
6 | self.active = True
7 | self.test_id = None
8 |
9 | def test_active(self):
10 | if self.active is True:
11 | return True
12 | return False
13 |
14 | def get_test_id(self):
15 | return self.test_id
16 |
17 | def generate_test_id(self):
18 | response = requests.post('https://api.scrapeops.io/api/v1/start_test?api_key=1234&sdk_type=scrapy')
19 | data = response.json()
20 | self.test_id = data.get('test_id', None)
21 | return self.test_id
22 |
23 | @staticmethod
24 | def generate_test_settings():
25 | return {
26 | 'RETRY_TIMES': 0,
27 | 'RETRY_ENABLED': False,
28 | }
29 |
30 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/utils/__init__.py
--------------------------------------------------------------------------------
/scrapeops_scrapy/utils/error_handling.py:
--------------------------------------------------------------------------------
1 | import functools
2 |
3 | from scrapeops_scrapy.exceptions import ScrapeOpsAPIResponseError
4 |
5 | def exception_handler(func):
6 | @functools.wraps(func)
7 | def wrapper(*args, **kwargs):
8 | try:
9 | return func(*args, **kwargs)
10 | except ScrapeOpsAPIResponseError as e:
11 | pass
12 | except Exception as e:
13 | pass
14 | return wrapper
15 |
16 |
17 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/utils/utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | import time
3 | import sys
4 | import sys
5 | import scrapy
6 | import scrapeops_scrapy
7 | import platform
8 |
9 | from scrapeops_scrapy.utils.error_handling import exception_handler
10 | from scrapy.utils.python import to_bytes
11 | from twisted.web import http
12 |
13 |
14 | def current_time():
15 | t = time.time()
16 | return int(round(t, 0))
17 |
18 | @exception_handler
19 | def get_args():
20 | arg_dict = {'raw_string': '', 'args': [], 'options': []}
21 | if sys.argv[0] == 'crawl' or sys.argv[0] == 'runspider':
22 | args = sys.argv[2:]
23 | else:
24 | args = sys.argv[1:]
25 | for index, arg in enumerate(args):
26 | arg_dict['raw_string'] += append_raw_string(arg)
27 | if arg.startswith('--'):
28 | arg_dict['options'].append(arg)
29 | if arg.startswith('-a'):
30 | try:
31 | if args[index + 1].startswith('-') is False and args[index + 1].startswith('--') is False: arg_dict['args'].append(args[index + 1])
32 | except Exception:
33 | arg_dict['args'].append(arg)
34 | return arg_dict
35 |
36 |
37 | def scrapeops_middleware_installed(spider_settings):
38 | downloader_middlerwares = spider_settings.get('DOWNLOADER_MIDDLEWARES', {})
39 | if downloader_middlerwares.get('scrapeops_scrapy.middleware.stats.ScrapeOpsStats') is not None:
40 | return True
41 | if downloader_middlerwares.get('scrapeops_scrapy.middleware.retry.RetryMiddleware') is not None:
42 | return True
43 | return False
44 |
45 | @exception_handler
46 | def get_python_version():
47 | verions_string = sys.version
48 | split_string = verions_string.split(' ')
49 | return split_string[0]
50 |
51 | @exception_handler
52 | def get_scrapy_version():
53 | return scrapy.__version__
54 |
55 | @exception_handler
56 | def get_scrapeops_version():
57 | return scrapeops_scrapy.__version__
58 |
59 | @exception_handler
60 | def get_system_version():
61 | return platform.platform()
62 |
63 | def append_raw_string(arg):
64 | if ' ' in arg:
65 | return '"{}" '.format(arg)
66 | return "{} ".format(arg)
67 |
68 | def merge_dicts(x, y):
69 | z = x.copy()
70 | z.update(y)
71 | return z
72 |
73 | # from scrapy
74 | def get_header_size(headers):
75 | size = 0
76 | for key, value in headers.items():
77 | if isinstance(value, (list, tuple)):
78 | for v in value:
79 | size += len(b": ") + len(key) + len(v)
80 | return size + len(b'\r\n') * (len(headers.keys()) - 1)
81 |
82 |
83 | def get_status_size(response_status):
84 | return len(to_bytes(http.RESPONSES.get(response_status, b''))) + 15
85 | # resp.status + b"\r\n" + b"HTTP/1.1 <100-599> "
86 |
87 |
88 | def remove_url(string, replacement=""):
89 | return re.sub(r'http\S+', replacement, string)
90 |
91 |
92 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/validators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScrapeOps/scrapeops-scrapy-sdk/8d38824cc54ff6bd77ab8233e16c2bac1a269ad5/scrapeops_scrapy/validators/__init__.py
--------------------------------------------------------------------------------
/scrapeops_scrapy/validators/item_validator.py:
--------------------------------------------------------------------------------
1 | from itemadapter import ItemAdapter, is_item
2 |
3 | class ItemValidator(object):
4 |
5 | ITEM_COVERAGE_ENABLED = True
6 | INVALID_ITEM_URLS_LOGGING_ENABLED = False
7 | MAX_ITEM_URLS = 1000
8 |
9 | def __init__(self):
10 | self.item_coverage = {
11 | '_SOP_OVERAL_STATS': {
12 | 'num_items': 0,
13 | 'num_invalid_items': 0,
14 | 'num_total_fields': 0,
15 | 'num_invalid_fields': 0,
16 | }
17 | }
18 | self.items = 0
19 | self.invalid_items = 0
20 | self.invalid_items_urls = {}
21 |
22 |
23 | def extract_name_fields_item(item):
24 | return
25 |
26 | def validate(self, request_response_object, item):
27 | if ItemValidator.ITEM_COVERAGE_ENABLED and is_item(item):
28 | try:
29 | self.increment_items()
30 | adapter = ItemAdapter(item)
31 | item_name = ItemValidator.get_item_name(item)
32 | dict_item = adapter.asdict()
33 | field_keys = dict_item.keys()
34 | if item_name is not None and field_keys is not None:
35 | domain = request_response_object.get_domain()
36 | invalid_fields = []
37 | valid_item = True
38 | self.check_item_exists(domain, item_name, field_keys)
39 | self.item_coverage[domain][item_name]['num_items'] += 1
40 | self.increment_total_fields(field_keys)
41 | for k in field_keys:
42 | if(dict_item.get(k) is not None and dict_item.get(k) != ''):
43 | self.item_coverage[domain][item_name]['coverage'][k] += 1
44 | else:
45 | valid_item = False
46 | self.increment_invalid_fields()
47 | invalid_fields.append(k)
48 |
49 | if valid_item is False:
50 | self.item_coverage[domain][item_name]['num_invalid_items'] += 1
51 | self.increment_invalid_items()
52 | if ItemValidator.INVALID_ITEM_URLS_LOGGING_ENABLED and len(invalid_fields) > 0:
53 | self.log_invalid_item_url(request_response_object.get_real_url(), item_name, invalid_fields)
54 | except Exception:
55 | pass
56 |
57 |
58 | def check_item_exists(self, domain, item_name, field_keys):
59 | if self.item_coverage.get(domain) is None:
60 | self.item_coverage[domain] = {}
61 | if self.item_coverage[domain].get(item_name) is None:
62 | self.item_coverage[domain][item_name] = {
63 | 'coverage': {},
64 | 'num_fields': 0,
65 | 'num_items': 0,
66 | 'num_invalid_items': 0,
67 | }
68 | self.item_coverage[domain][item_name]['num_fields'] = len(field_keys)
69 | for k in field_keys:
70 | self.item_coverage[domain][item_name]['coverage'][k] = 0
71 |
72 |
73 | def log_invalid_item_url(self, url, item_name, invalid_fields):
74 | if self.invalid_items_urls.get(item_name) is None:
75 | self.invalid_items_urls[item_name] = {}
76 | missing_fields_string = ItemValidator.generate_fields_key(invalid_fields)
77 | if self.invalid_items_urls[item_name].get(missing_fields_string) is None:
78 | self.invalid_items_urls[item_name][missing_fields_string] = []
79 | if url not in self.invalid_items_urls[item_name][missing_fields_string] and len(self.invalid_items_urls[item_name][missing_fields_string]) < ItemValidator.MAX_ITEM_URLS:
80 | self.invalid_items_urls[item_name][missing_fields_string].append(url)
81 |
82 |
83 | def increment_total_fields(self, fields):
84 | self.item_coverage['_SOP_OVERAL_STATS']['num_total_fields'] += len(fields)
85 |
86 | def increment_invalid_fields(self):
87 | self.item_coverage['_SOP_OVERAL_STATS']['num_invalid_fields'] += 1
88 |
89 | def increment_items(self):
90 | self.items += 1
91 | self.item_coverage['_SOP_OVERAL_STATS']['num_items'] += 1
92 |
93 | def increment_invalid_items(self):
94 | self.invalid_items += 1
95 | self.item_coverage['_SOP_OVERAL_STATS']['num_invalid_items'] += 1
96 |
97 | def get_item_coverage_data(self):
98 | return self.item_coverage
99 |
100 | def get_num_items(self):
101 | return self.items
102 |
103 | def get_num_invalid_items(self):
104 | return self.invalid_items
105 |
106 | def get_invalid_items_urls(self):
107 | return self.invalid_items_urls
108 |
109 | def get_field_coverage(self):
110 | overall_stats = self.item_coverage.get('_SOP_OVERAL_STATS')
111 | if overall_stats is None: return 0
112 | if overall_stats.get('num_total_fields', 0) == 0: return 0
113 | valid_fields = overall_stats.get('num_total_fields') - overall_stats.get('num_invalid_fields')
114 | return round((valid_fields / overall_stats.get('num_total_fields'))*100)
115 |
116 |
117 | @staticmethod
118 | def get_item_fields(item):
119 | return item.fields
120 |
121 | @staticmethod
122 | def get_item_name(item):
123 | return item.__class__.__name__
124 |
125 | @staticmethod
126 | def generate_fields_key(fields):
127 | missing_fields_string = ''
128 | for field in fields:
129 | if len(missing_fields_string) > 0:
130 | missing_fields_string += '&&'
131 | missing_fields_string += field
132 | return missing_fields_string
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
--------------------------------------------------------------------------------
/scrapeops_scrapy/validators/response_validator.py:
--------------------------------------------------------------------------------
1 | from scrapy.http import Response
2 | from scrapeops_scrapy.utils.utils import get_header_size, get_status_size
3 | from random import randint
4 | import json
5 |
6 | class ResponseValidator(object):
7 |
8 | def __init__(self):
9 | pass
10 |
11 | @staticmethod
12 | def validate(request_response_object, response, domain_tests=None, generic_tests=None, geotargeting_tests=None):
13 | if domain_tests is not None:
14 | for test in domain_tests:
15 | if ResponseValidator.run_validation_test(request_response_object, response, test.get('validation_tests', [])) is False:
16 | request_response_object.failed_validation_test(test)
17 | break
18 |
19 | if generic_tests is not None:
20 | for test in generic_tests:
21 | if ResponseValidator.run_validation_test(request_response_object, response, test.get('validation_tests', [])) is False:
22 | request_response_object.failed_validation_test(test)
23 | break
24 |
25 |
26 | @staticmethod
27 | def run_validation_test(request_response_object, response, test_array):
28 | """
29 | Returns True if test is passed, False if test is failed.
30 | """
31 | fail_counter = 0
32 | for test in test_array:
33 |
34 | if test.get('test_type') == 'bytes_check':
35 | if ResponseValidator.bytes_check(response, test.get('threshold', 0), test.get('comparison_type')):
36 | fail_counter += 1
37 | else: return True
38 |
39 | if test.get('test_type') == 'response_length_check':
40 | if ResponseValidator.response_length_check(ResponseValidator.get_response_text(request_response_object, response), test.get('threshold', 0), test.get('comparison_type')):
41 | fail_counter += 1
42 | else: return True
43 |
44 | if test.get('test_type') == 'string_check' and test.get('test_location') == 'body':
45 | if ResponseValidator.string_check(ResponseValidator.get_response_text(request_response_object, response), test.get('text_check', ''), test.get('comparison_type'), text_slice=test.get('text_slice')):
46 | fail_counter += 1
47 | else: return True
48 |
49 | if test.get('test_type') == 'string_check' and test.get('test_location') == 'user_agent':
50 | pass
51 |
52 | if test.get('test_type') == 'string_check' and test.get('test_location') == 'url':
53 | if ResponseValidator.string_check(request_response_object.get_real_url(), test.get('text_check', ''), test.get('comparison_type'), text_slice=test.get('text_slice')):
54 | fail_counter += 1
55 | else: return True
56 |
57 |
58 | if fail_counter == len(test_array):
59 | return False
60 | return True
61 |
62 |
63 | @staticmethod
64 | def get_domain_tests(request_response_object, domains):
65 | domain_details = domains.get(request_response_object.get_domain())
66 | if domain_details is not None:
67 | return domain_details.get('validation_details')
68 | return None
69 |
70 |
71 | @staticmethod
72 | def failed_scan(request_response_object, domains):
73 | domain_details = domains.get(request_response_object.get_domain())
74 | if domain_details is not None:
75 | failed_scan_ratio = domain_details.get('failed_generic_scan', 0)
76 | if failed_scan_ratio == 0: return False
77 | if failed_scan_ratio == 1: return True
78 | if randint(1, failed_scan_ratio) == 1: return True
79 | return False
80 |
81 |
82 | @staticmethod
83 | def get_response_text(request_response_object, response):
84 | try:
85 | if isinstance(response, Response):
86 | if request_response_object.is_json_response():
87 | json_response = json.loads(response.text)
88 | json_response_keys = request_response_object.get_json_response_keys()
89 | for key in json_response_keys:
90 | json_response = json_response.get(key)
91 | return json_response or ''
92 | return response.text
93 | else: return ''
94 | except AttributeError:
95 | return ''
96 |
97 |
98 | @staticmethod
99 | def string_check(text, text_check, comparison, text_slice=None):
100 | if isinstance(text, str):
101 | if text_slice is not None:
102 | text = ResponseValidator.string_slice(text, text_slice)
103 | if comparison == 'contains' and text_check in text:
104 | return True
105 | elif comparison == 'not_contain' and text_check not in text:
106 | return True
107 | return False
108 |
109 |
110 | @staticmethod
111 | def string_slice(text, text_slice):
112 | if text_slice.get('active'):
113 | if (text_slice.get('slice_type') == 'first') and (len(text) > 0):
114 | return text[:text_slice.get('slice_upper_threshold', len(text))]
115 | if (text_slice.get('slice_type') == 'last') and (len(text) > 0):
116 | return text[-text_slice.get('slice_lower_threshold', 0)]
117 | if text_slice.get('slice_type') == 'range':
118 | return text[text_slice.get('slice_lower_threshold', 0):text_slice.get('slice_upper_threshold', len(text))]
119 | return text
120 |
121 |
122 | @staticmethod
123 | def bytes_check(response, threshold, comparison):
124 | if threshold == 0: return False
125 | reslen = len(response.body) + get_header_size(response.headers) + get_status_size(response.status) + 4
126 | return ResponseValidator.comparison_operators(reslen, threshold, comparison)
127 |
128 |
129 | @staticmethod
130 | def response_length_check(text, threshold, comparison):
131 | if threshold == 0: return False
132 | response_text_length = len(text)
133 | return ResponseValidator.comparison_operators(response_text_length, threshold, comparison)
134 |
135 |
136 | @staticmethod
137 | def comparison_operators(value, threshold, comparison):
138 | if comparison == 'less_than':
139 | return value < threshold
140 | if comparison == 'less_than_equal':
141 | return value <= threshold
142 | if comparison == 'greater_than':
143 | return value > threshold
144 | if comparison == 'greater_than_equal':
145 | return value >= threshold
146 | if comparison == 'equals':
147 | return value == threshold
148 | if comparison == 'not_equal':
149 | return value != threshold
150 | return False
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 |
4 | VERSION = '0.5.6'
5 | DESCRIPTION = 'Scrapeops Scrapy SDK, is a monitoring tool for your Scrapy spiders.'
6 |
7 | setup(name='scrapeops_scrapy',
8 | description=DESCRIPTION,
9 | long_description=DESCRIPTION,
10 | author="ScrapeOps",
11 | author_email="info@scrapeops.io",
12 | version=VERSION,
13 | license="BSD",
14 | url="https://github.com/ScrapeOps/scrapeops-scrapy-sdk",
15 | packages=find_packages(),
16 | install_requires=[
17 | "tld>=0.13",
18 | "requests>=2.32.0",
19 | "json5>=0.9.13",
20 | # The latest version of requests (2.29.0) does not support urllib3 2.0.0 #6432 - https://github.com/psf/requests/issues/6432
21 | "urllib3>=1.26.14",
22 | "itemadapter>=0.8.0",
23 | ],
24 | classifiers=[
25 | "Programming Language :: Python",
26 | "Programming Language :: Python :: 3",
27 | "Programming Language :: Python :: 3.8",
28 | "Programming Language :: Python :: 3.9",
29 | "Programming Language :: Python :: 3.10",
30 | "Programming Language :: Python :: 3.11",
31 | "License :: OSI Approved :: BSD License",
32 | "Operating System :: OS Independent",
33 | "Intended Audience :: Developers",
34 | ],
35 | python_requires=">=3.8",
36 | )
--------------------------------------------------------------------------------