├── .gitignore ├── setup.cfg ├── requirements └── dev.txt ├── .github └── workflows │ └── qa.yaml ├── setup.py ├── Makefile ├── CHANGELOG.rst ├── LICENSE.txt ├── scrapy_html_storage ├── filesys.py └── __init__.py ├── README.md └── tests └── test_html_storage_middleware.py /.gitignore: -------------------------------------------------------------------------------- 1 | .venv/ 2 | **/*.pyc 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | pytest==8.3.5 2 | pyhamcrest==1.8.5 3 | mock==1.3.0 4 | pylint==1.5.4 5 | Scrapy==2.11.2 6 | pyright==1.1.403 -------------------------------------------------------------------------------- /.github/workflows/qa.yaml: -------------------------------------------------------------------------------- 1 | name: QA 2 | 3 | on: 4 | push: 5 | branches: [ main, master ] 6 | pull_request: 7 | branches: [ main, master ] 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | 13 | container: python:3.8.10 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | - name: Check types 19 | run: make check-types 20 | 21 | - name: Run tests 22 | run: make test 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='scrapy-html-storage', 5 | version='0.4.0', 6 | description='Scrapy downloader middleware that stores response HTML files to disk.', 7 | long_description=open('README.md').read(), 8 | url='https://github.com/povilasb/scrapy-html-storage', 9 | author='Povilas Balciunas', 10 | author_email='balciunas90@gmail.com', 11 | license='MIT', 12 | packages=['scrapy_html_storage'], 13 | zip_safe=False 14 | ) 15 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | virtualenv_dir := .venv 2 | pip := $(virtualenv_dir)/bin/pip 3 | pytest := $(virtualenv_dir)/bin/py.test 4 | pylint := $(virtualenv_dir)/bin/pylint 5 | pyright := $(virtualenv_dir)/bin/pyright 6 | 7 | qa: lint test 8 | .PHONY: qa 9 | 10 | lint: 11 | $(pylint) scrapy_html_storage/ 12 | .PHONY: lint 13 | 14 | test: $(virtualenv_dir) 15 | PYTHONPATH=$(PYTHONPATH):. $(pytest) -s tests 16 | .PHONY: test 17 | 18 | check-types: $(virtualenv_dir) 19 | . $(virtualenv_dir)/bin/activate && \ 20 | PYTHONPATH=$(PYTHONPATH):. $(pyright) scrapy_html_storage 21 | .PHONY: check-types 22 | 23 | $(virtualenv_dir): requirements/dev.txt 24 | python3 -m venv $@ 25 | $(pip) install -r $< 26 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Change Log 3 | ========== 4 | 5 | All notable changes to this project will be documented in this file. 6 | This project adheres to `Semantic Versioning `_. 7 | 8 | [0.4.0] - 2018-06-24 9 | ==================== 10 | 11 | Fixed 12 | ----- 13 | 14 | * Middleware to work with latest Scrapy - 1.5.0. 15 | 16 | [0.3.0] - 2016-11-11 17 | ==================== 18 | 19 | Changed 20 | ------- 21 | 22 | * File name. .gz extension is not appended any more - you have to specify 23 | full file name. 24 | 25 | [0.2.0] - 2016-04-19 26 | ==================== 27 | 28 | Added 29 | ------ 30 | 31 | * Option to gzip response content before storing to disk. 32 | 33 | [0.1.0] - 2016-03-29 34 | ==================== 35 | 36 | Added 37 | ----- 38 | 39 | * Initial working version. 40 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Povilas Balciunas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /scrapy_html_storage/filesys.py: -------------------------------------------------------------------------------- 1 | """File system related facilities. 2 | """ 3 | 4 | import os 5 | import gzip 6 | 7 | 8 | def ensure_dir_exists(dir_path): 9 | """Create the specified directory if it does not exist. 10 | 11 | Creates all intermediate subdirectories needed for the leaf directory. 12 | 13 | Args: 14 | dir_path (str): directory to be created. 15 | """ 16 | if not os.path.exists(dir_path): 17 | os.makedirs(dir_path) 18 | 19 | 20 | def write_to_file(fname, html_body): 21 | """Writes text to file. 22 | 23 | Args: 24 | fname (str): save text to this file. 25 | html_body (str): results page HTML content. 26 | """ 27 | dir_path = os.path.dirname(fname) 28 | ensure_dir_exists(dir_path) 29 | 30 | with open(fname, 'w') as html_file: 31 | html_file.write(html_body) 32 | 33 | 34 | def write_to_gzip(fname, html_body): 35 | """Writes text to compressed file. 36 | 37 | Args: 38 | fname (str): save compressed text to this file. 39 | html_body (str): results page HTML content. 40 | """ 41 | dir_path = os.path.dirname(fname) 42 | ensure_dir_exists(dir_path) 43 | 44 | with gzip.open(fname, 'wb') as html_file: 45 | html_file.write(html_body) 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | A [Scrapy downloader middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html) that stores response HTMLs to disk. 4 | 5 | ## Usage 6 | 7 | Turn downloader on, e.g. specifying it in `settings.py`: 8 | 9 | ```python 10 | DOWNLOADER_MIDDLEWARES = { 11 | 'scrapy_html_storage.HtmlStorageMiddleware': 10, 12 | } 13 | ``` 14 | 15 | None of responses by default are saved to disk. 16 | You must select for which requests the response HTMLs will be saved: 17 | 18 | ```python 19 | def parse(self, response): 20 | """Processes start urls. 21 | 22 | Args: 23 | response (HtmlResponse): scrapy HTML response object. 24 | """ 25 | yield scrapy.Request( 26 | 'http://target.com', 27 | callback=self.parse_target, 28 | meta={ 29 | 'save_html': True, 30 | } 31 | ) 32 | ``` 33 | 34 | The file path where HTML will be stored is resolved with spider method 35 | `response_html_path`. E.g.: 36 | 37 | ```python 38 | class TargetSpider(scrapy.Spider): 39 | def response_html_path(self, request): 40 | """ 41 | Args: 42 | request (scrapy.http.request.Request): request that produced the 43 | response. 44 | """ 45 | return 'html/last_response.html' 46 | ``` 47 | 48 | ## Configuration 49 | 50 | HTML storage downloader middleware supports such options: 51 | 52 | * **gzip_output** (bool) - if True, HTML output will be stored in gzip format. 53 | Default is False. 54 | * **save_html_on_status** (list) - if not empty, sets list of response codes 55 | whitelisted for html saving. If list is empty or not provided, all response 56 | codes will be allowed for html saving. 57 | 58 | Sample: 59 | 60 | ```python 61 | HTML_STORAGE = { 62 | "gzip_output": True, 63 | "save_html_on_status": [200, 202], 64 | "save_by_url" = [ 65 | "https://website.com/index.html", 66 | ".*", 67 | "website.com/section-\w+.html", 68 | ] 69 | } 70 | ``` -------------------------------------------------------------------------------- /scrapy_html_storage/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | import typing as t 3 | 4 | from scrapy.http import Request, Response 5 | from scrapy.settings import Settings 6 | from scrapy.crawler import Crawler 7 | from scrapy.spiders import Spider 8 | 9 | import scrapy_html_storage.filesys as fs 10 | 11 | 12 | class HtmlStorageMiddleware(object): 13 | """Scrapy downloader middleware that stores HTML files to local file system. 14 | """ 15 | 16 | def __init__(self, settings: Settings): 17 | self.settings = settings.get('HTML_STORAGE', {}) 18 | self.gzip_output = self.settings.get('gzip_output', False) 19 | self.save_html_on_codes = self.settings.get('save_html_on_codes', []) 20 | self._save_by_url = [re.compile(pattern) for pattern in self.settings.get('save_by_url', [])] 21 | 22 | @classmethod 23 | def from_crawler(cls, crawler: Crawler) -> "HtmlStorageMiddleware": 24 | return cls(crawler.settings) 25 | 26 | 27 | def process_response(self, request: Request, response: Response, spider: Spider) -> Response: 28 | """Stores response HTML body to file.""" 29 | if self._should_save_html(request, response): 30 | self._save_html_to( 31 | spider.response_html_path(request), # type: ignore - it is not defined in Spider, but exists during the runtime. 32 | response.text, 33 | ) 34 | 35 | return response 36 | 37 | 38 | def _save_html_to(self, path: str, html_body: str) -> None: 39 | """Store html to file. 40 | 41 | Optionally file will be gzipped. 42 | 43 | Args: 44 | str(path): file path to save html to. 45 | """ 46 | if self.gzip_output: 47 | fs.write_to_gzip(path, html_body) 48 | else: 49 | fs.write_to_file(path, html_body) 50 | 51 | 52 | def _should_save_html(self, request: Request, response: Response) -> bool: 53 | """Check if this request should be stored to disk.""" 54 | if not should_save_html_according_response_code( 55 | response.status, 56 | self.save_html_on_codes 57 | ): 58 | return False 59 | 60 | explicit_save_html = request.meta.get('save_html') 61 | if explicit_save_html is not None: 62 | return explicit_save_html 63 | 64 | for pattern in self._save_by_url: 65 | if pattern.match(request.url): 66 | return True 67 | 68 | return False 69 | 70 | 71 | 72 | 73 | def should_save_html_according_response_code(code: int, allowed_list: t.List[int]) -> bool: 74 | """ 75 | Args: 76 | code (int): response status code 77 | allowed_list (list): list of response status codes allowed to save html 78 | 79 | Returns: 80 | bool: True if allowed_list is empty (save all responses), or response 81 | code in allowed list. 82 | """ 83 | return not allowed_list or code in allowed_list 84 | -------------------------------------------------------------------------------- /tests/test_html_storage_middleware.py: -------------------------------------------------------------------------------- 1 | from hamcrest import assert_that, is_, has_properties 2 | from mock import MagicMock, patch, ANY 3 | import pytest 4 | 5 | from scrapy.settings import Settings 6 | 7 | from scrapy_html_storage import HtmlStorageMiddleware 8 | 9 | 10 | def make_request_mock(save_html=False, query='', results_page=None): 11 | """Constructs HTTP Request mock object. 12 | """ 13 | request_mock = MagicMock() 14 | request_mock.meta = { 15 | 'save_html': save_html, 16 | 'query': query, 17 | 'results_page': results_page, 18 | } 19 | 20 | return request_mock 21 | 22 | def make_response_mock(response_status): 23 | """ Constructs HTTP Response mock object. 24 | """ 25 | response_mock = MagicMock() 26 | response_mock.status = response_status 27 | 28 | return response_mock 29 | 30 | 31 | def make_allowed_response_codes_list(): 32 | return range(200, 300) 33 | 34 | 35 | def make_downloader(save_html_on_codes=[]): 36 | settings = Settings() 37 | settings.set('HTML_STORAGE', { 38 | 'gzip_output': True, 39 | 'save_html_on_codes': save_html_on_codes 40 | }) 41 | return HtmlStorageMiddleware(settings) 42 | 43 | 44 | @pytest.mark.parametrize('response_status,as_expected', [ 45 | (200, True), 46 | (299, True), 47 | (300, False), 48 | (404, False), 49 | ]) 50 | def test_should_save_html_returns_true_when_request_metainformation_has_special_key_set_and_appropriate_response_status(response_status, as_expected): 51 | request_mock = make_request_mock(save_html=True) 52 | response_mock = make_response_mock(response_status=response_status) 53 | downloader = make_downloader(make_allowed_response_codes_list()) 54 | 55 | save = downloader._should_save_html(request_mock, response_mock) 56 | 57 | assert_that(save, is_(as_expected)) 58 | 59 | 60 | @pytest.mark.parametrize('response_status', [200, 299, 300, 404]) 61 | def test_should_save_html_returns_true_when_request_metainformation_has_special_key_set_and_allowed_resonse_codes_list_is_empty(response_status): 62 | request_mock = make_request_mock(save_html=True) 63 | response_mock = make_response_mock(response_status=response_status) 64 | downloader = make_downloader() 65 | 66 | save = downloader._should_save_html(request_mock, response_mock) 67 | 68 | assert_that(save, is_(True)) 69 | 70 | 71 | @patch('scrapy_html_storage.filesys.write_to_file') 72 | def test_process_response_stores_response_body_to_file_if_request_asks_for_it( 73 | write_to_file_mock): 74 | downloader = HtmlStorageMiddleware(Settings()) 75 | request_mock = make_request_mock(save_html=True) 76 | response_mock = make_response_mock(response_status=200) 77 | 78 | downloader.process_response(request_mock, response_mock, MagicMock()) 79 | 80 | assert_that(write_to_file_mock.call_count, is_(1)) 81 | 82 | 83 | @patch('scrapy_html_storage.filesys.write_to_file') 84 | def test_process_response_saves_response_html_to_file_resolved_by_spider( 85 | write_to_file_mock): 86 | downloader = HtmlStorageMiddleware(Settings()) 87 | request_mock = make_request_mock(save_html=True) 88 | response_mock = make_response_mock(response_status=200) 89 | 90 | spider_mock = MagicMock() 91 | spider_mock.response_html_path.return_value = '/tmp/response.html' 92 | 93 | downloader.process_response(request_mock, response_mock, spider_mock) 94 | 95 | write_to_file_mock.assert_called_with('/tmp/response.html', ANY) 96 | 97 | 98 | @patch('scrapy_html_storage.filesys.write_to_gzip') 99 | def test_process_response_stores_response_body_to_gzip_file_if_this_setting_is_on( 100 | write_to_gzip_mock): 101 | downloader = HtmlStorageMiddleware(Settings()) 102 | downloader.gzip_output = True 103 | request_mock = make_request_mock(save_html=True) 104 | response_mock = make_response_mock(response_status=200) 105 | 106 | downloader.process_response(request_mock, response_mock, MagicMock()) 107 | 108 | assert_that(write_to_gzip_mock.call_count, is_(1)) 109 | 110 | 111 | def test_constructor_extracts_expected_settings(): 112 | settings = Settings() 113 | save_html_on_codes = make_allowed_response_codes_list() 114 | settings.set('HTML_STORAGE', { 115 | 'gzip_output': True, 116 | 'save_html_on_codes': save_html_on_codes 117 | }) 118 | 119 | downloader = HtmlStorageMiddleware(settings) 120 | 121 | assert_that(downloader, has_properties(dict( 122 | gzip_output=True, 123 | save_html_on_codes=save_html_on_codes 124 | ))) 125 | 126 | 127 | def test_constructor_sets_empty_settings_when_middleware_settings_are_not_specified(): 128 | settings = Settings() 129 | 130 | downloader = HtmlStorageMiddleware(settings) 131 | 132 | assert_that(downloader.settings, is_({})) 133 | 134 | 135 | @pytest.mark.parametrize('setting_name,expected', [ 136 | ('gzip_output', False), 137 | ]) 138 | def test_contructor_sets_default_settings_values_when_no_settings_are_specified( 139 | setting_name, expected): 140 | settings = Settings() 141 | settings.set('HTML_STORAGE', {}) 142 | 143 | downloader = HtmlStorageMiddleware(settings) 144 | 145 | assert_that(downloader.__dict__[setting_name], is_(expected)) 146 | --------------------------------------------------------------------------------