├── .gitignore
├── .travis.yml
├── CHANGELOG.rst
├── LICENSE.txt
├── Makefile
├── README.rst
├── requirements
    └── dev.txt
├── scrapy_html_storage
    ├── __init__.py
    └── filesys.py
├── setup.cfg
├── setup.py
└── tests
    └── test_html_storage_middleware.py


/.gitignore:
--------------------------------------------------------------------------------
1 | pyenv/
2 | **/*.pyc
3 | .coverage
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |   - "2.7"
 5 | 
 6 | script:
 7 |   - make test
 8 | 
 9 | after_success:
10 |   COVERALLS_REPO_TOKEN=0SNOWwX2OjKNbZHPZD7HbtfCHYzY7dSZG coveralls
11 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Change Log
 3 | ==========
 4 | 
 5 | All notable changes to this project will be documented in this file.
 6 | This project adheres to `Semantic Versioning <http://semver.org/>`_.
 7 | 
 8 | [0.4.0] - 2018-06-24
 9 | ====================
10 | 
11 | Fixed
12 | -----
13 | 
14 | * Middleware to work with latest Scrapy - 1.5.0.
15 | 
16 | [0.3.0] - 2016-11-11
17 | ====================
18 | 
19 | Changed
20 | -------
21 | 
22 | * File name. .gz extension is not appended any more - you have to specify
23 |   full file name.
24 | 
25 | [0.2.0] - 2016-04-19
26 | ====================
27 | 
28 | Added
29 | ------
30 | 
31 | * Option to gzip response content before storing to disk.
32 | 
33 | [0.1.0] - 2016-03-29
34 | ====================
35 | 
36 | Added
37 | -----
38 | 
39 | * Initial working version.
40 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Povilas Balciunas
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | virtualenv_dir := pyenv
 2 | pip := $(virtualenv_dir)/bin/pip
 3 | pytest := $(virtualenv_dir)/bin/py.test
 4 | pylint := $(virtualenv_dir)/bin/pylint
 5 | coverage := $(virtualenv_dir)/bin/coverage
 6 | 
 7 | 
 8 | qa: lint test
 9 | .PHONY: qa
10 | 
11 | lint:
12 | 	$(pylint) scrapy_html_storage/
13 | .PHONY: lint
14 | 
15 | test: $(virtualenv_dir)
16 | 	PYTHONPATH=$(PYTHONPATH):. $(coverage) run \
17 | 		--source scrapy_html_storage $(pytest) -s tests
18 | 	$(coverage) report -m
19 | .PHONY: test
20 | 
21 | $(virtualenv_dir): requirements/dev.txt
22 | 	virtualenv $@
23 | 	$(pip) install -r $<
24 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | =====
 2 | About
 3 | =====
 4 | 
 5 | .. image:: https://travis-ci.org/povilasb/scrapy-html-storage.svg?branch=master
 6 | .. image:: https://coveralls.io/repos/github/povilasb/scrapy-html-storage/badge.svg?branch=master :target: https://coveralls.io/github/povilasb/scrapy-html-storage?branch=master
 7 | 
 8 | This is Scrapy downloader middleware that stores response HTMLs to disk.
 9 | 
10 | Usage
11 | =====
12 | 
13 | Turn downloader on, e.g. specifying it in `settings.py`::
14 | 
15 |     DOWNLOADER_MIDDLEWARES = {
16 |         'scrapy_html_storage.HtmlStorageMiddleware': 10,
17 |     }
18 | 
19 | None of responses by default are saved to disk.
20 | You must select for which requests the response HTMLs will be saved::
21 | 
22 |    def parse(self, response):
23 |         """Processes start urls.
24 | 
25 |         Args:
26 |             response (HtmlResponse): scrapy HTML response object.
27 |         """
28 |         yield scrapy.Request(
29 |             'http://target.com',
30 |             callback=self.parse_target,
31 |             meta={
32 |               'save_html': True,
33 |             }
34 |         )
35 | 
36 | The file path where HTML will be stored is resolved with spider method
37 | `response_html_path`. E.g.::
38 | 
39 |     class TargetSpider(scrapy.Spider):
40 |         def response_html_path(self, request):
41 |             """
42 |             Args:
43 |                 request (scrapy.http.request.Request): request that produced the
44 |                     response.
45 |             """
46 |             return 'html/last_response.html'
47 | 
48 | Configuration
49 | =============
50 | 
51 | HTML storage downloader middleware supports such options:
52 | 
53 | * **gzip_output** (bool) - if True, HTML output will be stored in gzip format.
54 |   Default is False.
55 | * **save_html_on_status** (list) - if not empty, sets list of response codes
56 |   whitelisted for html saving. If list is empty or not provided, all response
57 |   codes will be allowed for html saving.
58 | 
59 | Sample::
60 | 
61 |     HTML_STORAGE = {
62 |         'gzip_output': True,
63 |         'save_html_on_status': [200, 202]
64 |     }
65 | 


--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
1 | pytest==2.7.2
2 | pyhamcrest==1.8.5
3 | mock==1.3.0
4 | pylint==1.5.4
5 | Scrapy==1.0.5
6 | coverage==4.0.3
7 | coveralls==1.1
8 | 


--------------------------------------------------------------------------------
/scrapy_html_storage/__init__.py:
--------------------------------------------------------------------------------
  1 | """Downloader middlewares.
  2 | """
  3 | 
  4 | import scrapy_html_storage.filesys as fs
  5 | 
  6 | 
  7 | class HtmlStorageMiddleware(object):
  8 |     """Scrapy downloader middleware that stores HTML files to local file system.
  9 |     """
 10 | 
 11 |     def __init__(self, settings):
 12 |         """
 13 |         Args:
 14 |             settings (scrapy.settings.Settings)
 15 |         """
 16 |         self.settings = settings.get('HTML_STORAGE', {})
 17 |         self.gzip_output = self.settings.get('gzip_output', False)
 18 |         self.save_html_on_codes = self.settings.get('save_html_on_codes', [])
 19 | 
 20 |     @classmethod
 21 |     def from_crawler(cls, crawler):
 22 |         """Contruct middleware with scrapy settings.
 23 | 
 24 |         Args:
 25 |             settings (scrapy.settings.Settings)
 26 | 
 27 |         Returns:
 28 |             HtmlStorageMiddleware:
 29 |         """
 30 |         return cls(crawler.settings)
 31 | 
 32 |     @classmethod
 33 |     def from_settings(self, settings):
 34 |         """Contruct middleware with scrapy settings.
 35 | 
 36 |         Args:
 37 |             settings (scrapy.settings.Settings)
 38 | 
 39 |         Returns:
 40 |             HtmlStorageMiddleware:
 41 |         """
 42 |         return HtmlStorageMiddleware(settings)
 43 | 
 44 | 
 45 |     def process_response(self, request, response, spider):
 46 |         """Stores response HTML body to file.
 47 | 
 48 |         Args:
 49 |             request (scrapy.http.request.Request): request which triggered
 50 |                 this response.
 51 |             response (scrapy.http.Response)
 52 |             spider: (scrapy.Spider): spider that triggered the request.
 53 |                 Spiders must set 'started_crawling' field to Unix timestamp.
 54 | 
 55 |         Returns:
 56 |             scrapy.http.response.Response: unmodified response object.
 57 |         """
 58 |         if self._should_save_html(request, response):
 59 |             self._save_html_to(spider.response_html_path(request), response.text)
 60 | 
 61 |         return response
 62 | 
 63 | 
 64 |     def _save_html_to(self, path, html_body):
 65 |         """Store html to file.
 66 | 
 67 |         Optionally file will be gzipped.
 68 | 
 69 |         Args:
 70 |             str(path): file path to save html to.
 71 |         """
 72 |         if self.gzip_output:
 73 |             fs.write_to_gzip(path, html_body)
 74 |         else:
 75 |             fs.write_to_file(path, html_body)
 76 | 
 77 | 
 78 |     def _should_save_html(self, request, response):
 79 |         """
 80 |         Args:
 81 |             request (scrapy.http.request.Request)
 82 |             response (scrapy.http.response.Response)
 83 | 
 84 |         Returns:
 85 |             bool: True if this request should be stored to disk, False otherwise.
 86 |         """
 87 |         return 'save_html' in request.meta and \
 88 |             should_save_html_according_response_code(
 89 |                 response.status,
 90 |                 self.save_html_on_codes
 91 |             )
 92 | 
 93 | 
 94 | def should_save_html_according_response_code(code, allowed_list):
 95 |     """
 96 |     Args:
 97 |         code (int): response status code
 98 |         allowed_list (list): list of response status codes allowed to save html
 99 | 
100 |     Returns:
101 |         bool: True if allowed_list is empty (save all responses), or response
102 |               code in allowed list.
103 |     """
104 |     return not allowed_list or code in allowed_list
105 | 


--------------------------------------------------------------------------------
/scrapy_html_storage/filesys.py:
--------------------------------------------------------------------------------
 1 | """File system related facilities.
 2 | """
 3 | 
 4 | import os
 5 | import gzip
 6 | 
 7 | 
 8 | def ensure_dir_exists(dir_path):
 9 |     """Create the specified directory if it does not exist.
10 | 
11 |     Creates all intermediate subdirectories needed for the leaf directory.
12 | 
13 |     Args:
14 |         dir_path (str): directory to be created.
15 |     """
16 |     if not os.path.exists(dir_path):
17 |         os.makedirs(dir_path)
18 | 
19 | 
20 | def write_to_file(fname, html_body):
21 |     """Writes text to file.
22 | 
23 |     Args:
24 |         fname (str): save text to this file.
25 |         html_body (str): results page HTML content.
26 |     """
27 |     dir_path = os.path.dirname(fname)
28 |     ensure_dir_exists(dir_path)
29 | 
30 |     with open(fname, 'w') as html_file:
31 |         html_file.write(html_body)
32 | 
33 | 
34 | def write_to_gzip(fname, html_body):
35 |     """Writes text to compressed file.
36 | 
37 |     Args:
38 |         fname (str): save compressed text to this file.
39 |         html_body (str): results page HTML content.
40 |     """
41 |     dir_path = os.path.dirname(fname)
42 |     ensure_dir_exists(dir_path)
43 | 
44 |     with gzip.open(fname, 'wb') as html_file:
45 |         html_file.write(html_body)
46 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='scrapy-html-storage',
 5 |     version='0.4.0',
 6 |     description='Scrapy downloader middleware that stores response HTML files to disk.',
 7 |     long_description=open('README.rst').read(),
 8 |     url='https://github.com/povilasb/scrapy-html-storage',
 9 |     author='Povilas Balciunas',
10 |     author_email='balciunas90@gmail.com',
11 |     license='MIT',
12 |     packages=['scrapy_html_storage'],
13 |     zip_safe=False
14 | )
15 | 


--------------------------------------------------------------------------------
/tests/test_html_storage_middleware.py:
--------------------------------------------------------------------------------
  1 | from hamcrest import assert_that, is_, has_properties
  2 | from mock import MagicMock, patch, ANY
  3 | import pytest
  4 | 
  5 | from scrapy.settings import Settings
  6 | 
  7 | from scrapy_html_storage import HtmlStorageMiddleware
  8 | 
  9 | 
 10 | def make_request_mock(save_html=False, query='', results_page=None):
 11 |     """Constructs HTTP Request mock object.
 12 |     """
 13 |     request_mock = MagicMock()
 14 |     request_mock.meta = {
 15 |         'save_html': save_html,
 16 |         'query': query,
 17 |         'results_page': results_page,
 18 |     }
 19 | 
 20 |     return request_mock
 21 | 
 22 | def make_response_mock(response_status):
 23 |     """ Constructs HTTP Response mock object.
 24 |     """
 25 |     response_mock = MagicMock()
 26 |     response_mock.status = response_status
 27 | 
 28 |     return response_mock
 29 | 
 30 | 
 31 | def make_allowed_response_codes_list():
 32 |     return range(200, 300)
 33 | 
 34 | 
 35 | def make_downloader(save_html_on_codes=[]):
 36 |     settings = Settings()
 37 |     settings.set('HTML_STORAGE', {
 38 |         'gzip_output': True,
 39 |         'save_html_on_codes': save_html_on_codes
 40 |     })
 41 |     return HtmlStorageMiddleware(settings)
 42 | 
 43 | 
 44 | @pytest.mark.parametrize('response_status,as_expected', [
 45 |     (200, True),
 46 |     (299, True),
 47 |     (300, False),
 48 |     (404, False),
 49 | ])
 50 | def test_should_save_html_returns_true_when_request_metainformation_has_special_key_set_and_appropriate_response_status(response_status, as_expected):
 51 |     request_mock = make_request_mock(save_html=True)
 52 |     response_mock = make_response_mock(response_status=response_status)
 53 |     downloader = make_downloader(make_allowed_response_codes_list())
 54 | 
 55 |     save = downloader._should_save_html(request_mock, response_mock)
 56 | 
 57 |     assert_that(save, is_(as_expected))
 58 | 
 59 | 
 60 | @pytest.mark.parametrize('response_status', [200, 299, 300, 404])
 61 | def test_should_save_html_returns_true_when_request_metainformation_has_special_key_set_and_allowed_resonse_codes_list_is_empty(response_status):
 62 |     request_mock = make_request_mock(save_html=True)
 63 |     response_mock = make_response_mock(response_status=response_status)
 64 |     downloader = make_downloader()
 65 | 
 66 |     save = downloader._should_save_html(request_mock, response_mock)
 67 | 
 68 |     assert_that(save, is_(True))
 69 | 
 70 | 
 71 | @patch('scrapy_html_storage.filesys.write_to_file')
 72 | def test_process_response_stores_response_body_to_file_if_request_asks_for_it(
 73 |         write_to_file_mock):
 74 |     downloader = HtmlStorageMiddleware(Settings())
 75 |     request_mock = make_request_mock(save_html=True)
 76 |     response_mock = make_response_mock(response_status=200)
 77 | 
 78 |     downloader.process_response(request_mock, response_mock, MagicMock())
 79 | 
 80 |     assert_that(write_to_file_mock.call_count, is_(1))
 81 | 
 82 | 
 83 | @patch('scrapy_html_storage.filesys.write_to_file')
 84 | def test_process_response_saves_response_html_to_file_resolved_by_spider(
 85 |         write_to_file_mock):
 86 |     downloader = HtmlStorageMiddleware(Settings())
 87 |     request_mock = make_request_mock(save_html=True)
 88 |     response_mock = make_response_mock(response_status=200)
 89 | 
 90 |     spider_mock = MagicMock()
 91 |     spider_mock.response_html_path.return_value = '/tmp/response.html'
 92 | 
 93 |     downloader.process_response(request_mock, response_mock, spider_mock)
 94 | 
 95 |     write_to_file_mock.assert_called_with('/tmp/response.html', ANY)
 96 | 
 97 | 
 98 | @patch('scrapy_html_storage.filesys.write_to_gzip')
 99 | def test_process_response_stores_response_body_to_gzip_file_if_this_setting_is_on(
100 |         write_to_gzip_mock):
101 |     downloader = HtmlStorageMiddleware(Settings())
102 |     downloader.gzip_output = True
103 |     request_mock = make_request_mock(save_html=True)
104 |     response_mock = make_response_mock(response_status=200)
105 | 
106 |     downloader.process_response(request_mock, response_mock, MagicMock())
107 | 
108 |     assert_that(write_to_gzip_mock.call_count, is_(1))
109 | 
110 | 
111 | def test_from_settings_constructs_middleware_with_the_specified_settings():
112 |     settings = Settings()
113 |     settings.set('HTML_STORAGE', {'test': 'settings'})
114 | 
115 |     downloader = HtmlStorageMiddleware.from_settings(settings)
116 | 
117 |     assert_that(downloader.settings, is_({'test': 'settings'}))
118 | 
119 | 
120 | def test_constructor_extracts_expected_settings():
121 |     settings = Settings()
122 |     save_html_on_codes = make_allowed_response_codes_list()
123 |     settings.set('HTML_STORAGE', {
124 |         'gzip_output': True,
125 |         'save_html_on_codes': save_html_on_codes
126 |     })
127 | 
128 |     downloader = HtmlStorageMiddleware(settings)
129 | 
130 |     assert_that(downloader, has_properties(dict(
131 |         gzip_output=True,
132 |         save_html_on_codes=save_html_on_codes
133 |     )))
134 | 
135 | 
136 | def test_constructor_sets_empty_settings_when_middleware_settings_are_not_specified():
137 |     settings = Settings()
138 | 
139 |     downloader = HtmlStorageMiddleware(settings)
140 | 
141 |     assert_that(downloader.settings, is_({}))
142 | 
143 | 
144 | @pytest.mark.parametrize('setting_name,expected', [
145 |     ('gzip_output', False),
146 | ])
147 | def test_contructor_sets_default_settings_values_when_no_settings_are_specified(
148 |         setting_name, expected):
149 |     settings = Settings()
150 |     settings.set('HTML_STORAGE', {})
151 | 
152 |     downloader = HtmlStorageMiddleware(settings)
153 | 
154 |     assert_that(downloader.__dict__[setting_name], is_(expected))
155 | 


--------------------------------------------------------------------------------