├── .bumpversion.cfg ├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.rst ├── requirements.txt ├── scrapy_dotpersistence.py ├── setup.py ├── tests ├── __init__.py ├── conftest.py └── test_dotpersistence.py └── tox.ini /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.3.0 3 | commit = True 4 | tag = True 5 | tag_name = v{new_version} 6 | 7 | [bumpversion:file:setup.py] 8 | 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.egg-info 3 | build/ 4 | dist/ 5 | .tox/ 6 | venv/ 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | branches: 3 | only: 4 | - master 5 | - /^\d\.\d+$/ 6 | - /^v\d\.\d+\.\d+(rc\d+|dev\d+)?$/ 7 | matrix: 8 | include: 9 | - python: 2.7 10 | env: TOXENV=py27 11 | - python: 3.5 12 | env: TOXENV=py35 13 | - python: 3.6 14 | env: TOXENV=py36 15 | - python: 3.7 16 | env: TOXENV=py37 17 | install: 18 | - pip install -U tox twine wheel codecov 19 | script: tox 20 | after_success: 21 | - codecov 22 | cache: 23 | directories: 24 | - $HOME/.cache/pip 25 | deploy: 26 | provider: pypi 27 | distributions: sdist bdist_wheel 28 | user: scrapyplugins 29 | on: 30 | tags: true 31 | condition: "$TOXENV == py27 && $TRAVIS_TAG =~ ^v[0-9]+[.][0-9]+[.][0-9]+(rc[0-9]+|dev[0-9]+)?$" 32 | password: 33 | secure: spW448Oc+56tdfjDUQB9CZU0Zf+Aq+RL0AUjLn11dYQSXVmED/wek6wYWWki9nGGozrucWM1VMK9n1mvZdBk4NKTSWkWcCdAlhcaycFnqJU5LMXeqv8wQZPk9vHPB+/pI0siUPCvBbIdUetX9jQsyzvG5yxj00R5c28ZqtE4QjzsUfa5pOqaqlHF65VKoJrIQWNxWUFodbgu8bbWhiZzcRMQwjC5G5HRNVAhPC7LORndXPrUqczl3RwlPfuq0OoHORaiWdevo/we8qQvBlYXi4Gb5Vqqhrg83a5mef8OajF8vfxLPXLwbHnyxWvzaCTP2Lnc7HFCUD5XVbG6OLGZJnFat0vBQmU+PJ5Ovpu0iEieMlW2qxkV/6Jk6VPzxgZDK3L6vaN7yLnGPWPnG7mG9Yka8TNCoCRzzvLVp7wCBWXVNRlNe+OAd4wHvacuAna+ki1mmJQu0pu4kKu12t1dZwS8AmqY/auIGUM09HeAr9U5noTQdwCIASTjsafs8wnlSLLfNrUYrb6Fd8fmjIcA7cfGXq73mtwuokNVJS4NK+iWYV+p19L1Zmi8km08PRqKo3anWCBkEQGwzvrWD7WRIrHsAXB4w+qv2zNFaCo3Xm3puJvO8yvWTn944K8D5ZS6qooQEDKrWNb5QCPTa1Z2w0uAR+0rgTJk6WwCE7sIhCg= 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Scrapy project 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of scrapy-dotpersistence nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ===================== 2 | scrapy-dotpersistence 3 | ===================== 4 | 5 | Scrapy extension to sync `.scrapy` folder to an S3 bucket. 6 | 7 | Installation 8 | ============ 9 | 10 | You can install scrapy-dotpersistence using pip:: 11 | 12 | pip install scrapy-dotpersistence 13 | 14 | You can then enable the extension in your `settings.py`:: 15 | 16 | EXTENSIONS = { 17 | ... 18 | 'scrapy_dotpersistence.DotScrapyPersistence': 0 19 | } 20 | 21 | How to use it 22 | ============= 23 | 24 | Enable extension through `settings.py`:: 25 | 26 | DOTSCRAPY_ENABLED = True 27 | 28 | Configure the exension through `settings.py`:: 29 | 30 | ADDONS_AWS_ACCESS_KEY_ID = "ABC" 31 | ADDONS_AWS_SECRET_ACCESS_KEY = "DEF" 32 | ADDONS_AWS_USERNAME = "username" 33 | ADDONS_S3_BUCKET = "test-bucket-name" 34 | 35 | You can change a dotpersistence folder path with environ:: 36 | 37 | export DOTSCRAPY_DIR='/tmp/.scrapy' 38 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy 2 | -------------------------------------------------------------------------------- /scrapy_dotpersistence.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import subprocess 4 | from scrapy.exceptions import NotConfigured 5 | from scrapy import signals 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class DotScrapyPersistence(object): 12 | 13 | @classmethod 14 | def from_crawler(cls, crawler): 15 | settings = crawler.settings 16 | enabled = (settings.getbool('DOTSCRAPY_ENABLED') or 17 | settings.get('DOTSCRAPYPERSISTENCE_ENABLED')) 18 | if not enabled: 19 | raise NotConfigured 20 | 21 | bucket = settings.get('ADDONS_S3_BUCKET') 22 | if bucket is None: 23 | raise NotConfigured("ADDONS_S3_BUCKET is required") 24 | 25 | aws_access_key_id = settings.get('ADDONS_AWS_ACCESS_KEY_ID') 26 | if aws_access_key_id is None: 27 | raise NotConfigured("ADDONS_AWS_ACCESS_KEY_ID is required") 28 | 29 | aws_secret_access_key = settings.get('ADDONS_AWS_SECRET_ACCESS_KEY') 30 | if aws_secret_access_key is None: 31 | raise NotConfigured("ADDONS_AWS_SECRET_ACCESS_KEY is required") 32 | 33 | return cls(crawler, bucket, aws_access_key_id, aws_secret_access_key) 34 | 35 | def __init__(self, crawler, bucket, aws_access_key_id, aws_secret_access_key): 36 | self._bucket = bucket 37 | self.AWS_ACCESS_KEY_ID = aws_access_key_id 38 | self.AWS_SECRET_ACCESS_KEY = aws_secret_access_key 39 | 40 | self._aws_username = crawler.settings.get('ADDONS_AWS_USERNAME') 41 | self._projectid = os.environ.get('SCRAPY_PROJECT_ID') 42 | self._spider = os.environ.get('SCRAPY_SPIDER', crawler.spider.name) 43 | self._localpath = os.environ.get( 44 | 'DOTSCRAPY_DIR', os.path.join(os.getcwd(), '.scrapy/')) 45 | self._env = { 46 | 'HOME': os.getenv('HOME'), 47 | 'PATH': os.getenv('PATH'), 48 | 'AWS_ACCESS_KEY_ID': self.AWS_ACCESS_KEY_ID, 49 | 'AWS_SECRET_ACCESS_KEY': self.AWS_SECRET_ACCESS_KEY 50 | } 51 | self._load_data() 52 | crawler.signals.connect(self._store_data, signals.engine_stopped) 53 | 54 | @property 55 | def _s3path(self): 56 | path = "/".join( 57 | filter(None, [self._bucket, self._aws_username, self._projectid]) 58 | ) 59 | return "s3://{0}/dot-scrapy/{1}/".format(path, self._spider) 60 | 61 | def _load_data(self): 62 | logger.info('Syncing .scrapy directory from %s' % self._s3path) 63 | cmd = ['aws', 's3', 'sync', self._s3path, self._localpath] 64 | self._call(cmd) 65 | 66 | def _store_data(self): 67 | # check for reason status here? 68 | logger.info('Syncing .scrapy directory to %s' % self._s3path) 69 | cmd = ['aws', 's3', 'sync', '--delete', 70 | self._localpath, self._s3path] 71 | self._call(cmd) 72 | 73 | def _call(self, cmd): 74 | p = subprocess.Popen(cmd, env=self._env, 75 | stdout=subprocess.PIPE, 76 | stderr=subprocess.STDOUT) 77 | stdout, _ = p.communicate() 78 | retcode = p.wait() 79 | if retcode != 0: 80 | msg = ('Failed to sync .scrapy: %(cmd)s\n%(stdout)s' % 81 | {'cmd': cmd, 'stdout': stdout[-1000:]}) 82 | logger.error(msg) 83 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='scrapy-dotpersistence', 5 | version='0.3.0', 6 | url='https://github.com/scrapy-plugins/scrapy-dotpersistence', 7 | description='Scrapy extension to sync `.scrapy` folder to an S3 bucket', 8 | long_description=open('README.rst').read(), 9 | author='Scrapy developers', 10 | license='BSD', 11 | py_modules=['scrapy_dotpersistence'], 12 | zip_safe=False, 13 | author_email='opensource@scrapinghub.com', 14 | platforms=['Any'], 15 | classifiers=[ 16 | 'Framework :: Scrapy', 17 | 'Development Status :: 5 - Production/Stable', 18 | 'Intended Audience :: Developers', 19 | 'License :: OSI Approved :: BSD License', 20 | 'Operating System :: OS Independent', 21 | 'Programming Language :: Python', 22 | 'Programming Language :: Python :: 2', 23 | 'Programming Language :: Python :: 2.7', 24 | 'Programming Language :: Python :: 3', 25 | 'Programming Language :: Python :: 3.5', 26 | 'Programming Language :: Python :: 3.6', 27 | 'Programming Language :: Python :: 3.7', 28 | 'Topic :: Utilities', 29 | ], 30 | install_requires=[ 31 | 'Scrapy>=1.0.3', 32 | 'awscli>=1.10.51', 33 | ], 34 | ) 35 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy-plugins/scrapy-dotpersistence/9b55f05e80809e13b3da9e9dc69416848cbc5fc1/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from scrapy.utils.test import get_crawler 3 | 4 | 5 | @pytest.fixture 6 | def settings(): 7 | """ Returns a dictionary with all required settings defined for the extension to work correctly """ 8 | return { 9 | "DOTSCRAPY_ENABLED": True, 10 | "ADDONS_S3_BUCKET": "s3_bucket", 11 | "ADDONS_AWS_ACCESS_KEY_ID": "s3_access_key_id", 12 | "ADDONS_AWS_SECRET_ACCESS_KEY": "s3_secret_access_key", 13 | } 14 | 15 | 16 | @pytest.fixture 17 | def get_test_crawler(): 18 | def _crawler(settings_dict={}): 19 | crawler = get_crawler(settings_dict=settings_dict) 20 | crawler.spider = crawler._create_spider("test_spider") 21 | return crawler 22 | 23 | return _crawler 24 | -------------------------------------------------------------------------------- /tests/test_dotpersistence.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | import mock 5 | import pytest 6 | from scrapy.exceptions import NotConfigured 7 | from scrapy.settings import Settings 8 | 9 | from scrapy_dotpersistence import DotScrapyPersistence 10 | 11 | 12 | class DotScrapyPersisitenceTestCase(TestCase): 13 | 14 | def setUp(self): 15 | self.mocked_proc = mock.MagicMock() 16 | self.mocked_proc.communicate.return_value = ([], None) 17 | self.mocked_proc.wait.return_value = 0 18 | self.mocked_popen = mock.Mock() 19 | self.mocked_popen.return_value = self.mocked_proc 20 | self.patch = mock.patch('subprocess.Popen', self.mocked_popen) 21 | crawler_mock = mock.Mock() 22 | crawler_mock.settings = Settings({ 23 | 'DOTSCRAPY_ENABLED': True, 24 | 'ADDONS_S3_BUCKET': 'test-bucket', 25 | 'ADDONS_AWS_ACCESS_KEY_ID': 'access-key', 26 | 'ADDONS_AWS_SECRET_ACCESS_KEY': 'secret-key', 27 | 'ADDONS_AWS_USERNAME': 'test-user', 28 | }) 29 | os.environ.update({ 30 | 'SCRAPY_JOB': '123/45/67', 31 | 'SCRAPY_PROJECT_ID': '123', 32 | 'SCRAPY_SPIDER': 'testspider', 33 | 'HOME': '/home/user', 34 | 'DOTSCRAPY_DIR': '/tmp/.scrapy', 35 | }) 36 | self.patch.start() 37 | self.instance = DotScrapyPersistence.from_crawler(crawler_mock) 38 | 39 | def test_from_crawler(self): 40 | crawler_mock = mock.Mock() 41 | crawler_mock.settings = Settings() 42 | self.assertRaises(NotConfigured, 43 | DotScrapyPersistence.from_crawler, 44 | crawler_mock) 45 | # add needed settings for from_crawler() 46 | crawler_mock.settings.set('DOTSCRAPY_ENABLED', True) 47 | crawler_mock.settings.set('ADDONS_S3_BUCKET', 's3-test-bucket') 48 | crawler_mock.settings.set('ADDONS_AWS_ACCESS_KEY_ID', 's3-acess-key') 49 | crawler_mock.settings.set('ADDONS_AWS_SECRET_ACCESS_KEY', 's3-secret-key') 50 | instance = DotScrapyPersistence.from_crawler(crawler_mock) 51 | assert isinstance(instance, DotScrapyPersistence) 52 | 53 | def test_init(self): 54 | assert self.instance._bucket == 'test-bucket' 55 | assert self.instance._aws_username == 'test-user' 56 | assert self.instance._projectid == '123' 57 | assert self.instance._spider == 'testspider' 58 | assert self.instance._localpath == '/tmp/.scrapy' 59 | assert self.instance._env == { 60 | 'HOME': '/home/user', 61 | 'PATH': os.environ['PATH'], 62 | 'AWS_ACCESS_KEY_ID': 'access-key', 63 | 'AWS_SECRET_ACCESS_KEY': 'secret-key', 64 | } 65 | # short checks that init called _load_data 66 | self.assertEqual( 67 | self.instance._s3path, 68 | 's3://test-bucket/test-user/123/dot-scrapy/testspider/') 69 | assert self.mocked_popen.called 70 | 71 | def test_load_data(self): 72 | mocked_call = mock.Mock() 73 | self.instance._call = mocked_call 74 | 75 | self.instance._load_data() 76 | s3_path1 = 's3://test-bucket/test-user/123/dot-scrapy/testspider/' 77 | self.assertEqual(self.instance._s3path, s3_path1) 78 | mocked_call.assert_called_with( 79 | ['aws', 's3', 'sync', s3_path1, '/tmp/.scrapy']) 80 | 81 | # test other s3_path w/o bucket_folder 82 | mocked_call.reset() 83 | self.instance._aws_username = None 84 | self.instance._load_data() 85 | s3_path2 = 's3://test-bucket/123/dot-scrapy/testspider/' 86 | self.assertEqual(self.instance._s3path, s3_path2) 87 | mocked_call.assert_called_with( 88 | ['aws', 's3', 'sync', s3_path2, '/tmp/.scrapy']) 89 | 90 | def test_store_data(self): 91 | mocked_call = mock.Mock() 92 | self.instance._call = mocked_call 93 | self.instance._store_data() 94 | mocked_call.assert_called_with( 95 | ['aws', 's3', 'sync', '--delete', '/tmp/.scrapy', 96 | 's3://test-bucket/test-user/123/dot-scrapy/testspider/']) 97 | 98 | def test_call(self): 99 | # reset mocks after init-related call 100 | self.mocked_popen.reset() 101 | self.mocked_proc.reset() 102 | self.instance._call(["test", "cmd"]) 103 | self.mocked_popen.assert_called_with( 104 | ['test', 'cmd'], stderr=-2, stdout=-1, 105 | env={'HOME': '/home/user', 106 | 'PATH': os.environ['PATH'], 107 | 'AWS_ACCESS_KEY_ID': 'access-key', 108 | 'AWS_SECRET_ACCESS_KEY': 'secret-key'}) 109 | self.mocked_proc.communicate.assert_called_with() 110 | self.mocked_proc.wait.assert_called_with() 111 | 112 | def tearDown(self): 113 | self.patch.stop() 114 | del os.environ['SCRAPY_JOB'] 115 | del os.environ['SCRAPY_PROJECT_ID'] 116 | del os.environ['SCRAPY_SPIDER'] 117 | del os.environ['HOME'] 118 | del os.environ['DOTSCRAPY_DIR'] 119 | 120 | 121 | @pytest.mark.parametrize( 122 | "enable_setting", ["DOTSCRAPY_ENABLED", "DOTSCRAPYPERSISTENCE_ENABLED"] 123 | ) 124 | def test_extension_enabled(mocker, get_test_crawler, enable_setting, settings): 125 | mocker.patch.object(DotScrapyPersistence, "_load_data", autospec=True) 126 | del settings["DOTSCRAPY_ENABLED"] 127 | 128 | settings[enable_setting] = True 129 | 130 | crawler = get_test_crawler(settings_dict=settings) 131 | try: 132 | extension = DotScrapyPersistence.from_crawler(crawler) # noqa 133 | except NotConfigured as excinfo: 134 | pytest.fail(excinfo) 135 | 136 | 137 | @pytest.mark.parametrize( 138 | "disable_setting", ["DOTSCRAPY_ENABLED", "DOTSCRAPYPERSISTENCE_ENABLED"] 139 | ) 140 | def test_extension_disabled(mocker, get_test_crawler, disable_setting, settings): 141 | mocker.patch.object(DotScrapyPersistence, "_load_data", autospec=True) 142 | del settings["DOTSCRAPY_ENABLED"] 143 | 144 | settings[disable_setting] = False 145 | 146 | crawler = get_test_crawler(settings_dict=settings) 147 | with pytest.raises(NotConfigured): 148 | extension = DotScrapyPersistence.from_crawler(crawler) # noqa 149 | 150 | 151 | @pytest.mark.parametrize( 152 | "missing_setting", 153 | ["ADDONS_S3_BUCKET", "ADDONS_AWS_ACCESS_KEY_ID", "ADDONS_AWS_SECRET_ACCESS_KEY"], 154 | ) 155 | def test_aws_required_settings(mocker, get_test_crawler, settings, missing_setting): 156 | mocker.patch.object(DotScrapyPersistence, "_load_data", autospec=True) 157 | 158 | del settings[missing_setting] 159 | 160 | crawler = get_test_crawler(settings_dict=settings) 161 | with pytest.raises(NotConfigured): 162 | extension = DotScrapyPersistence.from_crawler(crawler) # noqa 163 | 164 | 165 | def test_s3path_in_scrapy_cloud_without_aws_username( 166 | mocker, monkeypatch, get_test_crawler, settings 167 | ): 168 | mocker.patch.object(DotScrapyPersistence, "_load_data", autospec=True) 169 | monkeypatch.setenv("SCRAPY_PROJECT_ID", "123") 170 | monkeypatch.setenv("SCRAPY_SPIDER", "test_spider") 171 | crawler = get_test_crawler(settings) 172 | extension = DotScrapyPersistence.from_crawler(crawler) 173 | 174 | assert extension._s3path == "s3://s3_bucket/123/dot-scrapy/test_spider/" 175 | 176 | 177 | def test_s3path_in_scrapy_cloud_with_aws_username( 178 | mocker, monkeypatch, get_test_crawler, settings 179 | ): 180 | mocker.patch.object(DotScrapyPersistence, "_load_data", autospec=True) 181 | monkeypatch.setenv("SCRAPY_PROJECT_ID", "123") 182 | monkeypatch.setenv("SCRAPY_SPIDER", "test_spider") 183 | 184 | settings["ADDONS_AWS_USERNAME"] = "username" 185 | crawler = get_test_crawler(settings) 186 | extension = DotScrapyPersistence.from_crawler(crawler) 187 | 188 | assert extension._s3path == "s3://s3_bucket/username/123/dot-scrapy/test_spider/" 189 | 190 | 191 | def test_s3path_running_locally_without_aws_username( 192 | mocker, get_test_crawler, settings 193 | ): 194 | mocker.patch.object(DotScrapyPersistence, "_load_data", autospec=True) 195 | 196 | crawler = get_test_crawler(settings) 197 | extension = DotScrapyPersistence.from_crawler(crawler) 198 | 199 | assert extension._s3path == "s3://s3_bucket/dot-scrapy/test_spider/" 200 | 201 | 202 | def test_s3path_running_locally_with_aws_username(mocker, get_test_crawler, settings): 203 | mocker.patch.object(DotScrapyPersistence, "_load_data", autospec=True) 204 | 205 | settings["ADDONS_AWS_USERNAME"] = "username" 206 | crawler = get_test_crawler(settings) 207 | extension = DotScrapyPersistence.from_crawler(crawler) 208 | 209 | assert extension._s3path == "s3://s3_bucket/username/dot-scrapy/test_spider/" 210 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # tox.ini 2 | [tox] 3 | envlist = py27, py35, py36, py37 4 | 5 | [testenv] 6 | deps = 7 | -rrequirements.txt 8 | pytest 9 | pytest-cov 10 | pytest-mock 11 | mock 12 | commands = 13 | py.test --cov=scrapy_dotpersistence --cov-report= 14 | --------------------------------------------------------------------------------