├── tests ├── __init__.py ├── image │ ├── __init__.py │ ├── test_tool.py │ ├── test_upload.py │ ├── test_check.py │ ├── conftest.py │ ├── utils.py │ ├── test_test.py │ ├── test_init.py │ ├── test_run.py │ ├── test_utils.py │ ├── test_build.py │ └── test_list.py ├── samples │ ├── deploy_egg_sample_project │ │ ├── test_project │ │ │ └── __init__.py │ │ └── setup.py │ ├── migrate-eggs.zip │ ├── migrate-eggs-no-eggs.zip │ ├── deploy_egg_sample_project.zip │ ├── deploy_egg_sample_repo.git.zip │ ├── custom-images-examples-master.zip │ └── deploy_reqs_sample_project │ │ ├── other-egg-0.2.1.zip │ │ └── inflect-0.2.5.tar.gz ├── requirements.txt ├── conftest.py ├── test_fetch_eggs.py ├── test_logout.py ├── test_end_to_end.py ├── test_deploy_reqs.py ├── utils.py ├── test_jobresource.py ├── test_cancel.py ├── test_login.py ├── test_bootstrap.py ├── test_deploy_egg.py ├── test_schedule.py └── test_migrate_eggs.py ├── freeze ├── tests │ ├── __init__.py │ ├── testproject │ │ ├── testproject │ │ │ ├── __init__.py │ │ │ ├── items.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ └── example.py │ │ ├── scrapinghub.yml │ │ └── scrapy.cfg │ ├── run.py │ └── fakeserver.py ├── spider-down.ico └── hooks │ ├── runtime-hooks.py │ ├── hook-scrapinghub.py │ └── hook-shub.py ├── docs ├── changes.rst ├── requirements.txt ├── _static │ └── theme_overrides.css ├── index.rst ├── quickstart.rst ├── scheduling.rst ├── deploying.rst └── Makefile ├── setup.cfg ├── shub ├── version.py ├── __init__.py ├── __main__.py ├── image │ ├── __init__.py │ ├── check.py │ ├── run │ │ ├── wrapper.py │ │ └── __init__.py │ ├── upload.py │ ├── build.py │ ├── test.py │ ├── list.py │ ├── init.py │ └── push.py ├── logout.py ├── items.py ├── requests.py ├── tool.py ├── compat.py ├── fetch_eggs.py ├── log.py ├── login.py ├── deploy_reqs.py ├── cancel.py ├── copy_eggs.py ├── migrate_eggs.py ├── exceptions.py ├── schedule.py ├── bootstrap.py └── deploy_egg.py ├── .bumpversion.cfg ├── .readthedocs.yml ├── CHANGES.rst ├── .github └── workflows │ ├── checks.yml │ ├── freeze-release-publish.yml │ └── tests.yml ├── RELEASE.md ├── LICENSE ├── tox.ini ├── README.rst ├── setup.py └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/image/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /freeze/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/changes.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CHANGES.rst 2 | -------------------------------------------------------------------------------- /freeze/tests/testproject/testproject/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==7.2.6 2 | sphinx-rtd-theme==2.0.0 3 | -------------------------------------------------------------------------------- /tests/samples/deploy_egg_sample_project/test_project/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /freeze/spider-down.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/shub/HEAD/freeze/spider-down.ico -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | cleo 2 | flake8 3 | pipenv 4 | python-dateutil 5 | pytest 6 | pytest-cov 7 | -------------------------------------------------------------------------------- /tests/samples/migrate-eggs.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/shub/HEAD/tests/samples/migrate-eggs.zip -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | [flake8] 4 | max-line-length = 110 5 | exclude = .tox,tests,freeze,dist 6 | -------------------------------------------------------------------------------- /freeze/tests/testproject/testproject/items.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class TestprojectItem(scrapy.Item): 5 | pass 6 | -------------------------------------------------------------------------------- /tests/samples/migrate-eggs-no-eggs.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/shub/HEAD/tests/samples/migrate-eggs-no-eggs.zip -------------------------------------------------------------------------------- /tests/samples/deploy_egg_sample_project.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/shub/HEAD/tests/samples/deploy_egg_sample_project.zip -------------------------------------------------------------------------------- /tests/samples/deploy_egg_sample_repo.git.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/shub/HEAD/tests/samples/deploy_egg_sample_repo.git.zip -------------------------------------------------------------------------------- /tests/samples/custom-images-examples-master.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/shub/HEAD/tests/samples/custom-images-examples-master.zip -------------------------------------------------------------------------------- /freeze/tests/testproject/testproject/pipelines.py: -------------------------------------------------------------------------------- 1 | class TestprojectPipeline: 2 | def process_item(self, item, spider): 3 | return item 4 | -------------------------------------------------------------------------------- /shub/version.py: -------------------------------------------------------------------------------- 1 | import click 2 | import shub 3 | 4 | 5 | @click.command(help="Show shub version") 6 | def cli(): 7 | click.echo(shub.__version__) 8 | -------------------------------------------------------------------------------- /freeze/hooks/runtime-hooks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | os.environ['REQUESTS_CA_BUNDLE'] = os.path.join( 5 | sys._MEIPASS, 'requests', 'cacert.pem') 6 | -------------------------------------------------------------------------------- /freeze/tests/testproject/testproject/settings.py: -------------------------------------------------------------------------------- 1 | BOT_NAME = 'testproject' 2 | SPIDER_MODULES = ['testproject.spiders'] 3 | NEWSPIDER_MODULE = 'testproject.spiders' 4 | -------------------------------------------------------------------------------- /tests/samples/deploy_reqs_sample_project/other-egg-0.2.1.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/shub/HEAD/tests/samples/deploy_reqs_sample_project/other-egg-0.2.1.zip -------------------------------------------------------------------------------- /tests/samples/deploy_reqs_sample_project/inflect-0.2.5.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/shub/HEAD/tests/samples/deploy_reqs_sample_project/inflect-0.2.5.tar.gz -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 2.16.0 3 | commit = True 4 | tag = True 5 | tag_name = v{new_version} 6 | 7 | [bumpversion:file:setup.py] 8 | 9 | [bumpversion:file:shub/__init__.py] 10 | -------------------------------------------------------------------------------- /freeze/hooks/hook-scrapinghub.py: -------------------------------------------------------------------------------- 1 | from PyInstaller.utils.hooks import collect_data_files 2 | 3 | # Add the data files in the scrapinghub package (aka scrapinghub.VERSION). 4 | datas = collect_data_files('scrapinghub') 5 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture 7 | def tempdir(tmpdir): 8 | cwd = os.getcwd() 9 | os.chdir(str(tmpdir)) 10 | yield tmpdir 11 | os.chdir(cwd) 12 | -------------------------------------------------------------------------------- /freeze/tests/testproject/testproject/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /freeze/hooks/hook-shub.py: -------------------------------------------------------------------------------- 1 | from PyInstaller.utils.hooks import collect_submodules 2 | 3 | # Add as hidden imports all submodules from shub. This is because shub 4 | # modules are loaded when it's executed. 5 | hiddenimports = collect_submodules('shub') 6 | -------------------------------------------------------------------------------- /docs/_static/theme_overrides.css: -------------------------------------------------------------------------------- 1 | /* override table width restrictions */ 2 | /* https://github.com/snide/sphinx_rtd_theme/issues/117#issuecomment-41506687 */ 3 | .wy-table-responsive table td, .wy-table-responsive table th { 4 | white-space: normal; 5 | } 6 | -------------------------------------------------------------------------------- /shub/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '2.16.0' 2 | 3 | 4 | # Links to documentation to use over the project sources 5 | DOCS_LINK = "https://shub.readthedocs.io/en/stable/" 6 | DEPLOY_DOCS_LINK = DOCS_LINK + "deploying.html#deploying-dependencies" 7 | CONFIG_DOCS_LINK = DOCS_LINK + "configuration.html" 8 | -------------------------------------------------------------------------------- /shub/__main__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import shub.tool 5 | 6 | 7 | prog_name = os.path.basename(sys.argv and sys.argv[0] or __file__) 8 | if prog_name == '__main__.py': 9 | # shub invoked via python -m shub 10 | prog_name = __package__ 11 | shub.tool.cli(prog_name=prog_name) 12 | -------------------------------------------------------------------------------- /freeze/tests/testproject/scrapinghub.yml: -------------------------------------------------------------------------------- 1 | projects: 2 | default: 1 3 | next: 2 4 | nested: default/3 5 | nested2: numeric/4 6 | 7 | endpoints: 8 | default: http://localhost:7999/api/ 9 | nuneric: http://127.0.0.1:7999/api/ 10 | 11 | apikeys: 12 | default: abcdabcdabcdabcdabcdabcdabcdabcd 13 | 14 | -------------------------------------------------------------------------------- /freeze/tests/testproject/testproject/spiders/example.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class ExampleSpider(scrapy.Spider): 5 | name = "example" 6 | allowed_domains = ["example.com"] 7 | start_urls = ( 8 | 'http://www.example.com/', 9 | ) 10 | 11 | def parse(self, response): 12 | pass 13 | -------------------------------------------------------------------------------- /freeze/tests/testproject/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = testproject.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = testproject 12 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | formats: all 3 | sphinx: 4 | configuration: docs/conf.py 5 | fail_on_warning: true 6 | 7 | build: 8 | os: ubuntu-20.04 9 | tools: 10 | # For available versions, see: 11 | # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python 12 | python: "3.11" # Keep in sync with .github/workflows/checks.yml 13 | 14 | python: 15 | install: 16 | - requirements: docs/requirements.txt 17 | - path: . 18 | -------------------------------------------------------------------------------- /tests/image/test_tool.py: -------------------------------------------------------------------------------- 1 | from click.testing import CliRunner 2 | from unittest import TestCase 3 | from shub.image import cli 4 | 5 | 6 | class TestToolCli(TestCase): 7 | 8 | def test_cli(self): 9 | runner = CliRunner() 10 | result = runner.invoke(cli, ['--help']) 11 | assert result.exit_code == 0 12 | assert 'Manage project based on custom Docker image' in result.output 13 | assert 'Options:' in result.output 14 | assert 'Commands:' in result.output 15 | -------------------------------------------------------------------------------- /shub/image/__init__.py: -------------------------------------------------------------------------------- 1 | import click 2 | import importlib 3 | 4 | 5 | @click.group(help="Manage project based on custom Docker image") 6 | def cli(): 7 | pass 8 | 9 | 10 | module_deps = [ 11 | "init", 12 | "build", 13 | "list", 14 | "test", 15 | "push", 16 | "deploy", 17 | "upload", 18 | "check", 19 | "run", 20 | ] 21 | 22 | for command in module_deps: 23 | module_path = "shub.image." + command 24 | command_module = importlib.import_module(module_path) 25 | cli.add_command(command_module.cli, command) 26 | -------------------------------------------------------------------------------- /shub/logout.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from shub.config import load_shub_config, GLOBAL_SCRAPINGHUB_YML_PATH 4 | from shub.utils import update_yaml_dict 5 | 6 | 7 | HELP = """ 8 | Remove the Scrapinghub API key that is saved in your global configuration 9 | file (~/.scrapinghub.yml), if any. 10 | """ 11 | 12 | SHORT_HELP = "Forget saved Scrapinghub API key" 13 | 14 | 15 | @click.command(help=HELP, short_help=SHORT_HELP) 16 | def cli(): 17 | global_conf = load_shub_config(load_local=False, load_env=False) 18 | if 'default' not in global_conf.apikeys: 19 | click.echo("You are not logged in.") 20 | return 0 21 | 22 | with update_yaml_dict(GLOBAL_SCRAPINGHUB_YML_PATH) as conf: 23 | del conf['apikeys']['default'] 24 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. shub.image documentation master file, created by 2 | sphinx-quickstart on Tue May 3 16:20:52 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to shub's documentation! 7 | ================================ 8 | 9 | ``shub`` is the Scrapinghub command line client. It allows you to deploy 10 | projects or dependencies, schedule spiders, and retrieve scraped data or logs 11 | without leaving the command line. 12 | 13 | Contents 14 | -------- 15 | 16 | .. toctree:: 17 | :maxdepth: 2 18 | 19 | quickstart 20 | configuration 21 | deploying 22 | scheduling 23 | deploy-custom-image 24 | custom-images-contract 25 | changes 26 | -------------------------------------------------------------------------------- /shub/image/check.py: -------------------------------------------------------------------------------- 1 | import click 2 | import requests 3 | 4 | from shub.image.utils import load_status_url 5 | 6 | SHORT_HELP = "Check a deploy task's status url saved in a temporary file." 7 | 8 | HELP = """ 9 | A command to check your release task state for asynchronous deploy mode. 10 | Does a simple GET request to Dash with an URL which it reads from a 11 | temporary file. 12 | """ 13 | 14 | 15 | @click.command(help=HELP, short_help=SHORT_HELP) 16 | @click.option("--id", type=int, help="status id to check deploy results") 17 | def cli(id): 18 | status_url = load_status_url(id) 19 | status_req = requests.get(status_url, timeout=300) 20 | status_req.raise_for_status() 21 | result = status_req.json() 22 | click.echo(f"Deploy results: {result}") 23 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Changes 3 | ======= 4 | 5 | 2.16.0 (2025-11-05) 6 | =================== 7 | 8 | - Drop support for Python 3.6, 3.7 & 3.8. 9 | - Add support for Python 3.13 & 3.14. 10 | - Fix ``shub image upload`` documentation. 11 | - Add support for current Poetry versions. 12 | - Fix link to get apikey in ``shub login`` command. 13 | - Modernize generated Dockerfile (``shub image init`` command). 14 | 15 | 16 | 2.15.4 (2024-02-08) 17 | =================== 18 | 19 | - Support Docker server 25+. 20 | 21 | 22 | 2.15.3 (2024-01-23) 23 | =================== 24 | 25 | - Fix ``shub image deploy`` failing on Python 3.8 and 3.9. 26 | 27 | 28 | 2.15.2 (2024-01-17) 29 | =================== 30 | 31 | - Add support for Python 3.12. 32 | 33 | - Remove remnants of Python 2 support. 34 | 35 | - Start a changelog. 36 | -------------------------------------------------------------------------------- /.github/workflows/checks.yml: -------------------------------------------------------------------------------- 1 | name: Checks 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | checks: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | include: 17 | - python-version: 3 18 | env: 19 | TOXENV: flake8 20 | - python-version: "3.11" 21 | env: 22 | TOXENV: docs 23 | 24 | steps: 25 | - uses: actions/checkout@v2 26 | 27 | - name: Set up Python ${{ matrix.python-version }} 28 | uses: actions/setup-python@v5 29 | with: 30 | python-version: ${{ matrix.python-version }} 31 | 32 | - name: Run check 33 | env: ${{ matrix.env }} 34 | run: | 35 | pip install -U pip 36 | pip install -U tox 37 | tox 38 | -------------------------------------------------------------------------------- /tests/samples/deploy_egg_sample_project/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | setup( 5 | name='test_project', 6 | version='1.2.0', 7 | packages=['test_project'], 8 | description='Test Project', 9 | author='Scrapinghub', 10 | author_email='info@scrapinghub.com', 11 | maintainer='Scrapinghub', 12 | maintainer_email='info@scrapinghub.com', 13 | license='BSD', 14 | include_package_data=True, 15 | zip_safe=False, 16 | install_requires=[], 17 | classifiers=[ 18 | 'Development Status :: 5 - Production/Stable', 19 | 'Intended Audience :: Developers', 20 | 'Natural Language :: English', 21 | 'License :: OSI Approved :: BSD License', 22 | 'Programming Language :: Python', 23 | 'Programming Language :: Python :: 2.7', 24 | 'Operating System :: OS Independent', 25 | 'Environment :: Console', 26 | 'Topic :: Internet :: WWW/HTTP', 27 | ], 28 | ) 29 | -------------------------------------------------------------------------------- /tests/image/test_upload.py: -------------------------------------------------------------------------------- 1 | from unittest import mock, TestCase 2 | 3 | from click.testing import CliRunner 4 | 5 | from shub.image.upload import cli 6 | 7 | 8 | class TestUploadCli(TestCase): 9 | 10 | @mock.patch('shub.image.deploy.deploy_cmd') 11 | @mock.patch('shub.image.push.push_cmd') 12 | @mock.patch('shub.image.build.build_cmd') 13 | def test_cli(self, build, push, deploy): 14 | runner = CliRunner() 15 | result = runner.invoke( 16 | cli, ["dev", "-v", "--version", "test", 17 | "--username", "user", "--password", "pass", 18 | "--email", "mail", "--async", "--apikey", "apikey", 19 | "--skip-tests", "--no-cache", "-f", "Dockerfile", "--reauth"]) 20 | assert result.exit_code == 0 21 | build.assert_called_with('dev', 'test', True, True, (), filename='Dockerfile') 22 | push.assert_called_with( 23 | 'dev', 'test', 'user', 'pass', 'mail', "apikey", False, reauth=True, 24 | skip_tests=True) 25 | deploy.assert_called_with( 26 | 'dev', 'test', 'user', 'pass', 'mail', "apikey", False, True) 27 | -------------------------------------------------------------------------------- /tests/test_fetch_eggs.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from collections import namedtuple 3 | from unittest import mock 4 | 5 | from click.testing import CliRunner 6 | 7 | from shub import fetch_eggs 8 | from shub.exceptions import InvalidAuthException, RemoteErrorException 9 | 10 | from .utils import AssertInvokeRaisesMixin, mock_conf 11 | 12 | 13 | FakeResponse = namedtuple('FakeResponse', ['status_code']) 14 | 15 | 16 | @mock.patch('shub.fetch_eggs.requests', autospec=True) 17 | class FetchEggsTest(AssertInvokeRaisesMixin, unittest.TestCase): 18 | 19 | def setUp(self): 20 | self.runner = CliRunner() 21 | self.conf = mock_conf(self) 22 | 23 | def test_raises_auth_exception(self, requests_mock): 24 | fake_response = FakeResponse(403) 25 | requests_mock.get.return_value = fake_response 26 | self.assertInvokeRaises(InvalidAuthException, fetch_eggs.cli) 27 | 28 | def test_raises_exception_if_request_error(self, requests_mock): 29 | fake_response = FakeResponse(400) 30 | requests_mock.get.return_value = fake_response 31 | self.assertInvokeRaises(RemoteErrorException, fetch_eggs.cli) 32 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | Release procedure for shub 2 | ========================== 3 | 4 | The GitHub Actions build is configured to release `shub` to PyPI whenever 5 | a new tag (starting with `v`, e.g. `v2.13.0`) is committed. 6 | 7 | The steps to do a release are: 8 | 9 | 1. Install [bumpversion](https://pypi.python.org/pypi/bumpversion) 10 | 11 | 2. Make sure you're at the tip of master, and then run: 12 | 13 | bumpversion VERSION_PART 14 | 15 | In place of `VERSION_PART`, use one of `patch`, `minor` or `major`, meaning 16 | the part of the version number to be updated. 17 | 18 | This will create a new commit and tag updating the version number. 19 | 20 | 3. Push the changes and the new tag to trigger the release: 21 | 22 | git push origin master --tags 23 | 24 | 4. Once the build finishes, run `pip install shub` in a temporary virtualenv 25 | and make sure it's installing the latest version. 26 | 27 | 5. Update the release information at: 28 | 29 | https://github.com/scrapinghub/shub/releases 30 | 31 | The GitHub action will automatically create a release draft and attach the 32 | platform-specific binaries (built with the `freeze` tox environment) to it. 33 | -------------------------------------------------------------------------------- /tests/test_logout.py: -------------------------------------------------------------------------------- 1 | import textwrap 2 | import unittest 3 | from unittest import mock 4 | 5 | from click.testing import CliRunner 6 | 7 | from shub import config, logout 8 | 9 | 10 | @mock.patch('shub.config.GLOBAL_SCRAPINGHUB_YML_PATH', new='.scrapinghub.yml') 11 | @mock.patch('shub.config.NETRC_PATH', new='.netrc') 12 | @mock.patch('shub.logout.GLOBAL_SCRAPINGHUB_YML_PATH', new='.scrapinghub.yml') 13 | class LogoutTestCase(unittest.TestCase): 14 | 15 | def setUp(self): 16 | self.runner = CliRunner() 17 | 18 | def test_remove_key(self): 19 | GLOBAL_SH_YML = textwrap.dedent(""" 20 | apikeys: 21 | default: LOGGED_IN_KEY 22 | """) 23 | with self.runner.isolated_filesystem(): 24 | with open('.scrapinghub.yml', 'w') as f: 25 | f.write(GLOBAL_SH_YML) 26 | conf = config.load_shub_config() 27 | self.assertIn('default', conf.apikeys) 28 | self.runner.invoke(logout.cli) 29 | conf = config.load_shub_config() 30 | self.assertNotIn('default', conf.apikeys) 31 | 32 | @mock.patch('shub.logout.update_yaml_dict') 33 | def test_fail_on_not_logged_in(self, mock_uyd): 34 | with self.runner.isolated_filesystem(): 35 | self.runner.invoke(logout.cli) 36 | self.assertFalse(mock_uyd.called) 37 | -------------------------------------------------------------------------------- /shub/items.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from shub.utils import job_resource_iter, get_job 4 | 5 | 6 | HELP = """ 7 | Given a job ID, fetch items for that job from Scrapy Cloud and output them as 8 | JSON lines. 9 | 10 | A job ID consists of the Scrapinghub project ID, the numerical spider ID, and 11 | the job ID, separated by forward slashes, e.g.: 12 | 13 | shub items 12345/2/15 14 | 15 | You can also provide the Scrapinghub job URL instead: 16 | 17 | shub items https://app.zyte.com/p/12345/2/15 18 | 19 | You can omit the project ID if you have a default target defined in your 20 | scrapinghub.yml: 21 | 22 | shub items 2/15 23 | 24 | Or use any target defined in your scrapinghub.yml: 25 | 26 | shub items production/2/15 27 | 28 | If the job is still running, you can watch the items as they are being scraped 29 | by providing the -f flag: 30 | 31 | shub items -f 2/15 32 | """ 33 | 34 | SHORT_HELP = "Fetch items from Scrapy Cloud" 35 | 36 | 37 | @click.command(help=HELP, short_help=SHORT_HELP) 38 | @click.argument('job_id') 39 | @click.option('-f', '--follow', help='output new items as they are scraped', 40 | is_flag=True) 41 | @click.option('-n', '--tail', help='output last N items only', type=int) 42 | def cli(job_id, follow, tail): 43 | job = get_job(job_id) 44 | for item in job_resource_iter(job, job.items, output_json=True, 45 | follow=follow, tail=tail): 46 | click.echo(item) 47 | -------------------------------------------------------------------------------- /shub/requests.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from shub.utils import job_resource_iter, get_job 4 | 5 | 6 | HELP = """ 7 | Given a job ID, fetch requests made for that job from Scrapy Cloud and output 8 | them as JSON lines. 9 | 10 | A job ID consists of the Scrapinghub project ID, the numerical spider ID, and 11 | the job ID, separated by forward slashes, e.g.: 12 | 13 | shub requests 12345/2/15 14 | 15 | You can also provide the Scrapinghub job URL instead: 16 | 17 | shub requests https://app.zyte.com/p/12345/2/15 18 | 19 | You can omit the project ID if you have a default target defined in your 20 | scrapinghub.yml: 21 | 22 | shub requests 2/15 23 | 24 | Or use any target defined in your scrapinghub.yml: 25 | 26 | shub requests production/2/15 27 | 28 | If the job is still running, you can watch the requests as they are being made 29 | by providing the -f flag: 30 | 31 | shub requests -f 2/15 32 | """ 33 | 34 | SHORT_HELP = "Fetch requests from Scrapy Cloud" 35 | 36 | 37 | @click.command(help=HELP, short_help=SHORT_HELP) 38 | @click.argument('job_id') 39 | @click.option('-f', '--follow', help='output new requests as they are made', 40 | is_flag=True) 41 | @click.option('-n', '--tail', help='output last N requests only', type=int) 42 | def cli(job_id, follow, tail): 43 | job = get_job(job_id) 44 | for item in job_resource_iter(job, job.requests, output_json=True, 45 | follow=follow, tail=tail): 46 | click.echo(item) 47 | -------------------------------------------------------------------------------- /tests/test_end_to_end.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from click.testing import CliRunner 3 | from shub import tool 4 | import os 5 | 6 | 7 | @unittest.skipUnless(os.getenv('USING_TOX'), 8 | 'End to end tests only run via TOX') 9 | class ShubEndToEndTests(unittest.TestCase): 10 | def setUp(self): 11 | self.runner = CliRunner() 12 | 13 | def run_subcmd(self, subcmd): 14 | return self.runner.invoke(tool.cli, [subcmd]).output 15 | 16 | def test_usage_is_displayed_if_no_arg_is_provided(self): 17 | output = self.run_subcmd('') 18 | usage_is_displayed = output.startswith('Usage:') 19 | self.assertTrue(usage_is_displayed) 20 | 21 | def test_deploy_egg_isnt_broken(self): 22 | output = self.run_subcmd('deploy-egg') 23 | error = 'Unexpected output: %s' % output 24 | self.assertTrue('specify target' in output, error) 25 | 26 | def test_deploy_reqs_isnt_broken(self): 27 | output = self.run_subcmd('deploy-reqs') 28 | error = 'Unexpected output: %s' % output 29 | self.assertTrue('specify target' in output, error) 30 | 31 | def test_deploy_isnt_broken(self): 32 | output = self.run_subcmd('deploy') 33 | error = 'Unexpected output: %s' % output 34 | self.assertTrue('Cannot find project' in output, error) 35 | 36 | def test_fetch_eggs_isnt_broken(self): 37 | output = self.run_subcmd('fetch-eggs') 38 | error = 'Unexpected output: %s' % output 39 | self.assertTrue('specify target' in output, error) 40 | -------------------------------------------------------------------------------- /tests/image/test_check.py: -------------------------------------------------------------------------------- 1 | from unittest import mock, TestCase 2 | 3 | from click.testing import CliRunner 4 | 5 | from shub import exceptions as shub_exceptions 6 | from shub.image.check import cli 7 | from shub.image import utils 8 | 9 | from .utils import FakeProjectDirectory 10 | 11 | 12 | class TestCheckCli(TestCase): 13 | 14 | @mock.patch('requests.get') 15 | def test_cli(self, mocked): 16 | # the test creates .releases file locally 17 | # this context manager cleans it in the end 18 | with FakeProjectDirectory(): 19 | runner = CliRunner() 20 | result = runner.invoke(cli, []) 21 | assert result.exit_code == \ 22 | shub_exceptions.NotFoundException.exit_code 23 | deploy_id1 = utils.store_status_url('http://linkA', 2) 24 | deploy_id2 = utils.store_status_url('http://linkB', 2) 25 | utils.store_status_url('http://linkC', 2) 26 | 27 | # get latest (deploy 3) 28 | result = runner.invoke(cli, []) 29 | assert result.exit_code == 0 30 | mocked.assert_called_with('http://linkC', timeout=300) 31 | 32 | # get deploy by id 33 | result = runner.invoke(cli, ["--id", deploy_id2]) 34 | assert result.exit_code == 0 35 | mocked.assert_called_with('http://linkB', timeout=300) 36 | 37 | # get non-existing deploy 38 | result = runner.invoke(cli, ["--id", deploy_id1]) 39 | assert result.exit_code == \ 40 | shub_exceptions.NotFoundException.exit_code 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2016 Scrapinghub, Inc 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of extruct nor the names of its contributors may be used 15 | to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /shub/tool.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | import click 4 | 5 | import shub 6 | from shub.utils import update_available 7 | 8 | 9 | HELP = """ 10 | shub is the Scrapinghub command-line client. It allows you to deploy projects 11 | or dependencies, schedule spiders, and retrieve scraped data or logs without 12 | leaving the command line. 13 | """ 14 | 15 | SHORT_HELP = "Scrapinghub command-line client" 16 | 17 | EPILOG = """ 18 | For usage and help on a specific command, run it with a --help flag, e.g.: 19 | 20 | shub schedule --help 21 | """ 22 | 23 | CONTEXT_SETTINGS = {'help_option_names': ['-h', '--help']} 24 | 25 | 26 | @click.group(help=HELP, short_help=SHORT_HELP, epilog=EPILOG, 27 | context_settings=CONTEXT_SETTINGS) 28 | @click.version_option(shub.__version__) 29 | def cli(): 30 | update_url = update_available() 31 | if update_url: 32 | click.echo("INFO: A newer version of shub is available. Update " 33 | "via pip or get it at {}".format(update_url), err=True) 34 | 35 | 36 | commands = [ 37 | "bootstrap", 38 | "deploy", 39 | "login", 40 | "deploy_egg", 41 | "fetch_eggs", 42 | "deploy_reqs", 43 | "logout", 44 | "version", 45 | "items", 46 | "schedule", 47 | "log", 48 | "requests", 49 | "copy_eggs", 50 | "migrate_eggs", 51 | "image", 52 | "cancel", 53 | ] 54 | 55 | for command in commands: 56 | module_path = "shub." + command 57 | command_module = importlib.import_module(module_path) 58 | command_name = command.replace('_', '-') # easier to type 59 | cli.add_command(command_module.cli, command_name) 60 | -------------------------------------------------------------------------------- /tests/test_deploy_reqs.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import tempfile 4 | from unittest import mock 5 | 6 | from click.testing import CliRunner 7 | 8 | from shub import deploy_reqs 9 | 10 | from .utils import mock_conf 11 | 12 | 13 | class TestDeployReqs(unittest.TestCase): 14 | 15 | def setUp(self): 16 | self.runner = CliRunner() 17 | self.conf = mock_conf(self) 18 | 19 | @unittest.skip('flaky') 20 | def test_can_decompress_downloaded_packages_and_call_deploy_reqs(self): 21 | requirements_file = self._write_tmp_requirements_file() 22 | with mock.patch('shub.utils.build_and_deploy_egg') as m: 23 | self.runner.invoke( 24 | deploy_reqs.cli, 25 | ('-r', requirements_file), 26 | ) 27 | self.assertEqual(m.call_count, 2) 28 | for args, kwargs in m.call_args_list: 29 | project, endpoint, apikey = args 30 | self.assertEqual(project, 1) 31 | self.assertIn('https://app.zyte.com', endpoint) 32 | self.assertEqual(apikey, self.conf.apikeys['default']) 33 | 34 | def _write_tmp_requirements_file(self): 35 | basepath = 'tests/samples/deploy_reqs_sample_project/' 36 | eggs = ['other-egg-0.2.1.zip', 'inflect-0.2.5.tar.gz'] 37 | tmp_dir = tempfile.mkdtemp(prefix="shub-test-deploy-reqs") 38 | requirements_file = os.path.join(tmp_dir, 'requirements.txt') 39 | 40 | with open(requirements_file, 'w') as f: 41 | for egg in eggs: 42 | f.write(os.path.abspath(os.path.join(basepath, egg)) + "\n") 43 | 44 | return requirements_file 45 | -------------------------------------------------------------------------------- /shub/compat.py: -------------------------------------------------------------------------------- 1 | def to_unicode(text, encoding=None, errors='strict'): 2 | """Return the unicode representation of `text`. 3 | 4 | If `text` is already a ``unicode`` object, return it as-is. 5 | If `text` is a ``bytes`` object, decode it using `encoding`. 6 | 7 | Otherwise, raise an error. 8 | 9 | """ 10 | if isinstance(text, str): 11 | return text 12 | if not isinstance(text, (bytes, bytearray)): 13 | raise TypeError('to_unicode must receive a bytes, str or unicode ' 14 | 'object, got %s' % type(text).__name__) 15 | if encoding is None: 16 | encoding = 'utf-8' 17 | return text.decode(encoding, errors) 18 | 19 | 20 | def to_bytes(text, encoding=None, errors='strict'): 21 | """Return the binary representation of `text`. 22 | 23 | If `text` is already a ``bytes`` object, return it as-is. 24 | If `text` is a ``unicode`` object, encode it using `encoding`. 25 | 26 | Otherwise, raise an error.""" 27 | if isinstance(text, bytes): 28 | return text 29 | if isinstance(text, bytearray): 30 | return bytes(text) 31 | if not isinstance(text, str): 32 | raise TypeError('to_bytes must receive a unicode, str or bytes ' 33 | 'object, got %s' % type(text).__name__) 34 | if encoding is None: 35 | encoding = 'utf-8' 36 | return text.encode(encoding, errors) 37 | 38 | 39 | def to_native_str(text, encoding=None, errors='strict'): 40 | """Return ``str`` representation of `text`. 41 | 42 | ``str`` representation means ``bytes`` in PY2 and ``unicode`` in PY3. 43 | 44 | """ 45 | return to_unicode(text, encoding, errors) 46 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = flake8,min,min-poetry,py,poetry 3 | 4 | [testenv] 5 | basepython = python3 6 | setenv = 7 | USING_TOX=1 8 | deps = 9 | -r tests/requirements.txt 10 | commands = 11 | pytest --cov=shub --cov-report=term-missing --cov-report=html --cov-report=xml {posargs:shub tests} 12 | 13 | [testenv:min] 14 | basepython = python3.9 15 | deps = 16 | {[testenv]deps} 17 | pipenv<2024.3.0 18 | 19 | [testenv:min-poetry] 20 | basepython = python3.9 21 | deps = 22 | {[testenv]deps} 23 | poetry-core<2 24 | 25 | [testenv:poetry] 26 | deps = 27 | {[testenv:min]deps} 28 | poetry-core 29 | poetry-plugin-export 30 | 31 | [testenv:freeze] 32 | install_command = 33 | python -m pip install {opts} {packages} 34 | deps = 35 | pyinstaller==4.10 36 | pytest 37 | packaging==20.4 38 | setuptools==59.8.0 # https://github.com/pypa/setuptools/issues/3089 39 | ; address https://github.com/pyinstaller/pyinstaller/issues/2162 with hidden imports 40 | setuptools>=44.0 41 | commands = 42 | pyinstaller --clean -y -F -n shub \ 43 | --distpath=./dist_bin \ 44 | --additional-hooks-dir=./freeze/hooks \ 45 | --runtime-hook=./freeze/hooks/runtime-hooks.py \ 46 | --icon=./freeze/spider-down.ico \ 47 | --hidden-import=packaging \ 48 | --hidden-import=packaging.specifiers \ 49 | ./shub/__main__.py 50 | pytest -vv {toxinidir}/freeze/tests/run.py 51 | 52 | [testenv:flake8] 53 | deps = 54 | flake8>=3.7.9 55 | commands = 56 | flake8 --exclude=.git,.tox,venv* {posargs:shub tests} 57 | 58 | [testenv:docs] 59 | changedir = docs 60 | deps = 61 | -rdocs/requirements.txt 62 | commands = 63 | sphinx-build -W -b html . {envtmpdir}/html 64 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Scrapinghub command line client 2 | =============================== 3 | 4 | .. image:: https://img.shields.io/pypi/v/shub.svg 5 | :target: https://pypi.python.org/pypi/shub 6 | :alt: PyPI Version 7 | 8 | .. image:: https://img.shields.io/pypi/pyversions/shub.svg 9 | :target: https://pypi.python.org/pypi/shub 10 | :alt: Python Versions 11 | 12 | .. image:: https://github.com/scrapinghub/shub/actions/workflows/tests.yml/badge.svg 13 | :target: https://github.com/scrapinghub/shub/actions/workflows/tests.yml 14 | :alt: Tests 15 | 16 | .. image:: https://img.shields.io/codecov/c/github/scrapinghub/shub/master.svg 17 | :target: https://codecov.io/github/scrapinghub/shub?branch=master 18 | :alt: Coverage report 19 | 20 | ``shub`` is the Scrapinghub command line client. It allows you to deploy 21 | projects or dependencies, schedule spiders, and retrieve scraped data or logs 22 | without leaving the command line. 23 | 24 | 25 | Requirements 26 | ------------ 27 | 28 | * Python >= 3.9 29 | 30 | 31 | Installation 32 | ------------ 33 | 34 | If you have ``pip`` installed on your system, you can install ``shub`` from 35 | the Python Package Index:: 36 | 37 | pip install shub 38 | 39 | Please note: 40 | 41 | * if you are using Python < 3.6, you should pin `shub` to `2.13.0` or lower. 42 | * if you are using Python < 3.9, you should pin `shub` to `2.15.4` or lower. 43 | 44 | We also supply stand-alone binaries. You can find them in our `latest GitHub 45 | release`_. 46 | 47 | .. _`latest Github release`: https://github.com/scrapinghub/shub/releases/latest 48 | 49 | 50 | Documentation 51 | ------------- 52 | 53 | Documentation is available online via Read the Docs: 54 | https://shub.readthedocs.io/, or in the ``docs`` directory. 55 | -------------------------------------------------------------------------------- /freeze/tests/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import shlex 4 | import shutil 5 | import tempfile 6 | from os.path import abspath, dirname, join 7 | from subprocess import Popen, PIPE 8 | 9 | import pytest 10 | from . import fakeserver 11 | 12 | SHUB = abspath(join(dirname(__file__), '../../dist_bin/shub')) 13 | 14 | 15 | @pytest.fixture(scope='module') 16 | def apipipe(): 17 | return fakeserver.run(("127.0.0.1", 7999)) 18 | 19 | 20 | @pytest.fixture 21 | def scrapyproject(request): 22 | cwd = os.getcwd() 23 | tmpdir = os.path.join(tempfile.mkdtemp(), 'project') 24 | 25 | def _fin(): 26 | os.chdir(cwd) 27 | shutil.rmtree(tmpdir, ignore_errors=True) 28 | request.addfinalizer(_fin) 29 | shutil.copytree(abspath(join(dirname(__file__), 'testproject')), tmpdir) 30 | os.chdir(tmpdir) 31 | return tmpdir 32 | 33 | 34 | def shub(shub_args): 35 | cmd = [SHUB] 36 | if isinstance(shub_args, str): 37 | shub_args = shlex.split(shub_args) 38 | if shub_args is not None: 39 | cmd.extend(shub_args) 40 | return Popen(cmd, stdout=PIPE, stderr=PIPE) 41 | 42 | 43 | def test_version(): 44 | stdout, stderr = shub('version').communicate() 45 | assert re.match(br'\d+[.]\d+[.]\d+$', stdout.strip()) 46 | 47 | 48 | def test_deploy_without_project(): 49 | stdout, stderr = shub('deploy').communicate() 50 | assert stdout == b'' 51 | assert b'Cannot find project' in stderr 52 | 53 | 54 | def test_deploy_default_project(apipipe, scrapyproject): 55 | p = shub('deploy') 56 | assert apipipe.poll(15) 57 | req = apipipe.recv() 58 | assert req['path'] == '/api/scrapyd/addversion.json' 59 | apipipe.send((200, None, {'status': 'ok'})) 60 | stdout, stderr = p.communicate() 61 | assert b'{"status": "ok"}' in stdout 62 | -------------------------------------------------------------------------------- /shub/fetch_eggs.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urljoin 2 | 3 | import click 4 | import requests 5 | 6 | from shub.config import get_target_conf 7 | from shub.exceptions import InvalidAuthException, RemoteErrorException 8 | 9 | 10 | HELP = """ 11 | Download all eggs deployed to a Scrapy CLoud project into a zip file. 12 | 13 | You can either fetch to your default target (as defined in scrapinghub.yml), 14 | or explicitly supply a numerical project ID or a target defined in 15 | scrapinghub.yml (see shub deploy). 16 | """ 17 | 18 | SHORT_HELP = "Download project eggs from Scrapy Cloud" 19 | 20 | 21 | @click.command(help=HELP, short_help=SHORT_HELP) 22 | @click.argument("target", required=False, default='default') 23 | def cli(target): 24 | targetconf = get_target_conf(target) 25 | destfile = 'eggs-%s.zip' % targetconf.project_id 26 | fetch_eggs(targetconf.project_id, targetconf.endpoint, targetconf.apikey, 27 | destfile) 28 | 29 | 30 | def fetch_eggs(project, endpoint, apikey, destfile): 31 | auth = (apikey, '') 32 | url = urljoin(endpoint, "eggs/bundle.zip") 33 | rsp = requests.get(url=url, params={'project': project}, auth=auth, 34 | stream=True, timeout=300) 35 | 36 | _assert_response_is_valid(rsp) 37 | 38 | click.echo("Downloading eggs to %s" % destfile) 39 | 40 | with open(destfile, 'wb') as f: 41 | for chunk in rsp.iter_content(chunk_size=1024): 42 | if chunk: 43 | f.write(chunk) 44 | f.flush() 45 | 46 | 47 | def _assert_response_is_valid(rsp): 48 | if rsp.status_code == 403: 49 | raise InvalidAuthException 50 | elif rsp.status_code != 200: 51 | msg = 'Eggs could not be fetched. Status: %d' % rsp.status_code 52 | raise RemoteErrorException(msg) 53 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup( 5 | name='shub', 6 | version='2.16.0', 7 | packages=find_packages(exclude=('tests', 'tests.*')), 8 | url="https://shub.readthedocs.io/en/stable/", 9 | description='Scrapinghub Command Line Client', 10 | long_description=open('README.rst').read(), 11 | author='Scrapinghub', 12 | author_email='info@scrapinghub.com', 13 | maintainer='Scrapinghub', 14 | maintainer_email='info@scrapinghub.com', 15 | license='BSD', 16 | entry_points={ 17 | 'console_scripts': ['shub = shub.tool:cli'] 18 | }, 19 | include_package_data=True, 20 | zip_safe=False, 21 | python_requires='>=3.9', 22 | install_requires=[ 23 | 'click', 24 | 'docker', 25 | 'importlib-metadata; python_version < "3.10"', 26 | 'packaging', 27 | 'pip', 28 | 'PyYAML', 29 | 'retrying', 30 | 'requests', 31 | 'scrapinghub>=2.3.1', 32 | 'setuptools', 33 | 'tqdm==4.55.1', 34 | 'toml', 35 | ], 36 | classifiers=[ 37 | 'Development Status :: 5 - Production/Stable', 38 | 'Intended Audience :: Developers', 39 | 'Natural Language :: English', 40 | 'License :: OSI Approved :: BSD License', 41 | 'Programming Language :: Python', 42 | 'Programming Language :: Python :: 3.9', 43 | 'Programming Language :: Python :: 3.10', 44 | 'Programming Language :: Python :: 3.11', 45 | 'Programming Language :: Python :: 3.12', 46 | 'Programming Language :: Python :: 3.13', 47 | 'Programming Language :: Python :: 3.14', 48 | 'Operating System :: OS Independent', 49 | 'Environment :: Console', 50 | 'Topic :: Internet :: WWW/HTTP', 51 | ], 52 | ) 53 | -------------------------------------------------------------------------------- /docs/quickstart.rst: -------------------------------------------------------------------------------- 1 | .. _quickstart: 2 | 3 | ========== 4 | Quickstart 5 | ========== 6 | 7 | Installation 8 | ------------ 9 | 10 | If you have ``pip`` installed on your system, you can install shub from the 11 | Python Package Index:: 12 | 13 | pip install shub 14 | 15 | We also supply stand-alone binaries. You can find them in our `latest GitHub 16 | release`_. 17 | 18 | .. _`latest Github release`: https://github.com/scrapinghub/shub/releases/latest 19 | 20 | 21 | Getting help 22 | ------------ 23 | 24 | To see all available commands, run:: 25 | 26 | shub 27 | 28 | For help on a specific command, run it with a ``--help`` flag, e.g.:: 29 | 30 | shub schedule --help 31 | 32 | 33 | .. _basic-usage: 34 | 35 | Basic usage 36 | ----------- 37 | 38 | Start by logging in:: 39 | 40 | shub login 41 | 42 | This will save your Scrapinghub API key to a file in your home directory 43 | (``~/.scrapinghub.yml``) and is necessary for access to projects associated 44 | with your Scrapinghub account. Alternatively, you can set your Scrapinghub 45 | API key as an environment variable (``SHUB_APIKEY``), check :ref:`an appropriate 46 | section ` for details. 47 | 48 | Next, navigate to a Scrapy project that you wish to upload to Scrapinghub. You 49 | can deploy it to Scrapy Cloud via:: 50 | 51 | shub deploy 52 | 53 | On the first call, this will guide you through a wizard to save your project ID 54 | into a YAML file named ``scrapinghub.yml``, living next to your ``scrapy.cfg``. 55 | From anywhere within the project directory tree, you can now deploy via ``shub 56 | deploy``. 57 | 58 | Next, schedule one of your spiders to run on Scrapy Cloud:: 59 | 60 | shub schedule myspider 61 | 62 | You can watch its log or the scraped items while the spider is running by 63 | supplying the job ID:: 64 | 65 | shub log -f 2/34 66 | shub items -f 2/34 67 | -------------------------------------------------------------------------------- /freeze/tests/fakeserver.py: -------------------------------------------------------------------------------- 1 | import json 2 | import multiprocessing 3 | from threading import Thread 4 | from socketserver import TCPServer 5 | from http.server import SimpleHTTPRequestHandler 6 | import urllib.parse 7 | 8 | class Handler(SimpleHTTPRequestHandler): 9 | 10 | def _do_any(self): 11 | method = self.command 12 | path, _, querystr = self.path.partition('?') 13 | query = urllib.parse.parse_qs(querystr) 14 | content_len = int(self.headers.get('content-length', 0)) 15 | body = self.rfile.read(content_len) 16 | headers = self.headers.get_params() 17 | print(self) 18 | 19 | self.server.pipe.send({ 20 | 'path': path, 'query': query, 'body': body, 21 | 'method': self.command, 'headers': headers, 22 | }) 23 | if not self.server.pipe.poll(10): 24 | self.send_error(500, 'Pipe hung') 25 | 26 | status, headers, body = self.server.pipe.recv() 27 | if not isinstance(body, bytes): 28 | body = json.dumps(body).encode('utf8') + b'\n' 29 | 30 | self.send_response(status) 31 | for hn, hv in headers or (): 32 | self.send_header(hn, hv) 33 | self.end_headers() 34 | self.wfile.write(body) 35 | 36 | do_GET = _do_any 37 | do_PUT = _do_any 38 | do_POST = _do_any 39 | do_DELETE = _do_any 40 | do_PATCH = _do_any 41 | 42 | 43 | def threadit(target, *args, **kw): 44 | t = Thread(target=target, name=target.__name__, args=args, kwargs=kw) 45 | t.daemon = True 46 | t.start() 47 | return t 48 | 49 | 50 | def run(bind_at): 51 | p1, p2 = multiprocessing.Pipe() 52 | 53 | class MyTCPServer(TCPServer): 54 | allow_reuse_address = True 55 | pipe = p2 56 | 57 | httpd = MyTCPServer(bind_at, Handler) 58 | threadit(httpd.serve_forever) 59 | return p1 60 | 61 | 62 | -------------------------------------------------------------------------------- /shub/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime 3 | 4 | from shub.utils import job_resource_iter, get_job 5 | 6 | import click 7 | 8 | 9 | HELP = """ 10 | Given a job ID, fetch the log of that job from Scrapy Cloud and print it. 11 | 12 | A job ID consists of the Scrapinghub project ID, the numerical spider ID, and 13 | the job ID, separated by forward slashes, e.g.: 14 | 15 | shub log 12345/2/15 16 | 17 | You can also provide the Scrapinghub job URL instead: 18 | 19 | shub log https://app.zyte.com/p/12345/2/15 20 | 21 | You can omit the project ID if you have a default target defined in your 22 | scrapinghub.yml: 23 | 24 | shub log 2/15 25 | 26 | Or use any target defined in your scrapinghub.yml: 27 | 28 | shub log production/2/15 29 | 30 | If the job is still running, you can watch the log as it is being written by 31 | providing the -f flag: 32 | 33 | shub log -f 2/15 34 | """ 35 | 36 | SHORT_HELP = "Fetch log from Scrapy Cloud" 37 | 38 | 39 | @click.command(help=HELP, short_help=SHORT_HELP) 40 | @click.argument('job_id') 41 | @click.option('-f', '--follow', help='output new log entries as they are ' 42 | 'produced', is_flag=True) 43 | @click.option('-n', '--tail', help='output last N log entries only', type=int) 44 | @click.option('--json', 'json_', help='output log entries in JSON', is_flag=True, default=False) 45 | def cli(job_id, follow, tail, json_): 46 | job = get_job(job_id) 47 | for item in job_resource_iter(job, job.logs, follow=follow, tail=tail, output_json=json_): 48 | if json_: 49 | click.echo(item) 50 | else: 51 | click.echo( 52 | "{} {} {}".format( 53 | datetime.utcfromtimestamp(item['time']/1000), 54 | logging.getLevelName(int(item['level'])), 55 | item['message'] 56 | ) 57 | ) 58 | -------------------------------------------------------------------------------- /tests/image/conftest.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | from unittest import mock 3 | 4 | import pytest 5 | 6 | from shub.image.utils import ProgressBar 7 | 8 | try: 9 | # https://stackoverflow.com/a/55000090 10 | from inspect import getfullargspec as get_args 11 | except ImportError: 12 | from inspect import getargspec as get_args 13 | 14 | from .utils import ( 15 | FakeProjectDirectory, add_scrapy_fake_config, add_sh_fake_config, 16 | add_fake_dockerfile, add_fake_setup_py, 17 | ) 18 | 19 | 20 | @pytest.fixture 21 | def docker_client_mock(): 22 | """Docker client mock""" 23 | client_mock = mock.Mock() 24 | with mock.patch('shub.image.utils.get_docker_client') as m: 25 | m.return_value = client_mock 26 | yield client_mock 27 | 28 | 29 | @pytest.fixture 30 | def project_dir(): 31 | """Fake project directory""" 32 | with FakeProjectDirectory() as tmpdir: 33 | add_scrapy_fake_config(tmpdir) 34 | add_sh_fake_config(tmpdir) 35 | add_fake_dockerfile(tmpdir) 36 | add_fake_setup_py(tmpdir) 37 | yield tmpdir 38 | 39 | 40 | @pytest.fixture 41 | def monkeypatch_bar_rate(monkeypatch): 42 | # Converting to List instead to unpacking the Tuple 43 | # because get_args returns different tuple sizes between py versions. 44 | args = list(get_args(ProgressBar.format_meter))[0] 45 | rate_arg_idx = args.index('rate') 46 | 47 | def override_rate(func): 48 | 49 | @wraps(func) 50 | def wrapper(*args, **kwargs): 51 | args = list(args) 52 | if 'rate' in args: 53 | args[rate_arg_idx] = 10 ** 6 54 | elif 'rate' in kwargs: 55 | kwargs['rate'] = 10 ** 6 56 | return func(*args, **kwargs) 57 | 58 | return wrapper 59 | 60 | monkeypatch.setattr('shub.image.utils.ProgressBar.format_meter', 61 | staticmethod(override_rate(ProgressBar.format_meter))) 62 | -------------------------------------------------------------------------------- /shub/login.py: -------------------------------------------------------------------------------- 1 | import click 2 | import requests 3 | from urllib.parse import urljoin 4 | 5 | from shub.config import (load_shub_config, GLOBAL_SCRAPINGHUB_YML_PATH, 6 | ShubConfig) 7 | from shub.exceptions import AlreadyLoggedInException 8 | from shub.utils import update_yaml_dict 9 | 10 | 11 | HELP = """ 12 | Add your Scrapinghub API key to your global configuration file 13 | (~/.scrapinghub.yml). This is necessary to gain access to projects associated 14 | with your Scrapinghub account. 15 | 16 | You can find your API key in Scrapinghub's dashboard: 17 | https://app.zyte.com/account/apikey 18 | """ 19 | 20 | SHORT_HELP = "Save your Scrapinghub API key" 21 | 22 | 23 | @click.command(help=HELP, short_help=SHORT_HELP) 24 | def cli(): 25 | global_conf = load_shub_config(load_local=False, load_env=False) 26 | if 'default' in global_conf.apikeys: 27 | raise AlreadyLoggedInException 28 | 29 | conf = load_shub_config() 30 | key = _get_apikey( 31 | suggestion=conf.apikeys.get('default'), 32 | endpoint=global_conf.endpoints.get('default'), 33 | ) 34 | with update_yaml_dict(GLOBAL_SCRAPINGHUB_YML_PATH) as conf: 35 | conf.setdefault('apikeys', {}) 36 | conf['apikeys']['default'] = key 37 | 38 | 39 | def _get_apikey(suggestion='', endpoint=None): 40 | suggestion_txt = ' (%s)' % suggestion if suggestion else '' 41 | click.echo( 42 | "Enter your API key from https://app.zyte.com/o/settings/apikey" 43 | ) 44 | while True: 45 | key = input('API key%s: ' % suggestion_txt) or suggestion 46 | click.echo("Validating API key...") 47 | if _is_valid_apikey(key, endpoint=endpoint): 48 | click.echo("API key is OK, you are logged in now.") 49 | return key 50 | else: 51 | click.echo("API key failed, try again.") 52 | 53 | 54 | def _is_valid_apikey(key, endpoint=None): 55 | endpoint = endpoint or ShubConfig.DEFAULT_ENDPOINT 56 | validate_api_key_endpoint = urljoin(endpoint, "v2/users/me") 57 | r = requests.get(validate_api_key_endpoint, params={'apikey': key}) 58 | return r.status_code == 200 59 | -------------------------------------------------------------------------------- /tests/image/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | from contextlib import contextmanager 5 | 6 | 7 | SH_CONFIG_FILE = """ 8 | projects: 9 | dev: 10 | id: 12345 11 | image: registry.io/user/project 12 | xyz: 13 | id: 32167 14 | image: images.scrapinghub.com/project/32167 15 | endpoints: 16 | dev: https://dash-fake 17 | apikeys: 18 | default: abcdef 19 | """ 20 | 21 | SH_SETUP_FILE = """ 22 | from setuptools import setup 23 | setup( 24 | name = 'project', version = '1.0', 25 | entry_points = {'scrapy': ['settings = test.settings']}, 26 | scripts = ['bin/scriptA.py', 'scriptB.py'] 27 | ) 28 | """ 29 | 30 | 31 | @contextmanager 32 | def FakeProjectDirectory(): 33 | tmpdir = os.path.realpath(tempfile.mkdtemp()) 34 | current = os.getcwd() 35 | os.chdir(tmpdir) 36 | try: 37 | yield tmpdir 38 | finally: 39 | os.chdir(current) 40 | shutil.rmtree(tmpdir) 41 | 42 | 43 | def add_scrapy_fake_config(tmpdir): 44 | # add fake scrapy.cfg 45 | config_path = os.path.join(tmpdir, 'scrapy.cfg') 46 | with open(config_path, 'w') as config_file: 47 | config_file.write("[settings]\ndefault=test.settings") 48 | 49 | 50 | def add_sh_fake_config(tmpdir): 51 | # add fake SH config 52 | sh_config_path = os.path.join(tmpdir, 'scrapinghub.yml') 53 | with open(sh_config_path, 'w') as sh_config_file: 54 | sh_config_file.write(SH_CONFIG_FILE) 55 | 56 | 57 | def add_fake_requirements(tmpdir): 58 | """Add fake requirements""" 59 | reqs_path = os.path.join(tmpdir, 'fake-requirements.txt') 60 | with open(reqs_path, 'w') as reqs_file: 61 | reqs_file.write("mock\nrequests") 62 | 63 | 64 | def add_fake_dockerfile(tmpdir): 65 | """Add fake Dockerfile""" 66 | docker_path = os.path.join(tmpdir, 'Dockerfile') 67 | with open(docker_path, 'w') as docker_file: 68 | docker_file.write("FROM python:2.7") 69 | 70 | 71 | def add_fake_setup_py(tmpdir): 72 | """Add fake setup.py for extract scripts tests""" 73 | setup_path = os.path.join(tmpdir, 'setup.py') 74 | with open(setup_path, 'w') as setup_file: 75 | setup_file.write(SH_SETUP_FILE) 76 | -------------------------------------------------------------------------------- /.github/workflows/freeze-release-publish.yml: -------------------------------------------------------------------------------- 1 | name: Freeze, Release & Publish 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | tags: 8 | - v* 9 | pull_request: 10 | branches: 11 | - master 12 | 13 | jobs: 14 | freeze: 15 | name: "Freeze: ${{ matrix.os }}" 16 | runs-on: ${{ matrix.os }} 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | os: [ubuntu-latest, macos-latest, windows-latest] 21 | 22 | steps: 23 | - uses: actions/checkout@v2 24 | 25 | - name: Set up Python 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: "3.10" 29 | 30 | - name: Install tox 31 | run: pip install tox 32 | 33 | - name: Build binary 34 | run: tox -e freeze 35 | 36 | - name: Pack binary (Windows) 37 | if: ${{ runner.os == 'Windows' }} 38 | run: 7z a shub-Windows.zip dist_bin/shub.exe 39 | 40 | - name: Pack binary (Linux/macOS) 41 | if: ${{ runner.os != 'Windows' }} 42 | run: tar -czvf shub-${{ runner.os }}.tar.gz dist_bin/shub 43 | 44 | - name: Upload binary 45 | uses: actions/upload-artifact@v4 46 | with: 47 | name: shub-${{ runner.os }} 48 | path: | 49 | shub-${{ runner.os }}.tar.gz 50 | shub-${{ runner.os }}.zip 51 | 52 | release: 53 | if: startsWith(github.ref, 'refs/tags/v') 54 | needs: freeze 55 | runs-on: ubuntu-latest 56 | 57 | steps: 58 | - name: Download binaries 59 | uses: actions/download-artifact@v2 60 | with: 61 | path: binaries 62 | 63 | - name: Display structure of downloaded files 64 | run: ls -R binaries 65 | 66 | - name: Draft release 67 | uses: softprops/action-gh-release@v1 68 | with: 69 | draft: true 70 | files: binaries/** 71 | 72 | publish: 73 | if: startsWith(github.ref, 'refs/tags/v') 74 | runs-on: ubuntu-latest 75 | 76 | steps: 77 | - uses: actions/checkout@v2 78 | 79 | - name: Set up Python 80 | uses: actions/setup-python@v5 81 | with: 82 | python-version: "3.10" 83 | 84 | - name: Publish to PyPI 85 | run: | 86 | pip install --upgrade pip 87 | pip install --upgrade setuptools wheel twine 88 | python setup.py sdist bdist_wheel 89 | export TWINE_USERNAME=__token__ 90 | export TWINE_PASSWORD=${{ secrets.PYPI_TOKEN }} 91 | twine upload dist/* 92 | -------------------------------------------------------------------------------- /shub/deploy_reqs.py: -------------------------------------------------------------------------------- 1 | import click 2 | import os 3 | import tempfile 4 | import shutil 5 | 6 | from shub import DEPLOY_DOCS_LINK 7 | from shub.config import get_target_conf 8 | from shub.utils import (build_and_deploy_eggs, decompress_egg_files, 9 | download_from_pypi) 10 | 11 | 12 | HELP = """ 13 | Build eggs of your project's requirements and deploy them to Scrapy Cloud. 14 | 15 | You can either deploy to your default target (as defined in scrapinghub.yml), 16 | or explicitly supply a numerical project ID or a target defined in 17 | scrapinghub.yml (see shub deploy). 18 | 19 | By default, requirements will be read from requirements.txt. You may supply a 20 | different file name with the -r option: 21 | 22 | shub deploy-reqs -r myreqs.txt 23 | 24 | The requirements file must be in a format parsable by pip. 25 | """ 26 | 27 | SHORT_HELP = "[DEPRECATED] Build and deploy eggs from requirements.txt" 28 | 29 | 30 | @click.command(help=HELP, short_help=SHORT_HELP) 31 | @click.argument("target", required=False, default="default") 32 | @click.option("-r", "--requirements-file", default='requirements.txt', 33 | type=click.STRING) 34 | def cli(target, requirements_file): 35 | click.secho( 36 | "deploy-reqs was deprecated, define a requirements file in your " 37 | "scrapinghub.yml instead. See {}".format(DEPLOY_DOCS_LINK), 38 | err=True, fg='yellow', 39 | ) 40 | main(target, requirements_file) 41 | 42 | 43 | def main(target, requirements_file): 44 | targetconf = get_target_conf(target) 45 | requirements_full_path = os.path.abspath(requirements_file) 46 | eggs_tmp_dir = _mk_and_cd_eggs_tmpdir() 47 | _download_egg_files(eggs_tmp_dir, requirements_full_path) 48 | decompress_egg_files() 49 | build_and_deploy_eggs(targetconf.project_id, targetconf.endpoint, 50 | targetconf.apikey) 51 | 52 | 53 | def _mk_and_cd_eggs_tmpdir(): 54 | tmpdir = tempfile.mkdtemp(prefix="eggs") 55 | os.chdir(tmpdir) 56 | os.mkdir('eggs') 57 | os.chdir('eggs') 58 | return os.path.join(tmpdir, 'eggs') 59 | 60 | 61 | def _download_egg_files(eggs_dir, requirements_file): 62 | editable_src_dir = tempfile.mkdtemp(prefix='pipsrc') 63 | 64 | click.echo('Downloading eggs...') 65 | try: 66 | download_from_pypi(eggs_dir, reqfile=requirements_file, 67 | extra_args=["--src", editable_src_dir]) 68 | finally: 69 | shutil.rmtree(editable_src_dir, ignore_errors=True) 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/vim,python 3 | # Edit at https://www.gitignore.io/?templates=vim,python 4 | 5 | ### Python ### 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # pyenv 72 | .python-version 73 | 74 | # pipenv 75 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 76 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 77 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 78 | # install all needed dependencies. 79 | #Pipfile.lock 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | .spyproject 90 | 91 | # Rope project settings 92 | .ropeproject 93 | 94 | # Mr Developer 95 | .mr.developer.cfg 96 | .project 97 | .pydevproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | .dmypy.json 105 | dmypy.json 106 | 107 | # Pyre type checker 108 | .pyre/ 109 | 110 | ### Vim ### 111 | # Swap 112 | [._]*.s[a-v][a-z] 113 | [._]*.sw[a-p] 114 | [._]s[a-rt-v][a-z] 115 | [._]ss[a-gi-z] 116 | [._]sw[a-p] 117 | 118 | # Session 119 | Session.vim 120 | Sessionx.vim 121 | 122 | # Temporary 123 | .netrwhist 124 | *~ 125 | # Auto-generated tag files 126 | tags 127 | # Persistent undo 128 | [._]*.un~ 129 | 130 | # End of https://www.gitignore.io/api/vim,python 131 | 132 | .vscode 133 | -------------------------------------------------------------------------------- /shub/image/run/wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | A helper wrapper over start-crawl to run a custom image locally. 4 | 5 | The wrapper is used in `shub image run` command as an entrypoint 6 | to create a FIFO file inside a Docker container, enforce using it 7 | to communicate with crawl process and start the crawl process. 8 | 9 | The initial version handles and prints only LOG entries to mimic 10 | Scrapy behavior when running locally, however it could be easily 11 | extended in the future. 12 | 13 | Reading about SH custom image contract should bring you more context 14 | https://shub.readthedocs.io/en/stable/custom-images-contract.html. 15 | 16 | FIFO based communication protocol is described well in 17 | https://doc.scrapinghub.com/scrapy-cloud-write-entrypoint.html 18 | 19 | TODO As a custom image isn't necessarily based on Python, the wrapper 20 | should be rewritten in the future with something more basic and 21 | lightweight, to get rid of dependence on Python. 22 | """ 23 | 24 | 25 | import os 26 | import sys 27 | import json 28 | import logging 29 | import datetime 30 | from multiprocessing import Process 31 | from shutil import which 32 | 33 | 34 | def _consume_from_fifo(fifo_path): 35 | """Start reading/printing entries from FIFO.""" 36 | with open(fifo_path) as fifo: 37 | while True: 38 | line = fifo.readline() 39 | # returns an empty string only in the end of the file 40 | if not line: 41 | return 42 | entry_type, entry_raw = line[:3], line[4:] 43 | _print_fifo_entry(entry_type, json.loads(entry_raw)) 44 | 45 | 46 | def _print_fifo_entry(message_type, message): 47 | """Print only specific entries.""" 48 | if message_type == 'LOG': 49 | timestamp = _millis_to_str(message['time']) 50 | loglevel = logging.getLevelName(message['level']) 51 | # mimic Scrapy logging format as much as possible 52 | print('{} {} {}'.format(timestamp, loglevel, message['message'])) 53 | 54 | 55 | def _millis_to_str(millis): 56 | """Convert a datatime in ms to a formatted string.""" 57 | datetime_ts = datetime.datetime.fromtimestamp(millis / 1000.0) 58 | return datetime_ts.strftime('%Y-%m-%d %H:%M:%S') 59 | 60 | 61 | def main(): 62 | """Main wrapper entrypoint.""" 63 | # create a named pipe for communication 64 | fifo_path = os.environ.get('SHUB_FIFO_PATH') 65 | os.mkfifo(fifo_path) 66 | # create and start a consumer process to read from the fifo: 67 | # non-daemon to allow it to finish reading from pipe before exit. 68 | Process(target=_consume_from_fifo, args=[fifo_path]).start() 69 | # replace current process with original start-crawl 70 | os.execv(which('start-crawl'), sys.argv) 71 | 72 | 73 | if __name__ == '__main__': 74 | sys.exit(main()) 75 | -------------------------------------------------------------------------------- /docs/scheduling.rst: -------------------------------------------------------------------------------- 1 | .. _scheduling: 2 | 3 | ===================================== 4 | Scheduling jobs and fetching job data 5 | ===================================== 6 | 7 | shub allows you to schedule a spider run from the command line:: 8 | 9 | shub schedule SPIDER 10 | 11 | where ``SPIDER`` should match the spider's name. By default, shub will schedule 12 | the spider in your default project (as defined in ``scrapinghub.yml``). You may 13 | also explicitly specify the project to use:: 14 | 15 | shub schedule project_alias_or_id/SPIDER 16 | 17 | You can supply spider arguments and job-specific settings through the ``-a`` 18 | and ``-s`` options:: 19 | 20 | $ shub schedule myspider -a ARG1=VALUE -a ARG2=VALUE 21 | Spider myspider scheduled, job ID: 12345/2/15 22 | Watch the log on the command line: 23 | shub log -f 2/15 24 | or print items as they are being scraped: 25 | shub items -f 2/15 26 | or watch it running in Scrapinghub's web interface: 27 | https://app.zyte.com/p/12345/job/2/15 28 | 29 | :: 30 | 31 | $ shub schedule 33333/myspider -s LOG_LEVEL=DEBUG 32 | Spider myspider scheduled, job ID: 33333/2/15 33 | Watch the log on the command line: 34 | shub log -f 2/15 35 | or print items as they are being scraped: 36 | shub items -f 2/15 37 | or watch it running in Scrapinghub's web interface: 38 | https://app.zyte.com/p/33333/job/2/15 39 | 40 | You can also specify the amount of Scrapy Cloud units (``-u``) and the priority (``-p``):: 41 | 42 | $ shub schedule myspider -p 3 -u 3 43 | Spider myspider scheduled, job ID: 12345/2/16 44 | Watch the log on the command line: 45 | shub log -f 2/16 46 | or print items as they are being scraped: 47 | shub items -f 2/16 48 | or watch it running in Scrapinghub's web interface: 49 | https://app.zyte.com/p/12345/job/2/16 50 | 51 | shub provides commands to retrieve log entries, scraped items, or requests from 52 | jobs. If the job is still running, you can provide the ``-f`` (follow) option 53 | to receive live updates:: 54 | 55 | $ shub log -f 2/15 56 | 2016-01-02 16:38:35 INFO Log opened. 57 | 2016-01-02 16:38:35 INFO [scrapy.log] Scrapy 1.0.3.post6+g2d688cd started 58 | ... 59 | # shub will keep updating the log until the job finishes or you hit CTRL+C 60 | 61 | :: 62 | 63 | $ shub items 2/15 64 | {"name": "Example product", description": "Example description"} 65 | {"name": "Another product", description": "Another description"} 66 | 67 | :: 68 | 69 | $ shub requests 1/1/1 70 | {"status": 200, "fp": "1ff11f1543809f1dbd714e3501d8f460b92a7a95", "rs": 138137, "_key": "1/1/1/0", "url": "http://blog.scrapinghub.com", "time": 1449834387621, "duration": 238, "method": "GET"} 71 | {"status": 200, "fp": "418a0964a93e139166dbf9b33575f10f31f17a1", "rs": 138137, "_key": "1/1/1/0", "url": "http://blog.scrapinghub.com", "time": 1449834390881, "duration": 163, "method": "GET"} 72 | -------------------------------------------------------------------------------- /shub/cancel.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from scrapinghub import ScrapinghubAPIError 4 | from scrapinghub.client.utils import parse_job_key 5 | 6 | from shub.utils import get_scrapinghub_client_from_config 7 | from shub.config import get_target_conf 8 | from shub.exceptions import ( 9 | ShubException, 10 | BadParameterException, 11 | SubcommandException, 12 | ) 13 | 14 | 15 | HELP = """ 16 | Cancel multiple jobs from Scrapy Cloud. 17 | 18 | The cancel command expects the project ID (target) followed by 19 | the pair containing the spider ID and Job ID: 20 | 21 | shub cancel 12345 1/1 1/2 1/3 22 | 23 | If the project ID is not defined it is going to use the default 24 | project (as defined in scrapinghub.yml): 25 | 26 | shub cancel 1/1 1/2 1/3 27 | 28 | The cancel command requires a confirmation that could be skipped 29 | with the flag --force/-f: 30 | 31 | shub cancel --force 1/1 1/2 1/3 32 | """ 33 | 34 | 35 | SHORT_HELP = "Cancel multiple jobs from Scrapy Cloud" 36 | 37 | 38 | @click.command(help=HELP, short_help=SHORT_HELP) 39 | @click.argument("target_or_key") 40 | @click.argument("keys", nargs=-1) 41 | @click.option('--force', '-f', is_flag=True, 42 | help='It ignores the confirmation prompt') 43 | def cli(target_or_key, keys, force): 44 | # target_or_key contains a target or just another job key 45 | if "/" in target_or_key: 46 | keys = (target_or_key,) + keys 47 | target = "default" 48 | else: 49 | target = target_or_key 50 | 51 | targetconf = get_target_conf(target) 52 | project_id = targetconf.project_id 53 | client = get_scrapinghub_client_from_config(targetconf) 54 | project = client.get_project(project_id) 55 | 56 | try: 57 | job_keys = [validate_job_key(project_id, key) for key in keys] 58 | except (BadParameterException, SubcommandException) as err: 59 | click.echo('Error during keys validation: %s' % str(err)) 60 | exit(1) 61 | 62 | if not force: 63 | jobs_str = ", ".join([str(job) for job in job_keys]) 64 | click.confirm( 65 | 'Do you want to cancel these %s jobs? \n\n%s \n\nconfirm?' 66 | % (len(job_keys), jobs_str), 67 | abort=True 68 | ) 69 | 70 | try: 71 | output = project.jobs.cancel( 72 | keys=[str(job) for job in job_keys] 73 | ) 74 | except (ValueError, ScrapinghubAPIError) as err: 75 | raise ShubException(str(err)) 76 | 77 | click.echo(output) 78 | 79 | 80 | def validate_job_key(project_id, short_key): 81 | job_key = f"{project_id}/{short_key}" 82 | 83 | if len(short_key.split("/")) != 2: 84 | raise BadParameterException( 85 | "keys must be defined as /" 86 | ) 87 | 88 | try: 89 | return parse_job_key(job_key) 90 | except ValueError as err: 91 | raise BadParameterException(str(err)) 92 | except Exception as err: 93 | raise SubcommandException(str(err)) 94 | -------------------------------------------------------------------------------- /shub/copy_eggs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from urllib.parse import urljoin 3 | from tempfile import mkdtemp 4 | import click 5 | import requests 6 | from shutil import rmtree 7 | 8 | from shub.config import get_target_conf 9 | from shub.fetch_eggs import fetch_eggs 10 | from shub.utils import decompress_egg_files, _deploy_dependency_egg 11 | 12 | SHORT_HELP = "Sync eggs from one project with other project" 13 | 14 | HELP = SHORT_HELP + """ 15 | 16 | Fetch all eggs from one project and upload them to other project. This allows 17 | you to easily clone requirements from an old project into a new one.""" 18 | 19 | 20 | @click.command(help=HELP, short_help=SHORT_HELP) 21 | @click.option("--source_project", 22 | prompt="From which projects should I download eggs?") 23 | @click.option("--new_project", 24 | prompt="To which project should I upload eggs?") 25 | @click.option("-m", "--copy-main", default=False, is_flag=True, 26 | help="copy main Scrapy project egg") 27 | def cli(source_project, new_project, copy_main): 28 | source = get_target_conf(source_project) 29 | target = get_target_conf(new_project) 30 | copy_eggs(source.project_id, source.endpoint, source.apikey, 31 | target.project_id, target.endpoint, target.apikey, 32 | copy_main) 33 | 34 | 35 | def copy_eggs(project, endpoint, apikey, new_project, new_endpoint, new_apikey, 36 | copy_main): 37 | egg_versions = get_eggs_versions(project, endpoint, apikey) 38 | temp_dir = mkdtemp() 39 | destfile = os.path.join(temp_dir, 'eggs-%s.zip' % project) 40 | fetch_eggs(project, endpoint, apikey, destfile) 41 | # Decompress project bundle (so temp_dir will contain all project eggs) 42 | decompress_egg_files(directory=temp_dir) 43 | destdir = os.path.join(temp_dir, f"eggs-{project}") 44 | for egg_name in os.listdir(destdir): 45 | if egg_name == "__main__.egg" and not copy_main: 46 | continue 47 | name = egg_name.partition(".egg")[0] 48 | try: 49 | version = egg_versions[name] 50 | except KeyError: 51 | click.secho( 52 | "WARNING: The following egg belongs to a Dash Addon: %s. " 53 | "Please manually enable the corresponding Addon in the target " 54 | "project." % name, 55 | fg='yellow', 56 | bold=True, 57 | ) 58 | continue 59 | egg_path = os.path.join(destdir, egg_name) 60 | egg_info = (egg_name, egg_path) 61 | _deploy_dependency_egg(new_project, new_endpoint, new_apikey, 62 | name=name, version=version, egg_info=egg_info) 63 | rmtree(temp_dir) 64 | 65 | 66 | def get_eggs_versions(project, endpoint, apikey): 67 | click.echo(f'Getting eggs list from project {project}...') 68 | list_endpoint = urljoin(endpoint, "eggs/list.json") 69 | response = requests.get(list_endpoint, params={"project": project}, 70 | auth=(apikey, '')) 71 | response.raise_for_status() 72 | obj = response.json() 73 | return {x['name']: x['version'] for x in obj['eggs']} 74 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | from unittest import mock 4 | 5 | from click.testing import CliRunner 6 | from tqdm.utils import _supports_unicode 7 | 8 | from shub import config 9 | 10 | 11 | class AssertInvokeRaisesMixin: 12 | def assertInvokeRaises(self, exc, *args, **kwargs): 13 | """ 14 | Invoke self.runner (or a new runner if nonexistent) with given *args 15 | and **kwargs, assert that it raised an exception of type exc, and 16 | return the runner's result. 17 | """ 18 | runner = getattr(self, 'runner', None) or CliRunner() 19 | kwargs['standalone_mode'] = False 20 | result = runner.invoke(*args, **kwargs) 21 | self.assertIsInstance(result.exception, exc) 22 | return result 23 | 24 | 25 | def mock_conf(testcase, target=None, attr=None, conf=None): 26 | if not conf: 27 | conf = config.ShubConfig() 28 | conf.projects.update({ 29 | 'default': 1, 30 | 'prod': 2, 31 | 'vagrant': 'vagrant/3', 32 | 'custom1': {'id': 4, 'image': False}, 33 | 'custom2': {'id': 5, 'image': True}, 34 | 'custom3': {'id': 6, 'image': 'custom/image'}, 35 | }) 36 | conf.endpoints.update({ 37 | 'vagrant': 'https://vagrant_ep/api/', 38 | }) 39 | conf.apikeys.update({ 40 | 'default': 32 * '1', 41 | 'vagrant': 32 * '2', 42 | }) 43 | conf.version = 'version' 44 | if target: 45 | if attr: 46 | patcher = mock.patch.object(target, attr, return_value=conf, 47 | autospec=True) 48 | else: 49 | patcher = mock.patch(target, return_value=conf, autospec=True) 50 | else: 51 | patcher = mock.patch('shub.config.load_shub_config', return_value=conf, 52 | autospec=True) 53 | patcher.start() 54 | testcase.addCleanup(patcher.stop) 55 | return conf 56 | 57 | 58 | def _is_tqdm_in_ascii_mode(): 59 | """Small helper deciding about placeholders for tqdm progress bars.""" 60 | with CliRunner().isolation(): 61 | return not _supports_unicode(sys.stdout) 62 | 63 | 64 | def format_expected_progress(progress): 65 | """Replace unicode symbols in progress string for tqdm in ascii mode.""" 66 | if _is_tqdm_in_ascii_mode(): 67 | to_replace = {'█': '#', '▏': '2', '▎': '3', '▌': '5', '▋': '6'} 68 | for sym in to_replace: 69 | progress = progress.replace(sym, to_replace[sym]) 70 | return progress 71 | 72 | 73 | def clean_progress_output(output): 74 | """Return output cleaned from \\r, \\n, and ANSI escape sequences""" 75 | return re.sub( 76 | r"""(?x) # Matches: 77 | \n|\r| # 1. newlines or carriage returns, or 78 | (\x1b\[|\x9b) # 2. ANSI control sequence introducer ("ESC[" or single 79 | # byte \x9b) + 80 | [^@-_]*[@-_]| # private mode characters + command character, or 81 | \x1b[@-_] # 3. ANSI control codes without sequence introducer 82 | # ("ESC" + single command character) 83 | """, 84 | '', output) 85 | -------------------------------------------------------------------------------- /shub/image/upload.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from shub.config import list_targets_callback 4 | from shub.image import build 5 | from shub.image import push 6 | from shub.image import deploy 7 | from shub.image import utils 8 | 9 | 10 | SHORT_HELP = "Shortcut command for build-push-deploy chain" 11 | HELP = """ 12 | Upload command is a handy shortcut to rebuild and redeploy your project 13 | (in other words it does consecutive calls of build-push-deploy cmds). 14 | 15 | Obviously it accepts all the options for the commands above. 16 | """ 17 | 18 | 19 | @click.command(help=HELP, short_help=SHORT_HELP) 20 | @click.argument("target", required=False, default="default") 21 | @click.option("-l", "--list-targets", is_flag=True, is_eager=True, 22 | expose_value=False, callback=list_targets_callback, 23 | help="List available project names defined in your config") 24 | @click.option("-d", "--debug", help="debug mode", is_flag=True, 25 | callback=utils.deprecate_debug_parameter) 26 | @click.option("-v", "--verbose", is_flag=True, 27 | help="stream upload logs to console") 28 | @click.option("-V", "--version", help="release version") 29 | @click.option("--username", help="docker registry name") 30 | @click.option("--password", help="docker registry password") 31 | @click.option("--email", help="docker registry email") 32 | @click.option("--apikey", help="SH apikey to use built-in registry") 33 | @click.option("--insecure", is_flag=True, help="use insecure registry") 34 | @click.option("--async", "async_", is_flag=True, help="[DEPRECATED] enable asynchronous mode", 35 | callback=utils.deprecate_async_parameter) 36 | @click.option("-S", "--skip-tests", help="skip testing image", is_flag=True) 37 | @click.option("-R", "--reauth", is_flag=True, 38 | help="re-authenticate to registry before pushing") 39 | @click.option("-n", "--no-cache", is_flag=True, 40 | help="Do not use cache when building the image") 41 | @click.option("-b", "--build-arg", multiple=True, 42 | help="Allow to pass build arguments to docker client.") 43 | @click.option("-f", "--file", "filename", default='Dockerfile', 44 | help="Name of the Dockerfile (Default is 'PATH/Dockerfile')") 45 | def cli(target, debug, verbose, version, username, password, email, 46 | apikey, insecure, async_, skip_tests, reauth, no_cache, build_arg, filename): 47 | upload_cmd(target, version, username, password, email, apikey, insecure, 48 | async_, skip_tests, reauth, no_cache, build_arg, filename) 49 | 50 | 51 | def upload_cmd(target, version, username=None, password=None, email=None, 52 | apikey=None, insecure=False, async_=False, skip_tests=False, 53 | reauth=False, no_cache=False, build_arg=(), filename='Dockerfile'): 54 | build.build_cmd(target, version, skip_tests, no_cache, build_arg, filename=filename) 55 | # skip tests for push command anyway because they run in build command if not skipped 56 | push.push_cmd(target, version, username, password, email, apikey, 57 | insecure, skip_tests=True, reauth=reauth) 58 | deploy.deploy_cmd(target, version, username, password, email, 59 | apikey, insecure, async_) 60 | -------------------------------------------------------------------------------- /shub/migrate_eggs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | 4 | import errno 5 | 6 | from shub.compat import to_unicode 7 | from urllib.parse import urljoin 8 | 9 | from io import BytesIO 10 | 11 | import click 12 | import requests 13 | 14 | from shub.config import get_target_conf, ShubConfig 15 | 16 | HELP = """ 17 | Migrate eggs stored in Dash's "Code & Deploy" section. 18 | 19 | Eggs that are available in PYPI will be stored in requirements.txt file. 20 | The rest will be stored in user provided directory and send to Dash 21 | for each deployment. 22 | 23 | After the operation is completed, please review changes made to 24 | scrapinghub.yml and requirements.txt files. 25 | """ 26 | 27 | SHORT_HELP = "Migrate dash eggs to requirements.txt and project's directory" 28 | 29 | 30 | @click.command(help=HELP, short_help=SHORT_HELP) 31 | @click.argument("target", required=False, default='default') 32 | def cli(target): 33 | main(target) 34 | 35 | 36 | def main(target): 37 | targetconf = get_target_conf(target) 38 | 39 | url = urljoin(targetconf.endpoint, 'migrate-eggs.zip') 40 | params = {'project': targetconf.project_id} 41 | auth = (targetconf.apikey, '') 42 | 43 | response = requests.get(url, auth=auth, params=params, stream=True) 44 | 45 | with zipfile.ZipFile(BytesIO(response.content), 'r') as mfile: 46 | Migrator(mfile).start() 47 | 48 | 49 | class Migrator: 50 | def __init__(self, mfile): 51 | self.mfile = mfile 52 | self.sh_yml = './scrapinghub.yml' 53 | self.conf = ShubConfig() 54 | self.conf.load_file(self.sh_yml) 55 | 56 | self.req_content = to_unicode(self.mfile.read('requirements.txt')) 57 | self.eggs = [] 58 | 59 | for filename in self.mfile.namelist(): 60 | if filename.endswith('.egg'): 61 | self.eggs.append(filename) 62 | 63 | def start(self): 64 | if self.eggs: 65 | self.migrate_eggs() 66 | 67 | self.migrate_requirements_txt() 68 | 69 | self.conf.save(self.sh_yml) 70 | 71 | def migrate_eggs(self): 72 | eggsdir = './eggs' 73 | msg = f"Eggs will be stored in {eggsdir}, are you sure ? " 74 | click.confirm(msg) 75 | try: 76 | os.mkdir(eggsdir) 77 | except OSError as e: 78 | if e.errno != errno.EEXIST: 79 | raise 80 | 81 | for filename in self.eggs: 82 | filepath = os.path.join(eggsdir, filename) 83 | if filepath in self.conf.eggs: 84 | continue 85 | 86 | self.conf.eggs.append(filepath) 87 | self.mfile.extract(filename, eggsdir) 88 | 89 | def migrate_requirements_txt(self): 90 | req_file = self.conf.requirements_file or './requirements.txt' 91 | 92 | if os.path.isfile(req_file): 93 | y = click.confirm( 94 | 'requirements.txt already exists, ' 95 | 'are you sure to override it ?' 96 | ) 97 | if not y: 98 | click.echo('Aborting') 99 | return 100 | 101 | self.conf.requirements_file = req_file 102 | 103 | with open(self.conf.requirements_file, 'w') as reqfile: 104 | reqfile.write(self.req_content) 105 | -------------------------------------------------------------------------------- /tests/image/test_test.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | 3 | import pytest 4 | from click.testing import CliRunner 5 | 6 | from shub import exceptions as shub_exceptions 7 | from shub.image.test import ( 8 | cli, _run_docker_command, _check_image_size, _check_start_crawl_entry, 9 | IMAGE_SIZE_LIMIT, 10 | ) 11 | 12 | from .utils import FakeProjectDirectory 13 | from .utils import add_sh_fake_config 14 | 15 | 16 | class MockedNotFound(Exception): 17 | """Mocking docker.errors.NotFound""" 18 | 19 | 20 | @pytest.fixture 21 | def docker_client(): 22 | client = mock.Mock() 23 | client.create_container.return_value = {'Id': '12345'} 24 | client.wait.return_value = {'Error': None, 'StatusCode': 0} 25 | client.logs.return_value = 'some-logs' 26 | return client 27 | 28 | 29 | def test_test_cli(monkeypatch, docker_client): 30 | """ This test mocks docker library to test the function itself """ 31 | monkeypatch.setattr('docker.errors.NotFound', MockedNotFound) 32 | monkeypatch.setattr('shub.image.utils.get_docker_client', 33 | lambda *args, **kwargs: docker_client) 34 | with FakeProjectDirectory() as tmpdir: 35 | add_sh_fake_config(tmpdir) 36 | runner = CliRunner() 37 | result = runner.invoke( 38 | cli, ["dev", "-v", "--version", "test"]) 39 | assert result.exit_code == 0 40 | 41 | 42 | def test_check_image_exists(monkeypatch, docker_client): 43 | assert _check_image_size('img', docker_client) is None 44 | 45 | monkeypatch.setattr('docker.errors.NotFound', MockedNotFound) 46 | docker_client.inspect_image.side_effect = MockedNotFound 47 | with pytest.raises(shub_exceptions.NotFoundException): 48 | _check_image_size('image', docker_client) 49 | 50 | 51 | def test_check_image_size(monkeypatch, docker_client): 52 | docker_client.inspect_image.return_value = {'Size': IMAGE_SIZE_LIMIT} 53 | assert _check_image_size('img', docker_client) is None 54 | 55 | docker_client.inspect_image.return_value = {'Size': IMAGE_SIZE_LIMIT + 1} 56 | with pytest.raises(shub_exceptions.CustomImageTooLargeException): 57 | _check_image_size('image', docker_client) 58 | 59 | 60 | def test_start_crawl(docker_client): 61 | assert _check_start_crawl_entry('image', docker_client) is None 62 | docker_client.create_container.assert_called_with( 63 | image='image', command=['which', 'start-crawl']) 64 | docker_client.wait.return_value = {'Error': None, 'StatusCode': 1} 65 | with pytest.raises(shub_exceptions.NotFoundException): 66 | _check_start_crawl_entry('image', docker_client) 67 | 68 | docker_client.wait.return_value = {'Error': None, 'StatusCode': 0} 69 | docker_client.logs.return_value = '' 70 | with pytest.raises(shub_exceptions.NotFoundException): 71 | _check_start_crawl_entry('image', docker_client) 72 | 73 | 74 | def test_run_docker_command(docker_client): 75 | assert _run_docker_command( 76 | docker_client, 'image-name', ['some', 'cmd']) == \ 77 | (0, 'some-logs') 78 | docker_client.create_container.assert_called_with( 79 | image='image-name', command=['some', 'cmd']) 80 | docker_client.start.assert_called_with({'Id': '12345'}) 81 | docker_client.wait.assert_called_with(container='12345') 82 | docker_client.logs.assert_called_with( 83 | container='12345', stdout=True, stderr=False, 84 | stream=False, timestamps=False) 85 | docker_client.remove_container.assert_called_with({'Id': '12345'}) 86 | -------------------------------------------------------------------------------- /shub/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | shub-specific exceptions. 3 | 4 | Exit codes follow the sysexits.h convention: 5 | https://www.freebsd.org/cgi/man.cgi?query=sysexits&sektion=3 6 | """ 7 | 8 | 9 | import sys 10 | import warnings 11 | 12 | from click import BadParameter, ClickException 13 | 14 | 15 | class ShubException(ClickException): 16 | def __init__(self, msg=None): 17 | super().__init__(msg or self.default_msg) 18 | 19 | 20 | class MissingAuthException(ShubException): 21 | # EX_NOPERM would be more appropriate here but would forbid distinguishing 22 | # this from InvalidAuth by exit code 23 | exit_code = 67 # EX_NOUSER 24 | default_msg = "Not logged in. Please run 'shub login' first." 25 | 26 | 27 | class InvalidAuthException(ShubException): 28 | exit_code = 77 # EX_NOPERM 29 | default_msg = ("Authentication failure. Please make sure that your API key" 30 | " is valid.") 31 | 32 | 33 | class AlreadyLoggedInException(ShubException): 34 | exit_code = 0 35 | default_msg = ("You are already logged in. To change credentials, use " 36 | "'shub logout' first.") 37 | 38 | 39 | class ConfigParseException(ShubException): 40 | exit_code = 65 # EX_DATAERR 41 | default_msg = "Unable to parse configuration." 42 | 43 | 44 | class BadConfigException(ShubException): 45 | exit_code = 78 # EX_CONFIG 46 | # Should be initialised with more specific message 47 | default_msg = "Please check your scrapinghub.yml." 48 | 49 | 50 | class NotFoundException(ShubException): 51 | # Should be initialised with more specific message 52 | exit_code = 69 # EX_UNAVAILABLE 53 | default_msg = "Not found." 54 | 55 | 56 | class BadParameterException(BadParameter): 57 | exit_code = 64 # EX_USAGE 58 | 59 | 60 | class SubcommandException(ShubException): 61 | exit_code = 65 # EX_DATAERR 62 | default_msg = "Error while calling subcommand." 63 | 64 | 65 | class RemoteErrorException(ShubException): 66 | exit_code = 76 # EX_PROTOCOL 67 | # Should be initialised with more specific message 68 | default_msg = "Remote error." 69 | 70 | 71 | class DeployRequestTooLargeException(ShubException): 72 | exit_code = 65 # EX_DATAERR 73 | default_msg = ("Deploy request is too large. Please make sure that your " 74 | "project egg(s) size is less than 50MB in total.") 75 | 76 | 77 | class CustomImageTooLargeException(ShubException): 78 | exit_code = 65 # EX_DATAERR 79 | default_msg = ("Custom Docker image is too large. Please make sure that " 80 | "your image size is less than 3GB.") 81 | 82 | 83 | class ShubWarning(Warning): 84 | """Base class for custom warnings.""" 85 | 86 | 87 | class ShubDeprecationWarning(ShubWarning): 88 | """Warning category for deprecated features, since the default 89 | DeprecationWarning is silenced on Python 2.7+ 90 | """ 91 | 92 | 93 | def print_warning(msg, category=ShubWarning): 94 | """Helper to use Python warnings with custom formatter.""" 95 | 96 | def custom_showwarning(message, *args, **kwargs): 97 | # ignore everything except the message 98 | try: 99 | sys.stderr.write("WARNING: " + str(message) + '\n') 100 | # stderr is invalid - this warning just gets lost 101 | except (OSError, UnicodeError): 102 | pass 103 | 104 | old_showwarning = warnings.showwarning 105 | try: 106 | warnings.showwarning = custom_showwarning 107 | warnings.warn(msg, category=category) 108 | finally: 109 | warnings.showwarning = old_showwarning 110 | -------------------------------------------------------------------------------- /tests/test_jobresource.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import time 3 | import json 4 | from unittest import mock 5 | 6 | from click.testing import CliRunner 7 | 8 | from shub import items, log, requests 9 | 10 | 11 | class JobResourceTest(unittest.TestCase): 12 | 13 | def setUp(self): 14 | self.runner = CliRunner() 15 | 16 | def _test_prints_objects(self, cmd_mod, resource_name): 17 | objects = ['Object 1', 'Object 2'] 18 | jobid = '1/2/3' 19 | with mock.patch.object(cmd_mod, 'get_job', autospec=True) as mock_gj: 20 | # Patch job.items.iter_json() to return our objects 21 | mock_gj.return_value._metadata_updated = time.time() 22 | mock_resource = getattr(mock_gj.return_value, resource_name) 23 | mock_resource.iter_json.return_value = objects 24 | result = self.runner.invoke(cmd_mod.cli, (jobid,)) 25 | mock_gj.assert_called_once_with(jobid) 26 | self.assertIn("\n".join(objects), result.output) 27 | 28 | def _test_forwards_follow(self, cmd_mod): 29 | with mock.patch.object(cmd_mod, 'get_job'), \ 30 | mock.patch.object(cmd_mod, 'job_resource_iter', autospec=True) \ 31 | as mock_jri: 32 | self.runner.invoke(cmd_mod.cli, ('1/2/3',)) 33 | self.assertFalse(mock_jri.call_args[1]['follow']) 34 | self.runner.invoke(cmd_mod.cli, ('1/2/3', '-f')) 35 | self.assertTrue(mock_jri.call_args[1]['follow']) 36 | 37 | def test_items(self): 38 | self._test_prints_objects(items, 'items') 39 | self._test_forwards_follow(items) 40 | 41 | def test_requests(self): 42 | self._test_prints_objects(requests, 'requests') 43 | self._test_forwards_follow(requests) 44 | 45 | def test_log(self): 46 | objects = [ 47 | {'time': 0, 'level': 20, 'message': 'message 1'}, 48 | {'time': 1450874471000, 'level': 50, 'message': 'message 2'}, 49 | ] 50 | jobid = '1/2/3' 51 | with mock.patch.object(log, 'get_job', autospec=True) as mock_gj: 52 | mock_gj.return_value._metadata_updated = time.time() 53 | mock_gj.return_value.logs.iter_values.return_value = objects 54 | result = self.runner.invoke(log.cli, (jobid,)) 55 | mock_gj.assert_called_once_with(jobid) 56 | self.assertIn('1970-01-01 00:00:00 INFO message 1', result.output) 57 | self.assertIn('2015-12-23 12:41:11 CRITICAL message 2', result.output) 58 | with mock.patch.object(log, 'get_job', autospec=True) as mock_gj: 59 | with mock.patch.object(log, 'job_resource_iter', autospec=True) as mock_res_iter: 60 | mock_res_iter.return_value = [json.dumps(x) for x in objects] 61 | result = self.runner.invoke(log.cli, (jobid, '--json')) 62 | self.assertTrue(mock_res_iter.call_args[1].get('output_json')) 63 | for idx, line in enumerate(result.output.splitlines()): 64 | self.assertEqual(json.loads(line), objects[idx]) 65 | self._test_forwards_follow(log) 66 | 67 | def test_log_unicode(self): 68 | objects = [ 69 | {'time': 0, 'level': 20, 'message': 'jarzębina'} 70 | ] 71 | jobid = '1/2/3' 72 | with mock.patch.object(log, 'get_job', autospec=True) as mock_gj: 73 | mock_gj.return_value._metadata_updated = time.time() 74 | mock_gj.return_value.logs.iter_values.return_value = objects 75 | result = self.runner.invoke(log.cli, (jobid,)) 76 | mock_gj.assert_called_once_with(jobid) 77 | self.assertIn('1970-01-01 00:00:00 INFO jarzębina', result.output) 78 | -------------------------------------------------------------------------------- /tests/test_cancel.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from collections import namedtuple 3 | from unittest import mock 4 | 5 | from click.testing import CliRunner 6 | 7 | from shub import cancel 8 | from shub.exceptions import ( 9 | BadParameterException, 10 | ShubException, 11 | ) 12 | 13 | from .utils import AssertInvokeRaisesMixin, mock_conf 14 | 15 | 16 | class CancelTest(AssertInvokeRaisesMixin, unittest.TestCase): 17 | 18 | def setUp(self): 19 | self.runner = CliRunner() 20 | self.conf = mock_conf(self) 21 | 22 | @mock.patch('shub.cancel.get_scrapinghub_client_from_config') 23 | def test_simple_cancel_call(self, mock_client): 24 | client = mock_client.return_value 25 | mock_proj = client.get_project.return_value 26 | mock_proj.jobs.cancel.return_value = {'count': 2} 27 | 28 | result = self.runner.invoke( 29 | cancel.cli, ('123456', '1/1', '1/2',), input='y\n' 30 | ) 31 | 32 | self.assertTrue("{'count': 2}" in result.output) 33 | self.assertEqual(0, result.exit_code) 34 | self.assertEqual(mock_proj.jobs.cancel.call_count, 1) 35 | mock_proj.jobs.cancel.assert_called_with( 36 | keys=['123456/1/1', '123456/1/2'] 37 | ) 38 | 39 | @mock.patch('shub.cancel.get_target_conf') 40 | @mock.patch('shub.cancel.get_scrapinghub_client_from_config') 41 | def test_cancel_default_project(self, mock_client, targetconf): 42 | client = mock_client.return_value 43 | mock_proj = client.get_project.return_value 44 | mock_proj.jobs.cancel.return_value = {'count': 2} 45 | 46 | Target = namedtuple('Target', 'project_id') 47 | targetconf.return_value = Target(project_id='123456') 48 | 49 | result = self.runner.invoke( 50 | cancel.cli, ('1/1', '1/2',), input='y\n' 51 | ) 52 | 53 | self.assertTrue("{'count': 2}" in result.output) 54 | self.assertEqual(0, result.exit_code) 55 | self.assertEqual(mock_proj.jobs.cancel.call_count, 1) 56 | mock_proj.jobs.cancel.assert_called_with( 57 | keys=['123456/1/1', '123456/1/2'] 58 | ) 59 | 60 | @mock.patch('shub.cancel.get_scrapinghub_client_from_config') 61 | def test_invalid_job_key(self, mock_client): 62 | self.assertInvokeRaises( 63 | SystemExit, 64 | cancel.cli, 65 | ('123456', '1/1', '1',), 66 | input='y\n' 67 | ) 68 | 69 | self.assertInvokeRaises( 70 | SystemExit, 71 | cancel.cli, 72 | ('123456', '1/abc', '1',), 73 | input='y\n' 74 | ) 75 | 76 | @mock.patch('shub.cancel.get_scrapinghub_client_from_config') 77 | def test_cancel_failure(self, mock_client): 78 | client = mock_client.return_value 79 | mock_proj = client.get_project.return_value 80 | mock_proj.jobs.cancel.side_effect = ValueError('Error msg') 81 | 82 | self.assertInvokeRaises( 83 | ShubException, 84 | cancel.cli, 85 | ('123456', '1/1', '1/2',), 86 | input='y\n', 87 | ) 88 | 89 | @mock.patch('shub.cancel.get_scrapinghub_client_from_config') 90 | def test_cancel_abort(self, mock_client): 91 | client = mock_client.return_value 92 | client.get_project.return_value 93 | 94 | result = self.runner.invoke( 95 | cancel.cli, ('123456', '1/1', '1/2',), input='N\n', 96 | ) 97 | self.assertTrue('Aborted!' in result.output) 98 | 99 | def test_validate_job_key(self): 100 | with self.assertRaises(BadParameterException): 101 | cancel.validate_job_key('123456', '1') 102 | 103 | with self.assertRaises(BadParameterException): 104 | cancel.validate_job_key('123456', '1/abc') 105 | 106 | with self.assertRaises(BadParameterException): 107 | cancel.validate_job_key('123456', '') 108 | -------------------------------------------------------------------------------- /tests/test_login.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import textwrap 3 | from unittest.mock import patch, MagicMock 4 | 5 | import yaml 6 | from click.testing import CliRunner 7 | from yaml import CLoader as Loader 8 | 9 | from shub import login 10 | from shub.exceptions import AlreadyLoggedInException 11 | 12 | from .utils import AssertInvokeRaisesMixin 13 | 14 | 15 | VALID_KEY = 32 * '1' 16 | 17 | 18 | @patch('shub.login.GLOBAL_SCRAPINGHUB_YML_PATH', new='.scrapinghub.yml') 19 | @patch('shub.config.GLOBAL_SCRAPINGHUB_YML_PATH', new='.scrapinghub.yml') 20 | @patch('shub.config.NETRC_PATH', new='.netrc') 21 | @patch('shub.config.get_sources', new=MagicMock(return_value=[])) 22 | class LoginTest(AssertInvokeRaisesMixin, unittest.TestCase): 23 | 24 | def setUp(self): 25 | self.runner = CliRunner() 26 | 27 | def _run(self, user_input=VALID_KEY, files=None, fs=None, **kwargs): 28 | """Invokes the login cli on an isolated filesystem""" 29 | 30 | def write_local_test_files(): 31 | for path, content in (files or {}).items(): 32 | with open(path, 'w') as f: 33 | f.write(content) 34 | 35 | def invoke(): 36 | return self.runner.invoke(login.cli, input=user_input, **kwargs) 37 | 38 | def run(): 39 | write_local_test_files() 40 | with patch.object(login, '_is_valid_apikey', return_value=True): 41 | return invoke() 42 | 43 | if fs: 44 | return run() 45 | 46 | with self.runner.isolated_filesystem() as fs: 47 | return run() 48 | 49 | def test_write_key_to_new_file(self): 50 | with self.runner.isolated_filesystem() as fs: 51 | self._run(fs=fs) 52 | with open('.scrapinghub.yml') as f: 53 | conf = yaml.load(f, Loader=Loader) 54 | self.assertEqual(conf['apikeys']['default'], VALID_KEY) 55 | 56 | def test_write_key_to_existing_file(self): 57 | VALID_SCRAPINGHUB_YML = textwrap.dedent(""" 58 | endpoints: 59 | other: some_endpoint 60 | """) 61 | with self.runner.isolated_filesystem() as fs: 62 | files = {'.scrapinghub.yml': VALID_SCRAPINGHUB_YML} 63 | self._run(files=files, fs=fs) 64 | with open('.scrapinghub.yml') as f: 65 | conf = yaml.load(f, Loader=Loader) 66 | self.assertEqual(conf['apikeys']['default'], VALID_KEY) 67 | self.assertEqual(conf['endpoints']['other'], "some_endpoint") 68 | 69 | def test_suggest_project_key(self): 70 | PROJECT_SH_YML = textwrap.dedent(""" 71 | apikeys: 72 | default: KEY_SUGGESTION 73 | """) 74 | files = {'scrapinghub.yml': PROJECT_SH_YML} 75 | result = self._run(files=files) 76 | err = 'Unexpected output: %s' % result.output 77 | self.assertTrue('KEY_SUGGESTION' in result.output, err) 78 | 79 | def test_suggest_env_key(self): 80 | result = self._run(env={'SHUB_APIKEY': 'SHUB_APIKEY_VALUE'}) 81 | err = 'Unexpected output: %s' % result.output 82 | self.assertTrue('SHUB_APIKEY_VALUE' in result.output, err) 83 | 84 | def test_use_suggestion_to_log_in(self): 85 | apikey_suggestion = 'SHUB_APIKEY_VALUE' 86 | with self.runner.isolated_filesystem() as fs: 87 | self._run( 88 | env={'SHUB_APIKEY': apikey_suggestion}, 89 | user_input='\n', 90 | fs=fs, 91 | ) 92 | with open('.scrapinghub.yml') as f: 93 | conf = yaml.load(f, Loader=Loader) 94 | self.assertEqual(conf['apikeys']['default'], apikey_suggestion) 95 | 96 | def test_login_attempt_after_login_doesnt_lead_to_an_error(self): 97 | with self.runner.isolated_filesystem() as fs: 98 | self._run(fs=fs) 99 | self.assertInvokeRaises(AlreadyLoggedInException, login.cli, 100 | input=VALID_KEY) 101 | -------------------------------------------------------------------------------- /tests/test_bootstrap.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | from unittest import mock 4 | 5 | import pytest 6 | import requests 7 | import yaml 8 | from click.testing import CliRunner 9 | 10 | from shub.bootstrap import cli, EXAMPLE_REPO, list_projects, unzip_project 11 | from shub.exceptions import ( 12 | BadParameterException, NotFoundException, RemoteErrorException) 13 | 14 | 15 | BOOTSTRAP_PROJECTS = """ 16 | projA: 17 | path: projects_dir/projA 18 | description: PROJECT_A_DESC 19 | 20 | projB: 21 | path: projects_dir/projB 22 | description: PROJECT_B_DESC 23 | """ 24 | 25 | 26 | REPO_ZIP_PATH = os.path.join(os.path.dirname(__file__), 'samples', 27 | 'custom-images-examples-master.zip') 28 | 29 | 30 | @pytest.fixture 31 | def requests_get_mock(): 32 | with mock.patch('shub.bootstrap.requests.get') as m: 33 | yield m 34 | 35 | 36 | @pytest.fixture 37 | def github_responses(requests_get_mock): 38 | requests_get_mock.return_value.text = BOOTSTRAP_PROJECTS 39 | with open(REPO_ZIP_PATH, 'rb') as f: 40 | requests_get_mock.return_value.content = f.read() 41 | 42 | 43 | def test_list_projects(capsys): 44 | projects = yaml.safe_load(BOOTSTRAP_PROJECTS) 45 | list_projects(projects) 46 | out, _ = capsys.readouterr() 47 | assert 'projA' in out 48 | assert 'PROJECT_A_DESC' in out 49 | assert 'projB' in out 50 | assert 'PROJECT_B_DESC' in out 51 | assert 'projects_dir' not in out 52 | 53 | 54 | def test_unzip_project(tempdir): 55 | target_dir = str(tempdir.join('projA')) 56 | project = {'path': 'projects_dir/projA'} 57 | repo_zip = zipfile.ZipFile(REPO_ZIP_PATH) 58 | assert not os.path.exists(target_dir) 59 | unzip_project(repo_zip, project, target_dir) 60 | assert os.path.exists(target_dir) 61 | assert os.path.isfile(os.path.join(target_dir, 'a_file')) 62 | assert os.path.isdir(os.path.join(target_dir, 'a_dir')) 63 | assert os.path.isfile(os.path.join(target_dir, 'a_dir', 'a_dir_file')) 64 | 65 | 66 | @pytest.mark.usefixtures('github_responses') 67 | def test_cli_lists_projects(): 68 | result = CliRunner().invoke(cli, ['-l']) 69 | assert result.exit_code == 0 70 | assert 'projA' in result.output 71 | assert 'PROJECT_A_DESC' in result.output 72 | 73 | 74 | @pytest.mark.usefixtures('github_responses') 75 | def test_cli_clones_project_into_default_dir(tempdir): 76 | target_dir = str(tempdir.join('projA')) 77 | assert not os.path.exists(target_dir) 78 | result = CliRunner().invoke(cli, ['projA']) 79 | assert result.exit_code == 0 80 | assert os.path.isdir(target_dir) 81 | assert os.path.isfile(os.path.join(target_dir, 'a_file')) 82 | 83 | 84 | @pytest.mark.usefixtures('github_responses') 85 | def test_cli_clones_project_into_target_dir(tempdir): 86 | target_dir = str(tempdir.join('target_dir')) 87 | assert not os.path.exists(target_dir) 88 | result = CliRunner().invoke(cli, ['projA', 'target_dir']) 89 | assert result.exit_code == 0 90 | assert os.path.isdir(target_dir) 91 | assert os.path.isfile(os.path.join(target_dir, 'a_file')) 92 | 93 | 94 | def test_cli_fails_on_existing_target_dir(tempdir): 95 | os.mkdir('target_dir') 96 | result = CliRunner().invoke(cli, ['some_project', 'target_dir']) 97 | assert result.exit_code == BadParameterException.exit_code 98 | assert "exists" in result.output 99 | 100 | 101 | @pytest.mark.usefixtures('github_responses') 102 | def test_cli_fails_on_unknown_project(): 103 | result = CliRunner().invoke(cli, ['nonexistent']) 104 | assert result.exit_code == NotFoundException.exit_code 105 | assert "shub bootstrap -l" in result.output 106 | 107 | 108 | def test_cli_links_to_repo_on_http_error(requests_get_mock): 109 | requests_get_mock.return_value.raise_for_status.side_effect = ( 110 | requests.HTTPError) 111 | result = CliRunner().invoke(cli, ['some_project']) 112 | assert result.exit_code == RemoteErrorException.exit_code 113 | assert EXAMPLE_REPO in result.output 114 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | tests-ubuntu: 13 | name: "Test: py${{ matrix.python-version }}, Ubuntu" 14 | runs-on: ${{ matrix.os || 'ubuntu-latest' }} 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | include: 19 | - python-version: '3.9' 20 | os: ubuntu-22.04 21 | tox-env: min 22 | - python-version: '3.9' 23 | tox-env: min-poetry 24 | os: ubuntu-22.04 25 | - python-version: '3.9' 26 | - python-version: '3.10' 27 | - python-version: '3.11' 28 | - python-version: '3.12' 29 | - python-version: '3.13' 30 | - python-version: '3.14' 31 | - python-version: '3.14' 32 | tox-env: poetry 33 | 34 | steps: 35 | - uses: actions/checkout@v2 36 | 37 | - name: Set up Python ${{ matrix.python-version }} 38 | uses: actions/setup-python@v5 39 | with: 40 | python-version: ${{ matrix.python-version }} 41 | 42 | - name: Install tox 43 | run: pip install tox 44 | 45 | - name: Run tests 46 | run: tox -e ${{ matrix.tox-env || 'py' }} 47 | 48 | - name: Run off-tox tests 49 | # https://github.com/scrapinghub/shub/issues/441 50 | run: | 51 | python -m venv venv 52 | . venv/bin/activate 53 | pip install . 54 | python -c "from shub.image.utils import get_docker_client; get_docker_client(validate=False)" 55 | 56 | - name: coverage 57 | uses: codecov/codecov-action@v5 58 | with: 59 | token: ${{ secrets.CODECOV_TOKEN }} 60 | 61 | tests-macos: 62 | name: "Test: py${{ matrix.python-version }}, macOS" 63 | runs-on: ${{ matrix.os || 'macos-latest' }} 64 | strategy: 65 | fail-fast: false 66 | matrix: 67 | include: 68 | - python-version: '3.9' 69 | tox-env: min 70 | os: macos-13 71 | - python-version: '3.9' 72 | tox-env: min-poetry 73 | os: macos-13 74 | - python-version: '3.9' 75 | - python-version: '3.10' 76 | - python-version: '3.11' 77 | - python-version: '3.12' 78 | - python-version: '3.13' 79 | - python-version: '3.14' 80 | - python-version: '3.14' 81 | tox-env: poetry 82 | 83 | steps: 84 | - uses: actions/checkout@v2 85 | 86 | - name: Set up Python ${{ matrix.python-version }} 87 | uses: actions/setup-python@v5 88 | with: 89 | python-version: ${{ matrix.python-version }} 90 | 91 | - name: Install tox 92 | run: pip install tox 93 | 94 | - name: Run tests 95 | run: tox -e ${{ matrix.tox-env || 'py' }} 96 | 97 | - name: coverage 98 | uses: codecov/codecov-action@v5 99 | with: 100 | token: ${{ secrets.CODECOV_TOKEN }} 101 | 102 | tests-windows: 103 | name: "Test: py${{ matrix.python-version }}, Windows" 104 | runs-on: windows-latest 105 | strategy: 106 | fail-fast: false 107 | matrix: 108 | include: 109 | - python-version: '3.9' 110 | tox-env: min 111 | - python-version: '3.9' 112 | tox-env: min-poetry 113 | - python-version: '3.9' 114 | - python-version: '3.10' 115 | - python-version: '3.11' 116 | - python-version: '3.12' 117 | - python-version: '3.13' 118 | - python-version: '3.14' 119 | - python-version: '3.14' 120 | tox-env: poetry 121 | 122 | steps: 123 | - uses: actions/checkout@v2 124 | 125 | - name: Set up Python ${{ matrix.python-version }} 126 | uses: actions/setup-python@v5 127 | with: 128 | python-version: ${{ matrix.python-version }} 129 | 130 | - name: Install tox 131 | run: pip install tox 132 | 133 | - name: Run tests 134 | run: tox -e ${{ matrix.tox-env || 'py' }} 135 | 136 | - name: coverage 137 | uses: codecov/codecov-action@v5 138 | with: 139 | token: ${{ secrets.CODECOV_TOKEN }} 140 | -------------------------------------------------------------------------------- /shub/schedule.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import click 4 | from scrapinghub import ScrapinghubClient, ScrapinghubAPIError 5 | from urllib.parse import urljoin 6 | 7 | from shub.exceptions import RemoteErrorException 8 | from shub.config import get_target_conf 9 | 10 | 11 | HELP = """ 12 | Schedule a spider to run on Scrapy Cloud, optionally with provided spider 13 | arguments and job-specific settings. 14 | 15 | The `spider` argument should match the spider's name, e.g.: 16 | 17 | shub schedule myspider 18 | 19 | By default, shub will schedule the spider in your default project (as defined 20 | in scrapinghub.yml). You may also explicitly specify the project to use by 21 | supplying its ID: 22 | 23 | shub schedule 12345/myspider 24 | 25 | Or by supplying an identifier defined in scrapinghub.yml: 26 | 27 | shub schedule production/myspider 28 | 29 | Spider arguments can be supplied through the -a option: 30 | 31 | shub schedule myspider -a ARG1=VALUE1 -a ARG2=VALUE2 32 | 33 | Similarly, job-specific settings can be supplied through the -s option: 34 | 35 | shub schedule myspider -s SETTING=VALUE -s LOG_LEVEL=DEBUG 36 | """ 37 | 38 | SHORT_HELP = "Schedule a spider to run on Scrapy Cloud" 39 | DEFAULT_PRIORITY = 2 40 | 41 | 42 | @click.command(help=HELP, short_help=SHORT_HELP) 43 | @click.argument('spider', type=click.STRING) 44 | @click.option('-a', '--argument', 45 | help='Spider argument (-a name=value)', multiple=True) 46 | @click.option('-s', '--set', 47 | help='Job-specific setting (-s name=value)', multiple=True) 48 | @click.option('-p', '--priority', type=int, default=DEFAULT_PRIORITY, 49 | help='Job priority (-p number). From 0 (lowest) to 4 (highest)') 50 | @click.option('-e', '--environment', multiple=True, 51 | help='Job environment variable (-e VAR=VAL)') 52 | @click.option('-u', '--units', type=int, 53 | help='Amount of Scrapy Cloud units (-u number)') 54 | @click.option('-t', '--tag', 55 | help='Job tags (-t tag)', multiple=True) 56 | def cli(spider, argument, set, environment, priority, units, tag): 57 | try: 58 | target, spider = spider.rsplit('/', 1) 59 | except ValueError: 60 | target = 'default' 61 | targetconf = get_target_conf(target) 62 | job_key = schedule_spider(targetconf.project_id, targetconf.endpoint, 63 | targetconf.apikey, spider, argument, set, 64 | priority, units, tag, environment) 65 | watch_url = urljoin( 66 | targetconf.endpoint, 67 | '../p/{}/{}/{}'.format(*job_key.split('/')), 68 | ) 69 | short_key = job_key.split('/', 1)[1] if target == 'default' else job_key 70 | click.echo(f"Spider {spider} scheduled, job ID: {job_key}") 71 | click.echo("Watch the log on the command line:\n shub log -f {}" 72 | "".format(short_key)) 73 | click.echo("or print items as they are being scraped:\n shub items -f " 74 | "{}".format(short_key)) 75 | click.echo("or watch it running in Zyte's web interface:\n {}" 76 | "".format(watch_url)) 77 | 78 | 79 | def schedule_spider(project, endpoint, apikey, spider, arguments=(), settings=(), 80 | priority=DEFAULT_PRIORITY, units=None, tag=(), environment=()): 81 | client = ScrapinghubClient(apikey, dash_endpoint=endpoint) 82 | try: 83 | project = client.get_project(project) 84 | args = dict(x.split('=', 1) for x in arguments) 85 | cmd_args = args.pop('cmd_args', None) 86 | meta = args.pop('meta', None) 87 | job = project.jobs.run( 88 | spider=spider, 89 | meta=json.loads(meta) if meta else {}, 90 | cmd_args=cmd_args, 91 | job_args=args, 92 | job_settings=dict(x.split('=', 1) for x in settings), 93 | priority=priority, 94 | units=units, 95 | add_tag=tag, 96 | environment=dict(x.split('=', 1) for x in environment), 97 | ) 98 | return job.key 99 | except ScrapinghubAPIError as e: 100 | raise RemoteErrorException(str(e)) 101 | -------------------------------------------------------------------------------- /tests/test_deploy_egg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | import unittest 5 | from unittest import mock 6 | from zipfile import ZipFile 7 | 8 | from shub import deploy_egg 9 | from shub.exceptions import BadParameterException 10 | 11 | 12 | class FakeRequester: 13 | """Used to mock shub.utils#make_deploy_request""" 14 | def fake_request(self, *args): 15 | self.url = args[0] 16 | self.data = args[1] 17 | self.files = args[2] 18 | self.auth = args[3] 19 | 20 | 21 | @mock.patch.dict(os.environ, {'SHUB_APIKEY': '1234'}) 22 | class TestDeployEgg(unittest.TestCase): 23 | 24 | def setUp(self): 25 | self.curdir = os.getcwd() 26 | self.fake_requester = FakeRequester() 27 | deploy_egg.utils.make_deploy_request = self.fake_requester.fake_request 28 | self.tmp_dir = tempfile.mkdtemp(prefix="shub-test-deploy-eggs") 29 | 30 | def tearDown(self): 31 | os.chdir(self.curdir) 32 | if os.path.exists(self.tmp_dir): 33 | shutil.rmtree(self.tmp_dir) 34 | 35 | def test_parses_project_information_correctly(self): 36 | # this test's assertions are based on the values 37 | # defined on this folder's setup.py file 38 | shutil.rmtree(self.tmp_dir) 39 | shutil.copytree('tests/samples/deploy_egg_sample_project', self.tmp_dir) 40 | os.chdir(self.tmp_dir) 41 | 42 | data = self.call_main_and_check_request_data() 43 | self.assertEqual('1.2.0', data['version']) 44 | 45 | def test_can_clone_a_git_repo_and_deploy_the_egg(self): 46 | self._unzip_git_repo_to(self.tmp_dir) 47 | repo = os.path.join(self.tmp_dir, 'deploy_egg_sample_repo.git') 48 | 49 | self.call_main_and_check_request_data(from_url=repo) 50 | data = self.call_main_and_check_request_data() 51 | 52 | self.assertTrue('master' in data['version']) 53 | 54 | @unittest.skip('flaky') 55 | def test_can_deploy_an_egg_from_pypi(self): 56 | basepath = os.path.abspath('tests/samples/') 57 | pkg = os.path.join(basepath, 'deploy_egg_sample_project.zip') 58 | self.call_main_and_check_request_data(from_pypi=pkg) 59 | 60 | def test_can_clone_checkout_and_deploy_the_egg(self): 61 | self._unzip_git_repo_to(self.tmp_dir) 62 | repo = os.path.join(self.tmp_dir, 'deploy_egg_sample_repo.git') 63 | 64 | branch = 'dev' 65 | data = self.call_main_and_check_request_data(from_url=repo, git_branch=branch) 66 | self.assertTrue('dev' in data['version']) 67 | 68 | def test_fails_on_invalid_repo(self): 69 | self._unzip_git_repo_to(self.tmp_dir) 70 | repo = os.path.join(self.tmp_dir, 'deploy_egg_sample_repo.git') 71 | shutil.rmtree(os.path.join(repo, '.git')) 72 | 73 | with self.assertRaises(BadParameterException): 74 | self.call_main_and_check_request_data(from_url=repo) 75 | 76 | def test_fails_on_invalid_branch(self): 77 | self._unzip_git_repo_to(self.tmp_dir) 78 | repo = os.path.join(self.tmp_dir, 'deploy_egg_sample_repo.git') 79 | with self.assertRaises(BadParameterException): 80 | self.call_main_and_check_request_data( 81 | from_url=repo, git_branch='nonexisting') 82 | 83 | def _unzip_git_repo_to(self, path): 84 | zipped_repo = os.path.abspath('tests/samples/deploy_egg_sample_repo.git.zip') 85 | ZipFile(zipped_repo).extractall(path) 86 | 87 | def call_main_and_check_request_data(self, project_id=0, from_url=None, 88 | git_branch=None, from_pypi=None): 89 | # WHEN 90 | deploy_egg.main(project_id, from_url, git_branch, from_pypi) 91 | 92 | data = self.fake_requester.data 93 | files = self.fake_requester.files 94 | 95 | # THEN 96 | # the egg was successfully built, let's check the data 97 | # that is sent to the scrapy cloud 98 | self.assertTrue('test_project', files['egg'][0]) 99 | self.assertEqual(project_id, data['project']) 100 | self.assertEqual('test_project', data['name']) 101 | 102 | return data 103 | -------------------------------------------------------------------------------- /shub/bootstrap.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | import zipfile 5 | 6 | import click 7 | import requests 8 | import yaml 9 | from click.formatting import HelpFormatter 10 | from io import BytesIO 11 | 12 | from shub.exceptions import ( 13 | BadParameterException, NotFoundException, RemoteErrorException) 14 | 15 | 16 | EXAMPLE_REPO = "scrapinghub/custom-images-examples" 17 | AVAILABLE_PROJECTS_URL = ( 18 | "https://raw.githubusercontent.com/%s/master/bootstrap_projects.yml" 19 | "" % EXAMPLE_REPO) 20 | 21 | HELP = """ 22 | Through custom images, Scrapinghub allows you to run crawlers written in any 23 | language you want. To get you started, we prepared a few examples projects in 24 | different programming languages and frameworks. You can find them in our custom 25 | images repository at: 26 | 27 | https://github.com/scrapinghub/custom-images-examples 28 | 29 | The 'shub bootstrap' command clones an example project to the current directory 30 | so that you can start hacking right away. 31 | 32 | Run 33 | 34 | shub bootstrap -l 35 | 36 | to get a list of all available example projects, then run 37 | 38 | shub bootstrap PROJECTNAME 39 | 40 | to clone it. 41 | """ 42 | 43 | SHORT_HELP = "Clone custom image example project" 44 | 45 | 46 | def list_projects_callback(ctx, param, value): 47 | if not value or ctx.resilient_parsing: 48 | return 49 | projects = get_available_projects() 50 | list_projects(projects) 51 | ctx.exit() 52 | 53 | 54 | @click.command(help=HELP, short_help=SHORT_HELP) 55 | @click.option('-l', '--list', 'list_projects', help='list available projects', 56 | is_flag=True, callback=list_projects_callback, 57 | expose_value=False, is_eager=True) 58 | @click.argument('project') 59 | @click.argument('target_dir', required=False) 60 | def cli(project, target_dir): 61 | target_dir = os.path.normpath( 62 | os.path.join(os.getcwd(), target_dir or project)) 63 | if os.path.exists(target_dir): 64 | raise BadParameterException( 65 | "Target directory %s already exists, please delete it or supply a " 66 | "non-existing target." % target_dir) 67 | projects = get_available_projects() 68 | if project not in projects: 69 | raise NotFoundException( 70 | "There is no example project named '%s'. Run 'shub bootstrap -l' " 71 | "to get a list of all available projects." % project) 72 | click.echo("Downloading custom image examples") 73 | repo_zip = get_repo_zip(EXAMPLE_REPO) 74 | click.echo(f"Cloning project '{project}' into {target_dir}") 75 | unzip_project(repo_zip, project=projects[project], target_dir=target_dir) 76 | 77 | 78 | def get_available_projects(): 79 | try: 80 | resp = requests.get(AVAILABLE_PROJECTS_URL) 81 | resp.raise_for_status() 82 | except (requests.HTTPError, requests.ConnectionError) as e: 83 | raise RemoteErrorException( 84 | "There was an error while getting the list of available projects " 85 | "from GitHub: %s.\n\nPlease check your connection or go to\n %s\n" 86 | "to browse the custom image examples manually." 87 | "" % (e, "https://github.com/%s" % EXAMPLE_REPO)) 88 | return yaml.safe_load(resp.text) 89 | 90 | 91 | def list_projects(projects): 92 | formatter = HelpFormatter() 93 | with formatter.section("Available projects"): 94 | formatter.write_dl( 95 | sorted( 96 | [(name, info['description']) 97 | for name, info in projects.items()], 98 | key=lambda x: x[0])) 99 | click.echo(formatter.getvalue().strip()) 100 | 101 | 102 | def get_repo_zip(repo): 103 | zip_url = "https://github.com/%s/archive/master.zip" % repo 104 | resp = requests.get(zip_url) 105 | return zipfile.ZipFile(BytesIO(resp.content)) 106 | 107 | 108 | def unzip_project(repo_zip, project, target_dir): 109 | filenames = repo_zip.namelist() 110 | repo_dirname = filenames[0] 111 | project_filenames = [ 112 | fn 113 | for fn in filenames 114 | if fn.startswith(repo_dirname + project['path']) 115 | ] 116 | tempdir = tempfile.mkdtemp() 117 | repo_zip.extractall(path=tempdir, members=project_filenames) 118 | shutil.move( 119 | os.path.join(tempdir, repo_dirname, project['path']), 120 | target_dir, 121 | ) 122 | shutil.rmtree(tempdir) 123 | -------------------------------------------------------------------------------- /shub/deploy_egg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | from shutil import which 4 | 5 | import click 6 | 7 | from shub import utils, DEPLOY_DOCS_LINK 8 | from shub.config import get_target_conf 9 | from shub.exceptions import (BadParameterException, NotFoundException, 10 | SubcommandException) 11 | from shub.utils import (decompress_egg_files, download_from_pypi, 12 | run_cmd) 13 | 14 | 15 | HELP = """ 16 | Build a Python egg from source and deploy it to Scrapy Cloud. 17 | 18 | You can either deploy to your default target (as defined in scrapinghub.yml), 19 | or explicitly supply a numerical project ID or a target defined in 20 | scrapinghub.yml (see shub deploy). 21 | 22 | By default, shub will try to build the egg using the current folder's setup.py. 23 | You can also build the egg from a remote (git/mercurial/bazaar) repository by 24 | using the --from-url option: 25 | 26 | shub deploy-egg --from-url https://github.com/scrapinghub/shub.git 27 | 28 | For git repositories, you may additionally specify the branch to be checked 29 | out: 30 | 31 | shub deploy-egg --from-url https://xy.git --git-branch my-feature 32 | 33 | Alternatively, you can build the egg from a PyPI package: 34 | 35 | shub deploy-egg --from-pypi shub 36 | """ 37 | 38 | SHORT_HELP = "[DEPRECATED] Build and deploy egg from source" 39 | 40 | 41 | @click.command(help=HELP, short_help=SHORT_HELP) 42 | @click.argument("target", required=False, default='default') 43 | @click.option("--from-url", help="Git, bazaar or mercurial repository URL") 44 | @click.option("--git-branch", help="Git branch to checkout") 45 | @click.option("--from-pypi", help="Name of package on pypi") 46 | def cli(target, from_url=None, git_branch=None, from_pypi=None): 47 | click.secho( 48 | "deploy-egg was deprecated, define the eggs you would like to deploy " 49 | "in your scrapinghub.yml instead. See {}".format(DEPLOY_DOCS_LINK), 50 | err=True, fg='yellow', 51 | ) 52 | main(target, from_url, git_branch, from_pypi) 53 | 54 | 55 | def main(target, from_url=None, git_branch=None, from_pypi=None): 56 | targetconf = get_target_conf(target) 57 | 58 | if from_pypi: 59 | _fetch_from_pypi(from_pypi) 60 | decompress_egg_files() 61 | utils.build_and_deploy_eggs(targetconf.project_id, targetconf.endpoint, 62 | targetconf.apikey) 63 | return 64 | 65 | if from_url: 66 | _checkout(from_url, git_branch) 67 | 68 | if not os.path.isfile('setup.py'): 69 | error = "No setup.py -- are you running from a valid Python project?" 70 | raise NotFoundException(error) 71 | 72 | utils.build_and_deploy_egg(targetconf.project_id, targetconf.endpoint, 73 | targetconf.apikey) 74 | 75 | 76 | def _checkout(repo, git_branch=None, target_dir='egg-tmp-clone'): 77 | tmpdir = tempfile.mkdtemp(prefix='shub-deploy-egg-from-url') 78 | 79 | click.echo("Cloning the repository to a tmp folder...") 80 | os.chdir(tmpdir) 81 | 82 | vcs_commands = [ 83 | ['git', 'clone', repo, target_dir], 84 | ['hg', 'clone', repo, target_dir], 85 | ['bzr', 'branch', repo, target_dir], 86 | ] 87 | missing_exes = [] 88 | for cmd in vcs_commands: 89 | exe = which(cmd[0]) 90 | if not exe: 91 | missing_exes.append(cmd[0]) 92 | continue 93 | try: 94 | run_cmd([exe] + cmd[1:]) 95 | except SubcommandException: 96 | pass 97 | else: 98 | break 99 | else: 100 | if missing_exes: 101 | click.secho( 102 | "shub was unable to find the following VCS executables and " 103 | "could not try to check out your repository with these: %s" 104 | "" % ', '.join(missing_exes), fg='yellow') 105 | raise BadParameterException( 106 | "\nERROR: The provided repository URL is not valid: %s\n") 107 | 108 | os.chdir(target_dir) 109 | 110 | if git_branch: 111 | try: 112 | run_cmd([which('git'), 'checkout', git_branch]) 113 | except SubcommandException: 114 | raise BadParameterException("Branch %s is not valid" % git_branch) 115 | click.echo("%s branch was checked out" % git_branch) 116 | 117 | 118 | def _fetch_from_pypi(pkg): 119 | tmpdir = tempfile.mkdtemp(prefix='shub-deploy-egg-from-pypi') 120 | click.echo('Fetching %s from pypi' % pkg) 121 | download_from_pypi(tmpdir, pkg=pkg) 122 | click.echo('Package fetched successfully') 123 | os.chdir(tmpdir) 124 | -------------------------------------------------------------------------------- /docs/deploying.rst: -------------------------------------------------------------------------------- 1 | .. _deploying: 2 | 3 | =================================== 4 | Deploying projects and dependencies 5 | =================================== 6 | 7 | Deploying projects 8 | ------------------ 9 | 10 | To deploy a Scrapy project to Scrapy Cloud, navigate into the project's folder 11 | and run:: 12 | 13 | shub deploy [TARGET] 14 | 15 | where ``[TARGET]`` is either a project name defined in ``scrapinghub.yml`` or a 16 | numerical Scrapinghub project ID. If you have configured a default target in 17 | your ``scrapinghub.yml``, you can leave out the parameter completely:: 18 | 19 | $ shub deploy 20 | Packing version 3af023e-master 21 | Deploying to Scrapy Cloud project "12345" 22 | {"status": "ok", "project": 12345, "version": "3af023e-master", "spiders": 1} 23 | Run your spiders at: https://app.zyte.com/p/12345/ 24 | 25 | You can also deploy your project from a Python egg, or build one without 26 | deploying:: 27 | 28 | $ shub deploy --egg egg_name --version 1.0.0 29 | Using egg: egg_name 30 | Deploying to Scrapy Cloud project "12345" 31 | {"status": "ok", "project": 12345, "version": "1.0.0", "spiders": 1} 32 | Run your spiders at: https://app.zyte.com/p/12345/ 33 | 34 | :: 35 | 36 | $ shub deploy --build-egg egg_name 37 | Writing egg to egg_name 38 | 39 | 40 | .. _deploying-dependencies: 41 | 42 | Deploying dependencies 43 | ---------------------- 44 | 45 | Sometimes your project will depend on third party libraries that are not 46 | available on Scrapy Cloud. You can easily upload these by specifying a 47 | `requirements file`_:: 48 | 49 | # project_directory/scrapinghub.yml 50 | 51 | projects: 52 | default: 12345 53 | prod: 33333 54 | 55 | requirements: 56 | file: requirements.txt 57 | 58 | Note that this requirements file is an *extension* of the `Scrapy Cloud 59 | stack`_, and therefore should not contain packages that are already part of the 60 | stack, such as ``scrapy``. 61 | 62 | In case you use `pipenv`_ you may also specify a ``Pipfile``:: 63 | 64 | # project_directory/scrapinghub.yml 65 | 66 | projects: 67 | default: 12345 68 | prod: 33333 69 | 70 | requirements: 71 | file: Pipfile 72 | 73 | In this case the ``Pipfile`` must be locked and ``pipenv`` available in the 74 | environment. 75 | 76 | .. note:: 77 | 78 | To install pipenv tool, use ``pip install pipenv`` or check `its documentation 79 | `_. 80 | 81 | A requirements.txt file will be created out of the ``Pipfile`` so like the 82 | requirements file above, it should not contain packages that are already part 83 | of the stack. 84 | 85 | If you use `Poetry`_ you can specify your ``pyproject.toml``:: 86 | 87 | # project_directory/scrapinghub.yml 88 | 89 | projects: 90 | default: 12345 91 | prod: 33333 92 | 93 | requirements: 94 | file: pyproject.toml 95 | 96 | This will use Poetry's ``export`` command to create a requirements.txt file. For 97 | Poetry >= 2.0 this command is no longer installed by default and needs to manually 98 | added as described in the 99 | `plugin's documentation `_. 100 | If ``poetry.lock`` does not exist yet, it will be created during this process. 101 | 102 | .. note:: 103 | 104 | `Poetry`_ is a tool for dependency management and packaging in Python. 105 | 106 | When your dependencies cannot be specified in a requirements file, e.g. 107 | because they are not publicly available, you can supply them as Python eggs:: 108 | 109 | # project_directory/scrapinghub.yml 110 | 111 | projects: 112 | default: 12345 113 | prod: 33333 114 | 115 | requirements: 116 | file: requirements.txt 117 | eggs: 118 | - privatelib.egg 119 | - path/to/otherlib.egg 120 | 121 | Alternatively, if you cannot or don't want to supply Python eggs, you can also 122 | build your own Docker image to be used on Scrapy Cloud. See 123 | :ref:`deploy-custom-image`. 124 | 125 | .. _requirements file: https://pip.pypa.io/en/stable/user_guide/#requirements-files 126 | 127 | .. _pipenv: https://github.com/pypa/pipenv 128 | 129 | .. _Poetry: https://poetry.eustace.io/ 130 | 131 | .. _choose-custom-stack: 132 | 133 | Choosing a Scrapy Cloud stack 134 | ----------------------------- 135 | 136 | You can specify the `Scrapy Cloud stack`_ to deploy your spider to by adding a 137 | ``stack`` entry to your configuration:: 138 | 139 | # project_directory/scrapinghub.yml 140 | 141 | projects: 142 | default: 12345 143 | stack: scrapy:1.3-py3 144 | 145 | It is also possible to define the stack per project for advanced use cases:: 146 | 147 | # project_directory/scrapinghub.yml 148 | 149 | projects: 150 | default: 151 | id: 12345 152 | stack: scrapy:1.3-py3 153 | prod: 33333 # will use Scrapinghub's default stack 154 | 155 | .. _`Scrapy Cloud stack`: https://helpdesk.scrapinghub.com/support/solutions/articles/22000200402-scrapy-cloud-stacks 156 | -------------------------------------------------------------------------------- /tests/image/test_init.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | from click.testing import CliRunner 5 | 6 | from shub.exceptions import BadConfigException 7 | from shub.image.init import cli 8 | from shub.image.init import _format_system_deps 9 | from shub.image.init import _format_system_env 10 | from shub.image.init import _format_requirements 11 | from shub.image.init import _wrap 12 | 13 | from .utils import add_fake_requirements 14 | 15 | 16 | @pytest.fixture 17 | def project_dir(project_dir): 18 | """Overriden project_dir fixture without Dockerfile""" 19 | os.remove(os.path.join(project_dir, 'Dockerfile')) 20 | return project_dir 21 | 22 | 23 | def test_cli_default_settings(project_dir): 24 | dockerfile_path = os.path.join(project_dir, 'Dockerfile') 25 | assert not os.path.exists(dockerfile_path) 26 | runner = CliRunner() 27 | result = runner.invoke(cli, []) 28 | assert result.exit_code == 0 29 | msg = f'Dockerfile is saved to {dockerfile_path}' 30 | assert msg in result.output 31 | assert os.path.exists(dockerfile_path) 32 | 33 | 34 | @pytest.mark.usefixtures('project_dir') 35 | def test_cli_list_recommended_reqs(): 36 | runner = CliRunner() 37 | result = runner.invoke(cli, ["--list-recommended-reqs"]) 38 | assert result.exit_code == 0 39 | assert "Recommended Python deps list:" in result.output 40 | 41 | 42 | def test_cli_abort_if_dockerfile_exists(project_dir): 43 | dockerfile_path = os.path.join(project_dir, 'Dockerfile') 44 | open(dockerfile_path, 'w').close() 45 | runner = CliRunner() 46 | result = runner.invoke(cli, [], input='yes\n') 47 | assert result.exit_code == 1 48 | assert 'Found a Dockerfile in the project directory, aborting' in result.output 49 | assert os.path.exists(os.path.join(project_dir, 'Dockerfile')) 50 | with open(dockerfile_path) as f: 51 | assert f.read() == '' 52 | 53 | 54 | def test_cli_create_setup_py(project_dir): 55 | setup_py_path = os.path.join(project_dir, 'setup.py') 56 | os.remove(setup_py_path) 57 | runner = CliRunner() 58 | result = runner.invoke(cli, [], input='yes\n') 59 | assert result.exit_code == 0 60 | assert os.path.isfile(setup_py_path) 61 | 62 | 63 | def test_wrap(): 64 | short_cmd = "run short command wrapping another one short" 65 | assert _wrap(short_cmd) == short_cmd 66 | assert _wrap(short_cmd + ' ' + short_cmd) == ( 67 | short_cmd + ' ' + ' '.join(short_cmd.split()[:3]) + 68 | " \\\n " + ' '.join(short_cmd.split()[3:])) 69 | 70 | 71 | def test_format_system_deps(): 72 | # no deps at all 73 | assert _format_system_deps('-', None) is None 74 | # base deps only 75 | assert _format_system_deps('a,b,cd', None) == ( 76 | "RUN apt-get update -qq && \\\n" 77 | " apt-get install -qy a b cd && \\\n" 78 | " rm -rf /var/lib/apt/lists/*") 79 | # base & additional deps only 80 | assert _format_system_deps('a,b,cd', 'ef,hk,b') == ( 81 | "RUN apt-get update -qq && \\\n" 82 | " apt-get install -qy a b cd ef hk && \\\n" 83 | " rm -rf /var/lib/apt/lists/*") 84 | # additional deps only 85 | assert _format_system_deps('-', 'ef,hk,b') == ( 86 | "RUN apt-get update -qq && \\\n" 87 | " apt-get install -qy b ef hk && \\\n" 88 | " rm -rf /var/lib/apt/lists/*") 89 | 90 | 91 | def test_format_system_env(): 92 | assert _format_system_env(None) == 'ENV TERM xterm' 93 | assert _format_system_env('test.settings') == ( 94 | "ENV TERM xterm\n" 95 | "ENV SCRAPY_SETTINGS_MODULE test.settings") 96 | 97 | 98 | def test_format_requirements(project_dir): 99 | add_fake_requirements(project_dir) 100 | basereqs = os.path.join(project_dir, 'requirements.txt') 101 | if os.path.exists(basereqs): 102 | os.remove(basereqs) 103 | # use given requirements 104 | assert _format_requirements( 105 | os.getcwd(), 'fake-requirements.txt') == ( 106 | "COPY ./fake-requirements.txt /app/requirements.txt\n" 107 | "RUN pip install --no-cache-dir -r requirements.txt") 108 | assert not os.path.exists(basereqs) 109 | # using base requirements 110 | assert _format_requirements( 111 | os.getcwd(), 'requirements.txt') == ( 112 | "COPY ./requirements.txt /app/requirements.txt\n" 113 | "RUN pip install --no-cache-dir -r requirements.txt") 114 | assert os.path.exists(basereqs) 115 | os.remove(basereqs) 116 | 117 | 118 | def test_no_scrapy_cfg(project_dir): 119 | os.remove(os.path.join(project_dir, 'scrapy.cfg')) 120 | runner = CliRunner() 121 | result = runner.invoke(cli, []) 122 | assert result.exit_code == BadConfigException.exit_code 123 | error_msg = ( 124 | 'Error: Cannot find Scrapy project settings. Please ensure that current ' 125 | 'directory contains scrapy.cfg with settings section, see example at ' 126 | 'https://doc.scrapy.org/en/latest/topics/commands.html#default-structure-of-scrapy-projects' 127 | ) 128 | assert error_msg in result.output 129 | assert not os.path.exists(os.path.join(project_dir, 'Dockerfile')) 130 | -------------------------------------------------------------------------------- /tests/test_schedule.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest import mock 3 | 4 | from click.testing import CliRunner 5 | from scrapinghub import ScrapinghubAPIError 6 | 7 | from shub import schedule 8 | from shub.exceptions import RemoteErrorException 9 | 10 | from .utils import mock_conf 11 | 12 | 13 | class ScheduleTest(unittest.TestCase): 14 | 15 | def setUp(self): 16 | self.runner = CliRunner() 17 | self.conf = mock_conf(self) 18 | 19 | @mock.patch('shub.schedule.schedule_spider', autospec=True) 20 | def test_schedules_job_if_input_is_ok(self, mock_schedule): 21 | proj, endpoint, apikey = self.conf.get_target('default') 22 | # Default 23 | self.runner.invoke(schedule.cli, ['spider']) 24 | mock_schedule.assert_called_with( 25 | proj, endpoint, apikey, 'spider', (), (), 2, None, (), ()) 26 | # Other project 27 | self.runner.invoke(schedule.cli, ['123/spider']) 28 | mock_schedule.assert_called_with( 29 | 123, endpoint, apikey, 'spider', (), (), 2, None, (), ()) 30 | # Other endpoint 31 | proj, endpoint, apikey = self.conf.get_target('vagrant') 32 | self.runner.invoke(schedule.cli, ['vagrant/spider']) 33 | mock_schedule.assert_called_with( 34 | proj, endpoint, apikey, 'spider', (), (), 2, None, (), ()) 35 | # Other project at other endpoint 36 | self.runner.invoke(schedule.cli, ['vagrant/456/spider']) 37 | mock_schedule.assert_called_with( 38 | 456, endpoint, apikey, 'spider', (), (), 2, None, (), ()) 39 | 40 | @mock.patch('shub.schedule.ScrapinghubClient', autospec=True) 41 | def test_schedule_invalid_spider(self, mock_client): 42 | mock_proj = mock_client.return_value.get_project.return_value 43 | mock_proj.jobs.run.side_effect = ScrapinghubAPIError('') 44 | with self.assertRaises(RemoteErrorException): 45 | schedule.schedule_spider(1, 'https://endpoint/api/', 46 | 'FAKE_API_KEY', 'fake_spider') 47 | 48 | @mock.patch('shub.schedule.ScrapinghubClient', autospec=True) 49 | def test_schedule_spider_calls_project_jobs_run(self, mock_client): 50 | mock_proj = mock_client.return_value.get_project.return_value 51 | schedule.schedule_spider(1, 'https://endpoint/api/', 52 | 'FAKE_API_KEY', 'fake_spider') 53 | self.assertTrue(mock_proj.jobs.run) 54 | 55 | @mock.patch('shub.schedule.ScrapinghubClient', autospec=True) 56 | def test_forwards_args_and_settings(self, mock_client): 57 | mock_proj = mock_client.return_value.get_project.return_value 58 | self.runner.invoke( 59 | schedule.cli, 60 | "testspider -s SETT=99 -a ARG=val1 --set SETTWITHEQUAL=10=10 " 61 | "--argument ARGWITHEQUAL=val2=val2".split(' '), 62 | ) 63 | job_args = mock_proj.jobs.run.call_args[1]['job_args'] 64 | self.assertLessEqual( 65 | {'ARG': 'val1', 'ARGWITHEQUAL': 'val2=val2'}.items(), 66 | job_args.items(), 67 | ) 68 | job_settings = mock_proj.jobs.run.call_args[1]['job_settings'] 69 | self.assertEqual( 70 | {'SETT': '99', 'SETTWITHEQUAL': '10=10'}, 71 | job_settings, 72 | ) 73 | 74 | @mock.patch('shub.schedule.ScrapinghubClient', autospec=True) 75 | def test_forwards_tags(self, mock_client): 76 | mock_proj = mock_client.return_value.get_project.return_value 77 | self.runner.invoke(schedule.cli, 'testspider -t tag1 -t tag2 --tag tag3'.split()) 78 | call_kwargs = mock_proj.jobs.run.call_args[1] 79 | assert call_kwargs['add_tag'] == ('tag1', 'tag2', 'tag3') 80 | 81 | @mock.patch('shub.schedule.ScrapinghubClient', autospec=True) 82 | def test_forwards_priority(self, mock_client): 83 | mock_proj = mock_client.return_value.get_project.return_value 84 | # short option name 85 | self.runner.invoke(schedule.cli, 'testspider -p 3'.split()) 86 | call_kwargs = mock_proj.jobs.run.call_args[1] 87 | assert call_kwargs['priority'] == 3 88 | # long option name 89 | self.runner.invoke(schedule.cli, 'testspider --priority 1'.split()) 90 | call_kwargs = mock_proj.jobs.run.call_args[1] 91 | assert call_kwargs['priority'] == 1 92 | 93 | @mock.patch('shub.schedule.ScrapinghubClient', autospec=True) 94 | def test_forwards_units(self, mock_client): 95 | mock_proj = mock_client.return_value.get_project.return_value 96 | # no units specified 97 | self.runner.invoke(schedule.cli, 'testspider'.split()) 98 | call_kwargs = mock_proj.jobs.run.call_args[1] 99 | assert call_kwargs['units'] is None 100 | # short option name 101 | self.runner.invoke(schedule.cli, 'testspider -u 4'.split()) 102 | call_kwargs = mock_proj.jobs.run.call_args[1] 103 | assert call_kwargs['units'] == 4 104 | # long option name 105 | self.runner.invoke(schedule.cli, 'testspider --units 3'.split()) 106 | call_kwargs = mock_proj.jobs.run.call_args[1] 107 | assert call_kwargs['units'] == 3 108 | 109 | @mock.patch('shub.schedule.ScrapinghubClient', autospec=True) 110 | def test_forwards_environment(self, mock_client): 111 | mock_proj = mock_client.return_value.get_project.return_value 112 | self.runner.invoke( 113 | schedule.cli, 114 | "testspider -e VAR1=VAL1 --environment VAR2=VAL2".split(' '), 115 | ) 116 | call_kwargs = mock_proj.jobs.run.call_args[1] 117 | self.assertLessEqual( 118 | {'VAR1': 'VAL1', 'VAR2': 'VAL2'}.items(), 119 | call_kwargs['environment'].items(), 120 | ) 121 | -------------------------------------------------------------------------------- /shub/image/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import click 5 | 6 | from shub import exceptions as shub_exceptions 7 | from shub.config import load_shub_config, list_targets_callback 8 | from shub.image import utils 9 | from shub.image.test import test_cmd 10 | from shub.utils import create_scrapinghub_yml_wizard 11 | 12 | 13 | SHORT_HELP = 'Build release image.' 14 | 15 | HELP = """ 16 | Build command uses your Dockerfile to build an image and tag it properly. 17 | 18 | Internally, this command is a simple wrapper to `docker build` and uses 19 | docker daemon on your system to build an image. Also it can generate 20 | project version for you, and locate root project directory by itself. 21 | 22 | Image should be set via scrapinghub.yml, section "images". If version is not 23 | provided, the tool uses VCS-based stamp over project directory (the same as 24 | shub utils itself). 25 | """ 26 | 27 | BUILD_STEP_REGEX = re.compile(r'Step (\d+)/(\d+) :.*') 28 | BUILD_SUCCESS_REGEX = re.compile(r'Successfully built ([0-9a-f]+)') 29 | 30 | 31 | @click.command(help=HELP, short_help=SHORT_HELP) 32 | @click.argument("target", required=False, default="default") 33 | @click.option("-l", "--list-targets", is_flag=True, is_eager=True, 34 | expose_value=False, callback=list_targets_callback, 35 | help="List available project names defined in your config") 36 | @click.option("-d", "--debug", help="debug mode", is_flag=True, 37 | callback=utils.deprecate_debug_parameter) 38 | @click.option("-v", "--verbose", is_flag=True, 39 | help="stream build logs to console") 40 | @click.option("-V", "--version", help="release version") 41 | @click.option("-S", "--skip-tests", help="skip testing image", is_flag=True) 42 | @click.option("-n", "--no-cache", is_flag=True, 43 | help="Do not use cache when building the image") 44 | @click.option("-b", "--build-arg", multiple=True, 45 | help="Allow to pass build arguments to docker client.") 46 | @click.option("-f", "--file", "filename", default='Dockerfile', 47 | help="Name of the Dockerfile (Default is 'PATH/Dockerfile')") 48 | def cli(target, debug, verbose, version, skip_tests, no_cache, build_arg, filename): 49 | build_cmd(target, version, skip_tests, no_cache, build_arg, filename=filename) 50 | 51 | 52 | def build_cmd(target, version, skip_tests, no_cache, build_arg, filename='Dockerfile'): 53 | config = load_shub_config() 54 | create_scrapinghub_yml_wizard(config, target=target, image=True) 55 | client = utils.get_docker_client() 56 | project_dir = utils.get_project_dir() 57 | image = config.get_image(target) 58 | image_name = utils.format_image_name(image, version) 59 | build_args = dict(a.split('=', 1) for a in build_arg) 60 | if not os.path.exists(os.path.join(project_dir, filename)): 61 | raise shub_exceptions.NotFoundException( 62 | "Dockerfile is not found and it is required because project '{}' is configured " 63 | "to deploy Docker images. Please add a Dockerfile that will be used to build " 64 | "the image and retry this command. If you want to migrate an existing Scrapy project " 65 | "you can use `shub image init` command to create a Dockerfile.".format(target)) 66 | if utils.is_verbose(): 67 | build_progress_cls = _LoggedBuildProgress 68 | else: 69 | build_progress_cls = _BuildProgress 70 | click.echo(f"Building {image_name}.") 71 | events = client.build( 72 | path=project_dir, 73 | tag=image_name, 74 | decode=True, 75 | dockerfile=filename, 76 | nocache=no_cache, 77 | rm=True, 78 | buildargs=build_args, 79 | ) 80 | build_progress = build_progress_cls(events) 81 | build_progress.show() 82 | click.echo(f"The image {image_name} build is completed.") 83 | # Test the image content after building it 84 | if not skip_tests: 85 | test_cmd(target, version) 86 | 87 | 88 | class _LoggedBuildProgress(utils.BaseProgress): 89 | """Visualize build progress in verbose mode. 90 | 91 | Output all the events received from the docker daemon. 92 | """ 93 | def handle_event(self, event): 94 | super().handle_event(event) 95 | if 'stream' in event: 96 | self.handle_stream_event(event) 97 | 98 | def handle_stream_event(self, event): 99 | utils.debug_log("{}".format(event['stream'].rstrip())) 100 | 101 | 102 | class _BuildProgress(_LoggedBuildProgress): 103 | """Visualize build progress in non-verbose mode. 104 | 105 | Show total progress bar. 106 | """ 107 | 108 | def __init__(self, events): 109 | super().__init__(events) 110 | self.bar = utils.create_progress_bar( 111 | total=1, 112 | desc='Steps', 113 | # don't need rate here, let's simplify the bar 114 | bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}', 115 | ) 116 | self.is_built = False 117 | 118 | def show(self): 119 | super().show() 120 | if self.bar: 121 | self.bar.close() 122 | if not self.is_built: 123 | raise shub_exceptions.RemoteErrorException( 124 | "Build image operation failed") 125 | 126 | def handle_stream_event(self, event): 127 | if BUILD_SUCCESS_REGEX.search(event['stream']): 128 | self.is_built = True 129 | return 130 | step_row = BUILD_STEP_REGEX.match(event['stream']) 131 | if not step_row: 132 | return 133 | step_id, total = (int(val) for val in step_row.groups()) 134 | self.bar.total = max(self.bar.total, total) 135 | # ignore onbuild sub-steps 136 | if step_id > self.bar.n and self.bar.total == total: 137 | self.bar.update() 138 | -------------------------------------------------------------------------------- /shub/image/test.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from shub import exceptions as shub_exceptions 4 | from shub.config import load_shub_config, list_targets_callback 5 | from shub.image import utils 6 | 7 | SHORT_HELP = "Test a built image with Scrapy Cloud contract" 8 | HELP = """ 9 | A command to test an image after build step to make sure it fits contract. 10 | 11 | It consists of the following steps: 12 | 13 | 1) check that image exists on local machine 14 | 2) check that image has start-crawl entrypoint 15 | 3) check that image has shub-image-info entrypoint 16 | 17 | If any of the checks fails - the test command fails as a whole. By default, 18 | the test command is also executed automatically as a part of build command 19 | in its end (if you do not provide -S/--skip-tests parameter explicitly). 20 | """ 21 | 22 | IMAGE_SIZE_LIMIT = 3 * 1024 * 1024 * 1024 # 3GB 23 | CONTRACT_CMD_NOT_FOUND_WARNING = ( 24 | 'Command %s is not found in the image. ' 25 | 'Please make sure you provided it according to Scrapy Cloud contract ' 26 | '(https://shub.readthedocs.io/en/stable/custom-images-contract.html) or ' 27 | 'added scrapinghub-entrypoint-scrapy>=0.8.0 to your requirements file ' 28 | 'if you use Scrapy.' 29 | ) 30 | LIST_SPIDERS_DEPRECATED_WARNING = ( 31 | 'list-spiders command is deprecated in favour of shub-image-info command: ' 32 | 'its format is described well in Scrapy Cloud contract ' 33 | '(https://shub.readthedocs.io/en/stable/custom-images-contract.html), ' 34 | 'please review and update your code.' 35 | ) 36 | IMAGE_TOO_LARGE_WARNING = ( 37 | 'Custom image for the project is too large (more than 3GB), it can lead ' 38 | 'to various performance issues when running it in Scrapy Cloud. ' 39 | 'Please reduce the image size or ask support team for help ' 40 | '(one of the recommended articles to start with is ' 41 | 'https://www.codacy.com/blog/five-ways-to-slim-your-docker-images/).' 42 | ) 43 | 44 | 45 | @click.command(help=HELP, short_help=SHORT_HELP) 46 | @click.argument("target", required=False, default="default") 47 | @click.option("-l", "--list-targets", is_flag=True, is_eager=True, 48 | expose_value=False, callback=list_targets_callback, 49 | help="List available project names defined in your config") 50 | @click.option("-d", "--debug", help="debug mode", is_flag=True, 51 | callback=utils.deprecate_debug_parameter) 52 | @click.option("-v", "--verbose", is_flag=True, 53 | help="stream test logs to console") 54 | @click.option("-V", "--version", help="release version") 55 | def cli(target, debug, verbose, version): 56 | test_cmd(target, version) 57 | 58 | 59 | def test_cmd(target, version): 60 | config = load_shub_config() 61 | image = config.get_image(target) 62 | version = version or config.get_version() 63 | image_name = utils.format_image_name(image, version) 64 | docker_client = utils.get_docker_client() 65 | for check in [_check_image_size, 66 | _check_start_crawl_entry, 67 | _check_shub_image_info_entry]: 68 | check(image_name, docker_client) 69 | 70 | 71 | def _check_image_size(image_name, docker_client): 72 | """Check that the image exists on local machine and validate its size.""" 73 | # if there's no docker lib, the command will fail earlier 74 | # with an exception when getting a client in get_docker_client() 75 | from docker.errors import NotFound 76 | try: 77 | size = docker_client.inspect_image(image_name).get('Size') 78 | if size and isinstance(size, int) and size > IMAGE_SIZE_LIMIT: 79 | raise shub_exceptions.CustomImageTooLargeException( 80 | IMAGE_TOO_LARGE_WARNING) 81 | except NotFound as exc: 82 | utils.debug_log(exc) 83 | raise shub_exceptions.NotFoundException( 84 | "The image doesn't exist yet, please use build command at first.") 85 | 86 | 87 | def _check_shub_image_info_entry(image_name, docker_client): 88 | """Check that the image has shub-image-info entrypoint""" 89 | status, logs = _run_docker_command( 90 | docker_client, image_name, ['which', 'shub-image-info']) 91 | if status != 0 or not logs: 92 | _check_fallback_to_list_spiders(image_name, docker_client) 93 | 94 | 95 | def _check_fallback_to_list_spiders(image_name, docker_client): 96 | status, logs = _run_docker_command( 97 | docker_client, image_name, ['which', 'list-spiders']) 98 | if status != 0 or not logs: 99 | raise shub_exceptions.NotFoundException( 100 | CONTRACT_CMD_NOT_FOUND_WARNING % 'shub-image-info (& list-spiders)') 101 | else: 102 | click.echo(LIST_SPIDERS_DEPRECATED_WARNING) 103 | 104 | 105 | def _check_start_crawl_entry(image_name, docker_client): 106 | """Check that the image has start-crawl entrypoint""" 107 | status, logs = _run_docker_command( 108 | docker_client, image_name, ['which', 'start-crawl']) 109 | if status != 0 or not logs: 110 | raise shub_exceptions.NotFoundException( 111 | CONTRACT_CMD_NOT_FOUND_WARNING % 'start-crawl') 112 | 113 | 114 | def _run_docker_command(client, image_name, command): 115 | """A helper to execute an arbitrary cmd with given docker image""" 116 | container = client.create_container(image=image_name, command=command) 117 | try: 118 | client.start(container) 119 | statuscode = client.wait(container=container['Id'])['StatusCode'] 120 | logs = client.logs(container=container['Id'], stdout=True, 121 | stderr=True if statuscode else False, 122 | stream=False, timestamps=False) 123 | utils.debug_log(f"{command} results:\n{logs}") 124 | return statuscode, logs 125 | finally: 126 | client.remove_container(container) 127 | -------------------------------------------------------------------------------- /shub/image/run/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | import stat 3 | import shlex 4 | import signal 5 | import os.path 6 | from os import stat as os_stat 7 | from shutil import copyfile 8 | 9 | import click 10 | 11 | from shub.config import load_shub_config 12 | from shub.image import utils 13 | 14 | 15 | SHORT_HELP = 'Run custom image locally.' 16 | HELP = """ 17 | Run a custom Docker image locally. 18 | 19 | The command should be helpful to ensure that your custom image is properly 20 | written and do some preliminary local tests before pushing it to Scrapy Cloud. 21 | 22 | Most of the command parameters coincide with parameters for 'shub schedule' 23 | command to simplfy its usage. 24 | 25 | The `spider` argument should match the spider's name, e.g.: 26 | 27 | shub image run myspider 28 | 29 | A more advanced example of using non-default target with settings/arguments: 30 | 31 | shub image run production/myspider -a ARG1=VAL1 -s LOG_LEVEL=DEBUG 32 | """ 33 | 34 | SCRAPINGHUB_VOLUME = '/scrapinghub' 35 | WRAPPER_FILENAME = 'start-crawl-local' 36 | WRAPPER_LOCAL_PATH = os.path.join(os.path.dirname(__file__), 'wrapper.py') 37 | WRAPPER_IMAGE_PATH = os.path.join(SCRAPINGHUB_VOLUME, WRAPPER_FILENAME) 38 | 39 | 40 | @click.command(help=HELP, short_help=SHORT_HELP) 41 | @click.argument("spider", type=click.STRING) 42 | @click.option('-a', '--argument', 'args', 43 | help='Spider argument (-a name=value)', multiple=True) 44 | @click.option('-s', '--set', 'settings', 45 | help='Job-specific setting (-s name=value)', multiple=True) 46 | @click.option('-e', '--environment', multiple=True, 47 | help='Job environment variable (-e VAR=VAL)') 48 | @click.option("-V", "--version", help="use custom release version") 49 | @click.option("-v", "--verbose", is_flag=True, 50 | help="stream additional logs to console") 51 | @click.option("-k", "--keep-volume", help="Keep volume folder", is_flag=True) 52 | def cli(spider, args, settings, environment, version, verbose, keep_volume): 53 | run_cmd(spider, args, settings, environment, version, keep_volume) 54 | 55 | 56 | def run_cmd(spider, args, settings, environment, version, keep_volume): 57 | try: 58 | target, spider = spider.rsplit('/', 1) 59 | except ValueError: 60 | target = 'default' 61 | 62 | config = load_shub_config() 63 | image = config.get_image(target) 64 | version = version or config.get_version() 65 | image_name = utils.format_image_name(image, version) 66 | docker_client = utils.get_docker_client() 67 | 68 | env = _format_environment(spider, args, settings, environment) 69 | _run_with_docker(docker_client, image_name, env, keep_volume) 70 | 71 | 72 | def _format_environment(spider, args, settings, environment): 73 | """Convert all input crawl args to environment variables.""" 74 | # required defaults, can be overwritten with meta if needed 75 | job_data = {'spider': spider, 'key': '1/2/3', 'auth': ''} 76 | 77 | args = dict(x.split('=', 1) for x in args) 78 | cmd_args = shlex.split(args.pop('cmd_args', '')) 79 | if spider.startswith('py:'): 80 | job_data['job_cmd'] = [spider] + cmd_args 81 | else: 82 | job_data['spider_args'] = args 83 | meta = args.pop('meta', None) 84 | if meta: 85 | job_data.update(json.loads(meta)) 86 | 87 | job_environment = dict(x.split('=', 1) for x in environment) 88 | job_settings = dict(x.split('=', 1) for x in settings) 89 | return { 90 | 'SHUB_JOBKEY': job_data['key'], 91 | 'SHUB_SPIDER': spider, 92 | 'SHUB_JOB_DATA': _json_dumps(job_data), 93 | 'SHUB_JOB_ENV': _json_dumps(job_environment), 94 | 'SHUB_SETTINGS': _json_dumps({'job_settings': job_settings}), 95 | 'PYTHONUNBUFFERED': 1, 96 | } 97 | 98 | 99 | def _json_dumps(data): 100 | return json.dumps(data, sort_keys=True, separators=(',', ':')) 101 | 102 | 103 | def _run_with_docker(client, image_name, env, keep_volume=False): 104 | """Run a local docker container with the given custom image.""" 105 | 106 | def _signal_handler(sig, _): 107 | client.kill(container, sig) 108 | 109 | tmpdir_kw = {'prefix': 'shub-image-run-', 'cleanup': not keep_volume} 110 | with utils.make_temp_directory(**tmpdir_kw) as volume_dir: 111 | container = _create_container(client, image_name, env, volume_dir) 112 | try: 113 | client.start(container) 114 | signal.signal(signal.SIGINT, _signal_handler) 115 | signal.signal(signal.SIGTERM, _signal_handler) 116 | for log in client.logs(container, stream=True): 117 | click.echo(log.rstrip()) 118 | finally: 119 | client.remove_container(container, force=True) 120 | 121 | 122 | def _create_container(client, image_name, environment, volume_dir): 123 | """Create a docker container and customize its setup.""" 124 | # copy start-crawl wrapper to the volume temporary directory 125 | wrapper_cont_path = os.path.join(volume_dir, WRAPPER_FILENAME) 126 | copyfile(WRAPPER_LOCAL_PATH, wrapper_cont_path) 127 | wrapper_perms = os_stat(wrapper_cont_path).st_mode | stat.S_IEXEC 128 | os.chmod(wrapper_cont_path, wrapper_perms) # must be executable 129 | fifo_path = os.path.join(volume_dir, 'scrapinghub.fifo') 130 | environment['SHUB_FIFO_PATH'] = fifo_path 131 | # keep using default /scrapinghub volume but mount it as a temporary 132 | # directory in the host /tmp/ to have access to the files in needed 133 | binds = {volume_dir: {'bind': SCRAPINGHUB_VOLUME, 'mode': 'rw'}} 134 | host_config = client.create_host_config(binds=binds) 135 | return client.create_container( 136 | image=image_name, 137 | command=[WRAPPER_IMAGE_PATH], 138 | environment=environment, 139 | volumes=[volume_dir], 140 | host_config=host_config, 141 | ) 142 | -------------------------------------------------------------------------------- /tests/image/test_run.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import tempfile 3 | from unittest import mock 4 | 5 | try: 6 | from StringIO import StringIO 7 | except ImportError: 8 | from io import StringIO 9 | 10 | import pytest 11 | from click.testing import CliRunner 12 | 13 | from shub.image.run import cli, _json_dumps, WRAPPER_IMAGE_PATH 14 | from shub.image.run.wrapper import _consume_from_fifo, _millis_to_str 15 | from shub.image.utils import make_temp_directory 16 | 17 | 18 | def _format_job_data(spider='spider', auth='', **kwargs): 19 | data = {'key': '1/2/3', 'spider': spider, 'auth': auth} 20 | data.update(kwargs) 21 | return _json_dumps(data) 22 | 23 | 24 | @pytest.mark.usefixtures('project_dir') 25 | def test_cli(docker_client_mock): 26 | docker_client_mock.create_host_config.return_value = {'host': 'config'} 27 | docker_client_mock.create_container.return_value = 'contID' 28 | docker_client_mock.logs.return_value = ['some', 'logs'] 29 | # wrap make_temp_directory to validate its call args 30 | tmp_dir_fun = 'shub.image.utils.make_temp_directory' 31 | with mock.patch(tmp_dir_fun, wraps=make_temp_directory) as tmp_dir_mock: 32 | result = CliRunner().invoke(cli, ["dev/spider"]) 33 | assert result.exit_code == 0, result.stdout 34 | assert tmp_dir_mock.call_args[1] == { 35 | 'prefix': 'shub-image-run-', 'cleanup': True 36 | } 37 | docker_client_mock.start.assert_called_with('contID') 38 | docker_client_mock.logs.assert_called_with('contID', stream=True) 39 | docker_client_mock.remove_container.assert_called_with('contID', force=True) 40 | # validate create_container args 41 | docker_client_mock.create_container.assert_called_once() 42 | call_args = docker_client_mock.create_container.call_args[1] 43 | assert call_args['command'] == [WRAPPER_IMAGE_PATH] 44 | # validate environment 45 | call_env = call_args['environment'] 46 | fifo_path = call_env.pop('SHUB_FIFO_PATH') 47 | assert fifo_path.endswith('scrapinghub.fifo') 48 | job_data = _format_job_data(spider_args={}) 49 | expected_env = { 50 | 'SHUB_JOBKEY': '1/2/3', 51 | 'SHUB_SPIDER': 'spider', 52 | 'SHUB_JOB_DATA': job_data, 53 | 'SHUB_JOB_ENV': '{}', 54 | 'SHUB_SETTINGS': '{"job_settings":{}}', 55 | 'PYTHONUNBUFFERED': 1, 56 | } 57 | assert call_env == expected_env 58 | # validate other configuration parts 59 | assert call_args['host_config'] == {'host': 'config'} 60 | assert call_args['image'] == 'registry.io/user/project:1.0' 61 | assert call_args['volumes'] == [os.path.dirname(fifo_path)] 62 | 63 | 64 | @pytest.mark.usefixtures('project_dir') 65 | def test_cli_with_args(docker_client_mock): 66 | docker_client_mock.logs.return_value = [] 67 | result = CliRunner().invoke(cli, ( 68 | 'dev/spider -a arg0= -a arg1=val1 --argument arg2=val2 ' 69 | '-s SET1=VAL1 --set SET2=VAL2 ' 70 | '-e ENV1=ENVVAL1 --environment ENV2=ENVVAL2 ' 71 | '-a meta={"auth":"custom"}'.split(' ') 72 | )) 73 | assert result.exit_code == 0, result.stdout 74 | call_args = docker_client_mock.create_container.call_args[1] 75 | call_env = call_args['environment'] 76 | expected_settings = {"job_settings": {"SET1": "VAL1", "SET2": "VAL2"}} 77 | assert call_env['SHUB_SETTINGS'] == _json_dumps(expected_settings) 78 | expected_env = {"ENV1": "ENVVAL1", "ENV2": "ENVVAL2"} 79 | assert call_env['SHUB_JOB_ENV'] == _json_dumps(expected_env) 80 | expected_jobdata = {"arg0": "", "arg1": "val1", "arg2": "val2"} 81 | assert call_env['SHUB_JOB_DATA'] == _format_job_data( 82 | spider_args=expected_jobdata, auth='custom' 83 | ) 84 | 85 | 86 | @pytest.mark.usefixtures('project_dir') 87 | def test_cli_with_version(docker_client_mock): 88 | docker_client_mock.logs.return_value = [] 89 | result = CliRunner().invoke(cli, ['dev/spider', '-V', 'custom']) 90 | assert result.exit_code == 0, result.stdout 91 | call_args = docker_client_mock.create_container.call_args[1] 92 | assert call_args['image'] == 'registry.io/user/project:custom' 93 | 94 | 95 | @pytest.mark.usefixtures('project_dir') 96 | def test_cli_with_script(docker_client_mock): 97 | docker_client_mock.logs.return_value = [] 98 | script_args = "--flag1 --flag2=0 val1 val2" 99 | result = CliRunner().invoke(cli, [ 100 | 'dev/py:testargs.py', '-a', 'cmd_args="%s"' % script_args 101 | ]) 102 | assert result.exit_code == 0, result.stdout 103 | call_args = docker_client_mock.create_container.call_args[1] 104 | call_env = call_args['environment'] 105 | assert call_env['SHUB_JOB_DATA'] == _format_job_data( 106 | spider='py:testargs.py', 107 | job_cmd=["py:testargs.py", script_args], 108 | ) 109 | 110 | 111 | # Separate section for wrapper tests. 112 | 113 | FIFO_TEST_TS = 1485269941065 114 | FIFO_TEST_DATA = """\ 115 | LOG {"time": %(ts)d, "level": 20, "message": "Some message"} 116 | ITM {"key": "value", "should-be": "ignored"} 117 | LOG {"time": %(ts)d, "level": 30, "message": "Other message"}\ 118 | """ % {'ts': FIFO_TEST_TS} 119 | 120 | 121 | @mock.patch('sys.stdout', new_callable=StringIO) 122 | def test_consume_from_fifo(mock_stdout): 123 | try: 124 | # XXX work-around to use NamedTemporaryFile on Windows 125 | # https://github.com/appveyor/ci/issues/2547 126 | with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp: 127 | filename = temp.name 128 | temp.write(FIFO_TEST_DATA) 129 | temp.seek(0) 130 | _consume_from_fifo(filename) 131 | finally: 132 | os.remove(filename) 133 | local_datetime_string = _millis_to_str(FIFO_TEST_TS) 134 | assert mock_stdout.getvalue() == ( 135 | '{date} INFO Some message\n' 136 | '{date} WARNING Other message\n'.format(date=local_datetime_string) 137 | ) 138 | -------------------------------------------------------------------------------- /tests/image/test_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tempfile 4 | from unittest import mock, TestCase 5 | 6 | import pytest 7 | 8 | from shub.exceptions import BadConfigException, BadParameterException, NotFoundException 9 | from shub.image.utils import ( 10 | get_credentials, 11 | get_docker_client, 12 | get_image_registry, 13 | get_project_dir, 14 | format_image_name, 15 | load_status_url, 16 | store_status_url, 17 | STATUS_FILE_LOCATION, 18 | ) 19 | 20 | from .utils import FakeProjectDirectory, add_sh_fake_config 21 | 22 | 23 | class ReleaseUtilsTest(TestCase): 24 | 25 | def test_get_project_dir(self): 26 | self.assertRaises(BadConfigException, get_project_dir) 27 | with FakeProjectDirectory() as tmpdir: 28 | add_sh_fake_config(tmpdir) 29 | assert get_project_dir() == tmpdir 30 | 31 | def test_get_docker_client(self): 32 | mocked_docker = mock.Mock() 33 | sys.modules['docker'] = mocked_docker 34 | client_mock = mock.Mock() 35 | 36 | class DockerClientMock: 37 | 38 | def __init__(self, *args, **kwargs): 39 | client_mock(*args, **kwargs) 40 | 41 | def version(self): 42 | return {} 43 | 44 | mocked_docker.APIClient = DockerClientMock 45 | assert get_docker_client() 46 | client_mock.assert_called_with(base_url=None, tls=None, version='auto') 47 | # set basic test environment 48 | os.environ['DOCKER_HOST'] = 'http://127.0.0.1' 49 | os.environ['DOCKER_API_VERSION'] = '1.40' 50 | assert get_docker_client() 51 | client_mock.assert_called_with( 52 | base_url='http://127.0.0.1', tls=None, version='1.40') 53 | # test for tls 54 | os.environ['DOCKER_TLS_VERIFY'] = '1' 55 | os.environ['DOCKER_CERT_PATH'] = 'some-path' 56 | mocked_tls = mock.Mock() 57 | mocked_docker.tls.TLSConfig.return_value = mocked_tls 58 | assert get_docker_client() 59 | client_mock.assert_called_with( 60 | base_url='http://127.0.0.1', 61 | tls=mocked_tls, 62 | version='1.40') 63 | mocked_docker.tls.TLSConfig.assert_called_with( 64 | client_cert=(os.path.join('some-path', 'cert.pem'), 65 | os.path.join('some-path', 'key.pem')), 66 | verify=os.path.join('some-path', 'ca.pem'), 67 | assert_hostname=False) 68 | 69 | def test_format_image_name(self): 70 | assert format_image_name('simple', 'tag') == 'simple:tag' 71 | assert format_image_name('user/simple', 'tag') == 'user/simple:tag' 72 | assert format_image_name('registry/user/simple', 'tag') == \ 73 | 'registry/user/simple:tag' 74 | assert format_image_name('registry:port/user/simple', 'tag') == \ 75 | 'registry:port/user/simple:tag' 76 | assert format_image_name('registry:port/user/simple:test', 'tag') == \ 77 | 'registry:port/user/simple:tag' 78 | with mock.patch('shub.config.load_shub_config') as mocked: 79 | config = mock.Mock() 80 | config.get_version.return_value = 'test-version' 81 | mocked.return_value = config 82 | assert format_image_name('test', None) == 'test:test-version' 83 | 84 | def test_get_credentials(self): 85 | assert get_credentials(insecure=True) == (None, None) 86 | with pytest.raises(BadParameterException): 87 | get_credentials(username='user', insecure=True) 88 | with pytest.raises(BadParameterException): 89 | get_credentials(password='pass', insecure=True) 90 | assert get_credentials(apikey='apikey') == ('apikey', ' ') 91 | assert get_credentials( 92 | username='user', password='pass') == ('user', 'pass') 93 | with pytest.raises(BadParameterException): 94 | get_credentials(username='user') 95 | with pytest.raises(BadParameterException): 96 | get_credentials(password='pass') 97 | assert get_credentials(target_apikey='tapikey') == ('tapikey', ' ') 98 | 99 | def test_get_image_registry(self): 100 | assert get_image_registry('ubuntu:12.04') is None 101 | assert get_image_registry('someuser/image:tagA') is None 102 | assert get_image_registry('registry.io/imageA') == 'registry.io' 103 | assert get_image_registry('registry.io/user/name:tag') == 'registry.io' 104 | assert get_image_registry('registry:8012/image') == 'registry:8012' 105 | assert get_image_registry('registry:8012/user/repo') == 'registry:8012' 106 | 107 | 108 | class StatusUrlsTest(TestCase): 109 | 110 | def setUp(self): 111 | self.curdir = os.getcwd() 112 | self.tmp_dir = tempfile.gettempdir() 113 | os.chdir(self.tmp_dir) 114 | self.status_file = os.path.join(self.tmp_dir, STATUS_FILE_LOCATION) 115 | if os.path.exists(self.status_file): 116 | os.remove(self.status_file) 117 | 118 | def tearDown(self): 119 | os.chdir(self.curdir) 120 | 121 | def test_load_status_url(self): 122 | self.assertRaises(NotFoundException, load_status_url, 0) 123 | # try with void file 124 | open(self.status_file, 'a').close() 125 | self.assertRaises(BadConfigException, load_status_url, 0) 126 | # try with data 127 | with open(self.status_file, 'w') as f: 128 | f.write('1: http://link1\n2: https://link2\n') 129 | self.assertRaises(NotFoundException, load_status_url, 0) 130 | assert load_status_url(1) == 'http://link1' 131 | assert load_status_url(2) == 'https://link2' 132 | 133 | def test_store_status_url(self): 134 | assert not os.path.exists(self.status_file) 135 | # create and add first entry 136 | store_status_url('http://test0', 2) 137 | assert os.path.exists(self.status_file) 138 | with open(self.status_file) as f: 139 | assert f.read() == '0: http://test0\n' 140 | # add another one 141 | store_status_url('http://test1', 2) 142 | with open(self.status_file) as f: 143 | assert f.read() == '0: http://test0\n1: http://test1\n' 144 | # replacement 145 | assert store_status_url('http://test2', 2) == 2 146 | with open(self.status_file) as f: 147 | assert f.read() == '1: http://test1\n2: http://test2\n' 148 | # existing 149 | assert store_status_url('http://test1', 2) == 1 150 | with open(self.status_file) as f: 151 | assert f.read() == '1: http://test1\n2: http://test2\n' 152 | -------------------------------------------------------------------------------- /tests/test_migrate_eggs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from unittest import mock 4 | 5 | import yaml 6 | from click.testing import CliRunner 7 | from yaml import CLoader as Loader 8 | 9 | from shub.migrate_eggs import main 10 | from shub.config import Target 11 | 12 | 13 | class MigrateEggsTest(unittest.TestCase): 14 | REQ_LIST = [ 15 | 'boto==2.38.0', 16 | 'dateparser==0.3.3', 17 | 'decorator==4.0.10', 18 | 'dicttoxml==1.6.6', 19 | 'httpretty==0.8.0', 20 | 'hubstorage==0.16.1', 21 | 'jdatetime==1.7.2', 22 | 'six==1.9.0', 'spur==0.3.15', 23 | 'SQLAlchemy==1.0.5', 24 | 'sqlitedict==1.3.0', 25 | 'urllib3==1.11', 26 | 'wheel==0.24.0', 27 | 'wsgiref==0.1.2', 28 | ] 29 | 30 | def run(self, *a, **kw): 31 | runner = CliRunner() 32 | with runner.isolated_filesystem(): 33 | super().run(*a, **kw) 34 | 35 | def setUp(self): 36 | self.clickm = mock.patch('shub.migrate_eggs.click').start() 37 | gtc = mock.patch('shub.migrate_eggs.get_target_conf').start() 38 | self.requestsm = mock.patch('shub.migrate_eggs.requests').start() 39 | 40 | self.curr_dir = os.path.dirname(os.path.realpath(__file__)) 41 | 42 | with open('./scrapinghub.yml', 'w') as f: 43 | f.write('') 44 | 45 | gtc.return_value = Target( 46 | project_id=123, 47 | endpoint='endpoint1', 48 | apikey='apikey1', 49 | stack='', 50 | image='', 51 | requirements_file='', 52 | version='', 53 | eggs=[], 54 | ) 55 | 56 | self.addCleanup(mock.patch.stopall) 57 | 58 | def walksorted(self): 59 | return [ 60 | (sorted(dirs), sorted(files)) 61 | for _, dirs, files in os.walk('.') 62 | ] 63 | 64 | def _assert_requirements_content(self): 65 | with open('./requirements.txt') as f: 66 | content = f.read() 67 | self.assertIn('DISABLE_DASH_EGGS', content) 68 | requirements = [line for line in content.split('\n') if '==' in line] 69 | self.assertListEqual(requirements, self.REQ_LIST) 70 | 71 | def test_full(self): 72 | migrate_zip = os.path.join(self.curr_dir, 'samples/migrate-eggs.zip') 73 | with open(migrate_zip, 'rb') as f: 74 | self.requestsm.get().content = f.read() 75 | 76 | main('default') 77 | self.clickm.confirm.assert_called_with( 78 | 'Eggs will be stored in ./eggs, are you sure ? ' 79 | ) 80 | 81 | files = self.walksorted() 82 | 83 | self.assertEqual( 84 | files[0], 85 | (['eggs'], ['requirements.txt', 'scrapinghub.yml']), 86 | ) 87 | self.assertEqual( 88 | files[1], 89 | ([], ['1.egg', '2.egg', '3.egg']) 90 | ) 91 | 92 | with open('./scrapinghub.yml') as f: 93 | abc = yaml.load(f, Loader=Loader) 94 | eggs = abc['requirements'].pop('eggs') 95 | eggs = [e.replace('\\', '/') for e in eggs] 96 | self.assertEqual( 97 | eggs, 98 | [ 99 | './eggs/1.egg', 100 | './eggs/2.egg', 101 | './eggs/3.egg', 102 | ], 103 | ) 104 | self.assertDictEqual( 105 | abc, 106 | { 107 | 'requirements': { 108 | 'file': './requirements.txt' 109 | }, 110 | } 111 | ) 112 | 113 | self._assert_requirements_content() 114 | 115 | for i in range(1, 4): 116 | i = str(i) 117 | with open('./eggs/%s.egg' % i) as f: 118 | self.assertEqual(f.read().strip(), i) 119 | 120 | def test_no_eggs(self): 121 | file_ = 'samples/migrate-eggs-no-eggs.zip' 122 | migrate_zip = os.path.join(self.curr_dir, file_) 123 | with open(migrate_zip, 'rb') as f: 124 | self.requestsm.get().content = f.read() 125 | 126 | main('default') 127 | self.assertFalse(self.clickm.confirm.called) 128 | 129 | files = self.walksorted() 130 | self.assertListEqual( 131 | files, 132 | [([], ['requirements.txt', 'scrapinghub.yml'])] 133 | ) 134 | 135 | with open('./scrapinghub.yml') as f: 136 | abc = yaml.load(f, Loader=Loader) 137 | self.assertDictEqual( 138 | abc, 139 | { 140 | 'requirements': { 141 | 'file': './requirements.txt' 142 | }, 143 | } 144 | ) 145 | 146 | self._assert_requirements_content() 147 | 148 | def test_override_reqs_file(self): 149 | file_ = 'samples/migrate-eggs-no-eggs.zip' 150 | migrate_zip = os.path.join(self.curr_dir, file_) 151 | with open(migrate_zip, 'rb') as f: 152 | self.requestsm.get().content = f.read() 153 | with open('./requirements.txt', 'w') as f: 154 | f.write('smth==1.2.3') 155 | 156 | self.clickm.confirm.return_value = False 157 | main('default') 158 | self.clickm.confirm.assert_called_with( 159 | 'requirements.txt already exists, are you sure to override it ?' 160 | ) 161 | 162 | files = self.walksorted() 163 | self.assertListEqual( 164 | files, 165 | [([], ['requirements.txt', 'scrapinghub.yml'])] 166 | ) 167 | 168 | with open('./scrapinghub.yml') as f: 169 | self.assertEqual(f.read(), '') 170 | 171 | with open('./requirements.txt') as f: 172 | content = f.read() 173 | self.assertEqual(content, 'smth==1.2.3') 174 | 175 | self.clickm.reset_mock() 176 | self.clickm.confirm.return_value = True 177 | main('default') 178 | 179 | self.clickm.confirm.assert_called_with( 180 | 'requirements.txt already exists, are you sure to override it ?' 181 | ) 182 | 183 | files = self.walksorted() 184 | self.assertListEqual( 185 | files, 186 | [([], ['requirements.txt', 'scrapinghub.yml'])] 187 | ) 188 | 189 | with open('./scrapinghub.yml') as f: 190 | abc = yaml.load(f, Loader=Loader) 191 | self.assertDictEqual( 192 | abc, 193 | { 194 | 'requirements': { 195 | 'file': './requirements.txt' 196 | }, 197 | } 198 | ) 199 | 200 | self._assert_requirements_content() 201 | -------------------------------------------------------------------------------- /shub/image/list.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import click 4 | import docker 5 | import requests 6 | from urllib.parse import urljoin 7 | 8 | from shub.exceptions import ShubException 9 | from shub.config import load_shub_config, list_targets_callback 10 | from shub.image import utils 11 | 12 | 13 | SETTING_TYPES = ['project_settings', 14 | 'organization_settings', 15 | 'enabled_addons'] 16 | 17 | SHORT_HELP = 'List spiders.' 18 | 19 | HELP = """ 20 | List command tries to run your image locally and get a spiders list. 21 | 22 | Internally, this command is a simple wrapper to `docker run` and uses 23 | docker daemon on your system to run a new container using your image. 24 | Before creating the container, there's a Dash call to get your project 25 | settings to get your spiders list properly (respecting SPIDERS_MODULE 26 | setting, etc). 27 | 28 | Image should be set via scrapinghub.yml, section "images". If version is not 29 | provided, the tool uses VCS-based stamp over project directory (the same as 30 | shub utils itself). 31 | """ 32 | 33 | 34 | @click.command(help=HELP, short_help=SHORT_HELP) 35 | @click.argument("target", required=False, default="default") 36 | @click.option("-l", "--list-targets", is_flag=True, is_eager=True, 37 | expose_value=False, callback=list_targets_callback, 38 | help="List available project names defined in your config") 39 | @click.option("-d", "--debug", help="debug mode", is_flag=True, 40 | callback=utils.deprecate_debug_parameter) 41 | @click.option("-v", "--verbose", is_flag=True, help="stream logs to console") 42 | @click.option("-s", "--silent", is_flag=True, 43 | help="don't warn if Dash project is not defined in config") 44 | @click.option("-V", "--version", help="release version") 45 | def cli(target, debug, verbose, silent, version): 46 | list_cmd_full(target, silent, version) 47 | 48 | 49 | def list_cmd_full(target, silent, version): 50 | config = load_shub_config() 51 | image = config.get_image(target) 52 | version = version or config.get_version() 53 | image_name = utils.format_image_name(image, version) 54 | target_conf = config.get_target_conf(target) 55 | metadata = list_cmd(image_name, 56 | target_conf.project_id, 57 | target_conf.endpoint, 58 | target_conf.apikey) 59 | for spider in metadata.get('spiders', []): 60 | click.echo(spider) 61 | 62 | 63 | def list_cmd(image_name, project, endpoint, apikey): 64 | """Short version of list cmd to use with deploy cmd.""" 65 | settings = _get_project_settings(project, endpoint, apikey) 66 | environment = {'JOB_SETTINGS': json.dumps(settings)} 67 | exit_code, logs = _run_cmd_in_docker_container( 68 | image_name, 'shub-image-info', environment) 69 | if exit_code == 0: 70 | return _extract_metadata_from_image_info_output(logs) 71 | # shub-image-info command not found, fallback to list-spiders 72 | elif exit_code == 127: 73 | # FIXME we should pass some value for SCRAPY_PROJECT_ID anyway 74 | # to handle `scrapy list` cmd properly via sh_scrapy entrypoint 75 | # environment['SCRAPY_PROJECT_ID'] = str(project) if project else '' 76 | exit_code, logs = _run_cmd_in_docker_container( 77 | image_name, 'list-spiders', environment) 78 | if exit_code != 0: 79 | click.echo(logs) 80 | raise ShubException('Container with list cmd exited with code %s' % exit_code) 81 | return { 82 | 'project_type': 'scrapy', 83 | 'spiders': utils.valid_spiders(logs.splitlines()), 84 | } 85 | else: 86 | click.echo(logs) 87 | raise ShubException( 88 | 'Container with shub-image-info cmd exited with code %s' % exit_code) 89 | 90 | 91 | def _get_project_settings(project, endpoint, apikey): 92 | utils.debug_log(f'Getting settings for {project} project:') 93 | req = requests.get( 94 | urljoin(endpoint, '/api/settings/get.json'), 95 | params={'project': project}, 96 | auth=(apikey, ''), 97 | timeout=300, 98 | allow_redirects=False 99 | ) 100 | req.raise_for_status() 101 | utils.debug_log(f"Response: {req.json()}") 102 | return {k: v for k, v in req.json().items() if k in SETTING_TYPES} 103 | 104 | 105 | def _run_cmd_in_docker_container(image_name, command, environment): 106 | """Run a command inside the image container.""" 107 | client = utils.get_docker_client() 108 | container = client.create_container( 109 | image=image_name, 110 | command=[command], 111 | environment=environment, 112 | ) 113 | if 'Id' not in container: 114 | raise ShubException("Create container error:\n %s" % container) 115 | try: 116 | client.start(container) 117 | except docker.errors.APIError as e: 118 | explanation = utils.ensure_unicode(e.explanation or '') 119 | if 'executable file not found' in explanation: 120 | # docker.errors.APIError: 500 Server Error: 121 | # Internal Server Error ("Cannot start container xxx: 122 | # [8] System error: exec: "shub-image-info": 123 | # executable file not found in $PATH") 124 | return 127, None 125 | raise 126 | statuscode = client.wait(container=container['Id'])['StatusCode'] 127 | logs = client.logs( 128 | container=container['Id'], stream=False, timestamps=False, 129 | stdout=True, stderr=True if statuscode else False, 130 | ) 131 | return statuscode, utils.ensure_unicode(logs) 132 | 133 | 134 | def _extract_metadata_from_image_info_output(output): 135 | """Extract and validate spiders list from `shub-image-info` output.""" 136 | 137 | def raise_shub_image_info_error(error): 138 | """Helper to raise ShubException with prefix and output""" 139 | msg = f"shub-image-info: {error} \n[output '{output}']" 140 | raise ShubException(msg) 141 | 142 | try: 143 | metadata = json.loads(output) 144 | project_type = metadata.get('project_type') 145 | except (AttributeError, ValueError): 146 | raise_shub_image_info_error('output is not a valid JSON dict') 147 | if not isinstance(project_type, str): 148 | raise_shub_image_info_error('"project_type" key is required and must be a string') 149 | 150 | spiders_list = metadata.get('spiders') 151 | if not isinstance(spiders_list, list): 152 | raise_shub_image_info_error('"spiders" key is required and must be a list') 153 | spiders, scripts = [], [] 154 | for name in spiders_list: 155 | if not (name and isinstance(name, str)): 156 | raise_shub_image_info_error("spider name can't be empty or non-string") 157 | if project_type == 'scrapy' and name.startswith('py:'): 158 | scripts.append(name[3:]) 159 | else: 160 | spiders.append(name) 161 | return { 162 | 'project_type': project_type, 163 | 'spiders': utils.valid_spiders(spiders), 164 | 'scripts': utils.valid_spiders(scripts), 165 | } 166 | -------------------------------------------------------------------------------- /shub/image/init.py: -------------------------------------------------------------------------------- 1 | import os 2 | import textwrap 3 | from string import Template 4 | 5 | import click 6 | 7 | from shub import exceptions as shub_exceptions 8 | from shub import utils as shub_utils 9 | 10 | 11 | DOCKER_APP_DIR = '/app' 12 | DOCKERFILE_TEMPLATE = """\ 13 | FROM $base_image 14 | $system_deps 15 | $system_env 16 | RUN mkdir -p {docker_app_dir} 17 | WORKDIR {docker_app_dir} 18 | $requirements 19 | COPY . {docker_app_dir} 20 | RUN python -m pip install . 21 | """.format(docker_app_dir=DOCKER_APP_DIR) 22 | 23 | DEFAULT_BASE_IMAGE = "scrapinghub/scrapinghub-stack-scrapy:2.13-20250721" 24 | RECOMMENDED_PYTHON_DEPS = [ 25 | 'guppy==0.1.10', 26 | ] 27 | 28 | SHORT_HELP = "Create Dockerfile for existing Scrapy project." 29 | 30 | HELP = """ 31 | Init command creates a Dockerfile for existing Scrapy project. This tool is for users 32 | who want to create a custom Docker image and don't have a Dockerfile yet. If generated 33 | Dockerfile doesn't fit your project feel free to edit it. 34 | 35 | Python packages 36 | 37 | If there's a requirements.txt file in the project directory - it will be added to the 38 | Dockerfile. Also it's possible to provide a path to requirements file via --requirements 39 | option. Otherwise new requirements.txt file will be created in the project directory 40 | with the recommended Python packages. Use --list-recommended-reqs to list them. 41 | 42 | It's recommended to include scrapinghub-entrypoint-scrapy package - it is a 43 | support layer that passes data from the job to Scrapinghub storage. Otherwise 44 | you will need to send data to Scrapinghub storage using HTTP API. 45 | 46 | System packages 47 | 48 | You can extend list of system packages installed in the image via --add-deps option. 49 | """ 50 | 51 | 52 | def list_recommended_python_reqs(ctx, param, value): 53 | """List recommended Python requirements""" 54 | if not value: 55 | return 56 | click.echo("Recommended Python deps list:") 57 | for dep in RECOMMENDED_PYTHON_DEPS: 58 | click.echo(f'- {dep}') 59 | ctx.exit() 60 | 61 | 62 | def _deprecate_base_deps_parameter(ctx, param, value): 63 | if value: 64 | click.echo("WARNING: --base-deps parameter is deprecated. " 65 | "Please use --add-deps parameter instead.", 66 | err=True) 67 | return value 68 | 69 | 70 | @click.command(help=HELP, short_help=SHORT_HELP) 71 | @click.option("--list-recommended-reqs", is_flag=True, is_eager=True, 72 | expose_value=False, callback=list_recommended_python_reqs, 73 | help="list recommended python requirements") 74 | @click.option("--project", default="default", 75 | help="project name to get settings module from scrapy.cfg") 76 | @click.option("--base-image", default=DEFAULT_BASE_IMAGE, 77 | help="base docker image name") 78 | @click.option("--base-deps", default='', 79 | help="[DEPRECATED] a comma-separated list with base system dependencies", 80 | callback=_deprecate_base_deps_parameter) 81 | @click.option("--add-deps", 82 | help="a comma-separated list with additional system dependencies") 83 | @click.option("--requirements", default="requirements.txt", 84 | help="path to requirements.txt") 85 | def cli(project, base_image, base_deps, add_deps, requirements): 86 | closest_scrapy_cfg = shub_utils.closest_file('scrapy.cfg') 87 | scrapy_config = shub_utils.get_config() 88 | if not closest_scrapy_cfg or not scrapy_config.has_option('settings', project): 89 | raise shub_exceptions.BadConfigException( 90 | 'Cannot find Scrapy project settings. Please ensure that current directory ' 91 | 'contains scrapy.cfg with settings section, see example at ' 92 | 'https://doc.scrapy.org/en/latest/topics/commands.html#default-structure-of-scrapy-projects') # NOQA 93 | project_dir = os.path.dirname(closest_scrapy_cfg) 94 | dockefile_path = os.path.join(project_dir, 'Dockerfile') 95 | if os.path.exists(dockefile_path): 96 | raise shub_exceptions.ShubException('Found a Dockerfile in the project directory, aborting') 97 | settings_module = scrapy_config.get('settings', 'default') 98 | shub_utils.create_default_setup_py(settings=settings_module) 99 | values = { 100 | 'base_image': base_image, 101 | 'system_deps': _format_system_deps(base_deps, add_deps), 102 | 'system_env': _format_system_env(settings_module), 103 | 'requirements': _format_requirements(project_dir, requirements), 104 | } 105 | values = {key: value if value else '' for key, value in values.items()} 106 | source = Template(DOCKERFILE_TEMPLATE) 107 | results = source.substitute(values) 108 | results = results.replace('\n\n', '\n') 109 | with open(dockefile_path, 'w') as dockerfile: 110 | dockerfile.write(results) 111 | click.echo(f"Dockerfile is saved to {dockefile_path}") 112 | 113 | 114 | def _format_system_deps(base_deps, add_deps): 115 | """Prepare a list with system dependencies install cmds""" 116 | system_deps = base_deps.split(',') if base_deps != '-' else [] 117 | if add_deps: 118 | system_add_deps = add_deps.split(',') 119 | system_deps = list(set(system_deps + system_add_deps)) 120 | system_deps = sorted(filter(None, system_deps)) 121 | if not system_deps: 122 | return 123 | commands = ["apt-get update -qq", 124 | "apt-get install -qy {}".format(' '.join(system_deps)), 125 | "rm -rf /var/lib/apt/lists/*"] 126 | return 'RUN ' + ' && \\\n '.join( 127 | [_wrap(cmd) for cmd in commands]) 128 | 129 | 130 | def _wrap(text): 131 | """Wrap dependencies with separator""" 132 | lines = textwrap.wrap(text, subsequent_indent=' ', 133 | break_long_words=False, 134 | break_on_hyphens=False) 135 | return ' \\\n'.join(lines) 136 | 137 | 138 | def _format_system_env(settings_module): 139 | rows = ['ENV TERM xterm'] 140 | if settings_module: 141 | rows.append('ENV SCRAPY_SETTINGS_MODULE %s' % settings_module) 142 | return '\n'.join(rows) 143 | 144 | 145 | def _format_requirements(project_dir, requirements): 146 | """Prepare cmds for project requirements""" 147 | rel_reqs_path = os.path.relpath( 148 | os.path.join(project_dir, requirements), project_dir) 149 | if os.path.isfile(rel_reqs_path): 150 | if rel_reqs_path.startswith('../'): 151 | raise shub_exceptions.BadParameterException( 152 | "Requirements file must be inside your project directory, " 153 | "otherwise it will not be included in the Docker build context.") 154 | else: 155 | # let's create requirements.txt with base dependencies 156 | with open(rel_reqs_path, 'w') as reqs_file: 157 | reqs_file.writelines("%s\n" % line for line in RECOMMENDED_PYTHON_DEPS) 158 | click.echo('Created base requirements.txt in project dir.') 159 | rows = [ 160 | f'COPY ./{rel_reqs_path} {DOCKER_APP_DIR}/requirements.txt', 161 | 'RUN pip install --no-cache-dir -r requirements.txt', 162 | ] 163 | return '\n'.join(rows) 164 | -------------------------------------------------------------------------------- /tests/image/test_build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from unittest import mock 4 | 5 | import pytest 6 | from click.testing import CliRunner 7 | 8 | from shub import exceptions as shub_exceptions 9 | from shub.image.build import cli 10 | 11 | from ..utils import clean_progress_output, format_expected_progress 12 | 13 | 14 | @pytest.fixture 15 | def test_mock(): 16 | """Mock for shub image test command""" 17 | with mock.patch('shub.image.build.test_cmd') as m: 18 | yield m 19 | 20 | 21 | def test_cli(docker_client_mock, project_dir, test_mock): 22 | docker_client_mock.build.return_value = [ 23 | {"stream": "all is ok"}, 24 | {"stream": "Successfully built 12345"} 25 | ] 26 | runner = CliRunner() 27 | result = runner.invoke(cli, ["dev", "-v"]) 28 | assert result.exit_code == 0 29 | docker_client_mock.build.assert_called_with( 30 | decode=True, 31 | path=project_dir, 32 | tag='registry.io/user/project:1.0', 33 | dockerfile='Dockerfile', 34 | nocache=False, 35 | rm=True, 36 | buildargs={} 37 | ) 38 | test_mock.assert_called_with("dev", None) 39 | 40 | 41 | def test_cli_with_nocache(docker_client_mock, project_dir, test_mock): 42 | docker_client_mock.build.return_value = [ 43 | {"stream": "all is ok"}, 44 | {"stream": "Successfully built 12345"} 45 | ] 46 | runner = CliRunner() 47 | result = runner.invoke(cli, ["dev", "-v", "--no-cache"]) 48 | assert result.exit_code == 0 49 | docker_client_mock.build.assert_called_with( 50 | decode=True, 51 | path=project_dir, 52 | tag='registry.io/user/project:1.0', 53 | dockerfile='Dockerfile', 54 | nocache=True, 55 | rm=True, 56 | buildargs={} 57 | ) 58 | test_mock.assert_called_with("dev", None) 59 | 60 | 61 | def test_cli_with_buildargs(docker_client_mock, project_dir, test_mock): 62 | docker_client_mock.build.return_value = [ 63 | {"stream": "all is ok"}, 64 | {"stream": "Successfully built 12345"} 65 | ] 66 | runner = CliRunner() 67 | result = runner.invoke(cli, ["dev", "-v", "-b", "AWS_KEY=asdasdeg", "-b", 68 | "AWS_SEC=ashthku", "-b", "PARAM=query=4"]) 69 | assert result.exit_code == 0 70 | docker_client_mock.build.assert_called_with( 71 | decode=True, 72 | path=project_dir, 73 | tag='registry.io/user/project:1.0', 74 | dockerfile='Dockerfile', 75 | nocache=False, 76 | rm=True, 77 | buildargs={'AWS_KEY': 'asdasdeg', 'AWS_SEC': 'ashthku', 'PARAM': 'query=4'} 78 | ) 79 | test_mock.assert_called_with("dev", None) 80 | 81 | 82 | def test_cli_with_progress(docker_client_mock, project_dir, test_mock): 83 | docker_client_mock.build.return_value = [ 84 | {"stream": "Step 1/3 : FROM some_image"}, 85 | {"stream": "some internal actions"}, 86 | {"stream": "Step 2/3 : RUN cmd1"}, 87 | {"stream": "some other actions"}, 88 | {"stream": "Step 3/3 : RUN cmd2"}, 89 | {"stream": "Successfully built 12345"} 90 | ] 91 | runner = CliRunner() 92 | result = runner.invoke(cli, ["dev"]) 93 | assert result.exit_code == 0 94 | expected = format_expected_progress( 95 | r'Building registry\.io/user/project:1\.0\.' 96 | r'Steps: 0%\| +\| 0/1' 97 | r'Steps: 100%\|█+\| 3/3' 98 | r'The image registry\.io/user/project:1\.0 build is completed\.' 99 | ) 100 | assert re.search(clean_progress_output(result.output), expected) 101 | 102 | 103 | def test_cli_custom_version(docker_client_mock, project_dir, test_mock): 104 | docker_client_mock.build.return_value = [ 105 | {"stream": "all is ok"}, 106 | {"stream": "Successfully built 12345"} 107 | ] 108 | runner = CliRunner() 109 | result = runner.invoke(cli, ["dev", "--version", "test"]) 110 | assert result.exit_code == 0 111 | docker_client_mock.build.assert_called_with( 112 | decode=True, 113 | path=project_dir, 114 | tag='registry.io/user/project:test', 115 | dockerfile='Dockerfile', 116 | nocache=False, 117 | rm=True, 118 | buildargs={} 119 | ) 120 | test_mock.assert_called_with("dev", "test") 121 | 122 | 123 | def test_cli_no_dockerfile(docker_client_mock, project_dir): 124 | docker_client_mock.build.return_value = [ 125 | {"error": "Minor", "errorDetail": "Testing output"}, 126 | {"stream": "Successfully built 12345"} 127 | ] 128 | os.remove(os.path.join(project_dir, 'Dockerfile')) 129 | runner = CliRunner() 130 | result = runner.invoke(cli, ["dev"]) 131 | assert result.exit_code == shub_exceptions.NotFoundException.exit_code 132 | 133 | 134 | @pytest.mark.usefixtures('project_dir') 135 | def test_cli_fail(docker_client_mock): 136 | docker_client_mock.build.return_value = [ 137 | {"error": "Minor", "errorDetail": "Test"} 138 | ] 139 | runner = CliRunner() 140 | result = runner.invoke(cli, ["dev"]) 141 | assert result.exit_code == shub_exceptions.RemoteErrorException.exit_code 142 | 143 | 144 | @pytest.mark.parametrize('skip_tests_flag', ['-S', '--skip-tests']) 145 | def test_cli_skip_tests(docker_client_mock, test_mock, project_dir, skip_tests_flag): 146 | docker_client_mock.build.return_value = [ 147 | {"stream": "all is ok"}, 148 | {"stream": "Successfully built 12345"} 149 | ] 150 | runner = CliRunner() 151 | result = runner.invoke(cli, ["dev", skip_tests_flag]) 152 | assert result.exit_code == 0 153 | docker_client_mock.build.assert_called_with( 154 | decode=True, 155 | path=project_dir, 156 | tag='registry.io/user/project:1.0', 157 | dockerfile='Dockerfile', 158 | nocache=False, 159 | rm=True, 160 | buildargs={} 161 | ) 162 | assert test_mock.call_count == 0 163 | 164 | 165 | @pytest.mark.parametrize('file_param', ['-f', '--file']) 166 | def test_cli_custom_dockerfile(docker_client_mock, project_dir, test_mock, file_param): 167 | docker_client_mock.build.return_value = [ 168 | {"stream": "all is ok"}, 169 | {"stream": "Successfully built 12345"} 170 | ] 171 | runner = CliRunner() 172 | result = runner.invoke(cli, ["dev", file_param, "Dockerfile"]) 173 | assert result.exit_code == 0 174 | docker_client_mock.build.assert_called_with( 175 | decode=True, 176 | path=project_dir, 177 | tag='registry.io/user/project:1.0', 178 | dockerfile='Dockerfile', 179 | nocache=False, 180 | rm=True, 181 | buildargs={} 182 | ) 183 | test_mock.assert_called_with("dev", None) 184 | 185 | 186 | @pytest.mark.usefixtures('project_dir') 187 | @pytest.mark.parametrize('file_param', ['-f', '--file']) 188 | def test_cli_missing_custom_dockerfile(docker_client_mock, file_param): 189 | docker_client_mock.build.return_value = [ 190 | {"error": "Minor", "errorDetail": "Testing output"}, 191 | {"stream": "Successfully built 12345"} 192 | ] 193 | runner = CliRunner() 194 | result = runner.invoke(cli, ["dev", file_param, "Dockerfile-missing"]) 195 | assert result.exit_code == shub_exceptions.NotFoundException.exit_code 196 | -------------------------------------------------------------------------------- /tests/image/test_list.py: -------------------------------------------------------------------------------- 1 | import json 2 | from unittest import mock 3 | 4 | import docker 5 | import pytest 6 | from click.testing import CliRunner 7 | 8 | from shub.exceptions import BadParameterException, ShubException 9 | from shub.image.list import cli, list_cmd 10 | from shub.image.list import _run_cmd_in_docker_container 11 | from shub.image.list import _extract_metadata_from_image_info_output 12 | 13 | 14 | def _mock_docker_client(wait_code=0, logs=None): 15 | client_mock = mock.Mock() 16 | client_mock.create_container.return_value = {'Id': '1234'} 17 | client_mock.wait.return_value = {'Error': None, 'StatusCode': wait_code} 18 | client_mock.logs.return_value = logs or '' 19 | return client_mock 20 | 21 | 22 | def _get_settings_mock(settings=None): 23 | settings_mock = mock.Mock() 24 | settings_mock.json.return_value = settings or {} 25 | return settings_mock 26 | 27 | 28 | def _convert_str(data, to_binary=False): 29 | """Helper to convert str to corresponding string or binary type. 30 | 31 | `data` has `str` type (in both Python 2/3), the function converts it 32 | to corresponding string or binary representation depending on Python 33 | version and boolean `to_binary` parameter. 34 | """ 35 | if to_binary: 36 | return data.encode('utf-8') 37 | return data 38 | 39 | 40 | def test_cli_no_scrapinghub_config(): 41 | result = CliRunner().invoke(cli, ["dev", "-v", "--version", "test"]) 42 | assert result.exit_code == BadParameterException.exit_code 43 | assert 'Could not find target "dev"' in result.output 44 | 45 | 46 | @pytest.mark.usefixtures('project_dir') 47 | @pytest.mark.parametrize('is_binary_logs', [True, False]) 48 | @mock.patch('shub.image.utils.get_docker_client') 49 | @mock.patch('requests.get') 50 | def test_cli(requests_get_mock, get_docker_client_mock, is_binary_logs): 51 | """Case when shub-image-info succeeded.""" 52 | requests_get_mock.return_value = _get_settings_mock() 53 | mocked_logs = json.dumps({'project_type': 'scrapy', 54 | 'spiders': ['abc', 'def']}) 55 | mocked_logs = _convert_str(mocked_logs, to_binary=is_binary_logs) 56 | docker_client = _mock_docker_client(logs=mocked_logs) 57 | get_docker_client_mock.return_value = docker_client 58 | result = CliRunner().invoke(cli, ["dev", "-v", "-s", "--version", "test"]) 59 | assert result.exit_code == 0 60 | assert result.output.endswith('abc\ndef\n') 61 | requests_get_mock.assert_called_with( 62 | 'https://app.zyte.com/api/settings/get.json', 63 | allow_redirects=False, auth=('abcdef', ''), 64 | params={'project': 12345}, timeout=300) 65 | 66 | 67 | @pytest.mark.usefixtures('project_dir') 68 | @mock.patch('shub.image.utils.get_docker_client') 69 | @mock.patch('requests.get') 70 | def test_cli_image_info_error(requests_get_mock, get_docker_client_mock): 71 | """Case when shub-image-info command failed with unknown exit code.""" 72 | requests_get_mock.return_value = _get_settings_mock() 73 | docker_client = _mock_docker_client(wait_code=1, logs='some-error') 74 | get_docker_client_mock.return_value = docker_client 75 | result = CliRunner().invoke(cli, ["dev", "-v", "--version", "test"]) 76 | assert result.exit_code == 1 77 | assert 'Container with shub-image-info cmd exited with code 1' in result.output 78 | 79 | 80 | @pytest.mark.usefixtures('project_dir') 81 | @mock.patch('shub.image.utils.get_docker_client') 82 | @mock.patch('requests.get') 83 | def test_cli_image_info_not_found(requests_get_mock, get_docker_client_mock): 84 | """Case when shub-image-info cmd not found with fallback to list-spiders.""" 85 | requests_get_mock.return_value = _get_settings_mock({'SETTING': 'VALUE'}) 86 | docker_client = _mock_docker_client() 87 | docker_client.wait.side_effect = [ 88 | {'Error': None, 'StatusCode': 127}, 89 | {'Error': None, 'StatusCode': 0} 90 | ] 91 | docker_client.logs.side_effect = ["not-found", "spider1\nspider2\n"] 92 | get_docker_client_mock.return_value = docker_client 93 | result = CliRunner().invoke(cli, ["dev", "-v", "--version", "test"]) 94 | assert result.exit_code == 0 95 | assert 'spider1\nspider2' in result.output 96 | 97 | 98 | @pytest.mark.usefixtures('project_dir') 99 | @mock.patch('shub.image.utils.get_docker_client') 100 | @mock.patch('requests.get') 101 | def test_cli_both_commands_failed(requests_get_mock, get_docker_client_mock): 102 | """Case when shub-image-info cmd not found with fallback to list-spiders.""" 103 | requests_get_mock.return_value = _get_settings_mock({'SETTING': 'VALUE'}) 104 | docker_client = _mock_docker_client(wait_code=127, logs='not-found') 105 | get_docker_client_mock.return_value = docker_client 106 | result = CliRunner().invoke(cli, ["dev", "-v", "--version", "test"]) 107 | assert result.exit_code == 1 108 | assert 'Container with list cmd exited with code 127' in result.output 109 | 110 | 111 | @mock.patch('shub.image.utils.get_docker_client') 112 | def test_run_cmd_in_docker_container(get_docker_client_mock): 113 | docker_client = _mock_docker_client(logs='abc\ndef\ndsd') 114 | get_docker_client_mock.return_value = docker_client 115 | test_env = {'TEST_ENV1': 'VAL1', 'TEST_ENV2': 'VAL2'} 116 | result = _run_cmd_in_docker_container('image', 'test-cmd', test_env) 117 | assert result[0] == 0 118 | assert result[1] == 'abc\ndef\ndsd' 119 | docker_client.create_container.assert_called_with( 120 | command=['test-cmd'], environment=test_env, image='image') 121 | docker_client.start.assert_called_with({'Id': '1234'}) 122 | docker_client.wait.assert_called_with(container="1234") 123 | docker_client.logs.assert_called_with( 124 | container='1234', stderr=False, stdout=True, 125 | stream=False, timestamps=False) 126 | 127 | 128 | @pytest.mark.parametrize('is_binary_explanation', [True, False]) 129 | @mock.patch('shub.image.list._get_project_settings', return_value={}) 130 | @mock.patch('shub.image.utils.get_docker_client') 131 | def test_shub_image_info_fallback(get_docker_client_mock, _, 132 | is_binary_explanation): 133 | error_msg = ('Cannot start container xxx: [8] System error: exec:' 134 | ' "shub-image-info": executable file not found in $PATH') 135 | error_msg = _convert_str(error_msg, to_binary=is_binary_explanation) 136 | exception = docker.errors.APIError(mock.Mock(), mock.Mock(), 137 | explanation=error_msg) 138 | get_docker_client_mock().create_container.return_value = {'Id': 'id'} 139 | get_docker_client_mock().start.side_effect = [ 140 | exception, 141 | None, 142 | ] 143 | get_docker_client_mock().wait.return_value = {'Error': None, 'StatusCode': 0} 144 | get_docker_client_mock().logs.return_value = 'abc\ndef\n' 145 | result = list_cmd('image_name', 111, 'endpoint', 'apikey') 146 | assert get_docker_client_mock().start.call_count == 2 147 | assert result == {'spiders': ['abc', 'def'], 'project_type': 'scrapy'} 148 | 149 | 150 | @pytest.mark.parametrize('output,error_msg', [ 151 | ('bad-json', 'output is not a valid JSON dict'), 152 | (['data'], 'output is not a valid JSON dict'), 153 | ({'spiders': []}, '"project_type" key is required and must be a string'), 154 | ({'project_type': 1}, '"project_type" key is required and must be a string'), 155 | ({'project_type': 'scrapy'}, '"spiders" key is required and must be a list'), 156 | ({'project_type': 'scrapy', 'spiders': 'bad-data'}, '"spiders" key is required and must be a list'), 157 | ({'project_type': 'scrapy', 'spiders': ['']}, "spider name can't be empty or non-string"), 158 | ({'project_type': 'scrapy', 'spiders': [123]}, "spider name can't be empty or non-string"), 159 | ]) 160 | def test_extract_metadata_from_image_info_output_failures(output, error_msg): 161 | with pytest.raises(ShubException) as exc: 162 | _extract_metadata_from_image_info_output(json.dumps(output)) 163 | assert error_msg in exc.value.message 164 | -------------------------------------------------------------------------------- /shub/image/push.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import click 4 | 5 | from shub import exceptions as shub_exceptions 6 | from shub.config import load_shub_config, list_targets_callback 7 | from shub.image import utils 8 | from shub.image.test import test_cmd 9 | from shub.image.utils import get_image_registry 10 | 11 | SHORT_HELP = 'Push an image to a specified docker registry' 12 | 13 | HELP = """ 14 | A command to push your image to specified docker registry. 15 | 16 | The command is a simple wrapper for `docker push` command and uses docker 17 | daemon on your system to build an image. The only differences are that it 18 | can generate correct image version and provide easy registry login logic. 19 | 20 | The optional params are mostly related with registry authorization. 21 | By default, the tool tries to call the registry in insecure manner, 22 | otherwise you have to enter your credentials (at least username/password). 23 | """ 24 | 25 | LOGIN_ERROR_MSG = 'Please authorize with docker login' 26 | 27 | 28 | @click.command(help=HELP, short_help=SHORT_HELP) 29 | @click.argument("target", required=False, default="default") 30 | @click.option("-l", "--list-targets", is_flag=True, is_eager=True, 31 | expose_value=False, callback=list_targets_callback, 32 | help="List available project names defined in your config") 33 | @click.option("-d", "--debug", help="debug mode", is_flag=True, 34 | callback=utils.deprecate_debug_parameter) 35 | @click.option("-v", "--verbose", is_flag=True, 36 | help="stream push logs to console") 37 | @click.option("-V", "--version", help="release version") 38 | @click.option("--username", help="docker registry name") 39 | @click.option("--password", help="docker registry password") 40 | @click.option("--email", help="docker registry email") 41 | @click.option("--apikey", help="SH apikey to use built-in registry") 42 | @click.option("--insecure", is_flag=True, help="use insecure registry") 43 | @click.option("-S", "--skip-tests", help="skip testing image", is_flag=True) 44 | @click.option("-R", "--reauth", is_flag=True, 45 | help="re-authenticate to registry") 46 | def cli(target, debug, verbose, version, username, password, email, apikey, 47 | insecure, skip_tests, reauth): 48 | push_cmd(target, version, username, password, email, apikey, insecure, 49 | skip_tests, reauth) 50 | 51 | 52 | def push_cmd(target, version, username, password, email, apikey, insecure, 53 | skip_tests, reauth): 54 | # Test the image content after building it 55 | if not skip_tests: 56 | test_cmd(target, version) 57 | 58 | client = utils.get_docker_client() 59 | config = load_shub_config() 60 | image = config.get_image(target) 61 | username, password = utils.get_credentials( 62 | username=username, password=password, insecure=insecure, 63 | apikey=apikey, target_apikey=config.get_apikey(target)) 64 | 65 | if username: 66 | _execute_push_login( 67 | client, image, username, password, email, reauth) 68 | image_name = utils.format_image_name(image, version) 69 | click.echo(f"Pushing {image_name} to the registry.") 70 | events = client.push(image_name, stream=True, decode=True) 71 | if utils.is_verbose(): 72 | push_progress_cls = _LoggedPushProgress 73 | else: 74 | push_progress_cls = _PushProgress 75 | push_progress = push_progress_cls(events) 76 | push_progress.show() 77 | click.echo(f"The image {image_name} pushed successfully.") 78 | 79 | 80 | def _execute_push_login(client, image, username, password, email, reauth): 81 | """Login if there're provided credentials for the registry""" 82 | registry = get_image_registry(image) 83 | resp = client.login(username=username, password=password, 84 | email=email, registry=registry, reauth=reauth) 85 | if not (isinstance(resp, dict) and 'username' in resp or 86 | ('Status' in resp and resp['Status'] == 'Login Succeeded')): 87 | raise shub_exceptions.RemoteErrorException( 88 | "Docker registry login error.") 89 | click.echo(f"Login to {registry} succeeded.") 90 | 91 | 92 | class _LoggedPushProgress(utils.BaseProgress): 93 | """Visualize push progress in verbose mode. 94 | 95 | Output all the events received from the docker daemon. 96 | """ 97 | def handle_event(self, event): 98 | if 'error' in event and LOGIN_ERROR_MSG in event['error']: 99 | click.echo( 100 | "Something went wrong when trying to authenticate to Docker " 101 | "registry when pushing the image. Please ensure your " 102 | "credentials are correct and try again with --reauth flag.") 103 | raise shub_exceptions.RemoteErrorException( 104 | "Docker registry authentication error") 105 | super().handle_event(event) 106 | if 'status' in event: 107 | self.handle_status_event(event) 108 | 109 | def handle_status_event(self, event): 110 | msg = "Logs:{} {}".format(event['status'], event.get('progress')) 111 | utils.debug_log(msg) 112 | 113 | 114 | class _PushProgress(_LoggedPushProgress): 115 | """Visualize push progress in non-verbose mode. 116 | 117 | Show total progress bar and separate bar for each pushed layer. 118 | """ 119 | 120 | def __init__(self, push_events): 121 | super().__init__(push_events) 122 | # Total bar repesents total progress in terms of amount of layers. 123 | self.total_bar = self._create_total_bar() 124 | self.layers = set() 125 | # XXX: has to be OrderedDict to make tqdm.write/click.echo work as expected. 126 | # Otherwise it writes at random position, usually in the middle of the progress bars. 127 | self.layers_bars = OrderedDict() 128 | 129 | def handle_status_event(self, event): 130 | layer_id = event.get('id') 131 | status = event.get('status') 132 | progress = event.get('progressDetail') 133 | # `preparing` events are correlated with amount of layers to push 134 | if status in ('Preparing', 'Waiting'): 135 | self._add_layer(layer_id) 136 | # the events are final and used to update total bar once per layer 137 | elif status in ('Layer already exists', 'Pushed'): 138 | self._add_layer(layer_id) 139 | self.total_bar.update() 140 | # `pushing` events represents actual push process per layer 141 | elif event.get('status') == 'Pushing' and progress: 142 | progress_current = progress.get('current', 0) 143 | progress_total = max(progress.get('total', 0), progress_current) 144 | if layer_id not in self.layers_bars: 145 | if not progress_total: 146 | return 147 | # create a progress bar per pushed layer 148 | self.layers_bars[layer_id] = self._create_bar_per_layer( 149 | layer_id, progress_total, progress_current) 150 | bar = self.layers_bars[layer_id] 151 | bar.total = max(bar.total, progress_total) 152 | bar.update(max(progress_current - bar.n, 0)) 153 | 154 | def _add_layer(self, layer_id): 155 | self.layers.add(layer_id) 156 | self.total_bar.total = max(self.total_bar.total, len(self.layers)) 157 | self.total_bar.refresh() 158 | 159 | def show(self): 160 | super().show() 161 | self.total_bar.close() 162 | for bar in self.layers_bars.values(): 163 | bar.close() 164 | 165 | def _create_total_bar(self): 166 | return utils.create_progress_bar( 167 | total=1, 168 | desc='Layers', 169 | # don't need rate here, let's simplify the bar 170 | bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt}' 171 | ) 172 | 173 | def _create_bar_per_layer(self, layer_id, total, initial): 174 | return utils.create_progress_bar( 175 | desc=layer_id, 176 | total=total, 177 | initial=initial, 178 | unit='B', 179 | unit_scale=True, 180 | # don't need estimates here, keep only rate 181 | bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{rate_fmt}]', 182 | ) 183 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " applehelp to make an Apple Help Book" 34 | @echo " devhelp to make HTML files and a Devhelp project" 35 | @echo " epub to make an epub" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | html: 55 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 56 | @echo 57 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 58 | 59 | livehtml: 60 | sphinx-autobuild -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 61 | 62 | dirhtml: 63 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 64 | @echo 65 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 66 | 67 | singlehtml: 68 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 69 | @echo 70 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 71 | 72 | pickle: 73 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 74 | @echo 75 | @echo "Build finished; now you can process the pickle files." 76 | 77 | json: 78 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 79 | @echo 80 | @echo "Build finished; now you can process the JSON files." 81 | 82 | htmlhelp: 83 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 84 | @echo 85 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 86 | ".hhp project file in $(BUILDDIR)/htmlhelp." 87 | 88 | qthelp: 89 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 90 | @echo 91 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 92 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 93 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/shub.image.qhcp" 94 | @echo "To view the help file:" 95 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/shub.image.qhc" 96 | 97 | applehelp: 98 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 99 | @echo 100 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 101 | @echo "N.B. You won't be able to view it unless you put it in" \ 102 | "~/Library/Documentation/Help or install it in your application" \ 103 | "bundle." 104 | 105 | devhelp: 106 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 107 | @echo 108 | @echo "Build finished." 109 | @echo "To view the help file:" 110 | @echo "# mkdir -p $$HOME/.local/share/devhelp/shub.image" 111 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/shub.image" 112 | @echo "# devhelp" 113 | 114 | epub: 115 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 116 | @echo 117 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 118 | 119 | latex: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo 122 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 123 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 124 | "(use \`make latexpdf' here to do that automatically)." 125 | 126 | latexpdf: 127 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 128 | @echo "Running LaTeX files through pdflatex..." 129 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 130 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 131 | 132 | latexpdfja: 133 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 134 | @echo "Running LaTeX files through platex and dvipdfmx..." 135 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 136 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 137 | 138 | text: 139 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 140 | @echo 141 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 142 | 143 | man: 144 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 145 | @echo 146 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 147 | 148 | texinfo: 149 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 150 | @echo 151 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 152 | @echo "Run \`make' in that directory to run these through makeinfo" \ 153 | "(use \`make info' here to do that automatically)." 154 | 155 | info: 156 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 157 | @echo "Running Texinfo files through makeinfo..." 158 | make -C $(BUILDDIR)/texinfo info 159 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 160 | 161 | gettext: 162 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 163 | @echo 164 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 165 | 166 | changes: 167 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 168 | @echo 169 | @echo "The overview file is in $(BUILDDIR)/changes." 170 | 171 | linkcheck: 172 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 173 | @echo 174 | @echo "Link check complete; look for any errors in the above output " \ 175 | "or in $(BUILDDIR)/linkcheck/output.txt." 176 | 177 | doctest: 178 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 179 | @echo "Testing of doctests in the sources finished, look at the " \ 180 | "results in $(BUILDDIR)/doctest/output.txt." 181 | 182 | coverage: 183 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 184 | @echo "Testing of coverage in the sources finished, look at the " \ 185 | "results in $(BUILDDIR)/coverage/python.txt." 186 | 187 | xml: 188 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 189 | @echo 190 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 191 | 192 | pseudoxml: 193 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 194 | @echo 195 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 196 | --------------------------------------------------------------------------------