├── .github └── workflows │ └── build.yaml ├── Dockerfile ├── LICENSE.md ├── README.md ├── docker-compose.yml └── scrapyd.conf /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: Docker Build Images 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - master 8 | paths-ignore: 9 | - README.md 10 | 11 | env: 12 | PLATFORMS: 'linux/amd64,linux/arm64' 13 | IMAGE_NAME: 'easypi/scrapyd' 14 | SCRAPYD_VERSION: '1.5.0' 15 | SCRAPY_VERSION: '2.13.0' 16 | 17 | jobs: 18 | release: 19 | runs-on: ubuntu-latest 20 | steps: 21 | - uses: actions/checkout@v2 22 | 23 | - uses: docker/setup-qemu-action@v2 24 | 25 | - uses: docker/setup-buildx-action@v2 26 | 27 | - name: Login to DockerHub 28 | uses: docker/login-action@v2 29 | with: 30 | username: ${{ secrets.DOCKER_USERNAME }} 31 | password: ${{ secrets.DOCKER_PASSWORD }} 32 | 33 | - name: Login to Github Container Registry 34 | uses: docker/login-action@v2 35 | with: 36 | registry: ghcr.io 37 | username: ${{ github.actor }} 38 | password: ${{ secrets.GITHUB_TOKEN }} 39 | 40 | - uses: docker/build-push-action@v4 41 | with: 42 | platforms: ${{ env.PLATFORMS }} 43 | build-args: | 44 | SCRAPYD_VERSION=${{ env.SCRAPYD_VERSION }} 45 | SCRAPY_VERSION=${{ env.SCRAPY_VERSION }} 46 | tags: | 47 | ${{ env.IMAGE_NAME }}:${{ env.SCRAPYD_VERSION }}-${{ env.SCRAPY_VERSION }} 48 | ${{ env.IMAGE_NAME }}:latest 49 | # ghcr.io/${{ env.IMAGE_NAME }}:${{ env.SCRAPYD_VERSION }}-${{ env.SCRAPY_VERSION }} 50 | # ghcr.io/${{ env.IMAGE_NAME }}:latest 51 | push: true 52 | 53 | - if: ${{ always() }} 54 | uses: slackapi/slack-github-action@v1.24.0 55 | with: 56 | channel-id: github 57 | payload: | 58 | { 59 | "attachments": [ 60 | { 61 | "color": "${{ job.status == 'success' && 'good' || 'danger' }}", 62 | "title": "${{ github.repository }}", 63 | "title_link": "https://github.com/${{ github.repository }}", 64 | "text": "docker build image result", 65 | "fields": [ 66 | { 67 | "title": "Image", 68 | "value": "", 69 | "short": true 70 | }, 71 | { 72 | "title": "Status", 73 | "value": "${{ job.status }}", 74 | "short": true 75 | } 76 | ], 77 | "footer": "Github Actions", 78 | "footer_icon": "https://github.githubassets.com/favicon.ico" 79 | } 80 | ] 81 | } 82 | env: 83 | SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} 84 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | # Dockerfile for scrapyd 3 | # 4 | 5 | FROM debian:bookworm 6 | MAINTAINER EasyPi Software Foundation 7 | 8 | ARG TARGETPLATFORM 9 | ARG SCRAPY_VERSION=2.13.0 10 | ARG SCRAPYD_VERSION=1.5.0 11 | ARG SCRAPYD_CLIENT_VERSION=2.0.2 12 | ARG SCRAPY_SPLASH_VERSION=0.11.1 13 | ARG SCRAPYRT_VERSION=v0.16.0 14 | ARG SPIDERMON_VERSION=1.24.0 15 | ARG SCRAPY_POET_VERSION=0.26.0 16 | ARG SCRAPY_PLAYWRIGHT_VERSION=v0.0.43 17 | 18 | SHELL ["/bin/bash", "-c"] 19 | 20 | RUN set -xe \ 21 | && echo ${TARGETPLATFORM} \ 22 | && apt-get update \ 23 | && apt-get install -y autoconf \ 24 | build-essential \ 25 | curl \ 26 | git \ 27 | libffi-dev \ 28 | libssl-dev \ 29 | libtool \ 30 | libxml2 \ 31 | libxml2-dev \ 32 | libxslt1.1 \ 33 | libxslt1-dev \ 34 | python3 \ 35 | python3-cryptography \ 36 | python3-dev \ 37 | python3-distutils \ 38 | python3-pil \ 39 | python3-pip \ 40 | tini \ 41 | vim-tiny \ 42 | && if [[ ${TARGETPLATFORM} = "linux/arm/v7" ]]; then apt install -y cargo; fi \ 43 | && rm -f /usr/lib/python3.11/EXTERNALLY-MANAGED \ 44 | && pip install --no-cache-dir boto3 dateparser ipython \ 45 | https://github.com/scrapy/scrapy/archive/refs/tags/$SCRAPY_VERSION.zip \ 46 | https://github.com/scrapy/scrapyd/archive/refs/tags/$SCRAPYD_VERSION.zip \ 47 | https://github.com/scrapy/scrapyd-client/archive/refs/tags/$SCRAPYD_CLIENT_VERSION.zip \ 48 | https://github.com/scrapy-plugins/scrapy-splash/archive/refs/tags/$SCRAPY_SPLASH_VERSION.zip \ 49 | https://github.com/scrapinghub/scrapyrt/archive/refs/tags/$SCRAPYRT_VERSION.zip \ 50 | https://github.com/scrapinghub/spidermon/archive/refs/tags/$SPIDERMON_VERSION.zip \ 51 | https://github.com/scrapinghub/scrapy-poet/archive/refs/tags/$SCRAPY_POET_VERSION.zip \ 52 | https://github.com/scrapy-plugins/scrapy-playwright/archive/refs/tags/$SCRAPY_PLAYWRIGHT_VERSION.zip \ 53 | && mkdir -p /etc/bash_completion.d \ 54 | && curl -sSL https://github.com/scrapy/scrapy/raw/master/extras/scrapy_bash_completion -o /etc/bash_completion.d/scrapy_bash_completion \ 55 | && echo 'source /etc/bash_completion.d/scrapy_bash_completion' >> /root/.bashrc \ 56 | && if [[ ${TARGETPLATFORM} = "linux/arm/v7" ]]; then apt purge -y --auto-remove cargo; fi \ 57 | && apt-get purge -y --auto-remove autoconf \ 58 | build-essential \ 59 | curl \ 60 | libffi-dev \ 61 | libssl-dev \ 62 | libtool \ 63 | libxml2-dev \ 64 | libxslt1-dev \ 65 | python3-dev \ 66 | && rm -rf /var/lib/apt/lists/* 67 | 68 | COPY ./scrapyd.conf /etc/scrapyd/ 69 | VOLUME /etc/scrapyd/ /var/lib/scrapyd/ 70 | EXPOSE 6800 71 | 72 | ENTRYPOINT ["tini", "--"] 73 | CMD ["scrapyd", "--pidfile="] 74 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 EasyPi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scrapyd 2 | ======= 3 | 4 | [![](https://github.com/easypi/docker-scrapyd/actions/workflows/build.yaml/badge.svg)](https://github.com/EasyPi/docker-scrapyd) 5 | 6 | [![](http://dockeri.co/image/easypi/scrapyd)](https://hub.docker.com/r/easypi/scrapyd) 7 | 8 | [scrapy][1] is an open source and collaborative framework for extracting the 9 | data you need from websites. In a fast, simple, yet extensible way. 10 | 11 | [scrapyd][2] is a service for running Scrapy spiders. It allows you to deploy 12 | your Scrapy projects and control their spiders using a HTTP JSON API. 13 | 14 | [scrapyd-client][3] is a client for scrapyd. It provides the scrapyd-deploy 15 | utility which allows you to deploy your project to a Scrapyd server. 16 | 17 | [scrapy-splash][4] provides Scrapy+JavaScript integration using Splash. 18 | 19 | [scrapyrt][5] allows you to easily add HTTP API to your existing Scrapy project. 20 | 21 | [spidermon][6] is a framework to build monitors for Scrapy spiders. 22 | 23 | [scrapy-poet][7] is the web-poet Page Object pattern implementation for Scrapy. 24 | 25 | [scrapy-playwright][8] is a Scrapy Download Handler which performs requests using Playwright for Python. 26 | 27 | This image is based on `debian:bookworm`, 8 latest stable python packages are installed: 28 | 29 | - scrapy==2.13.0 30 | - scrapyd==1.5.0 31 | - scrapyd-client==2.0.2 32 | - scrapy-splash==0.11.1 33 | - scrapyrt==v0.16.0 34 | - spidermon==1.24.0 35 | - scrapy-poet==0.26.0 36 | - scrapy-playwright==v0.0.43 37 | 38 | ```bash 39 | # fetch latest versions 40 | echo "scrapy scrapyd scrapyd-client scrapy-splash scrapyrt spidermon scrapy-poet scrapy-playwright" | 41 | xargs -n1 pip --disable-pip-version-check index versions 2>/dev/null | 42 | grep -v Available 43 | ``` 44 | 45 | Please use this as base image for your own project. 46 | 47 | :warning: Scrapy (since [2.0.0][9]) has dropped support for Python 2.7, which reached end-of-life on 2020-01-01. 48 | 49 | ## docker-compose.yml 50 | 51 | ```yaml 52 | version: "3.8" 53 | 54 | services: 55 | 56 | scrapyd: 57 | image: easypi/scrapyd 58 | ports: 59 | - "6800:6800" 60 | volumes: 61 | - ./data:/var/lib/scrapyd 62 | - /usr/local/lib/python3.11/dist-packages 63 | restart: unless-stopped 64 | 65 | scrapy: 66 | image: easypi/scrapyd 67 | command: bash 68 | volumes: 69 | - .:/code 70 | working_dir: /code 71 | restart: unless-stopped 72 | 73 | scrapyrt: 74 | image: easypi/scrapyd 75 | command: scrapyrt -i 0.0.0.0 -p 9080 76 | ports: 77 | - "9080:9080" 78 | volumes: 79 | - .:/code 80 | working_dir: /code 81 | restart: unless-stopped 82 | ``` 83 | 84 | ## Run it as background-daemon for scrapyd 85 | 86 | ```bash 87 | $ docker-compose up -d scrapyd 88 | $ docker-compose logs -f scrapyd 89 | $ docker cp scrapyd_scrapyd_1:/var/lib/scrapyd/items . 90 | $ tree items 91 | └── myproject 92 | └── myspider 93 | └── ad6153ee5b0711e68bc70242ac110005.jl 94 | ``` 95 | 96 | ```bash 97 | $ mkvirtualenv -p python3 webbot 98 | $ pip install scrapy scrapyd-client 99 | 100 | $ scrapy startproject myproject 101 | $ cd myproject 102 | $ setvirtualenvproject 103 | 104 | $ scrapy genspider myspider mydomain.com 105 | $ scrapy edit myspider 106 | $ scrapy list 107 | 108 | $ vi scrapy.cfg 109 | $ scrapyd-client deploy 110 | $ curl http://localhost:6800/schedule.json -d project=myproject -d spider=myspider 111 | $ curl http://localhost:6800/daemonstatus.json 112 | $ firefox http://localhost:6800 113 | ``` 114 | 115 | File: scrapy.cfg 116 | 117 | ```ini 118 | [settings] 119 | default = myproject.settings 120 | 121 | [deploy] 122 | url = http://localhost:6800/ 123 | project = myproject 124 | ``` 125 | 126 | ## Run it as interactive-shell for scrapy 127 | 128 | ```bash 129 | $ cat > stackoverflow_spider.py << _EOF_ 130 | import scrapy 131 | 132 | class StackOverflowSpider(scrapy.Spider): 133 | name = 'stackoverflow' 134 | start_urls = ['http://stackoverflow.com/questions?sort=votes'] 135 | 136 | def parse(self, response): 137 | for href in response.css('.question-summary h3 a::attr(href)'): 138 | full_url = response.urljoin(href.extract()) 139 | yield scrapy.Request(full_url, callback=self.parse_question) 140 | 141 | def parse_question(self, response): 142 | yield { 143 | 'title': response.css('h1 a::text').extract()[0], 144 | 'votes': response.css('.question div[itemprop="upvoteCount"]::text').extract()[0], 145 | 'body': response.css('.question .postcell').extract()[0], 146 | 'tags': response.css('.question .post-tag::text').extract(), 147 | 'link': response.url, 148 | } 149 | _EOF_ 150 | 151 | $ docker-compose run --rm scrapy 152 | >>> scrapy runspider stackoverflow_spider.py -o top-stackoverflow-questions.jl 153 | >>> cat top-stackoverflow-questions.jl 154 | >>> exit 155 | ``` 156 | 157 | ## Run it as realtime crawler for scrapyrt 158 | 159 | ```bash 160 | $ git clone https://github.com/scrapy/quotesbot.git . 161 | $ docker-compose up -d scrapyrt 162 | $ curl -s 'http://localhost:9080/crawl.json?spider_name=toscrape-css&callback=parse&url=http://quotes.toscrape.com/&max_requests=5' | jq -c '.items[]' 163 | ``` 164 | 165 | [1]: https://github.com/scrapy/scrapy 166 | [2]: https://github.com/scrapy/scrapyd 167 | [3]: https://github.com/scrapy/scrapyd-client 168 | [4]: https://github.com/scrapinghub/scrapy-splash 169 | [5]: https://github.com/scrapinghub/scrapyrt 170 | [6]: https://github.com/scrapinghub/spidermon 171 | [7]: https://github.com/scrapinghub/scrapy-poet 172 | [8]: https://github.com/scrapy-plugins/scrapy-playwright 173 | [9]: 174 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | services: 3 | scrapyd: 4 | image: easypi/scrapyd 5 | ports: 6 | - "6800:6800" 7 | volumes: 8 | - ./data:/var/lib/scrapyd 9 | - /usr/local/lib/python3.11/dist-packages 10 | restart: unless-stopped 11 | -------------------------------------------------------------------------------- /scrapyd.conf: -------------------------------------------------------------------------------- 1 | # 2 | # Doc: https://scrapyd.readthedocs.io/en/latest/config.html 3 | # 4 | 5 | [scrapyd] 6 | eggs_dir = /var/lib/scrapyd/eggs 7 | logs_dir = /var/lib/scrapyd/logs 8 | items_dir = /var/lib/scrapyd/items 9 | dbs_dir = /var/lib/scrapyd/dbs 10 | jobs_to_keep = 5 11 | max_proc = 0 12 | max_proc_per_cpu = 4 13 | finished_to_keep = 100 14 | poll_interval = 5 15 | bind_address = 0.0.0.0 16 | http_port = 6800 17 | username = 18 | password = 19 | prefix_header = x-forwarded-prefix 20 | debug = off 21 | runner = scrapyd.runner 22 | application = scrapyd.app.application 23 | launcher = scrapyd.launcher.Launcher 24 | spiderqueue = scrapyd.spiderqueue.SqliteSpiderQueue 25 | webroot = scrapyd.website.Root 26 | eggstorage = scrapyd.eggstorage.FilesystemEggStorage 27 | 28 | [services] 29 | schedule.json = scrapyd.webservice.Schedule 30 | cancel.json = scrapyd.webservice.Cancel 31 | addversion.json = scrapyd.webservice.AddVersion 32 | listprojects.json = scrapyd.webservice.ListProjects 33 | listversions.json = scrapyd.webservice.ListVersions 34 | listspiders.json = scrapyd.webservice.ListSpiders 35 | delproject.json = scrapyd.webservice.DeleteProject 36 | delversion.json = scrapyd.webservice.DeleteVersion 37 | listjobs.json = scrapyd.webservice.ListJobs 38 | daemonstatus.json = scrapyd.webservice.DaemonStatus 39 | --------------------------------------------------------------------------------