├── .gitignore ├── LICENSE ├── README.md ├── deployment.md ├── local_output └── readme.md ├── requirements.txt ├── scrapinghub.yml ├── scrapy.cfg ├── setup.py └── tutorial ├── __init__.py ├── items.py ├── middlewares.py ├── models.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py ├── quotes_spider.py ├── quotes_spider_v1.py └── quotes_spider_v2.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .DS_Store 107 | 108 | /local_output/*.html 109 | /local_output/*.json 110 | 111 | # sqlite 112 | *.db 113 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Harry Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapy Tutorial 2 | 3 | This repo contains the code for my tutorial: A Minimalist End-to-End Scrapy Tutorial (https://medium.com/p/11e350bcdec0). 4 | 5 | The website to crawl is [http://quotes.toscrape.com](http://quotes.toscrape.com). 6 | 7 | ## Setup 8 | Tested with Python 3.6 via virtual environment: 9 | ```shell 10 | $ python3.6 -m venv venv 11 | $ source venv/bin/activate 12 | $ pip install -r requirements.txt 13 | ``` 14 | 15 | ## Run 16 | 17 | Run `scrapy crawl quotes` at the project top level. 18 | 19 | Note that spider name is defined in the spider class, e.g., `quotes_spider.py`: 20 | ```python 21 | class QuotesSpider(scrapy.Spider): 22 | name = "quotes" 23 | ``` 24 | 25 | ## Deployment 26 | 27 | See deployment.md 28 | 29 | ## Versions 30 | 31 | I keep different versions for learning purposes using git tags: 32 | 33 | ### Version 1 (tag v1.0) 34 | 35 | Key Concepts: basic spider setup, project folder structure, saving files as json and html files, using Scrap shell,Following links, etc. 36 | 37 | Local outputs (json and html pages) are stored in "local-output" folder, which is ignored in .gitignore. 38 | 39 | For example: 40 | 41 | scrapy crawl quotes saves a set of html pages to /local_output 42 | scrapy crawl quotes -o ./local_output/quotes.json saves the output to a json file 43 | 44 | 45 | 46 | To create the initial project folder, run `scrapy startproject tutorial` (only need to do this once) I removed the top level `tutorial` folder and add additional files and folders as shown below: 47 | 48 | ``` 49 | tutorial/ 50 | scrapy.cfg # deploy configuration file 51 | 52 | 53 | tutorial/ # project's Python module, you'll import your code from here 54 | __init__.py 55 | 56 | items.py # project items definition file 57 | 58 | middlewares.py # project middlewares file 59 | 60 | pipelines.py # project pipelines file 61 | 62 | settings.py # project settings file 63 | 64 | spiders/ # a directory where you'll later put your spiders 65 | __init__.py 66 | ``` 67 | `self.log('Saved file %s' % filename)` outputs to the log console. `yield` also outputs the DEBUG info in the console, e.g.: 68 | 69 | Screen Shot 2019-08-13 at 3 30 44 PM 70 | 71 | 72 | ### Version 2 (tag v2.0) 73 | 74 | The major change is to use Items. 75 | 76 | Why use Items? 77 | 78 | - clearly specify the structured data to be collected - a central place to look 79 | - leverage pre and post processors for Items via ItemLoaders (you can also define additional custom processors) 80 | - Use item pipelines to save data to databases (Version 3) 81 | - Better code organization - you know where to look for certain processing code 82 | 83 | ### Version 3 (tag v3.0) 84 | 85 | - Add database support via SQLAlchemy and use Item pipeline to save items into database (sqlite and mysql) 86 | - Add instructions on deploying ScrapingHub.com 87 | 88 | Three tables: Authors, Quotes, Tags. 89 | 90 | - One-to-Many between Authors and Quotes 91 | - Many-to-Many between Tags and Quotes 92 | - Many-to-Many between Tags and Authors 93 | 94 | Database schema is defined in `/tutorial/models.py` file and connection string is specified in `/tutorial/settings.py`. 95 | Add a pipleline file and enable the pipeline in `/tutorial/settings.py` (The number 0-1000 specifies the execution order of the pipelines). 96 | 97 | ``` 98 | ITEM_PIPELINES = { 99 | 'tutorial.pipelines.SaveQuotesPipeline': 300, 100 | } 101 | ``` 102 | 103 | Use the following commands to check local SQLite database. https://sqlitebrowser.org can be used as a GUI tool. 104 | 105 | ``` 106 | $ man sqlite3 107 | $ sqlite3 scrapy_quotes.db 108 | sqlite> .tables 109 | sqlite> .schema quote 110 | sqlite> .quit 111 | ``` 112 | ### Test SQLAlchemy in Shell 113 | 114 | Once you setup models and pipelines, you can run `scrapy shell` to test the database part. Just paste the code block below and open sqlite database to check the results. 115 | 116 | ```python 117 | from sqlalchemy.orm import sessionmaker 118 | from tutorial.models import Quote, Author, Tag, db_connect, create_table 119 | engine = db_connect() 120 | create_table(engine) 121 | Session = sessionmaker(bind=engine) 122 | session = Session() 123 | 124 | quote1 = Quote() 125 | author1 = Author() 126 | author1.name = "Linus Torvalds" 127 | author1.bio = "Linus Torvalds is the creator the Linux kernel and Git." 128 | quote1.quote_content = "Talk is cheap. Show me the code." 129 | quote1.author = author1 130 | tag1 = Tag(name="linux") 131 | tag2 = Tag(name="git") 132 | tag3 = Tag(name="simple") 133 | quote1.tags.append(tag1) 134 | quote1.tags.append(tag2) 135 | quote1.tags.append(tag3) 136 | 137 | try: 138 | session.add(author1) 139 | session.add(quote1) 140 | session.commit() 141 | except: 142 | session.rollback() 143 | raise 144 | 145 | quote2 = Quote() 146 | author2 = Author() 147 | author2.name = "Steven Jobs" 148 | author2.bio = "Steven Jobs was the chairman, chief executive officer, and co-founder of Apple Inc." 149 | quote2.quote_content = "Stay Hungry Stay Foolish." 150 | quote2.author = author2 151 | tag4 = Tag(name="inspiring") 152 | tag5 = Tag(name="simple") # this already exists in the database 153 | 154 | # See difference between filter and filter_by at https://bit.ly/2TLvqeV 155 | 156 | # exist_tag = session.query(Tag).filter(Tag.name == tag5.name).first() 157 | exist_tag = session.query(Tag).filter_by(name = tag5.name).first() 158 | if exist_tag is not None: # the current tag exists 159 | tag5 = exist_tag 160 | 161 | quote2.tags.append(tag4) 162 | quote2.tags.append(tag5) 163 | 164 | try: 165 | 166 | session.add(author2) 167 | session.add(quote2) 168 | session.commit() 169 | except: 170 | session.rollback() 171 | raise 172 | finally: 173 | session.close() 174 | ``` 175 | ### MySQL 176 | 177 | - Install MySQL locally: ``$brew install mysql`, which installs MySQL without password. To start MySQL: `mysql.server start` and then connect: `mysql -u root`. 178 | 179 | - Create a local database and related user: `CREATE SCHEMA scrapy_quotes DEFAULT CHARACTER SET utf8mb4 ;` 180 | 181 | - `mysqlclient` package is required. 182 | 183 | - Comment out MySQL connection string in `settings.py` to use MySQL to store items: 184 | 185 | ```python3 186 | # SQLite 187 | # CONNECTION_STRING = 'sqlite:///scrapy_quotes.db' 188 | 189 | # MySQL 190 | CONNECTION_STRING = "{drivername}://{user}:{passwd}@{host}:{port}/{db_name}?charset=utf8".format( 191 | drivername="mysql", 192 | user="harrywang", 193 | passwd="tutorial", 194 | host="localhost", 195 | port="3306", 196 | db_name="scrapy_quotes", 197 | ) 198 | ``` 199 | 200 | ### Version 4 (tag v4.0) 201 | Deployment to Scrapinghub and ScrapydWeb. See [deployment.md](deployment.md) for details. 202 | 203 | 204 | ## Other Notes 205 | 206 | ### Scrapy Shell 207 | 208 | 209 | Enter shell: `scrapy shell 'http://quotes.toscrape.com/page/1/'` 210 | 211 | Extract data examples (css and xpath): 212 | 213 | CSS: 214 | ```bash 215 | >>> response.css('title').getall() 216 | ['Quotes to Scrape'] 217 | >>> response.css('title::text').get() 218 | 'Quotes to Scrape' 219 | >>> response.css('title::text')[0].get() 220 | 'Quotes to Scrape' 221 | >>> response.css('title::text').re(r'Quotes.*') 222 | ['Quotes to Scrape'] 223 | >>> response.css('title::text').re(r'Q\w+') 224 | ['Quotes'] 225 | >>> response.css('title::text').re(r'(\w+) to (\w+)') 226 | ['Quotes', 'Scrape'] 227 | ``` 228 | XPath: 229 | 230 | ```bash 231 | >>> response.xpath('//title') 232 | [] 233 | >>> response.xpath('//title/text()').get() 234 | 'Quotes to Scrape' 235 | ``` 236 | 237 | View page in browser from shell: `>>> view(response)` 238 | 239 | ### Extracting quotes and authors 240 | 241 | HTML to parse: 242 | 243 | ```html 244 |
245 | “The world as we have created it is a process of our 246 | thinking. It cannot be changed without changing our thinking.” 247 | 248 | by Albert Einstein 249 | (about) 250 | 251 |
252 | Tags: 253 | change 254 | deep-thoughts 255 | thinking 256 | world 257 |
258 |
259 | ``` 260 | 261 | Parse and output to log: 262 | 263 | ```python 264 | import scrapy 265 | 266 | 267 | class QuotesSpider(scrapy.Spider): 268 | name = "quotes" 269 | start_urls = [ 270 | 'http://quotes.toscrape.com/page/1/', 271 | 'http://quotes.toscrape.com/page/2/', 272 | ] 273 | 274 | def parse(self, response): 275 | for quote in response.css('div.quote'): 276 | yield { 277 | 'text': quote.css('span.text::text').get(), 278 | 'author': quote.css('small.author::text').get(), 279 | 'tags': quote.css('div.tags a.tag::text').getall(), 280 | } 281 | ``` 282 | Save the output above to json: `scrapy crawl quotes -o ./local_output/quotes.json` - Note: **this command appends to existing json instead of overwriting it**. 283 | 284 | ### Following links 285 | 286 | Next link html on the page: 287 | 288 | ```html 289 | 294 | ``` 295 | Extract it via shell: 296 | 297 | ```bash 298 | >>> response.css('li.next a::attr(href)').get() 299 | '/page/2/' 300 | >>> response.css('li.next a').attrib['href'] 301 | '/page/2' 302 | ``` 303 | Follow links: 304 | 305 | ```python 306 | for a in response.css('li.next a'): 307 | yield response.follow(a, callback=self.parse) 308 | ``` 309 | 310 | ### Using spider arguments 311 | See https://docs.scrapy.org/en/latest/topics/spiders.html#spiderargs 312 | -------------------------------------------------------------------------------- /deployment.md: -------------------------------------------------------------------------------- 1 | # Deployment Instructions 2 | Check the following sections for deployment instructions for Scrapinghub and Scrapydweb. 3 | 4 | ## Scrapinghub Deployment 5 | 6 | Create an free account and create a project: 7 | ![Screen Shot 2019-08-19 at 11 27 48 AM](https://user-images.githubusercontent.com/595772/63278299-05749780-c275-11e9-9c3f-f750ae5bc6e1.png) 8 | 9 | We will use the `shub` command line to deploy. You can find your API key and deploy number once in your project Code & Deploys page: 10 | ![Screen Shot 2019-08-19 at 11 33 05 AM](https://user-images.githubusercontent.com/595772/63278652-aebb8d80-c275-11e9-8fb1-1945888d6a53.png) 11 | 12 | Go back to the root of Scrapy-tutorial (the root of the Scrapy project) and use the following command to deploy your project to Scrapyinghub. 13 | 14 | ```bash 15 | 16 | (venv) dami:scrapy-tutorial harrywang$ shub login 17 | Enter your API key from https://app.scrapinghub.com/account/apikey 18 | API key: xxxxx 19 | Validating API key... 20 | API key is OK, you are logged in now. 21 | (venv) dami:scrapy-tutorial harrywang$ shub deploy 404937 22 | Messagepack is not available, please ensure that msgpack-python library is properly installed. 23 | Saving project 404937 as default target. You can deploy to it via 'shub deploy' from now on 24 | Saved to /Users/harrywang/xxx/scrapy-tutorial/scrapinghub.yml. 25 | Packing version b6ac860-master 26 | Created setup.py at /Users/harrywang/xxx/scrapy-tutorial 27 | Deploying to Scrapy Cloud project "404937" 28 | {"status": "ok", "project": 4xxx, "version": "b6ac860-master", "spiders": 3} 29 | Run your spiders at: https://app.scrapinghub.com/p/404937/ 30 | ``` 31 | Scrapinghub configuration file is created `scrapinghub.yml` and you need to edit it to specify: 32 | 33 | - scrapy 1.7 running Python 3 34 | - requirements files for other packages 35 | 36 | ```yml 37 | project: 404937 38 | 39 | stacks: 40 | default: scrapy:1.7-py3 41 | 42 | requirements: 43 | file: requirements.txt 44 | ``` 45 | 46 | run `$ shub deploy` to deploy again. 47 | 48 | We have three spiders in the project: 49 | - quotes_spider.py is the main spider 50 | - quotes_spider_v1.py is the version 1 of the spider that writes to files, etc. 51 | - authors_spider.py is the spider to get author page from the official tutorial 52 | 53 | You can see your current deployment on scrapinghub.com: 54 | ![Screen Shot 2019-08-19 at 11 44 31 AM](https://user-images.githubusercontent.com/595772/63279289-bd567480-c276-11e9-8b0d-f24607517652.png) 55 | 56 | Then, you can run your spider: 57 | 58 | ![Screen Shot 2019-08-19 at 12 47 48 PM](https://user-images.githubusercontent.com/595772/63287962-8ccc0600-c289-11e9-9a50-159ccbfb16fe.png) 59 | 60 | ![Screen Shot 2019-08-19 at 12 48 51 PM](https://user-images.githubusercontent.com/595772/63287944-85a4f800-c289-11e9-9edb-f2e32f8b3a35.png) 61 | 62 | Once the job is complete, you can check the results and download the items: 63 | ![Screen Shot 2019-08-19 at 1 57 49 PM](https://user-images.githubusercontent.com/595772/63287923-76be4580-c289-11e9-8269-85f156a19a02.png) 64 | 65 | ![Screen Shot 2019-08-19 at 1 58 22 PM](https://user-images.githubusercontent.com/595772/63288027-b127e280-c289-11e9-858b-7f03b37f721f.png) 66 | 67 | You can schedule periodic jobs if you upgrade your free plan. 68 | 69 | ## Scrapydweb Deployment 70 | 71 | I found this repo https://github.com/my8100/scrapydweb and follow https://github.com/my8100/scrapyd-cluster-on-heroku to setup the server. 72 | 73 | We need a custom deployment because our scrapy project has specific package requirements, e.g., SQLAlchemy, MySQL, etc. if no special package is needed, you can follow the easy setup below. 74 | 75 | ### Custom Setup 76 | 77 | #### Setup repo and Heroku account 78 | fork a copy of https://github.com/my8100/scrapyd-cluster-on-heroku to your account, e.g., https://github.com/harrywang/scrapyd-cluster-on-heroku 79 | 80 | create a free account at heroku.com and install Heroku CLI: `brew tap heroku/brew && brew install heroku` 81 | 82 | clone the repo: 83 | 84 | ```bash 85 | git clone https://github.com/harrywang/scrapyd-cluster-on-heroku 86 | cd scrapyd-cluster-on-heroku/ 87 | ``` 88 | login to Heroku 89 | 90 | ``` 91 | scrapyd-cluster-on-heroku harrywang$ heroku login 92 | heroku: Press any key to open up the browser to login or q to exit: 93 | Opening browser to https://cli-auth.heroku.com/auth/browser/3ba7221b-9c2a-4355-ab3b-d2csda 94 | Logging in... done 95 | Logged in as xxx@gmail.com 96 | ``` 97 | 98 | #### Set up Scrapyd server/app 99 | 100 | In this step, you should update the `runtime.txt` to specify the Python version and `requirements.txt` to include all packages your spider needs. 101 | 102 | After changes, `runtime.txt` is: 103 | ``` 104 | python-3.6 105 | ``` 106 | `requirements.txt` is: 107 | ``` 108 | pip>=19.1 109 | #Twisted==18.9.0 110 | scrapy 111 | scrapyd>=1.2.1 112 | scrapy-redis 113 | logparser>=0.8.2 114 | 115 | mysqlclient>=1.4.4 116 | SQLAlchemy>=1.3.6 117 | ``` 118 | 119 | Setup the repo and commit the changes we just made: 120 | 121 | ```bash 122 | cd scrapyd 123 | git init 124 | git status 125 | git add . 126 | git commit -a -m "first commit" 127 | git status 128 | ``` 129 | 130 | Deploy Scrapyd app 131 | 132 | ```bash 133 | heroku apps:create scrapy-server1 134 | heroku git:remote -a scrapy-server1 135 | git remote -v 136 | git push heroku master 137 | heroku logs --tail 138 | # Press ctrl+c to stop logs outputting 139 | # Visit https://svr-1.herokuapp.com 140 | ``` 141 | Add environment variables 142 | 143 | Timezone 144 | 145 | ``` 146 | # python -c "import tzlocal; print(tzlocal.get_localzone())" 147 | heroku config:set TZ=US/Eastern 148 | # heroku config:get TZ 149 | ``` 150 | Redis (optional - not in this tutorial) 151 | Redis account (optional, see settings.py in the scrapy_redis_demo_project.zip) 152 | ``` 153 | heroku config:set REDIS_HOST=your-redis-host 154 | heroku config:set REDIS_PORT=your-redis-port 155 | heroku config:set REDIS_PASSWORD=your-redis-password 156 | ``` 157 | Repeat this step if multiple scrapyd server is needed. 158 | 159 | #### Setup ScrapydWeb server/app 160 | 161 | go to scrapydweb subfolder and update `runtime.txt`, `requirements.txt`, and `scrapydweb_settings_v10.py` if needed. 162 | 163 | Let's enable authentication, edit the following section of `scrapydweb_settings_v10.py`: 164 | 165 | ``` 166 | # The default is False, set it to True to enable basic auth for the web UI. 167 | ENABLE_AUTH = True 168 | if os.environ.get('ENABLE_AUTH', 'False') == 'True': 169 | ENABLE_AUTH = True 170 | # In order to enable basic auth, both USERNAME and PASSWORD should be non-empty strings. 171 | USERNAME = 'admin' 172 | PASSWORD = 'scrapydweb' 173 | USERNAME = os.environ.get('USERNAME', 'admin') 174 | PASSWORD = os.environ.get('PASSWORD', 'scrapydweb') 175 | ``` 176 | 177 | Otherwise, proceed as follows: 178 | 179 | ``` 180 | cd .. 181 | cd scrapydweb 182 | git init 183 | git status 184 | git add . 185 | git commit -a -m "first commit" 186 | git status 187 | ``` 188 | 189 | Deploy ScrapydWeb app 190 | ```bash 191 | heroku apps:create scrapyd-web 192 | heroku git:remote -a scrapyd-web 193 | git remote -v 194 | git push heroku master 195 | ``` 196 | 197 | Add environment variables 198 | 199 | Timezone 200 | ``` 201 | heroku config:set TZ=US/Eastern 202 | ``` 203 | 204 | Scrapyd servers - you have to use the scrapyd server address you just setup above (see scrapydweb_settings_vN.py in the scrapydweb directory) 205 | 206 | ``` 207 | heroku config:set SCRAPYD_SERVER_1=scrapy-server1.herokuapp.com:80 208 | # heroku config:set SCRAPYD_SERVER_2=svr-2.herokuapp.com:80#group1 209 | # heroku config:set SCRAPYD_SERVER_3=svr-3.herokuapp.com:80#group1 210 | # heroku config:set SCRAPYD_SERVER_4=svr-4.herokuapp.com:80#group2 211 | ``` 212 | 213 | #### Deploy the scrapy project 214 | 215 | We need to package the project and upload to the server. 216 | 217 | First, install scrapyd-client using `pip install git+https://github.com/scrapy/scrapyd-client` (note: pip does not work as of writing this document see: https://stackoverflow.com/questions/45750739/scrapyd-client-command-not-found) 218 | 219 | change the deploy setting in scrapy.cfg: 220 | ``` 221 | [deploy] 222 | url = http://scrapyd-server1.herokuapp.com 223 | username = admin 224 | password = scrapydweb 225 | project = scrapy-tutorial 226 | ``` 227 | Then, use `scrapyd-deploy` to package and deploy to scrapyd server: 228 | 229 | ``` 230 | (venv) dami:scrapy-tutorial harrywang$ scrapyd-deploy 231 | /Users/harrywang/sandbox/scrapy-tutorial/venv/lib/python3.6/site-packages/scrapyd_client/deploy.py:23: ScrapyDeprecationWarning: Module `scrapy.utils.http` is deprecated, Please import from `w3lib.http` instead. 232 | from scrapy.utils.http import basic_auth_header 233 | Packing version 1566253506 234 | Deploying to project "scrapy-tutorial" in http://scrapyd-server1.herokuapp.com/addversion.json 235 | Server response (200): 236 | {"node_name": "9177f699-b645-4656-82d1-beef2898fdc1", "status": "ok", "project": "scrapy-tutorial", "version": "1566253506", "spiders": 3} 237 | ``` 238 | go to https://srapyd-web.herokuapp.com, you should see your project deployed: 239 | ![Screen Shot 2019-08-19 at 6 27 32 PM](https://user-images.githubusercontent.com/595772/63303881-2063fd80-c2af-11e9-8ba7-216778176e31.png) 240 | 241 | go to the following page to run the spider: 242 | 243 | ![Screen Shot 2019-08-19 at 8 56 23 PM](https://user-images.githubusercontent.com/595772/63309234-0c76c680-c2c4-11e9-98b9-1ea499bbf61a.png) 244 | 245 | Once the spider finishes, you can check the items in Files menu. 246 | 247 | You can specify Timer Tasks. The following shows a task that runs every 10 minutes. This part is based on APScheduler, see [document](https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html#expression-types) to figure out how to set the values (this could be confusing.) 248 | ![Screen Shot 2019-08-19 at 10 28 04 PM](https://user-images.githubusercontent.com/595772/63313103-6467fa00-c2d1-11e9-9830-744272ef0c2c.png) 249 | 250 | 251 | ### Easy Setup 252 | - create a free account at heroku.com and login 253 | - go to https://github.com/my8100/scrapyd-cluster-on-heroku-scrapyd-app and click "Deploy to Heroku" button to setup a scrayd server app (scrapyd-server1.herokuapp.com): 254 | ![Screen Shot 2019-08-19 at 5 13 53 PM](https://user-images.githubusercontent.com/595772/63300193-a4b18300-c2a5-11e9-9453-3f42c5004ba9.png) 255 | 256 | Use the following settings (No redis setting) and the app is at scrapyd-server1.herokuapp.com 257 | ![Screen Shot 2019-08-19 at 5 19 26 PM](https://user-images.githubusercontent.com/595772/63300172-9499a380-c2a5-11e9-89d8-29eee56e8a3f.png) 258 | 259 | - go to https://github.com/my8100/scrapyd-cluster-on-heroku-scrapydweb-app-git and click "Deploy to Heroku" button to setup a scrapydweb server app: 260 | ![Screen Shot 2019-08-19 at 5 22 46 PM](https://user-images.githubusercontent.com/595772/63301043-9d8b7480-c2a7-11e9-8d9b-f06e7127f7ba.png) 261 | 262 | Use the following settings (No redis setting) and the app is at scrapyd-server1.herokuapp.com 263 | ![Screen Shot 2019-08-19 at 5 31 15 PM](https://user-images.githubusercontent.com/595772/63301003-877db400-c2a7-11e9-87dd-5b7175729d90.png) 264 | 265 | - go to https://srapyd-web.herokuapp.com and login, you can see that one scrapyd server is ready: 266 | 267 | ![Screen Shot 2019-08-19 at 5 37 25 PM](https://user-images.githubusercontent.com/595772/63301257-17bbf900-c2a8-11e9-8b01-307487310163.png) 268 | 269 | We need to package the project and upload to the server. 270 | 271 | First, install scrapyd-client using `pip install git+https://github.com/scrapy/scrapyd-client` (note: pip does not work as of writing this document see: https://stackoverflow.com/questions/45750739/scrapyd-client-command-not-found) 272 | 273 | change the deploy setting in scrapy.cfg: 274 | ``` 275 | [deploy] 276 | url = http://scrapyd-server1.herokuapp.com 277 | username = admin 278 | password = scrapydweb 279 | project = scrapy-tutorial 280 | ``` 281 | Then, use `scrapyd-deploy` to package and deploy to scrapyd server: 282 | 283 | ``` 284 | (venv) dami:scrapy-tutorial harrywang$ scrapyd-deploy 285 | /Users/harrywang/sandbox/scrapy-tutorial/venv/lib/python3.6/site-packages/scrapyd_client/deploy.py:23: ScrapyDeprecationWarning: Module `scrapy.utils.http` is deprecated, Please import from `w3lib.http` instead. 286 | from scrapy.utils.http import basic_auth_header 287 | Packing version 1566253506 288 | Deploying to project "scrapy-tutorial" in http://scrapyd-server1.herokuapp.com/addversion.json 289 | Server response (200): 290 | {"node_name": "9177f699-b645-4656-82d1-beef2898fdc1", "status": "ok", "project": "scrapy-tutorial", "version": "1566253506", "spiders": 3} 291 | ``` 292 | go to https://srapyd-web.herokuapp.com, you should see your project deployed: 293 | ![Screen Shot 2019-08-19 at 6 27 32 PM](https://user-images.githubusercontent.com/595772/63303881-2063fd80-c2af-11e9-8ba7-216778176e31.png) 294 | -------------------------------------------------------------------------------- /local_output/readme.md: -------------------------------------------------------------------------------- 1 | This folder is ignored and stores local outputs (data saved to local such as json and html) 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mysqlclient>=1.4.4 2 | Scrapy>=1.7.3 3 | shub>=2.9.0 4 | SQLAlchemy>=1.3.6 5 | -------------------------------------------------------------------------------- /scrapinghub.yml: -------------------------------------------------------------------------------- 1 | project: 404937 2 | 3 | stacks: 4 | default: scrapy:1.7-py3 5 | requirements: 6 | file: requirements.txt 7 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = tutorial.settings 8 | 9 | [deploy] 10 | url = http://scrapy-server1.herokuapp.com 11 | username = admin 12 | password = scrapydweb 13 | project = scrapy-tutorial 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Automatically created by: shub deploy 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name = 'project', 7 | version = '1.0', 8 | packages = find_packages(), 9 | entry_points = {'scrapy': ['settings = tutorial.settings']}, 10 | ) 11 | -------------------------------------------------------------------------------- /tutorial/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/scrapy-tutorial/1dde391b856491202eded192cd0384be38f01b43/tutorial/__init__.py -------------------------------------------------------------------------------- /tutorial/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy.item import Item, Field 9 | from scrapy.loader.processors import MapCompose, TakeFirst 10 | from datetime import datetime 11 | 12 | 13 | def remove_quotes(text): 14 | # strip the unicode quotes 15 | text = text.strip(u'\u201c'u'\u201d') 16 | return text 17 | 18 | 19 | def convert_date(text): 20 | # convert string March 14, 1879 to Python date 21 | return datetime.strptime(text, '%B %d, %Y') 22 | 23 | 24 | def parse_location(text): 25 | # parse location "in Ulm, Germany" 26 | # this simply remove "in ", you can further parse city, state, country, etc. 27 | return text[3:] 28 | 29 | 30 | class QuoteItem(Item): 31 | quote_content = Field( 32 | input_processor=MapCompose(remove_quotes), 33 | # TakeFirst return the first value not the whole list 34 | output_processor=TakeFirst() 35 | ) 36 | author_name = Field( 37 | input_processor=MapCompose(str.strip), 38 | output_processor=TakeFirst() 39 | ) 40 | author_birthday = Field( 41 | input_processor=MapCompose(convert_date), 42 | output_processor=TakeFirst() 43 | ) 44 | author_bornlocation = Field( 45 | input_processor=MapCompose(parse_location), 46 | output_processor=TakeFirst() 47 | ) 48 | author_bio = Field( 49 | input_processor=MapCompose(str.strip), 50 | output_processor=TakeFirst() 51 | ) 52 | tags = Field() 53 | -------------------------------------------------------------------------------- /tutorial/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TutorialSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class TutorialDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /tutorial/models.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine, Column, Table, ForeignKey, MetaData 2 | from sqlalchemy.orm import relationship 3 | from sqlalchemy.ext.declarative import declarative_base 4 | from sqlalchemy import ( 5 | Integer, String, Date, DateTime, Float, Boolean, Text) 6 | from scrapy.utils.project import get_project_settings 7 | 8 | Base = declarative_base() 9 | 10 | 11 | def db_connect(): 12 | """ 13 | Performs database connection using database settings from settings.py. 14 | Returns sqlalchemy engine instance 15 | """ 16 | return create_engine(get_project_settings().get("CONNECTION_STRING")) 17 | 18 | 19 | def create_table(engine): 20 | Base.metadata.create_all(engine) 21 | 22 | 23 | # Association Table for Many-to-Many relationship between Quote and Tag 24 | # https://docs.sqlalchemy.org/en/13/orm/basic_relationships.html#many-to-many 25 | quote_tag = Table('quote_tag', Base.metadata, 26 | Column('quote_id', Integer, ForeignKey('quote.id')), 27 | Column('tag_id', Integer, ForeignKey('tag.id')) 28 | ) 29 | 30 | 31 | class Quote(Base): 32 | __tablename__ = "quote" 33 | 34 | id = Column(Integer, primary_key=True) 35 | quote_content = Column('quote_content', Text()) 36 | author_id = Column(Integer, ForeignKey('author.id')) # Many quotes to one author 37 | tags = relationship('Tag', secondary='quote_tag', 38 | lazy='dynamic', backref="quote") # M-to-M for quote and tag 39 | 40 | 41 | class Author(Base): 42 | __tablename__ = "author" 43 | 44 | id = Column(Integer, primary_key=True) 45 | name = Column('name', String(50), unique=True) 46 | birthday = Column('birthday', DateTime) 47 | bornlocation = Column('bornlocation', String(150)) 48 | bio = Column('bio', Text()) 49 | quotes = relationship('Quote', backref='author') # One author to many Quotes 50 | 51 | 52 | class Tag(Base): 53 | __tablename__ = "tag" 54 | 55 | id = Column(Integer, primary_key=True) 56 | name = Column('name', String(30), unique=True) 57 | quotes = relationship('Quote', secondary='quote_tag', 58 | lazy='dynamic', backref="tag") # M-to-M for quote and tag 59 | -------------------------------------------------------------------------------- /tutorial/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | from sqlalchemy.orm import sessionmaker 10 | from scrapy.exceptions import DropItem 11 | from tutorial.models import Quote, Author, Tag, db_connect, create_table 12 | import logging 13 | 14 | class DuplicatesPipeline(object): 15 | 16 | def __init__(self): 17 | """ 18 | Initializes database connection and sessionmaker. 19 | Creates tables. 20 | """ 21 | engine = db_connect() 22 | create_table(engine) 23 | self.Session = sessionmaker(bind=engine) 24 | logging.info("****DuplicatesPipeline: database connected****") 25 | 26 | def process_item(self, item, spider): 27 | session = self.Session() 28 | exist_quote = session.query(Quote).filter_by(quote_content = item["quote_content"]).first() 29 | if exist_quote is not None: # the current quote exists 30 | raise DropItem("Duplicate item found: %s" % item["quote_content"]) 31 | session.close() 32 | else: 33 | return item 34 | session.close() 35 | 36 | 37 | class SaveQuotesPipeline(object): 38 | def __init__(self): 39 | """ 40 | Initializes database connection and sessionmaker 41 | Creates tables 42 | """ 43 | engine = db_connect() 44 | create_table(engine) 45 | self.Session = sessionmaker(bind=engine) 46 | logging.info("****SaveQuotePipeline: database connected****") 47 | 48 | 49 | def process_item(self, item, spider): 50 | """Save quotes in the database 51 | This method is called for every item pipeline component 52 | """ 53 | session = self.Session() 54 | quote = Quote() 55 | author = Author() 56 | tag = Tag() 57 | author.name = item["author_name"] 58 | author.birthday = item["author_birthday"] 59 | author.bornlocation = item["author_bornlocation"] 60 | author.bio = item["author_bio"] 61 | quote.quote_content = item["quote_content"] 62 | 63 | # check whether the author exists 64 | exist_author = session.query(Author).filter_by(name = author.name).first() 65 | if exist_author is not None: # the current author exists 66 | quote.author = exist_author 67 | else: 68 | quote.author = author 69 | 70 | # check whether the current quote has tags or not 71 | if "tags" in item: 72 | for tag_name in item["tags"]: 73 | tag = Tag(name=tag_name) 74 | # check whether the current tag already exists in the database 75 | exist_tag = session.query(Tag).filter_by(name = tag.name).first() 76 | if exist_tag is not None: # the current tag exists 77 | tag = exist_tag 78 | quote.tags.append(tag) 79 | 80 | try: 81 | session.add(quote) 82 | session.commit() 83 | 84 | except: 85 | session.rollback() 86 | raise 87 | 88 | finally: 89 | session.close() 90 | 91 | return item 92 | -------------------------------------------------------------------------------- /tutorial/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for tutorial project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'tutorial' 13 | 14 | SPIDER_MODULES = ['tutorial.spiders'] 15 | NEWSPIDER_MODULE = 'tutorial.spiders' 16 | 17 | # Database Connection String 18 | 19 | # SQLite 20 | CONNECTION_STRING = 'sqlite:///scrapy_quotes.db' 21 | 22 | # MySQL 23 | # CONNECTION_STRING = "{drivername}://{user}:{passwd}@{host}:{port}/{db_name}?charset=utf8".format( 24 | # drivername="mysql", 25 | # user="harrywang", 26 | # passwd="tutorial", 27 | # host="localhost", 28 | # port="3306", 29 | # db_name="scrapy_quotes", 30 | # ) 31 | 32 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 33 | #USER_AGENT = 'tutorial (+http://www.yourdomain.com)' 34 | 35 | # Obey robots.txt rules 36 | ROBOTSTXT_OBEY = True 37 | 38 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 39 | #CONCURRENT_REQUESTS = 32 40 | 41 | # Configure a delay for requests for the same website (default: 0) 42 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 43 | # See also autothrottle settings and docs 44 | #DOWNLOAD_DELAY = 3 45 | # The download delay setting will honor only one of: 46 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 47 | #CONCURRENT_REQUESTS_PER_IP = 16 48 | 49 | # Disable cookies (enabled by default) 50 | #COOKIES_ENABLED = False 51 | 52 | # Disable Telnet Console (enabled by default) 53 | #TELNETCONSOLE_ENABLED = False 54 | 55 | # Override the default request headers: 56 | #DEFAULT_REQUEST_HEADERS = { 57 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 58 | # 'Accept-Language': 'en', 59 | #} 60 | 61 | # Enable or disable spider middlewares 62 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 63 | #SPIDER_MIDDLEWARES = { 64 | # 'tutorial.middlewares.TutorialSpiderMiddleware': 543, 65 | #} 66 | 67 | # Enable or disable downloader middlewares 68 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 69 | #DOWNLOADER_MIDDLEWARES = { 70 | # 'tutorial.middlewares.TutorialDownloaderMiddleware': 543, 71 | #} 72 | 73 | # Enable or disable extensions 74 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 75 | #EXTENSIONS = { 76 | # 'scrapy.extensions.telnet.TelnetConsole': None, 77 | #} 78 | 79 | # Configure item pipelines 80 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 81 | ITEM_PIPELINES = { 82 | 83 | 'tutorial.pipelines.DuplicatesPipeline': 100, 84 | 'tutorial.pipelines.SaveQuotesPipeline': 200, 85 | } 86 | 87 | # Enable and configure the AutoThrottle extension (disabled by default) 88 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 89 | #AUTOTHROTTLE_ENABLED = True 90 | # The initial download delay 91 | #AUTOTHROTTLE_START_DELAY = 5 92 | # The maximum download delay to be set in case of high latencies 93 | #AUTOTHROTTLE_MAX_DELAY = 60 94 | # The average number of requests Scrapy should be sending in parallel to 95 | # each remote server 96 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 97 | # Enable showing throttling stats for every response received: 98 | #AUTOTHROTTLE_DEBUG = False 99 | 100 | # Enable and configure HTTP caching (disabled by default) 101 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 102 | #HTTPCACHE_ENABLED = True 103 | #HTTPCACHE_EXPIRATION_SECS = 0 104 | #HTTPCACHE_DIR = 'httpcache' 105 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 106 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 107 | -------------------------------------------------------------------------------- /tutorial/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /tutorial/spiders/quotes_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy.loader import ItemLoader 3 | from tutorial.items import QuoteItem 4 | 5 | class QuotesSpider(scrapy.Spider): 6 | name = "quotes" 7 | allowed_domains = ["toscrape.com"] 8 | start_urls = ['http://quotes.toscrape.com/'] 9 | 10 | 11 | def parse(self, response): 12 | self.logger.info('Parse function called on {}'.format(response.url)) 13 | # quotes = response.xpath("//div[@class='quote']") 14 | quotes = response.css('div.quote') 15 | 16 | for quote in quotes: 17 | loader = ItemLoader(item=QuoteItem(), selector=quote) 18 | # pay attention to the dot .// to use relative xpath 19 | # loader.add_xpath('quote_content', ".//span[@class='text']/text()") 20 | loader.add_css('quote_content', '.text::text') 21 | # loader.add_xpath('author', './/small//text()') 22 | loader.add_css('tags', '.tag::text') 23 | quote_item = loader.load_item() 24 | author_url = quote.css('.author + a::attr(href)').get() 25 | # go to the author page and pass the current collected quote info 26 | yield response.follow(author_url, self.parse_author, meta={'quote_item': quote_item}) 27 | 28 | # go to Next page 29 | for a in response.css('li.next a'): 30 | yield response.follow(a, self.parse) 31 | 32 | def parse_author(self, response): 33 | quote_item = response.meta['quote_item'] 34 | loader = ItemLoader(item=quote_item, response=response) 35 | loader.add_css('author_name', '.author-title::text') 36 | loader.add_css('author_birthday', '.author-born-date::text') 37 | loader.add_css('author_bornlocation', '.author-born-location::text') 38 | loader.add_css('author_bio', '.author-description::text') 39 | yield loader.load_item() 40 | -------------------------------------------------------------------------------- /tutorial/spiders/quotes_spider_v1.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | class QuotesSpider(scrapy.Spider): 4 | name = "quotes_v1" 5 | 6 | start_urls = [ 7 | 'http://quotes.toscrape.com/page/1/', 8 | #'http://quotes.toscrape.com/page/2/', 9 | ] 10 | # long version to implement start_urls array: 11 | # def start_requests(self): 12 | # urls = [ 13 | # 'http://quotes.toscrape.com/page/1/', 14 | # 'http://quotes.toscrape.com/page/2/', 15 | # ] 16 | # for url in urls: 17 | # yield scrapy.Request(url=url, callback=self.parse) 18 | 19 | def parse(self, response): 20 | page = response.url.split("/")[-2] # getting the page number from the URL 21 | filename = 'local_output/' + 'quotes-%s.html' % page 22 | with open(filename, 'wb') as f: 23 | f.write(response.body) 24 | self.log('Saved file %s' % filename) 25 | 26 | for quote in response.css('div.quote'): 27 | yield { 28 | 'text': quote.css('span.text::text').get().strip(u'\u201c'u'\u201d'), # strip the unicode quotes 29 | 'author': quote.css('small.author::text').get(), 30 | 'tags': quote.css('div.tags a.tag::text').getall(), 31 | } 32 | 33 | # next_page = response.css('li.next a::attr(href)').get() 34 | 35 | # if next_page is not None: 36 | # next_page = response.urljoin(next_page) 37 | # yield scrapy.Request(next_page, callback=self.parse) 38 | 39 | # shortcut 1 40 | # if next_page is not None: 41 | # yield response.follow(next_page, callback=self.parse) 42 | 43 | # shortcut 2 44 | # for href in response.css('li.next a::attr(href)'): 45 | # yield response.follow(href, callback=self.parse) 46 | 47 | # shortcut 3 48 | for a in response.css('li.next a'): 49 | yield response.follow(a, callback=self.parse) 50 | -------------------------------------------------------------------------------- /tutorial/spiders/quotes_spider_v2.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | class QuotesSpider(scrapy.Spider): 4 | name = "quotes_v2" 5 | 6 | start_urls = ['http://quotes.toscrape.com'] 7 | 8 | def parse(self, response): 9 | # self.logger.info('hello this is my first spider') 10 | quotes = response.css('div.quote') 11 | for quote in quotes: 12 | 13 | yield { 14 | 'text': quote.css('.text::text').get(), 15 | 'author': quote.css('.author::text').get(), 16 | 'tags': quote.css('.tag::text').getall(), 17 | } 18 | 19 | author_url = quote.css('.author + a::attr(href)').get() 20 | self.logger.info('get author page url') 21 | # go to the author page 22 | yield response.follow(author_url, callback=self.parse_author) 23 | 24 | for a in response.css('li.next a'): 25 | yield response.follow(a, callback=self.parse) 26 | 27 | 28 | def parse_author(self, response): 29 | yield { 30 | 'author_name': response.css('.author-title::text').get(), 31 | 'author_birthday': response.css('.author-born-date::text').get(), 32 | 'author_bornlocation': response.css('.author-born-location::text').get(), 33 | 'author_bio': response.css('.author-description::text').get(), 34 | } 35 | --------------------------------------------------------------------------------