├── .gitignore
├── LICENSE
├── README.md
├── deployment.md
├── local_output
    └── readme.md
├── requirements.txt
├── scrapinghub.yml
├── scrapy.cfg
├── setup.py
└── tutorial
    ├── __init__.py
    ├── items.py
    ├── middlewares.py
    ├── models.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
        ├── __init__.py
        ├── quotes_spider.py
        ├── quotes_spider_v1.py
        └── quotes_spider_v2.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | .DS_Store
107 | 
108 | /local_output/*.html
109 | /local_output/*.json
110 | 
111 | # sqlite
112 | *.db
113 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Harry Wang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Scrapy Tutorial
  2 | 
  3 | This repo contains the code for my tutorial: A Minimalist End-to-End Scrapy Tutorial (https://medium.com/p/11e350bcdec0).
  4 | 
  5 | The website to crawl is [http://quotes.toscrape.com](http://quotes.toscrape.com).
  6 | 
  7 | ## Setup
  8 | Tested with Python 3.6 via virtual environment:
  9 | ```shell
 10 | $ python3.6 -m venv venv
 11 | $ source venv/bin/activate
 12 | $ pip install -r requirements.txt
 13 | ```
 14 | 
 15 | ## Run
 16 | 
 17 | Run `scrapy crawl quotes` at the project top level.
 18 | 
 19 | Note that spider name is defined in the spider class, e.g., `quotes_spider.py`:
 20 | ```python
 21 | class QuotesSpider(scrapy.Spider):
 22 |     name = "quotes"
 23 | ```
 24 | 
 25 | ## Deployment
 26 | 
 27 | See deployment.md
 28 | 
 29 | ## Versions
 30 | 
 31 | I keep different versions for learning purposes using git tags:
 32 | 
 33 | ### Version 1 (tag v1.0)
 34 | 
 35 | Key Concepts: basic spider setup, project folder structure, saving files as json and html files, using Scrap shell，Following links, etc.
 36 | 
 37 | Local outputs (json and html pages) are stored in "local-output" folder, which is ignored in .gitignore.
 38 | 
 39 | For example:
 40 | 
 41 | scrapy crawl quotes saves a set of html pages to /local_output
 42 | scrapy crawl quotes -o ./local_output/quotes.json saves the output to a json file
 43 | 
 44 | 
 45 | 
 46 | To create the initial project folder, run `scrapy startproject tutorial` (only need to do this once) I removed the top level `tutorial` folder and add additional files and folders as shown below:
 47 | 
 48 | ```
 49 | tutorial/
 50 |     scrapy.cfg            # deploy configuration file
 51 | 
 52 | 
 53 |     tutorial/             # project's Python module, you'll import your code from here
 54 |         __init__.py
 55 | 
 56 |         items.py          # project items definition file
 57 | 
 58 |         middlewares.py    # project middlewares file
 59 | 
 60 |         pipelines.py      # project pipelines file
 61 | 
 62 |         settings.py       # project settings file
 63 | 
 64 |         spiders/          # a directory where you'll later put your spiders
 65 |             __init__.py
 66 | ```
 67 | `self.log('Saved file %s' % filename)` outputs to the log console. `yield` also outputs the DEBUG info in the console, e.g.:
 68 | 
 69 | <img width="732" alt="Screen Shot 2019-08-13 at 3 30 44 PM" src="https://user-images.githubusercontent.com/595772/62971274-66b4ea80-bddf-11e9-906d-2a7545907cb0.png">
 70 | 
 71 | 
 72 | ### Version 2 (tag v2.0)
 73 | 
 74 | The major change is to use Items.
 75 | 
 76 | Why use Items?
 77 | 
 78 | - clearly specify the structured data to be collected - a central place to look
 79 | - leverage pre and post processors for Items via ItemLoaders (you can also define additional custom processors)
 80 | - Use item pipelines to save data to databases (Version 3)
 81 | - Better code organization - you know where to look for certain processing code
 82 | 
 83 | ### Version 3 (tag v3.0)
 84 | 
 85 | - Add database support via SQLAlchemy and use Item pipeline to save items into database (sqlite and mysql)
 86 | - Add instructions on deploying ScrapingHub.com
 87 | 
 88 | Three tables: Authors, Quotes, Tags.
 89 | 
 90 | - One-to-Many between Authors and Quotes
 91 | - Many-to-Many between Tags and Quotes
 92 | - Many-to-Many between Tags and Authors
 93 | 
 94 | Database schema is defined in `/tutorial/models.py` file and connection string is specified in `/tutorial/settings.py`.
 95 | Add a pipleline file and enable the pipeline in `/tutorial/settings.py` (The number 0-1000 specifies the execution order of the pipelines).
 96 | 
 97 | ```
 98 | ITEM_PIPELINES = {
 99 |     'tutorial.pipelines.SaveQuotesPipeline': 300,
100 | }
101 | ```
102 | 
103 | Use the following commands to check local SQLite database. https://sqlitebrowser.org can be used as a GUI tool.
104 | 
105 | ```
106 | $ man sqlite3
107 | $ sqlite3 scrapy_quotes.db
108 | sqlite> .tables
109 | sqlite> .schema quote
110 | sqlite> .quit
111 | ```
112 | ### Test SQLAlchemy in Shell
113 | 
114 | Once you setup models and pipelines, you can run `scrapy shell` to test the database part. Just paste the code block below and open sqlite database to check the results.
115 | 
116 | ```python
117 | from sqlalchemy.orm import sessionmaker
118 | from tutorial.models import Quote, Author, Tag, db_connect, create_table
119 | engine = db_connect()
120 | create_table(engine)
121 | Session = sessionmaker(bind=engine)
122 | session = Session()
123 | 
124 | quote1 = Quote()
125 | author1 = Author()
126 | author1.name = "Linus Torvalds"
127 | author1.bio = "Linus Torvalds is the creator the Linux kernel and Git."
128 | quote1.quote_content = "Talk is cheap. Show me the code."
129 | quote1.author = author1
130 | tag1 = Tag(name="linux")
131 | tag2 = Tag(name="git")
132 | tag3 = Tag(name="simple")
133 | quote1.tags.append(tag1)
134 | quote1.tags.append(tag2)
135 | quote1.tags.append(tag3)
136 | 
137 | try:
138 |     session.add(author1)
139 |     session.add(quote1)
140 |     session.commit()
141 | except:
142 |     session.rollback()
143 |     raise
144 | 
145 | quote2 = Quote()
146 | author2 = Author()
147 | author2.name = "Steven Jobs"
148 | author2.bio = "Steven Jobs was the chairman, chief executive officer, and co-founder of Apple Inc."
149 | quote2.quote_content = "Stay Hungry Stay Foolish."
150 | quote2.author = author2
151 | tag4 = Tag(name="inspiring")
152 | tag5 = Tag(name="simple")  # this already exists in the database
153 | 
154 | # See difference between filter and filter_by at https://bit.ly/2TLvqeV
155 | 
156 | # exist_tag = session.query(Tag).filter(Tag.name == tag5.name).first()
157 | exist_tag = session.query(Tag).filter_by(name = tag5.name).first()
158 | if exist_tag is not None:  # the current tag exists
159 |     tag5 = exist_tag
160 | 
161 | quote2.tags.append(tag4)
162 | quote2.tags.append(tag5)
163 | 
164 | try:
165 | 
166 |     session.add(author2)
167 |     session.add(quote2)
168 |     session.commit()
169 | except:
170 |     session.rollback()
171 |     raise
172 | finally:
173 |     session.close()
174 | ```
175 | ### MySQL
176 | 
177 | - Install MySQL locally: ``$brew install mysql`, which installs MySQL without password. To start MySQL: `mysql.server start` and then connect: `mysql -u root`.
178 | 
179 | - Create a local database and related user: `CREATE SCHEMA scrapy_quotes DEFAULT CHARACTER SET utf8mb4 ;`
180 | 
181 | - `mysqlclient` package is required.
182 | 
183 | - Comment out MySQL connection string in `settings.py` to use MySQL to store items:
184 | 
185 |     ```python3
186 |     # SQLite
187 |     # CONNECTION_STRING = 'sqlite:///scrapy_quotes.db'
188 | 
189 |     # MySQL
190 |     CONNECTION_STRING = "{drivername}://{user}:{passwd}@{host}:{port}/{db_name}?charset=utf8".format(
191 |         drivername="mysql",
192 |         user="harrywang",
193 |         passwd="tutorial",
194 |         host="localhost",
195 |         port="3306",
196 |         db_name="scrapy_quotes",
197 |     )
198 |     ```
199 | 
200 | ### Version 4 (tag v4.0)
201 | Deployment to Scrapinghub and ScrapydWeb. See [deployment.md](deployment.md) for details.
202 | 
203 | 
204 | ## Other Notes
205 | 
206 | ### Scrapy Shell
207 | 
208 | 
209 | Enter shell: `scrapy shell 'http://quotes.toscrape.com/page/1/'`
210 | 
211 | Extract data examples (css and xpath)：
212 | 
213 | CSS：
214 | ```bash
215 | >>> response.css('title').getall()
216 | ['<title>Quotes to Scrape</title>']
217 | >>> response.css('title::text').get()
218 | 'Quotes to Scrape'
219 | >>> response.css('title::text')[0].get()
220 | 'Quotes to Scrape'
221 | >>> response.css('title::text').re(r'Quotes.*')
222 | ['Quotes to Scrape']
223 | >>> response.css('title::text').re(r'Q\w+')
224 | ['Quotes']
225 | >>> response.css('title::text').re(r'(\w+) to (\w+)')
226 | ['Quotes', 'Scrape']
227 | ```
228 | XPath：
229 | 
230 | ```bash
231 | >>> response.xpath('//title')
232 | [<Selector xpath='//title' data='<title>Quotes to Scrape</title>'>]
233 | >>> response.xpath('//title/text()').get()
234 | 'Quotes to Scrape'
235 | ```
236 | 
237 | View page in browser from shell: `>>> view(response)`
238 | 
239 | ### Extracting quotes and authors
240 | 
241 | HTML to parse:
242 | 
243 | ```html
244 | <div class="quote">
245 |     <span class="text">“The world as we have created it is a process of our
246 |     thinking. It cannot be changed without changing our thinking.”</span>
247 |     <span>
248 |         by <small class="author">Albert Einstein</small>
249 |         <a href="/author/Albert-Einstein">(about)</a>
250 |     </span>
251 |     <div class="tags">
252 |         Tags:
253 |         <a class="tag" href="/tag/change/page/1/">change</a>
254 |         <a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
255 |         <a class="tag" href="/tag/thinking/page/1/">thinking</a>
256 |         <a class="tag" href="/tag/world/page/1/">world</a>
257 |     </div>
258 | </div>
259 | ```
260 | 
261 | Parse and output to log:
262 | 
263 | ```python
264 | import scrapy
265 | 
266 | 
267 | class QuotesSpider(scrapy.Spider):
268 |     name = "quotes"
269 |     start_urls = [
270 |         'http://quotes.toscrape.com/page/1/',
271 |         'http://quotes.toscrape.com/page/2/',
272 |     ]
273 | 
274 |     def parse(self, response):
275 |         for quote in response.css('div.quote'):
276 |             yield {
277 |                 'text': quote.css('span.text::text').get(),
278 |                 'author': quote.css('small.author::text').get(),
279 |                 'tags': quote.css('div.tags a.tag::text').getall(),
280 |             }
281 | ```
282 | Save the output above to json: `scrapy crawl quotes -o ./local_output/quotes.json` - Note: **this command appends to existing json instead of overwriting it**.
283 | 
284 | ### Following links
285 | 
286 | Next link html on the page:
287 | 
288 | ```html
289 | <ul class="pager">
290 |     <li class="next">
291 |         <a href="/page/2/">Next <span aria-hidden="true">&rarr;</span></a>
292 |     </li>
293 | </ul>
294 | ```
295 | Extract it via shell:
296 | 
297 | ```bash
298 | >>> response.css('li.next a::attr(href)').get()
299 | '/page/2/'
300 | >>> response.css('li.next a').attrib['href']
301 | '/page/2'
302 | ```
303 | Follow links:
304 | 
305 | ```python
306 | for a in response.css('li.next a'):
307 |     yield response.follow(a, callback=self.parse)
308 | ```
309 | 
310 | ### Using spider arguments
311 | See https://docs.scrapy.org/en/latest/topics/spiders.html#spiderargs
312 | 


--------------------------------------------------------------------------------
/deployment.md:
--------------------------------------------------------------------------------
  1 | # Deployment Instructions
  2 | Check the following sections for deployment instructions for Scrapinghub and Scrapydweb.
  3 | 
  4 | ## Scrapinghub Deployment
  5 | 
  6 | Create an free account and create a project:
  7 | ![Screen Shot 2019-08-19 at 11 27 48 AM](https://user-images.githubusercontent.com/595772/63278299-05749780-c275-11e9-9c3f-f750ae5bc6e1.png)
  8 | 
  9 | We will use the `shub` command line to deploy. You can find your API key and deploy number once in your project Code & Deploys page:
 10 | ![Screen Shot 2019-08-19 at 11 33 05 AM](https://user-images.githubusercontent.com/595772/63278652-aebb8d80-c275-11e9-8fb1-1945888d6a53.png)
 11 | 
 12 | Go back to the root of Scrapy-tutorial (the root of the Scrapy project) and use the following command to deploy your project to Scrapyinghub.
 13 | 
 14 | ```bash
 15 | 
 16 | (venv) dami:scrapy-tutorial harrywang$ shub login
 17 | Enter your API key from https://app.scrapinghub.com/account/apikey
 18 | API key: xxxxx
 19 | Validating API key...
 20 | API key is OK, you are logged in now.
 21 | (venv) dami:scrapy-tutorial harrywang$ shub deploy 404937
 22 | Messagepack is not available, please ensure that msgpack-python library is properly installed.
 23 | Saving project 404937 as default target. You can deploy to it via 'shub deploy' from now on
 24 | Saved to /Users/harrywang/xxx/scrapy-tutorial/scrapinghub.yml.
 25 | Packing version b6ac860-master
 26 | Created setup.py at /Users/harrywang/xxx/scrapy-tutorial
 27 | Deploying to Scrapy Cloud project "404937"
 28 | {"status": "ok", "project": 4xxx, "version": "b6ac860-master", "spiders": 3}
 29 | Run your spiders at: https://app.scrapinghub.com/p/404937/
 30 | ```
 31 | Scrapinghub configuration file is created `scrapinghub.yml` and you need to edit it to specify:
 32 | 
 33 | - scrapy 1.7 running Python 3
 34 | - requirements files for other packages
 35 | 
 36 | ```yml
 37 | project: 404937
 38 | 
 39 | stacks:
 40 |     default: scrapy:1.7-py3
 41 | 
 42 | requirements:
 43 |   file: requirements.txt
 44 | ```
 45 | 
 46 | run `$ shub deploy` to deploy again.
 47 | 
 48 | We have three spiders in the project:
 49 | - quotes_spider.py is the main spider
 50 | - quotes_spider_v1.py is the version 1 of the spider that writes to files, etc.
 51 | - authors_spider.py is the spider to get author page from the official tutorial
 52 | 
 53 | You can see your current deployment on scrapinghub.com:
 54 | ![Screen Shot 2019-08-19 at 11 44 31 AM](https://user-images.githubusercontent.com/595772/63279289-bd567480-c276-11e9-8b0d-f24607517652.png)
 55 | 
 56 | Then, you can run your spider:
 57 | 
 58 | ![Screen Shot 2019-08-19 at 12 47 48 PM](https://user-images.githubusercontent.com/595772/63287962-8ccc0600-c289-11e9-9a50-159ccbfb16fe.png)
 59 | 
 60 | ![Screen Shot 2019-08-19 at 12 48 51 PM](https://user-images.githubusercontent.com/595772/63287944-85a4f800-c289-11e9-9edb-f2e32f8b3a35.png)
 61 | 
 62 | Once the job is complete, you can check the results and download the items:
 63 | ![Screen Shot 2019-08-19 at 1 57 49 PM](https://user-images.githubusercontent.com/595772/63287923-76be4580-c289-11e9-8269-85f156a19a02.png)
 64 | 
 65 | ![Screen Shot 2019-08-19 at 1 58 22 PM](https://user-images.githubusercontent.com/595772/63288027-b127e280-c289-11e9-858b-7f03b37f721f.png)
 66 | 
 67 | You can schedule periodic jobs if you upgrade your free plan.
 68 | 
 69 | ## Scrapydweb Deployment
 70 | 
 71 | I found this repo https://github.com/my8100/scrapydweb and follow https://github.com/my8100/scrapyd-cluster-on-heroku to setup the server.
 72 | 
 73 | We need a custom deployment because our scrapy project has specific package requirements, e.g., SQLAlchemy, MySQL, etc. if no special package is needed, you can follow the easy setup below.
 74 | 
 75 | ### Custom Setup
 76 | 
 77 | #### Setup repo and Heroku account
 78 | fork a copy of https://github.com/my8100/scrapyd-cluster-on-heroku to your account, e.g., https://github.com/harrywang/scrapyd-cluster-on-heroku
 79 | 
 80 | create a free account at heroku.com and install Heroku CLI: `brew tap heroku/brew && brew install heroku`
 81 | 
 82 | clone the repo:
 83 | 
 84 | ```bash
 85 | git clone https://github.com/harrywang/scrapyd-cluster-on-heroku
 86 | cd scrapyd-cluster-on-heroku/
 87 | ```
 88 | login to Heroku
 89 | 
 90 | ```
 91 | scrapyd-cluster-on-heroku harrywang$ heroku login
 92 | heroku: Press any key to open up the browser to login or q to exit:
 93 | Opening browser to https://cli-auth.heroku.com/auth/browser/3ba7221b-9c2a-4355-ab3b-d2csda
 94 | Logging in... done
 95 | Logged in as xxx@gmail.com
 96 | ```
 97 | 
 98 | #### Set up Scrapyd server/app
 99 | 
100 | In this step, you should update the `runtime.txt` to specify the Python version and `requirements.txt` to include all packages your spider needs.
101 | 
102 | After changes, `runtime.txt` is:
103 | ```
104 | python-3.6
105 | ```
106 | `requirements.txt` is:
107 | ```
108 | pip>=19.1
109 | #Twisted==18.9.0
110 | scrapy
111 | scrapyd>=1.2.1
112 | scrapy-redis
113 | logparser>=0.8.2
114 | 
115 | mysqlclient>=1.4.4
116 | SQLAlchemy>=1.3.6
117 | ```
118 | 
119 | Setup the repo and commit the changes we just made:
120 | 
121 | ```bash
122 | cd scrapyd
123 | git init
124 | git status
125 | git add .
126 | git commit -a -m "first commit"
127 | git status
128 | ```
129 | 
130 | Deploy Scrapyd app
131 | 
132 | ```bash
133 | heroku apps:create scrapy-server1
134 | heroku git:remote -a scrapy-server1
135 | git remote -v
136 | git push heroku master
137 | heroku logs --tail
138 | # Press ctrl+c to stop logs outputting
139 | # Visit https://svr-1.herokuapp.com
140 | ```
141 | Add environment variables
142 | 
143 | Timezone
144 | 
145 | ```
146 | # python -c "import tzlocal; print(tzlocal.get_localzone())"
147 | heroku config:set TZ=US/Eastern
148 | # heroku config:get TZ
149 | ```
150 | Redis (optional - not in this tutorial)
151 | Redis account (optional, see settings.py in the scrapy_redis_demo_project.zip)
152 | ```
153 | heroku config:set REDIS_HOST=your-redis-host
154 | heroku config:set REDIS_PORT=your-redis-port
155 | heroku config:set REDIS_PASSWORD=your-redis-password
156 | ```
157 | Repeat this step if multiple scrapyd server is needed.
158 | 
159 | #### Setup ScrapydWeb server/app
160 | 
161 | go to scrapydweb subfolder and update `runtime.txt`, `requirements.txt`, and `scrapydweb_settings_v10.py` if needed.
162 | 
163 | Let's enable authentication, edit the following section of `scrapydweb_settings_v10.py`:
164 | 
165 | ```
166 | # The default is False, set it to True to enable basic auth for the web UI.
167 | ENABLE_AUTH = True
168 | if os.environ.get('ENABLE_AUTH', 'False') == 'True':
169 |     ENABLE_AUTH = True
170 | # In order to enable basic auth, both USERNAME and PASSWORD should be non-empty strings.
171 | USERNAME = 'admin'
172 | PASSWORD = 'scrapydweb'
173 | USERNAME = os.environ.get('USERNAME', 'admin')
174 | PASSWORD = os.environ.get('PASSWORD', 'scrapydweb')
175 | ```
176 | 
177 | Otherwise, proceed as follows:
178 | 
179 | ```
180 | cd ..
181 | cd scrapydweb
182 | git init
183 | git status
184 | git add .
185 | git commit -a -m "first commit"
186 | git status
187 | ```
188 | 
189 | Deploy ScrapydWeb app
190 | ```bash
191 | heroku apps:create scrapyd-web
192 | heroku git:remote -a scrapyd-web
193 | git remote -v
194 | git push heroku master
195 | ```
196 | 
197 | Add environment variables
198 | 
199 | Timezone
200 | ```
201 | heroku config:set TZ=US/Eastern
202 | ```
203 | 
204 | Scrapyd servers - you have to use the scrapyd server address you just setup above (see scrapydweb_settings_vN.py in the scrapydweb directory)
205 | 
206 | ```
207 | heroku config:set SCRAPYD_SERVER_1=scrapy-server1.herokuapp.com:80
208 | # heroku config:set SCRAPYD_SERVER_2=svr-2.herokuapp.com:80#group1
209 | # heroku config:set SCRAPYD_SERVER_3=svr-3.herokuapp.com:80#group1
210 | # heroku config:set SCRAPYD_SERVER_4=svr-4.herokuapp.com:80#group2
211 | ```
212 | 
213 | #### Deploy the scrapy project
214 | 
215 | We need to package the project and upload to the server.
216 | 
217 | First, install scrapyd-client using `pip install git+https://github.com/scrapy/scrapyd-client` (note: pip does not work as of writing this document see: https://stackoverflow.com/questions/45750739/scrapyd-client-command-not-found)
218 | 
219 | change the deploy setting in scrapy.cfg:
220 | ```
221 | [deploy]
222 | url = http://scrapyd-server1.herokuapp.com
223 | username = admin
224 | password = scrapydweb
225 | project = scrapy-tutorial
226 | ```
227 | Then, use `scrapyd-deploy` to package and deploy to scrapyd server:
228 | 
229 | ```
230 | (venv) dami:scrapy-tutorial harrywang$ scrapyd-deploy
231 | /Users/harrywang/sandbox/scrapy-tutorial/venv/lib/python3.6/site-packages/scrapyd_client/deploy.py:23: ScrapyDeprecationWarning: Module `scrapy.utils.http` is deprecated, Please import from `w3lib.http` instead.
232 |   from scrapy.utils.http import basic_auth_header
233 | Packing version 1566253506
234 | Deploying to project "scrapy-tutorial" in http://scrapyd-server1.herokuapp.com/addversion.json
235 | Server response (200):
236 | {"node_name": "9177f699-b645-4656-82d1-beef2898fdc1", "status": "ok", "project": "scrapy-tutorial", "version": "1566253506", "spiders": 3}
237 | ```
238 | go to https://srapyd-web.herokuapp.com, you should see your project deployed:
239 | ![Screen Shot 2019-08-19 at 6 27 32 PM](https://user-images.githubusercontent.com/595772/63303881-2063fd80-c2af-11e9-8ba7-216778176e31.png)
240 | 
241 | go to the following page to run the spider:
242 | 
243 | ![Screen Shot 2019-08-19 at 8 56 23 PM](https://user-images.githubusercontent.com/595772/63309234-0c76c680-c2c4-11e9-98b9-1ea499bbf61a.png)
244 | 
245 | Once the spider finishes, you can check the items in Files menu.
246 | 
247 | You can specify Timer Tasks. The following shows a task that runs every 10 minutes. This part is based on APScheduler, see [document](https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html#expression-types) to figure out how to set the values (this could be confusing.)
248 | ![Screen Shot 2019-08-19 at 10 28 04 PM](https://user-images.githubusercontent.com/595772/63313103-6467fa00-c2d1-11e9-9830-744272ef0c2c.png)
249 | 
250 | 
251 | ### Easy Setup
252 | - create a free account at heroku.com and login
253 | - go to https://github.com/my8100/scrapyd-cluster-on-heroku-scrapyd-app and click "Deploy to Heroku" button to setup a scrayd server app (scrapyd-server1.herokuapp.com):
254 | ![Screen Shot 2019-08-19 at 5 13 53 PM](https://user-images.githubusercontent.com/595772/63300193-a4b18300-c2a5-11e9-9453-3f42c5004ba9.png)
255 | 
256 | Use the following settings (No redis setting) and the app is at scrapyd-server1.herokuapp.com
257 | ![Screen Shot 2019-08-19 at 5 19 26 PM](https://user-images.githubusercontent.com/595772/63300172-9499a380-c2a5-11e9-89d8-29eee56e8a3f.png)
258 | 
259 | - go to https://github.com/my8100/scrapyd-cluster-on-heroku-scrapydweb-app-git and click "Deploy to Heroku" button to setup a scrapydweb server app:
260 | ![Screen Shot 2019-08-19 at 5 22 46 PM](https://user-images.githubusercontent.com/595772/63301043-9d8b7480-c2a7-11e9-8d9b-f06e7127f7ba.png)
261 | 
262 | Use the following settings (No redis setting) and the app is at scrapyd-server1.herokuapp.com
263 | ![Screen Shot 2019-08-19 at 5 31 15 PM](https://user-images.githubusercontent.com/595772/63301003-877db400-c2a7-11e9-87dd-5b7175729d90.png)
264 | 
265 | - go to https://srapyd-web.herokuapp.com and login, you can see that one scrapyd server is ready:
266 | 
267 | ![Screen Shot 2019-08-19 at 5 37 25 PM](https://user-images.githubusercontent.com/595772/63301257-17bbf900-c2a8-11e9-8b01-307487310163.png)
268 | 
269 | We need to package the project and upload to the server.
270 | 
271 | First, install scrapyd-client using `pip install git+https://github.com/scrapy/scrapyd-client` (note: pip does not work as of writing this document see: https://stackoverflow.com/questions/45750739/scrapyd-client-command-not-found)
272 | 
273 | change the deploy setting in scrapy.cfg:
274 | ```
275 | [deploy]
276 | url = http://scrapyd-server1.herokuapp.com
277 | username = admin
278 | password = scrapydweb
279 | project = scrapy-tutorial
280 | ```
281 | Then, use `scrapyd-deploy` to package and deploy to scrapyd server:
282 | 
283 | ```
284 | (venv) dami:scrapy-tutorial harrywang$ scrapyd-deploy
285 | /Users/harrywang/sandbox/scrapy-tutorial/venv/lib/python3.6/site-packages/scrapyd_client/deploy.py:23: ScrapyDeprecationWarning: Module `scrapy.utils.http` is deprecated, Please import from `w3lib.http` instead.
286 |   from scrapy.utils.http import basic_auth_header
287 | Packing version 1566253506
288 | Deploying to project "scrapy-tutorial" in http://scrapyd-server1.herokuapp.com/addversion.json
289 | Server response (200):
290 | {"node_name": "9177f699-b645-4656-82d1-beef2898fdc1", "status": "ok", "project": "scrapy-tutorial", "version": "1566253506", "spiders": 3}
291 | ```
292 | go to https://srapyd-web.herokuapp.com, you should see your project deployed:
293 | ![Screen Shot 2019-08-19 at 6 27 32 PM](https://user-images.githubusercontent.com/595772/63303881-2063fd80-c2af-11e9-8ba7-216778176e31.png)
294 | 


--------------------------------------------------------------------------------
/local_output/readme.md:
--------------------------------------------------------------------------------
1 | This folder is ignored and stores local outputs (data saved to local such as json and html)
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | mysqlclient>=1.4.4
2 | Scrapy>=1.7.3
3 | shub>=2.9.0
4 | SQLAlchemy>=1.3.6
5 | 


--------------------------------------------------------------------------------
/scrapinghub.yml:
--------------------------------------------------------------------------------
1 | project: 404937
2 | 
3 | stacks:
4 |     default: scrapy:1.7-py3
5 | requirements:
6 |   file: requirements.txt
7 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = tutorial.settings
 8 | 
 9 | [deploy]
10 | url = http://scrapy-server1.herokuapp.com
11 | username = admin
12 | password = scrapydweb
13 | project = scrapy-tutorial
14 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Automatically created by: shub deploy
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(
 6 |     name         = 'project',
 7 |     version      = '1.0',
 8 |     packages     = find_packages(),
 9 |     entry_points = {'scrapy': ['settings = tutorial.settings']},
10 | )
11 | 


--------------------------------------------------------------------------------
/tutorial/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/scrapy-tutorial/1dde391b856491202eded192cd0384be38f01b43/tutorial/__init__.py


--------------------------------------------------------------------------------
/tutorial/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy.item import Item, Field
 9 | from scrapy.loader.processors import MapCompose, TakeFirst
10 | from datetime import datetime
11 | 
12 | 
13 | def remove_quotes(text):
14 |     # strip the unicode quotes
15 |     text = text.strip(u'\u201c'u'\u201d')
16 |     return text
17 | 
18 | 
19 | def convert_date(text):
20 |     # convert string March 14, 1879 to Python date
21 |     return datetime.strptime(text, '%B %d, %Y')
22 | 
23 | 
24 | def parse_location(text):
25 |     # parse location "in Ulm, Germany"
26 |     # this simply remove "in ", you can further parse city, state, country, etc.
27 |     return text[3:]
28 | 
29 | 
30 | class QuoteItem(Item):
31 |     quote_content = Field(
32 |         input_processor=MapCompose(remove_quotes),
33 |         # TakeFirst return the first value not the whole list
34 |         output_processor=TakeFirst()
35 |         )
36 |     author_name = Field(
37 |         input_processor=MapCompose(str.strip),
38 |         output_processor=TakeFirst()
39 |         )
40 |     author_birthday = Field(
41 |         input_processor=MapCompose(convert_date),
42 |         output_processor=TakeFirst()
43 |     )
44 |     author_bornlocation = Field(
45 |         input_processor=MapCompose(parse_location),
46 |         output_processor=TakeFirst()
47 |     )
48 |     author_bio = Field(
49 |         input_processor=MapCompose(str.strip),
50 |         output_processor=TakeFirst()
51 |         )
52 |     tags = Field()
53 | 


--------------------------------------------------------------------------------
/tutorial/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class TutorialSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class TutorialDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/tutorial/models.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import create_engine, Column, Table, ForeignKey, MetaData
 2 | from sqlalchemy.orm import relationship
 3 | from sqlalchemy.ext.declarative import declarative_base
 4 | from sqlalchemy import (
 5 |     Integer, String, Date, DateTime, Float, Boolean, Text)
 6 | from scrapy.utils.project import get_project_settings
 7 | 
 8 | Base = declarative_base()
 9 | 
10 | 
11 | def db_connect():
12 |     """
13 |     Performs database connection using database settings from settings.py.
14 |     Returns sqlalchemy engine instance
15 |     """
16 |     return create_engine(get_project_settings().get("CONNECTION_STRING"))
17 | 
18 | 
19 | def create_table(engine):
20 |     Base.metadata.create_all(engine)
21 | 
22 | 
23 | # Association Table for Many-to-Many relationship between Quote and Tag
24 | # https://docs.sqlalchemy.org/en/13/orm/basic_relationships.html#many-to-many
25 | quote_tag = Table('quote_tag', Base.metadata,
26 |     Column('quote_id', Integer, ForeignKey('quote.id')),
27 |     Column('tag_id', Integer, ForeignKey('tag.id'))
28 | )
29 | 
30 | 
31 | class Quote(Base):
32 |     __tablename__ = "quote"
33 | 
34 |     id = Column(Integer, primary_key=True)
35 |     quote_content = Column('quote_content', Text())
36 |     author_id = Column(Integer, ForeignKey('author.id'))  # Many quotes to one author
37 |     tags = relationship('Tag', secondary='quote_tag',
38 |         lazy='dynamic', backref="quote")  # M-to-M for quote and tag
39 | 
40 | 
41 | class Author(Base):
42 |     __tablename__ = "author"
43 | 
44 |     id = Column(Integer, primary_key=True)
45 |     name = Column('name', String(50), unique=True)
46 |     birthday = Column('birthday', DateTime)
47 |     bornlocation = Column('bornlocation', String(150))
48 |     bio = Column('bio', Text())
49 |     quotes = relationship('Quote', backref='author')  # One author to many Quotes
50 | 
51 | 
52 | class Tag(Base):
53 |     __tablename__ = "tag"
54 | 
55 |     id = Column(Integer, primary_key=True)
56 |     name = Column('name', String(30), unique=True)
57 |     quotes = relationship('Quote', secondary='quote_tag',
58 |         lazy='dynamic', backref="tag")  # M-to-M for quote and tag
59 | 


--------------------------------------------------------------------------------
/tutorial/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | from sqlalchemy.orm import sessionmaker
10 | from scrapy.exceptions import DropItem
11 | from tutorial.models import Quote, Author, Tag, db_connect, create_table
12 | import logging
13 | 
14 | class DuplicatesPipeline(object):
15 | 
16 |     def __init__(self):
17 |         """
18 |         Initializes database connection and sessionmaker.
19 |         Creates tables.
20 |         """
21 |         engine = db_connect()
22 |         create_table(engine)
23 |         self.Session = sessionmaker(bind=engine)
24 |         logging.info("****DuplicatesPipeline: database connected****")
25 | 
26 |     def process_item(self, item, spider):
27 |         session = self.Session()
28 |         exist_quote = session.query(Quote).filter_by(quote_content = item["quote_content"]).first()
29 |         if exist_quote is not None:  # the current quote exists
30 |             raise DropItem("Duplicate item found: %s" % item["quote_content"])
31 |             session.close()
32 |         else:
33 |             return item
34 |             session.close()
35 | 
36 | 
37 | class SaveQuotesPipeline(object):
38 |     def __init__(self):
39 |         """
40 |         Initializes database connection and sessionmaker
41 |         Creates tables
42 |         """
43 |         engine = db_connect()
44 |         create_table(engine)
45 |         self.Session = sessionmaker(bind=engine)
46 |         logging.info("****SaveQuotePipeline: database connected****")
47 | 
48 | 
49 |     def process_item(self, item, spider):
50 |         """Save quotes in the database
51 |         This method is called for every item pipeline component
52 |         """
53 |         session = self.Session()
54 |         quote = Quote()
55 |         author = Author()
56 |         tag = Tag()
57 |         author.name = item["author_name"]
58 |         author.birthday = item["author_birthday"]
59 |         author.bornlocation = item["author_bornlocation"]
60 |         author.bio = item["author_bio"]
61 |         quote.quote_content = item["quote_content"]
62 | 
63 |         # check whether the author exists
64 |         exist_author = session.query(Author).filter_by(name = author.name).first()
65 |         if exist_author is not None:  # the current author exists
66 |             quote.author = exist_author
67 |         else:
68 |             quote.author = author
69 | 
70 |         # check whether the current quote has tags or not
71 |         if "tags" in item:
72 |             for tag_name in item["tags"]:
73 |                 tag = Tag(name=tag_name)
74 |                 # check whether the current tag already exists in the database
75 |                 exist_tag = session.query(Tag).filter_by(name = tag.name).first()
76 |                 if exist_tag is not None:  # the current tag exists
77 |                     tag = exist_tag
78 |                 quote.tags.append(tag)
79 | 
80 |         try:
81 |             session.add(quote)
82 |             session.commit()
83 | 
84 |         except:
85 |             session.rollback()
86 |             raise
87 | 
88 |         finally:
89 |             session.close()
90 | 
91 |         return item
92 | 


--------------------------------------------------------------------------------
/tutorial/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for tutorial project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'tutorial'
 13 | 
 14 | SPIDER_MODULES = ['tutorial.spiders']
 15 | NEWSPIDER_MODULE = 'tutorial.spiders'
 16 | 
 17 | # Database Connection String
 18 | 
 19 | # SQLite
 20 | CONNECTION_STRING = 'sqlite:///scrapy_quotes.db'
 21 | 
 22 | # MySQL
 23 | # CONNECTION_STRING = "{drivername}://{user}:{passwd}@{host}:{port}/{db_name}?charset=utf8".format(
 24 | #     drivername="mysql",
 25 | #     user="harrywang",
 26 | #     passwd="tutorial",
 27 | #     host="localhost",
 28 | #     port="3306",
 29 | #     db_name="scrapy_quotes",
 30 | # )
 31 | 
 32 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 33 | #USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
 34 | 
 35 | # Obey robots.txt rules
 36 | ROBOTSTXT_OBEY = True
 37 | 
 38 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 39 | #CONCURRENT_REQUESTS = 32
 40 | 
 41 | # Configure a delay for requests for the same website (default: 0)
 42 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 43 | # See also autothrottle settings and docs
 44 | #DOWNLOAD_DELAY = 3
 45 | # The download delay setting will honor only one of:
 46 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 47 | #CONCURRENT_REQUESTS_PER_IP = 16
 48 | 
 49 | # Disable cookies (enabled by default)
 50 | #COOKIES_ENABLED = False
 51 | 
 52 | # Disable Telnet Console (enabled by default)
 53 | #TELNETCONSOLE_ENABLED = False
 54 | 
 55 | # Override the default request headers:
 56 | #DEFAULT_REQUEST_HEADERS = {
 57 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 58 | #   'Accept-Language': 'en',
 59 | #}
 60 | 
 61 | # Enable or disable spider middlewares
 62 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 63 | #SPIDER_MIDDLEWARES = {
 64 | #    'tutorial.middlewares.TutorialSpiderMiddleware': 543,
 65 | #}
 66 | 
 67 | # Enable or disable downloader middlewares
 68 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 69 | #DOWNLOADER_MIDDLEWARES = {
 70 | #    'tutorial.middlewares.TutorialDownloaderMiddleware': 543,
 71 | #}
 72 | 
 73 | # Enable or disable extensions
 74 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 75 | #EXTENSIONS = {
 76 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 77 | #}
 78 | 
 79 | # Configure item pipelines
 80 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 81 | ITEM_PIPELINES = {
 82 | 
 83 |     'tutorial.pipelines.DuplicatesPipeline': 100,
 84 |     'tutorial.pipelines.SaveQuotesPipeline': 200,
 85 | }
 86 | 
 87 | # Enable and configure the AutoThrottle extension (disabled by default)
 88 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 89 | #AUTOTHROTTLE_ENABLED = True
 90 | # The initial download delay
 91 | #AUTOTHROTTLE_START_DELAY = 5
 92 | # The maximum download delay to be set in case of high latencies
 93 | #AUTOTHROTTLE_MAX_DELAY = 60
 94 | # The average number of requests Scrapy should be sending in parallel to
 95 | # each remote server
 96 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 97 | # Enable showing throttling stats for every response received:
 98 | #AUTOTHROTTLE_DEBUG = False
 99 | 
100 | # Enable and configure HTTP caching (disabled by default)
101 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
102 | #HTTPCACHE_ENABLED = True
103 | #HTTPCACHE_EXPIRATION_SECS = 0
104 | #HTTPCACHE_DIR = 'httpcache'
105 | #HTTPCACHE_IGNORE_HTTP_CODES = []
106 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
107 | 


--------------------------------------------------------------------------------
/tutorial/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/tutorial/spiders/quotes_spider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from scrapy.loader import ItemLoader
 3 | from tutorial.items import QuoteItem
 4 | 
 5 | class QuotesSpider(scrapy.Spider):
 6 |     name = "quotes"
 7 |     allowed_domains = ["toscrape.com"]
 8 |     start_urls = ['http://quotes.toscrape.com/']
 9 | 
10 | 
11 |     def parse(self, response):
12 |         self.logger.info('Parse function called on {}'.format(response.url))
13 |         # quotes = response.xpath("//div[@class='quote']")
14 |         quotes = response.css('div.quote')
15 | 
16 |         for quote in quotes:
17 |             loader = ItemLoader(item=QuoteItem(), selector=quote)
18 |             # pay attention to the dot .// to use relative xpath
19 |             # loader.add_xpath('quote_content', ".//span[@class='text']/text()")
20 |             loader.add_css('quote_content', '.text::text')
21 |             # loader.add_xpath('author', './/small//text()')
22 |             loader.add_css('tags', '.tag::text')
23 |             quote_item = loader.load_item()
24 |             author_url = quote.css('.author + a::attr(href)').get()
25 |             # go to the author page and pass the current collected quote info
26 |             yield response.follow(author_url, self.parse_author, meta={'quote_item': quote_item})
27 | 
28 |         # go to Next page
29 |         for a in response.css('li.next a'):
30 |             yield response.follow(a, self.parse)
31 | 
32 |     def parse_author(self, response):
33 |         quote_item = response.meta['quote_item']
34 |         loader = ItemLoader(item=quote_item, response=response)
35 |         loader.add_css('author_name', '.author-title::text')
36 |         loader.add_css('author_birthday', '.author-born-date::text')
37 |         loader.add_css('author_bornlocation', '.author-born-location::text')
38 |         loader.add_css('author_bio', '.author-description::text')
39 |         yield loader.load_item()
40 | 


--------------------------------------------------------------------------------
/tutorial/spiders/quotes_spider_v1.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | class QuotesSpider(scrapy.Spider):
 4 |     name = "quotes_v1"
 5 | 
 6 |     start_urls = [
 7 |         'http://quotes.toscrape.com/page/1/',
 8 |         #'http://quotes.toscrape.com/page/2/',
 9 |     ]
10 |     # long version to implement start_urls array:
11 |     # def start_requests(self):
12 |     #     urls = [
13 |     #         'http://quotes.toscrape.com/page/1/',
14 |     #         'http://quotes.toscrape.com/page/2/',
15 |     #     ]
16 |     #     for url in urls:
17 |     #         yield scrapy.Request(url=url, callback=self.parse)
18 | 
19 |     def parse(self, response):
20 |         page = response.url.split("/")[-2]  # getting the page number from the URL
21 |         filename = 'local_output/' + 'quotes-%s.html' % page
22 |         with open(filename, 'wb') as f:
23 |             f.write(response.body)
24 |         self.log('Saved file %s' % filename)
25 | 
26 |         for quote in response.css('div.quote'):
27 |             yield {
28 |                 'text': quote.css('span.text::text').get().strip(u'\u201c'u'\u201d'),  # strip the unicode quotes
29 |                 'author': quote.css('small.author::text').get(),
30 |                 'tags': quote.css('div.tags a.tag::text').getall(),
31 |             }
32 | 
33 |         # next_page = response.css('li.next a::attr(href)').get()
34 | 
35 |         # if next_page is not None:
36 |         #     next_page = response.urljoin(next_page)
37 |         #     yield scrapy.Request(next_page, callback=self.parse)
38 | 
39 |         # shortcut 1
40 |         # if next_page is not None:
41 |         #     yield response.follow(next_page, callback=self.parse)
42 | 
43 |         # shortcut 2
44 |         # for href in response.css('li.next a::attr(href)'):
45 |         #     yield response.follow(href, callback=self.parse)
46 | 
47 |         # shortcut 3
48 |         for a in response.css('li.next a'):
49 |             yield response.follow(a, callback=self.parse)
50 | 


--------------------------------------------------------------------------------
/tutorial/spiders/quotes_spider_v2.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | class QuotesSpider(scrapy.Spider):
 4 |     name = "quotes_v2"
 5 | 
 6 |     start_urls = ['http://quotes.toscrape.com']
 7 | 
 8 |     def parse(self, response):
 9 |         # self.logger.info('hello this is my first spider')
10 |         quotes = response.css('div.quote')
11 |         for quote in quotes:
12 | 
13 |             yield {
14 |                 'text': quote.css('.text::text').get(),
15 |                 'author': quote.css('.author::text').get(),
16 |                 'tags': quote.css('.tag::text').getall(),
17 |             }
18 | 
19 |             author_url = quote.css('.author + a::attr(href)').get()
20 |             self.logger.info('get author page url')
21 |             # go to the author page
22 |             yield response.follow(author_url, callback=self.parse_author)
23 | 
24 |         for a in response.css('li.next a'):
25 |             yield response.follow(a, callback=self.parse)
26 | 
27 | 
28 |     def parse_author(self, response):
29 |         yield {
30 |             'author_name': response.css('.author-title::text').get(),
31 |             'author_birthday': response.css('.author-born-date::text').get(),
32 |             'author_bornlocation': response.css('.author-born-location::text').get(),
33 |             'author_bio': response.css('.author-description::text').get(),
34 |         }
35 | 


--------------------------------------------------------------------------------