├── books
    ├── books
    │   ├── __init__.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── spider.py
    │   ├── pipelines.py
    │   ├── items.py
    │   ├── settings.py
    │   └── middlewares.py
    └── scrapy.cfg
├── .gitignore
└── README.md


/books/books/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | Pipfile
2 | 
3 | *.json
4 | *.csv
5 | *.xml


--------------------------------------------------------------------------------
/books/books/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Scrapy Tutorial
2 | A Scrapy bot that I'm using to scrape 'http://books.toscrape.com/' URL for my tutorial on http://letslearnabout.net/
3 | 
4 | 
5 | ## Third lesson - Your first Scrapy spider | Scraping next pages
6 | - How to check if there is a 'next' page
7 | - How to build the 'next' page URL
8 | 


--------------------------------------------------------------------------------
/books/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = books.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = books
12 | 


--------------------------------------------------------------------------------
/books/books/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class BooksPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/books/books/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://docs.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BooksItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/books/books/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | 
 5 | class SpiderSpider(scrapy.Spider):
 6 |     name = 'spider'
 7 |     allowed_domains = ['books.toscrape.com']
 8 |     start_urls = ['http://books.toscrape.com/']
 9 |     base_url = 'http://books.toscrape.com/'
10 | 
11 |     def parse(self, response):
12 |         all_books = response.xpath('//article[@class="product_pod"]')
13 | 
14 |         for book in all_books:
15 |             book_url = book.xpath('.//h3/a/@href').extract_first()
16 | 
17 |             if 'catalogue/' not in book_url:
18 |                 book_url = 'catalogue/' + book_url
19 | 
20 |             book_url = self.base_url + book_url
21 | 
22 |             yield scrapy.Request(book_url, callback=self.parse_book)
23 | 
24 |         next_page_partial_url = response.xpath(
25 |             '//li[@class="next"]/a/@href').extract_first()
26 | 
27 |         if next_page_partial_url:
28 |             if 'catalogue/' not in next_page_partial_url:
29 |                 next_page_partial_url = "catalogue/" + next_page_partial_url
30 | 
31 |             next_page_url = self.base_url + next_page_partial_url
32 |             yield scrapy.Request(next_page_url, callback=self.parse)
33 | 
34 |     def parse_book(self, response):
35 |         title = response.xpath('//div/h1/text()').extract_first()
36 | 
37 |         relative_image = response.xpath(
38 |             '//div[@class="item active"]/img/@src').extract_first().replace('../..', '')
39 |         final_image = self.base_url + relative_image
40 | 
41 |         price = response.xpath(
42 |             '//div[contains(@class, "product_main")]/p[@class="price_color"]/text()').extract_first()
43 |         stock = response.xpath(
44 |             '//div[contains(@class, "product_main")]/p[contains(@class, "instock")]/text()').extract()[1].strip()
45 |         stars = response.xpath(
46 |             '//div/p[contains(@class, "star-rating")]/@class').extract_first().replace('star-rating ', '')
47 |         description = response.xpath(
48 |             '//div[@id="product_description"]/following-sibling::p/text()').extract_first()
49 |         upc = response.xpath(
50 |             '//table[@class="table table-striped"]/tr[1]/td/text()').extract_first()
51 |         price_excl_tax = response.xpath(
52 |             '//table[@class="table table-striped"]/tr[3]/td/text()').extract_first()
53 |         price_inc_tax = response.xpath(
54 |             '//table[@class="table table-striped"]/tr[4]/td/text()').extract_first()
55 |         tax = response.xpath(
56 |             '//table[@class="table table-striped"]/tr[5]/td/text()').extract_first()
57 |         number_of_reviews = response.xpath(
58 |             '//table[@class="table table-striped"]/tr[5]/td/text()').extract_first().replace('\u00a3', '')
59 | 
60 |         yield {
61 |             'Title': title,
62 |             'Image': final_image,
63 |             'Price': price,
64 |             'Stock': stock,
65 |             'Stars': stars,
66 |             'Description': description,
67 |             'Upc': upc,
68 |             'Price after tax': price_excl_tax,
69 |             'Price incl tax': price_inc_tax,
70 |             'Tax': tax,
71 |             'Number of reviews': number_of_reviews,
72 |         }
73 | 


--------------------------------------------------------------------------------
/books/books/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for books project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 9 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'books'
13 | 
14 | SPIDER_MODULES = ['books.spiders']
15 | NEWSPIDER_MODULE = 'books.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'books (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'books.middlewares.BooksSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'books.middlewares.BooksDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'books.pipelines.BooksPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/books/books/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class BooksSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Request, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class BooksDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------