├── .gitignore ├── README.md └── books ├── books ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── spider.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | Pipfile 2 | 3 | *.json 4 | *.csv 5 | *.xml -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapy Tutorial 2 | A Scrapy bot that I'm using to scrape 'http://books.toscrape.com/' URL for my tutorial on http://letslearnabout.net/ 3 | 4 | 5 | ## Third lesson - Your first Scrapy spider | Scraping next pages 6 | - How to check if there is a 'next' page 7 | - How to build the 'next' page URL 8 | -------------------------------------------------------------------------------- /books/books/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/david1707/scrapy_tutorial/656c12d671e0be599782a005078b73dff3a6549f/books/books/__init__.py -------------------------------------------------------------------------------- /books/books/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BooksItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /books/books/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class BooksSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Request, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class BooksDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /books/books/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class BooksPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /books/books/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for books project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://docs.scrapy.org/en/latest/topics/settings.html 9 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'books' 13 | 14 | SPIDER_MODULES = ['books.spiders'] 15 | NEWSPIDER_MODULE = 'books.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'books (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'books.middlewares.BooksSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'books.middlewares.BooksDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'books.pipelines.BooksPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /books/books/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /books/books/spiders/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | class SpiderSpider(scrapy.Spider): 6 | name = 'spider' 7 | allowed_domains = ['books.toscrape.com'] 8 | start_urls = ['http://books.toscrape.com/'] 9 | base_url = 'http://books.toscrape.com/' 10 | 11 | def parse(self, response): 12 | all_books = response.xpath('//article[@class="product_pod"]') 13 | 14 | for book in all_books: 15 | book_url = book.xpath('.//h3/a/@href').extract_first() 16 | 17 | if 'catalogue/' not in book_url: 18 | book_url = 'catalogue/' + book_url 19 | 20 | book_url = self.base_url + book_url 21 | 22 | yield scrapy.Request(book_url, callback=self.parse_book) 23 | 24 | next_page_partial_url = response.xpath( 25 | '//li[@class="next"]/a/@href').extract_first() 26 | 27 | if next_page_partial_url: 28 | if 'catalogue/' not in next_page_partial_url: 29 | next_page_partial_url = "catalogue/" + next_page_partial_url 30 | 31 | next_page_url = self.base_url + next_page_partial_url 32 | yield scrapy.Request(next_page_url, callback=self.parse) 33 | 34 | def parse_book(self, response): 35 | title = response.xpath('//div/h1/text()').extract_first() 36 | 37 | relative_image = response.xpath( 38 | '//div[@class="item active"]/img/@src').extract_first().replace('../..', '') 39 | final_image = self.base_url + relative_image 40 | 41 | price = response.xpath( 42 | '//div[contains(@class, "product_main")]/p[@class="price_color"]/text()').extract_first() 43 | stock = response.xpath( 44 | '//div[contains(@class, "product_main")]/p[contains(@class, "instock")]/text()').extract()[1].strip() 45 | stars = response.xpath( 46 | '//div/p[contains(@class, "star-rating")]/@class').extract_first().replace('star-rating ', '') 47 | description = response.xpath( 48 | '//div[@id="product_description"]/following-sibling::p/text()').extract_first() 49 | upc = response.xpath( 50 | '//table[@class="table table-striped"]/tr[1]/td/text()').extract_first() 51 | price_excl_tax = response.xpath( 52 | '//table[@class="table table-striped"]/tr[3]/td/text()').extract_first() 53 | price_inc_tax = response.xpath( 54 | '//table[@class="table table-striped"]/tr[4]/td/text()').extract_first() 55 | tax = response.xpath( 56 | '//table[@class="table table-striped"]/tr[5]/td/text()').extract_first() 57 | number_of_reviews = response.xpath( 58 | '//table[@class="table table-striped"]/tr[5]/td/text()').extract_first().replace('\u00a3', '') 59 | 60 | yield { 61 | 'Title': title, 62 | 'Image': final_image, 63 | 'Price': price, 64 | 'Stock': stock, 65 | 'Stars': stars, 66 | 'Description': description, 67 | 'Upc': upc, 68 | 'Price after tax': price_excl_tax, 69 | 'Price incl tax': price_inc_tax, 70 | 'Tax': tax, 71 | 'Number of reviews': number_of_reviews, 72 | } 73 | -------------------------------------------------------------------------------- /books/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = books.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = books 12 | --------------------------------------------------------------------------------