├── .gitignore ├── README.md ├── books ├── __init__.py ├── items.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── toscrape.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Books Crawler 2 | A Scrapy crawler for http://books.toscrape.com 3 | -------------------------------------------------------------------------------- /books/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stummjr/books_crawler/5989dfa55561f972c385b841f821e9f222b83ccd/books/__init__.py -------------------------------------------------------------------------------- /books/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BooksItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /books/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class BooksPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /books/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | BOT_NAME = 'books' 4 | 5 | SPIDER_MODULES = ['books.spiders'] 6 | NEWSPIDER_MODULE = 'books.spiders' 7 | 8 | ROBOTSTXT_OBEY = True 9 | HTTPCACHE_ENABLED = True 10 | 11 | SPIDER_MIDDLEWARES = { 12 | 'scrapy_deltafetch.DeltaFetch': 100, 13 | 'scrapy_magicfields.MagicFieldsMiddleware': 200, 14 | } 15 | 16 | DELTAFETCH_ENABLED = True 17 | 18 | MAGICFIELDS_ENABLED = True 19 | MAGIC_FIELDS = { 20 | "timestamp": "$time", 21 | "spider": "$spider:name", 22 | "url": "scraped from $response:url", 23 | "domain": "$response:url,r'https?://([\w\.]+)/']", 24 | } 25 | -------------------------------------------------------------------------------- /books/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /books/spiders/toscrape.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | class ToscrapeSpider(scrapy.Spider): 6 | name = "toscrape" 7 | allowed_domains = ["books.toscrape.com"] 8 | start_urls = [ 9 | 'http://books.toscrape.com/', 10 | ] 11 | 12 | def parse(self, response): 13 | for book_url in response.css("article.product_pod > h3 > a ::attr(href)").extract(): 14 | yield scrapy.Request(response.urljoin(book_url), callback=self.parse_book_page) 15 | next_page = response.css("li.next > a ::attr(href)").extract_first() 16 | if next_page: 17 | yield scrapy.Request(response.urljoin(next_page), callback=self.parse) 18 | 19 | def parse_book_page(self, response): 20 | item = {} 21 | product = response.css("div.product_main") 22 | item["title"] = product.css("h1 ::text").extract_first() 23 | item['category'] = response.xpath( 24 | "//ul[@class='breadcrumb']/li[@class='active']/preceding-sibling::li[1]/a/text()" 25 | ).extract_first() 26 | item['description'] = response.xpath( 27 | "//div[@id='product_description']/following-sibling::p/text()" 28 | ).extract_first() 29 | yield item 30 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = books.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = books 12 | --------------------------------------------------------------------------------