├── .gitignore
├── README.md
├── books
    ├── __init__.py
    ├── items.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   └── toscrape.py
└── scrapy.cfg


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Books Crawler
2 | A Scrapy crawler for http://books.toscrape.com
3 | 


--------------------------------------------------------------------------------
/books/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stummjr/books_crawler/5989dfa55561f972c385b841f821e9f222b83ccd/books/__init__.py


--------------------------------------------------------------------------------
/books/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BooksItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/books/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class BooksPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/books/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | BOT_NAME = 'books'
 4 | 
 5 | SPIDER_MODULES = ['books.spiders']
 6 | NEWSPIDER_MODULE = 'books.spiders'
 7 | 
 8 | ROBOTSTXT_OBEY = True
 9 | HTTPCACHE_ENABLED = True
10 | 
11 | SPIDER_MIDDLEWARES = {
12 |     'scrapy_deltafetch.DeltaFetch': 100,
13 |     'scrapy_magicfields.MagicFieldsMiddleware': 200,
14 | }
15 | 
16 | DELTAFETCH_ENABLED = True
17 | 
18 | MAGICFIELDS_ENABLED = True
19 | MAGIC_FIELDS = {
20 |     "timestamp": "$time",
21 |     "spider": "$spider:name",
22 |     "url": "scraped from $response:url",
23 |     "domain": "$response:url,r'https?://([\w\.]+)/']",
24 | }
25 | 


--------------------------------------------------------------------------------
/books/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/books/spiders/toscrape.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | 
 5 | class ToscrapeSpider(scrapy.Spider):
 6 |     name = "toscrape"
 7 |     allowed_domains = ["books.toscrape.com"]
 8 |     start_urls = [
 9 |         'http://books.toscrape.com/',
10 |     ]
11 | 
12 |     def parse(self, response):
13 |         for book_url in response.css("article.product_pod > h3 > a ::attr(href)").extract():
14 |             yield scrapy.Request(response.urljoin(book_url), callback=self.parse_book_page)
15 |         next_page = response.css("li.next > a ::attr(href)").extract_first()
16 |         if next_page:
17 |             yield scrapy.Request(response.urljoin(next_page), callback=self.parse)
18 | 
19 |     def parse_book_page(self, response):
20 |         item = {}
21 |         product = response.css("div.product_main")
22 |         item["title"] = product.css("h1 ::text").extract_first()
23 |         item['category'] = response.xpath(
24 |             "//ul[@class='breadcrumb']/li[@class='active']/preceding-sibling::li[1]/a/text()"
25 |         ).extract_first()
26 |         item['description'] = response.xpath(
27 |             "//div[@id='product_description']/following-sibling::p/text()"
28 |         ).extract_first()
29 |         yield item
30 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = books.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = books
12 | 


--------------------------------------------------------------------------------