├── .gitignore
├── README.md
├── isbullshit
    ├── __init__.py
    ├── items.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   └── isbullshit_spiders.py
└── scrapy.cfg


/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *.pyc
3 | *.py#


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This project contains the code of the spider described in my blogpost [Crawl a website with Scrapy](http://isbullsh.it/2012/04/Web-crawling-with-scrapy/).
 2 | 
 3 | This spider crawls the website [http://isbullsh.it](http://isbullsh.it), and extract information about each blogpost:
 4 | 
 5 | * title
 6 | * author
 7 | * tag(s)
 8 | * release date
 9 | * url
10 | * HTML formatted text
11 | * location 
12 | 
13 | We implement the spider using [Scrapy](http://scrapy.org).
14 | 
15 | # Requirements
16 | 
17 | * Scrapy: `pip install Scrapy`
18 | * pymongo: `pip install pymongo`
19 | * An installed MongoDB server
20 | 
21 | # How do I test it?
22 | Release the spider by entering
23 | 
24 |     scrapy crawl isbullshit
25 | 
26 | 


--------------------------------------------------------------------------------
/isbullshit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brouberol/isbullshit-crawler/ef288e4fe7eb6a17f5eff9a9a0aebf6789210b67/isbullshit/__init__.py


--------------------------------------------------------------------------------
/isbullshit/items.py:
--------------------------------------------------------------------------------
 1 | from scrapy.item import Item, Field
 2 | 
 3 | class IsBullshitItem(Item):
 4 |     """ Definition of all the fields we want to extract from a scraped webpage. """
 5 |     title = Field()
 6 |     author = Field()
 7 |     tag = Field()
 8 |     date = Field()
 9 |     url = Field()
10 |     location = Field()
11 |     article_html = Field()


--------------------------------------------------------------------------------
/isbullshit/pipelines.py:
--------------------------------------------------------------------------------
 1 | import pymongo
 2 | 
 3 | from scrapy.exceptions import DropItem
 4 | from scrapy.conf import settings
 5 | from scrapy import log
 6 | 
 7 | class MongoDBStorage(object):
 8 |     def __init__(self):
 9 |         """ Initiate a MongoDB connection, a create the settings['MONGODB_COLLECTION'] collection. """
10 |         connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
11 |         db = connection[settings['MONGODB_DB']]
12 |         self.collection = db[settings['MONGODB_COLLECTION']]
13 |         
14 |     def process_item(self, item, spider):
15 |         """ This method is called each time an item is scraped from a webpage.
16 |         If the item validates, we store it in the MongoDB collection. If not,
17 |         we drop it.
18 |         """
19 |         # Validate article
20 |         if not item['article_html']:
21 |             raise DropItem("Missing article text of article from %s" %item['url'])
22 |         elif not item['title']:
23 |             raise DropItem("Missing title of object from %s" %item['url'])
24 |         else:
25 | 
26 |             # If valid article, insert it in MongoDB collection
27 |             # Log this insertion
28 |             self.collection.insert(dict(item))
29 |             log.msg("Item wrote to MongoDB database %s/%s" %
30 |                     (settings['MONGODB_DB'], settings['MONGODB_COLLECTION']),
31 |                     level=log.DEBUG, spider=spider) 
32 |         return item
33 | 


--------------------------------------------------------------------------------
/isbullshit/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for isbullshit project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'isbullshit'
10 | BOT_VERSION = '1.0'
11 | 
12 | SPIDER_MODULES = ['isbullshit.spiders']
13 | NEWSPIDER_MODULE = 'isbullshit.spiders'
14 | USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
15 | 
16 | ITEM_PIPELINES = [
17 |     'isbullshit.pipelines.MongoDBStorage',
18 | ]
19 | 
20 | MONGODB_SERVER = "localhost"
21 | MONGODB_PORT = 27017
22 | MONGODB_DB = "isbullshit"
23 | MONGODB_COLLECTION = "articles"
24 | 


--------------------------------------------------------------------------------
/isbullshit/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/isbullshit/spiders/isbullshit_spiders.py:
--------------------------------------------------------------------------------
 1 | import urlparse
 2 | 
 3 | from scrapy.contrib.spiders import CrawlSpider, Rule
 4 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 5 | from scrapy.selector import HtmlXPathSelector
 6 | from isbullshit.items import IsBullshitItem
 7 | 
 8 | 
 9 | class IsBullshitSpider(CrawlSpider):
10 |     """ General configuration of the Crawl Spider """
11 |     name = 'isbullshit'
12 |     start_urls = ['http://isbullsh.it'] # urls from which the spider will start crawling
13 |     rules = [Rule(SgmlLinkExtractor(allow=[r'page/\d+']), follow=True), 
14 |         # r'page/\d+' : regular expression for http://isbullsh.it/page/X URLs
15 |         Rule(SgmlLinkExtractor(allow=[r'\d{4}/\d{2}/\w+']), callback='parse_blogpost')]
16 |         # r'\d{4}/\d{2}/\w+' : regular expression for http://isbullsh.it/YYYY/MM/title URLs
17 | 
18 |     def parse_blogpost(self, response):
19 |         """ Extract title, author, tag(s), date, location, url and the html text of a blogpost,
20 |         using XPath selectors
21 |         """
22 |         hxs = HtmlXPathSelector(response)
23 |         item = IsBullshitItem()
24 |         # Extract title
25 |         item['title'] = hxs.select('//header/h1/text()').extract()[0]
26 |         # Extract author
27 |         item['author'] = hxs.select('//header/p/a/text()').extract()[0]
28 |         # Extract tag(s)
29 |         item['tag'] = hxs.select("//header/div[@class='post-data']/p/a/text()").extract() 
30 |         # Extract date
31 |         item['date'] = hxs.select("//header/div[@class='post-data']/p[contains(text(), '20')]/text()").extract()[0]
32 |         # Extract location
33 |         item['location'] = hxs.select("//header/div[@class='post-data']/p[contains(text(), 'From')]/text()").extract()[0].replace('From', '') 
34 |         # Extract article url
35 |         urls = hxs.select("//div[@class='breadcrumb-container']/ul[@class='breadcrumb']/li/a/@href").extract()
36 |         item['url'] = urlparse.urljoin(urls[1], urls[2])
37 |         # Extract article text, with html tags
38 |         item['article_html'] = hxs.select("//div[@role='main']/article").extract()[0]
39 |         
40 |         return item
41 | 
42 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = isbullshit.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = isbullshit
12 | 


--------------------------------------------------------------------------------