├── .gitignore ├── README.md ├── isbullshit ├── __init__.py ├── items.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── isbullshit_spiders.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.pyc 3 | *.py# -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This project contains the code of the spider described in my blogpost [Crawl a website with Scrapy](http://isbullsh.it/2012/04/Web-crawling-with-scrapy/). 2 | 3 | This spider crawls the website [http://isbullsh.it](http://isbullsh.it), and extract information about each blogpost: 4 | 5 | * title 6 | * author 7 | * tag(s) 8 | * release date 9 | * url 10 | * HTML formatted text 11 | * location 12 | 13 | We implement the spider using [Scrapy](http://scrapy.org). 14 | 15 | # Requirements 16 | 17 | * Scrapy: `pip install Scrapy` 18 | * pymongo: `pip install pymongo` 19 | * An installed MongoDB server 20 | 21 | # How do I test it? 22 | Release the spider by entering 23 | 24 | scrapy crawl isbullshit 25 | 26 | -------------------------------------------------------------------------------- /isbullshit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brouberol/isbullshit-crawler/ef288e4fe7eb6a17f5eff9a9a0aebf6789210b67/isbullshit/__init__.py -------------------------------------------------------------------------------- /isbullshit/items.py: -------------------------------------------------------------------------------- 1 | from scrapy.item import Item, Field 2 | 3 | class IsBullshitItem(Item): 4 | """ Definition of all the fields we want to extract from a scraped webpage. """ 5 | title = Field() 6 | author = Field() 7 | tag = Field() 8 | date = Field() 9 | url = Field() 10 | location = Field() 11 | article_html = Field() -------------------------------------------------------------------------------- /isbullshit/pipelines.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | 3 | from scrapy.exceptions import DropItem 4 | from scrapy.conf import settings 5 | from scrapy import log 6 | 7 | class MongoDBStorage(object): 8 | def __init__(self): 9 | """ Initiate a MongoDB connection, a create the settings['MONGODB_COLLECTION'] collection. """ 10 | connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) 11 | db = connection[settings['MONGODB_DB']] 12 | self.collection = db[settings['MONGODB_COLLECTION']] 13 | 14 | def process_item(self, item, spider): 15 | """ This method is called each time an item is scraped from a webpage. 16 | If the item validates, we store it in the MongoDB collection. If not, 17 | we drop it. 18 | """ 19 | # Validate article 20 | if not item['article_html']: 21 | raise DropItem("Missing article text of article from %s" %item['url']) 22 | elif not item['title']: 23 | raise DropItem("Missing title of object from %s" %item['url']) 24 | else: 25 | 26 | # If valid article, insert it in MongoDB collection 27 | # Log this insertion 28 | self.collection.insert(dict(item)) 29 | log.msg("Item wrote to MongoDB database %s/%s" % 30 | (settings['MONGODB_DB'], settings['MONGODB_COLLECTION']), 31 | level=log.DEBUG, spider=spider) 32 | return item 33 | -------------------------------------------------------------------------------- /isbullshit/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for isbullshit project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/topics/settings.html 7 | # 8 | 9 | BOT_NAME = 'isbullshit' 10 | BOT_VERSION = '1.0' 11 | 12 | SPIDER_MODULES = ['isbullshit.spiders'] 13 | NEWSPIDER_MODULE = 'isbullshit.spiders' 14 | USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION) 15 | 16 | ITEM_PIPELINES = [ 17 | 'isbullshit.pipelines.MongoDBStorage', 18 | ] 19 | 20 | MONGODB_SERVER = "localhost" 21 | MONGODB_PORT = 27017 22 | MONGODB_DB = "isbullshit" 23 | MONGODB_COLLECTION = "articles" 24 | -------------------------------------------------------------------------------- /isbullshit/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /isbullshit/spiders/isbullshit_spiders.py: -------------------------------------------------------------------------------- 1 | import urlparse 2 | 3 | from scrapy.contrib.spiders import CrawlSpider, Rule 4 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 5 | from scrapy.selector import HtmlXPathSelector 6 | from isbullshit.items import IsBullshitItem 7 | 8 | 9 | class IsBullshitSpider(CrawlSpider): 10 | """ General configuration of the Crawl Spider """ 11 | name = 'isbullshit' 12 | start_urls = ['http://isbullsh.it'] # urls from which the spider will start crawling 13 | rules = [Rule(SgmlLinkExtractor(allow=[r'page/\d+']), follow=True), 14 | # r'page/\d+' : regular expression for http://isbullsh.it/page/X URLs 15 | Rule(SgmlLinkExtractor(allow=[r'\d{4}/\d{2}/\w+']), callback='parse_blogpost')] 16 | # r'\d{4}/\d{2}/\w+' : regular expression for http://isbullsh.it/YYYY/MM/title URLs 17 | 18 | def parse_blogpost(self, response): 19 | """ Extract title, author, tag(s), date, location, url and the html text of a blogpost, 20 | using XPath selectors 21 | """ 22 | hxs = HtmlXPathSelector(response) 23 | item = IsBullshitItem() 24 | # Extract title 25 | item['title'] = hxs.select('//header/h1/text()').extract()[0] 26 | # Extract author 27 | item['author'] = hxs.select('//header/p/a/text()').extract()[0] 28 | # Extract tag(s) 29 | item['tag'] = hxs.select("//header/div[@class='post-data']/p/a/text()").extract() 30 | # Extract date 31 | item['date'] = hxs.select("//header/div[@class='post-data']/p[contains(text(), '20')]/text()").extract()[0] 32 | # Extract location 33 | item['location'] = hxs.select("//header/div[@class='post-data']/p[contains(text(), 'From')]/text()").extract()[0].replace('From', '') 34 | # Extract article url 35 | urls = hxs.select("//div[@class='breadcrumb-container']/ul[@class='breadcrumb']/li/a/@href").extract() 36 | item['url'] = urlparse.urljoin(urls[1], urls[2]) 37 | # Extract article text, with html tags 38 | item['article_html'] = hxs.select("//div[@role='main']/article").extract()[0] 39 | 40 | return item 41 | 42 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/topics/scrapyd.html 5 | 6 | [settings] 7 | default = isbullshit.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = isbullshit 12 | --------------------------------------------------------------------------------