├── __init__.py ├── items.pyc ├── __init__.pyc ├── settings.pyc ├── dbs └── default.db ├── spiders ├── __init__.pyc ├── leboncoin_spider.pyc ├── __init__.py └── leboncoin_spider.py ├── pipelines.py ├── .gitignore ├── items.py ├── README.md ├── settings.py └── leboncoin.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/items.pyc -------------------------------------------------------------------------------- /__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/__init__.pyc -------------------------------------------------------------------------------- /settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/settings.pyc -------------------------------------------------------------------------------- /dbs/default.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/dbs/default.db -------------------------------------------------------------------------------- /spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/spiders/__init__.pyc -------------------------------------------------------------------------------- /spiders/leboncoin_spider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/spiders/leboncoin_spider.pyc -------------------------------------------------------------------------------- /pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/topics/item-pipeline.html 5 | 6 | class LeboncoinPipeline(object): 7 | def process_item(self, item, spider): 8 | return item 9 | -------------------------------------------------------------------------------- /spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # To create the first spider for your project use this command: 4 | # 5 | # scrapy genspider myspider myspider-domain.com 6 | # 7 | # For more info see: 8 | # http://doc.scrapy.org/topics/spiders.html 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class LeboncoinItem(Item): 9 | # define the fields for your item here like: 10 | # name = Field() 11 | 12 | name = Field() 13 | photo = Field() 14 | url = Field() 15 | #pass 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scrapy-lbc 2 | ========== 3 | 4 | scrapper for leboncoin.fr 5 | 6 | New: all in one script 7 | ====================== 8 | 9 | python leboncoin.py 10 | it will create ameublement.html and decoration.html 11 | 12 | howto use ? 13 | =========== 14 | 15 | 2. install scrapy (min v 0.12) 16 | 17 | 3. git clone git@github.com:baqs/scrapy-lbc.git 18 | 19 | 4. mkdir /tmp/lbc 20 | 21 | 5. cd scrapy-lbc 22 | 23 | 6. scrapy crawl leboncoin 24 | 25 | That's it, look @ /tmp/lbc/ ;) 26 | -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for leboncoin project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/topics/settings.html 7 | # 8 | 9 | BOT_NAME = 'leboncoin' 10 | BOT_VERSION = '1.0' 11 | 12 | SPIDER_MODULES = ['leboncoin.spiders'] 13 | NEWSPIDER_MODULE = 'leboncoin.spiders' 14 | DEFAULT_ITEM_CLASS = 'leboncoin.items.LeboncoinItem' 15 | USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION) 16 | 17 | -------------------------------------------------------------------------------- /spiders/leboncoin_spider.py: -------------------------------------------------------------------------------- 1 | from scrapy.spider import BaseSpider 2 | from scrapy.contrib.spiders import CrawlSpider, Rule 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 4 | from scrapy.selector import HtmlXPathSelector 5 | from leboncoin.items import LeboncoinItem 6 | 7 | #class LeBonCoinSpider(CrawlSpider): 8 | class LeBonCoinSpider(BaseSpider): 9 | name = "leboncoin" 10 | allowed_domains = ["www.leboncoin.fr"] 11 | # we could use CrawlSpider, but result is not digestable... better generating last X urls 12 | categories = ['ameublement','decoration'] 13 | start_urls= [] 14 | for category in categories: 15 | for i in range(1,30): 16 | start_urls.append('http://www.leboncoin.fr/'+category+'/offres/nord_pas_de_calais/?o='+str(i)) 17 | 18 | #start_urls = [ 19 | # rules = ( 20 | # # Extract links matching 'category.php' (but not matching 'subsection.php') 21 | # # and follow links from them (since no callback means follow=True by default). 22 | # #Rule(SgmlLinkExtractor(allow=('\?o=', ), deny=('subsection\.php', ))), 23 | # #Rule(SgmlLinkExtractor(allow=('\?o=', )), callback='parse_item',follow=True), 24 | # Rule(SgmlLinkExtractor(allow=('\?o=', )), callback='parse_item',follow=False), 25 | 26 | # # Extract links matching 'item.php' and parse them with the spider's method parse_item 27 | # #Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'), 28 | # ) 29 | 30 | 31 | # def parse_item(self, response): 32 | def parse(self, response): 33 | # put in filename 34 | # filename = response.url.split("/")[-2] 35 | # open(filename, 'wb').write(response.body) 36 | hxs = HtmlXPathSelector(response) 37 | ads = hxs.select('//div[@class="list-ads"]/a') 38 | items = [] 39 | for ad in ads: 40 | item = LeboncoinItem() 41 | item['name'] = ad.select('div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()').re('^\s*([\w\s]+\w)\s*') 42 | item['photo'] = ad.select('div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src').extract() 43 | item['url'] = ad.select('@href').extract() 44 | self.log(item['name']) 45 | #print item['name'],':' ,item['photo'],'--->', item['url'] 46 | html = '
\ 47 |
\ 48 |

%s

\ 49 |
' % (''.join(item['photo']), ''.join(item['url']), ''.join(item['name']) ) 50 | 51 | #print photo 52 | items.append(item) 53 | # put in filename 54 | filename = response.url.split("/")[-4] 55 | open('/tmp/lbc/'+filename+'.html', 'a').write(html) 56 | return items 57 | -------------------------------------------------------------------------------- /leboncoin.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author: Rolando Espinoza La fuente 4 | # 5 | # Changelog: 6 | # 24/07/2011 - updated to work with scrapy 13.0dev 7 | # 25/08/2010 - initial version. works with scrapy 0.9 8 | 9 | from scrapy.contrib.loader import XPathItemLoader 10 | from scrapy.item import Item, Field 11 | from scrapy.selector import HtmlXPathSelector 12 | from scrapy.spider import BaseSpider 13 | 14 | 15 | class LeboncoinItem(Item): 16 | # define the fields for your item here like: 17 | # name = Field() 18 | 19 | name = Field() 20 | photo = Field() 21 | url = Field() 22 | category = Field() 23 | 24 | 25 | class LeboncoinSpider(BaseSpider): 26 | name = "leboncoin" 27 | allowed_domains = ["www.leboncoin.fr"] 28 | # we could use CrawlSpider, but result is not digestable... better generating last X urls 29 | categories = ['ameublement','decoration'] 30 | start_urls= [] 31 | # remove files 32 | 33 | 34 | for category in categories: 35 | open(category+'.html', 'w').write('') 36 | for i in range(1,30): 37 | start_urls.append('http://www.leboncoin.fr/'+category+'/offres/nord_pas_de_calais/?o='+str(i)) 38 | 39 | def parse(self, response): 40 | # hxs = HtmlXPathSelector(response) 41 | # ads = hxs.select('//div[@class="list-ads"]/a') 42 | # items = [] 43 | # for ad in ads: 44 | # item = LeboncoinItem() 45 | # item['name'] = ad.select('div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()').re('^\s*([\w\s]+\w)\s*') 46 | # item['photo'] = ad.select('div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src').extract() 47 | # item['url'] = ad.select('@href').extract() 48 | 49 | # self.log(item['name']) 50 | #print item['name'],':' ,item['photo'],'--->', item['url'] 51 | #html = '
\ 52 | #
\ 53 | #

%s

\ 54 | #
' % (''.join(item['photo']), ''.join(item['url']), ''.join(item['name']) ) 55 | 56 | ##print photo 57 | #items.append(item) 58 | ## put in filename 59 | #filename = response.url.split("/")[-4] 60 | #open('/tmp/lbc/'+filename+'.html', 'a').write(html) 61 | #return items 62 | #yield items 63 | hxs = HtmlXPathSelector(response) 64 | for qxs in hxs.select('//div[@class="list-ads"]/a'): 65 | loader = XPathItemLoader(LeboncoinItem(), selector=qxs) 66 | loader.add_xpath('name' , 'div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()', re='^\s*([\w\s]+\w)\s*' ) 67 | loader.add_xpath('photo' , 'div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src' ) 68 | loader.add_xpath('url' , '@href' ) 69 | loader.add_value('category' , response.url.split("/")[-4] ) 70 | 71 | yield loader.load_item() 72 | 73 | 74 | def printItem(item): 75 | filename = ''.join(item['category']) 76 | # we don't care in fact 77 | #try: 78 | # open(filename+'.html') 79 | #except IOError as e: 80 | # # need to write header in file 81 | # we don't care in fact 82 | keys = ['photo','url','name'] 83 | for key in keys: 84 | if not key in item: 85 | item[key] = '' 86 | html = '
\ 87 |
\ 88 | \ 89 | \ 90 | \ 91 |
\ 92 |
' % (''.join(item['url']), ''.join(item['name']), ''.join(item['photo']) ) 93 | open(filename+'.html', 'a').write(html) 94 | 95 | 96 | 97 | 98 | 99 | 100 | def main(): 101 | """Setups item signal and run the spider""" 102 | # set up signal to catch items scraped 103 | from scrapy import signals 104 | from scrapy.xlib.pydispatch import dispatcher 105 | 106 | # items = [] 107 | 108 | def catch_item(sender, item, **kwargs): 109 | printItem(item) 110 | # print "Got:", item 111 | # items.append(item) 112 | 113 | dispatcher.connect(catch_item, signal=signals.item_passed) 114 | 115 | # shut off log 116 | from scrapy.conf import settings 117 | settings.overrides['LOG_ENABLED'] = False 118 | 119 | # set up crawler 120 | from scrapy.crawler import CrawlerProcess 121 | 122 | crawler = CrawlerProcess(settings) 123 | crawler.install() 124 | crawler.configure() 125 | 126 | # schedule spider 127 | crawler.crawl(LeboncoinSpider()) 128 | 129 | # start engine scrapy/twisted 130 | print "STARTING ENGINE" 131 | crawler.start() 132 | print "ENGINE STOPPED" 133 | 134 | #printHTML(items) 135 | 136 | 137 | 138 | if __name__ == '__main__': 139 | main() 140 | --------------------------------------------------------------------------------