├── __init__.py ├── items.pyc ├── __init__.pyc ├── settings.pyc ├── dbs └── default.db ├── spiders ├── __init__.pyc ├── leboncoin_spider.pyc ├── __init__.py └── leboncoin_spider.py ├── pipelines.py ├── .gitignore ├── items.py ├── README.md ├── settings.py └── leboncoin.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/items.pyc -------------------------------------------------------------------------------- /__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/__init__.pyc -------------------------------------------------------------------------------- /settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/settings.pyc -------------------------------------------------------------------------------- /dbs/default.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/dbs/default.db -------------------------------------------------------------------------------- /spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/spiders/__init__.pyc -------------------------------------------------------------------------------- /spiders/leboncoin_spider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/spiders/leboncoin_spider.pyc -------------------------------------------------------------------------------- /pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/topics/item-pipeline.html 5 | 6 | class LeboncoinPipeline(object): 7 | def process_item(self, item, spider): 8 | return item 9 | -------------------------------------------------------------------------------- /spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # To create the first spider for your project use this command: 4 | # 5 | # scrapy genspider myspider myspider-domain.com 6 | # 7 | # For more info see: 8 | # http://doc.scrapy.org/topics/spiders.html 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class LeboncoinItem(Item): 9 | # define the fields for your item here like: 10 | # name = Field() 11 | 12 | name = Field() 13 | photo = Field() 14 | url = Field() 15 | #pass 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scrapy-lbc 2 | ========== 3 | 4 | scrapper for leboncoin.fr 5 | 6 | New: all in one script 7 | ====================== 8 | 9 | python leboncoin.py 10 | it will create ameublement.html and decoration.html 11 | 12 | howto use ? 13 | =========== 14 | 15 | 2. install scrapy (min v 0.12) 16 | 17 | 3. git clone git@github.com:baqs/scrapy-lbc.git 18 | 19 | 4. mkdir /tmp/lbc 20 | 21 | 5. cd scrapy-lbc 22 | 23 | 6. scrapy crawl leboncoin 24 | 25 | That's it, look @ /tmp/lbc/ ;) 26 | -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for leboncoin project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/topics/settings.html 7 | # 8 | 9 | BOT_NAME = 'leboncoin' 10 | BOT_VERSION = '1.0' 11 | 12 | SPIDER_MODULES = ['leboncoin.spiders'] 13 | NEWSPIDER_MODULE = 'leboncoin.spiders' 14 | DEFAULT_ITEM_CLASS = 'leboncoin.items.LeboncoinItem' 15 | USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION) 16 | 17 | -------------------------------------------------------------------------------- /spiders/leboncoin_spider.py: -------------------------------------------------------------------------------- 1 | from scrapy.spider import BaseSpider 2 | from scrapy.contrib.spiders import CrawlSpider, Rule 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 4 | from scrapy.selector import HtmlXPathSelector 5 | from leboncoin.items import LeboncoinItem 6 | 7 | #class LeBonCoinSpider(CrawlSpider): 8 | class LeBonCoinSpider(BaseSpider): 9 | name = "leboncoin" 10 | allowed_domains = ["www.leboncoin.fr"] 11 | # we could use CrawlSpider, but result is not digestable... better generating last X urls 12 | categories = ['ameublement','decoration'] 13 | start_urls= [] 14 | for category in categories: 15 | for i in range(1,30): 16 | start_urls.append('http://www.leboncoin.fr/'+category+'/offres/nord_pas_de_calais/?o='+str(i)) 17 | 18 | #start_urls = [ 19 | # rules = ( 20 | # # Extract links matching 'category.php' (but not matching 'subsection.php') 21 | # # and follow links from them (since no callback means follow=True by default). 22 | # #Rule(SgmlLinkExtractor(allow=('\?o=', ), deny=('subsection\.php', ))), 23 | # #Rule(SgmlLinkExtractor(allow=('\?o=', )), callback='parse_item',follow=True), 24 | # Rule(SgmlLinkExtractor(allow=('\?o=', )), callback='parse_item',follow=False), 25 | 26 | # # Extract links matching 'item.php' and parse them with the spider's method parse_item 27 | # #Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'), 28 | # ) 29 | 30 | 31 | # def parse_item(self, response): 32 | def parse(self, response): 33 | # put in filename 34 | # filename = response.url.split("/")[-2] 35 | # open(filename, 'wb').write(response.body) 36 | hxs = HtmlXPathSelector(response) 37 | ads = hxs.select('//div[@class="list-ads"]/a') 38 | items = [] 39 | for ad in ads: 40 | item = LeboncoinItem() 41 | item['name'] = ad.select('div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()').re('^\s*([\w\s]+\w)\s*') 42 | item['photo'] = ad.select('div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src').extract() 43 | item['url'] = ad.select('@href').extract() 44 | self.log(item['name']) 45 | #print item['name'],':' ,item['photo'],'--->', item['url'] 46 | html = '