├── __init__.py
├── items.pyc
├── __init__.pyc
├── settings.pyc
├── dbs
    └── default.db
├── spiders
    ├── __init__.pyc
    ├── leboncoin_spider.pyc
    ├── __init__.py
    └── leboncoin_spider.py
├── pipelines.py
├── .gitignore
├── items.py
├── README.md
├── settings.py
└── leboncoin.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/items.pyc


--------------------------------------------------------------------------------
/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/__init__.pyc


--------------------------------------------------------------------------------
/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/settings.pyc


--------------------------------------------------------------------------------
/dbs/default.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/dbs/default.db


--------------------------------------------------------------------------------
/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/spiders/__init__.pyc


--------------------------------------------------------------------------------
/spiders/leboncoin_spider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pierreatovh/scrapy-lbc/HEAD/spiders/leboncoin_spider.pyc


--------------------------------------------------------------------------------
/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: http://doc.scrapy.org/topics/item-pipeline.html
5 | 
6 | class LeboncoinPipeline(object):
7 |     def process_item(self, item, spider):
8 |         return item
9 | 


--------------------------------------------------------------------------------
/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # To create the first spider for your project use this command:
4 | #
5 | #   scrapy genspider myspider myspider-domain.com
6 | #
7 | # For more info see:
8 | # http://doc.scrapy.org/topics/spiders.html
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[co]
 2 | 
 3 | # Packages
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 | 
16 | # Installer logs
17 | pip-log.txt
18 | 
19 | # Unit test / coverage reports
20 | .coverage
21 | .tox
22 | 
23 | #Translations
24 | *.mo
25 | 
26 | #Mr Developer
27 | .mr.developer.cfg
28 | 


--------------------------------------------------------------------------------
/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class LeboncoinItem(Item):
 9 |     # define the fields for your item here like:
10 |     # name = Field()
11 | 
12 |      name	  = Field()
13 |      photo	  = Field()
14 |      url	  = Field()
15 | #pass
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | scrapy-lbc
 2 | ==========
 3 | 
 4 | scrapper for leboncoin.fr
 5 | 
 6 | New: all in one script
 7 | ======================
 8 | 
 9 | python leboncoin.py
10 | it will create ameublement.html and decoration.html
11 | 
12 | howto use ?
13 | ===========
14 | 
15 | 2. install scrapy (min v 0.12)
16 | 
17 | 3. git clone git@github.com:baqs/scrapy-lbc.git
18 | 
19 | 4. mkdir /tmp/lbc
20 | 
21 | 5. cd scrapy-lbc
22 | 
23 | 6. scrapy crawl leboncoin  
24 | 
25 | That's it, look @ /tmp/lbc/ ;)
26 | 


--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for leboncoin project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'leboncoin'
10 | BOT_VERSION = '1.0'
11 | 
12 | SPIDER_MODULES = ['leboncoin.spiders']
13 | NEWSPIDER_MODULE = 'leboncoin.spiders'
14 | DEFAULT_ITEM_CLASS = 'leboncoin.items.LeboncoinItem'
15 | USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
16 | 
17 | 


--------------------------------------------------------------------------------
/spiders/leboncoin_spider.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spider      import BaseSpider
 2 | from scrapy.contrib.spiders import CrawlSpider, Rule
 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 4 | from scrapy.selector    import HtmlXPathSelector
 5 | from leboncoin.items    import LeboncoinItem
 6 | 
 7 | #class LeBonCoinSpider(CrawlSpider):
 8 | class LeBonCoinSpider(BaseSpider):
 9 |     name = "leboncoin"
10 |     allowed_domains = ["www.leboncoin.fr"]
11 |     # we could use CrawlSpider, but result is not digestable... better generating last X urls
12 |     categories = ['ameublement','decoration']
13 |     start_urls= []
14 |     for category in categories:
15 |         for i in range(1,30):
16 |             start_urls.append('http://www.leboncoin.fr/'+category+'/offres/nord_pas_de_calais/?o='+str(i))
17 | 
18 |     #start_urls = [
19 | #   rules = (
20 | #       # Extract links matching 'category.php' (but not matching 'subsection.php')
21 | #       # and follow links from them (since no callback means follow=True by default).
22 | #       #Rule(SgmlLinkExtractor(allow=('\?o=', ), deny=('subsection\.php', ))),
23 | #       #Rule(SgmlLinkExtractor(allow=('\?o=', )), callback='parse_item',follow=True),
24 | #       Rule(SgmlLinkExtractor(allow=('\?o=', )), callback='parse_item',follow=False),
25 | 
26 | #       # Extract links matching 'item.php' and parse them with the spider's method parse_item
27 | #       #Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
28 | #   )
29 | 
30 | 
31 | #   def parse_item(self, response):
32 |     def parse(self, response):
33 | #	put in filename
34 | #        filename = response.url.split("/")[-2]
35 | #        open(filename, 'wb').write(response.body)
36 |         hxs     = HtmlXPathSelector(response)
37 |         ads     = hxs.select('//div[@class="list-ads"]/a')
38 |         items   = []
39 |         for ad in ads:
40 |             item = LeboncoinItem()
41 |             item['name']    = ad.select('div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()').re('^\s*([\w\s]+\w)\s*')
42 |             item['photo']   = ad.select('div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src').extract()
43 |             item['url']     = ad.select('@href').extract()
44 |             self.log(item['name'])
45 |             #print item['name'],':' ,item['photo'],'--->', item['url']
46 |             html = '<div><div style="width:150px;height:250px;float:left;text-align:center">\
47 |             <img src="%s" alt="" /><br />\
48 |             <p><a href="%s">%s</a></p>\
49 |             </div></div>' % (''.join(item['photo']), ''.join(item['url']), ''.join(item['name']) )
50 | 
51 |             #print photo
52 |             items.append(item)
53 |             #	put in filename
54 |             filename = response.url.split("/")[-4]
55 |             open('/tmp/lbc/'+filename+'.html', 'a').write(html)
56 |         return items
57 | 


--------------------------------------------------------------------------------
/leboncoin.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # author: Rolando Espinoza La fuente
  4 | #
  5 | # Changelog:
  6 | #     24/07/2011 - updated to work with scrapy 13.0dev
  7 | #     25/08/2010 - initial version. works with scrapy 0.9
  8 | 
  9 | from scrapy.contrib.loader import XPathItemLoader
 10 | from scrapy.item import Item, Field
 11 | from scrapy.selector import HtmlXPathSelector
 12 | from scrapy.spider import BaseSpider
 13 | 
 14 | 
 15 | class LeboncoinItem(Item):
 16 |     # define the fields for your item here like:
 17 |     # name = Field()
 18 | 
 19 |      name     = Field()
 20 |      photo    = Field()
 21 |      url      = Field()
 22 |      category = Field()
 23 | 
 24 | 
 25 | class LeboncoinSpider(BaseSpider):
 26 |     name = "leboncoin"
 27 |     allowed_domains = ["www.leboncoin.fr"]
 28 |     # we could use CrawlSpider, but result is not digestable... better generating last X urls
 29 |     categories = ['ameublement','decoration']
 30 |     start_urls= []
 31 |     # remove files
 32 | 
 33 | 
 34 |     for category in categories:
 35 |         open(category+'.html', 'w').write('')
 36 |         for i in range(1,30):
 37 |             start_urls.append('http://www.leboncoin.fr/'+category+'/offres/nord_pas_de_calais/?o='+str(i))
 38 | 
 39 |     def parse(self, response):
 40 |       # hxs     = HtmlXPathSelector(response)
 41 |       # ads     = hxs.select('//div[@class="list-ads"]/a')
 42 |       # items   = []
 43 |       # for ad in ads:
 44 |       #     item = LeboncoinItem()
 45 |       #     item['name']    = ad.select('div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()').re('^\s*([\w\s]+\w)\s*')
 46 |       #     item['photo']   = ad.select('div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src').extract()
 47 |       #     item['url']     = ad.select('@href').extract()
 48 | 
 49 |            # self.log(item['name'])
 50 |             #print item['name'],':' ,item['photo'],'--->', item['url']
 51 |            #html = '<div><div style="width:150px;height:250px;float:left;text-align:center">\
 52 |            #<img src="%s" alt="" /><br />\
 53 |            #<p><a href="%s">%s</a></p>\
 54 |            #</div></div>' % (''.join(item['photo']), ''.join(item['url']), ''.join(item['name']) )
 55 | 
 56 |            ##print photo
 57 |            #items.append(item)
 58 |            ##   put in filename
 59 |            #filename = response.url.split("/")[-4]
 60 |            #open('/tmp/lbc/'+filename+'.html', 'a').write(html)
 61 |         #return items
 62 |         #yield items
 63 |         hxs = HtmlXPathSelector(response)
 64 |         for qxs in hxs.select('//div[@class="list-ads"]/a'):
 65 |             loader = XPathItemLoader(LeboncoinItem(), selector=qxs)
 66 |             loader.add_xpath('name'      ,  'div[@class="ad-lbc"]/div[@class="detail"]/div[@class="title"]/text()', re='^\s*([\w\s]+\w)\s*' )
 67 |             loader.add_xpath('photo'     ,  'div[@class="ad-lbc"]/div[@class="image"]/div[@class="image-and-nb"]/img/@src' )
 68 |             loader.add_xpath('url'       ,  '@href' )
 69 |             loader.add_value('category'  ,  response.url.split("/")[-4]  )
 70 | 
 71 |             yield loader.load_item()
 72 | 
 73 | 
 74 | def printItem(item):
 75 |     filename = ''.join(item['category'])
 76 |     # we don't care in fact
 77 |     #try:
 78 |     #    open(filename+'.html')
 79 |     #except IOError as e:
 80 |     #    # need to write header in file
 81 |     # we don't care in fact
 82 |     keys = ['photo','url','name']
 83 |     for key in keys:
 84 |         if not key in item:
 85 |             item[key] = ''
 86 |     html = '<div>\
 87 |         <div style="width:auto;height:130px;float:left;margin:1px;">\
 88 |             <a href="%s" alt="%s">\
 89 |                 <img src="%s" />\
 90 |             </a>\
 91 |         </div>\
 92 |     </div>' % (''.join(item['url']), ''.join(item['name']), ''.join(item['photo']) )
 93 |     open(filename+'.html', 'a').write(html)
 94 | 
 95 | 
 96 |         
 97 |     
 98 | 
 99 | 
100 | def main():
101 |     """Setups item signal and run the spider"""
102 |     # set up signal to catch items scraped
103 |     from scrapy import signals
104 |     from scrapy.xlib.pydispatch import dispatcher
105 | 
106 |     # items = []
107 | 
108 |     def catch_item(sender, item, **kwargs):
109 |         printItem(item)
110 |         # print "Got:", item
111 |         # items.append(item)
112 | 
113 |     dispatcher.connect(catch_item, signal=signals.item_passed)
114 | 
115 |     # shut off log
116 |     from scrapy.conf import settings
117 |     settings.overrides['LOG_ENABLED'] = False
118 | 
119 |     # set up crawler
120 |     from scrapy.crawler import CrawlerProcess
121 | 
122 |     crawler = CrawlerProcess(settings)
123 |     crawler.install()
124 |     crawler.configure()
125 | 
126 |     # schedule spider
127 |     crawler.crawl(LeboncoinSpider())
128 | 
129 |     # start engine scrapy/twisted
130 |     print "STARTING ENGINE"
131 |     crawler.start()
132 |     print "ENGINE STOPPED"
133 | 
134 |     #printHTML(items)
135 | 
136 |     
137 | 
138 | if __name__ == '__main__':
139 |     main()
140 | 


--------------------------------------------------------------------------------