├── mercado
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   └── spider.py
    ├── items.py
    ├── pipelines.py
    ├── middlewares.py
    └── settings.py
├── .gitignore
├── scrapy.cfg
└── README.md


/mercado/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.csv
3 | *.jpg
4 | 


--------------------------------------------------------------------------------
/mercado/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = mercado.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = mercado
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Tutorial Scrapy; Extraer información de Mercado Libre
 2 | 
 3 | Si quieres ver como se realizo este proyecto puedes verlo en video: https://goo.gl/73crqm
 4 | 
 5 | Si tienes alguna duda, puedes crear una pregunta en: [Issues](https://github.com/luisramirez-m/mercadolibre-scrapy/issues "Issues")
 6 | 
 7 | ## Modificaciones:
 8 | 
 9 | 
10 | ### [0.0.3] - 2017-09-13
11 | - Cambios para el nuevo diseño de Mercado Libre
12 | - Se removieron muchos campos/variables que Mercado Libre ya no usa
13 | 
14 | 
15 | ### [0.0.2] - 2017-03-12
16 | - Bajar imágenes del producto
17 |   - Recuerda cambiar en el archivo settings.py tu ruta donde quieres que se descargen las imágenes:
18 | 
19 |     ```
20 |     IMAGES_STORE = '/URL/DE/TU/DIRECTORIO/imagenes'
21 |     ```
22 | 
23 | ### [0.0.1] - 2017-03-11
24 | - Primera version
25 | - Tutorial en video
26 | 


--------------------------------------------------------------------------------
/mercado/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class MercadoItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 | 
15 |     #info de producto
16 |     titulo = scrapy.Field()
17 |     modelo = scrapy.Field()
18 |     marca = scrapy.Field()
19 |     tecnologia = scrapy.Field()
20 |     tipo = scrapy.Field()
21 |     precio = scrapy.Field()
22 |     condicion = scrapy.Field()
23 |     envio = scrapy.Field()
24 |     ubicacion = scrapy.Field()
25 |     opiniones = scrapy.Field()
26 | 
27 |     #imagenes
28 |     image_urls = scrapy.Field()
29 |     images = scrapy.Field()
30 |     image_name = scrapy.Field()
31 | 
32 | 
33 |     #info de la tienda o vendedor
34 |     vendedor_url = scrapy.Field()
35 |     tipo_vendedor = scrapy.Field()
36 |     ventas_vendedor = scrapy.Field()
37 | 


--------------------------------------------------------------------------------
/mercado/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import scrapy
 9 | from scrapy import signals
10 | from scrapy.exporters import CsvItemExporter
11 | from scrapy.pipelines.images import ImagesPipeline
12 | from scrapy.exceptions import DropItem
13 | from scrapy import Request
14 | import csv
15 | 
16 | class MercadoPipeline(object):
17 |     def __init__(self):
18 |         self.files = {}
19 | 
20 |     @classmethod
21 |     def from_crawler(cls, crawler):
22 |         pipeline = cls()
23 |         crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
24 |         crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
25 |         return pipeline
26 | 
27 |     def spider_opened(self, spider):
28 |         file = open('%s_items.csv' % spider.name, 'w+b')
29 |         self.files[spider] = file
30 |         self.exporter = CsvItemExporter(file)
31 |         self.exporter.fields_to_export = ['titulo', 'modelo', 'marca', 'tecnologia', 'tipo', 'precio', 'condicion', 'envio', 'ubicacion','opiniones',
32 |         				'vendedor_url', 'tipo_vendedor', 'ventas_vendedor', 'image_name', 'image_urls']
33 |         self.exporter.start_exporting()
34 | 
35 |     def spider_closed(self, spider):
36 |         self.exporter.finish_exporting()
37 |         file = self.files.pop(spider)
38 |         file.close()
39 | 
40 |     def process_item(self, item, spider):
41 |         self.exporter.export_item(item)
42 |         return item
43 | 
44 | class MercadoImagenesPipeline(ImagesPipeline):
45 | 
46 |     def get_media_requests(self, item, info):
47 |         return [Request(x, meta={'image_name': item["image_name"]})
48 |                 for x in item.get('image_urls', [])]
49 | 
50 |     def file_path(self, request, response=None, info=None):
51 |         return '%s.jpg' % request.meta['image_name']
52 | 


--------------------------------------------------------------------------------
/mercado/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class MercadoSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/mercado/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | # import sys
 2 | # reload(sys)
 3 | # sys.setdefaultencoding('utf8')
 4 | 
 5 | import scrapy
 6 | from scrapy.spider import CrawlSpider, Rule
 7 | from scrapy.linkextractors import LinkExtractor
 8 | from scrapy.exceptions import CloseSpider
 9 | from mercado.items import MercadoItem
10 | 
11 | class MercadoSpider(CrawlSpider):
12 | 	name = 'mercado'
13 | 	item_count = 0
14 | 	allowed_domain = ['www.mercadolibre.com.mx']
15 | 	start_urls = ['http://listado.mercadolibre.com.mx/impresoras#D[A:impresoras,L:1]']
16 | 
17 | 	rules = {
18 | 		# Para cada item
19 | 		Rule(LinkExtractor(allow = (), restrict_xpaths = ('//li[@class="pagination__next"]/a'))),
20 | 		Rule(LinkExtractor(allow =(), restrict_xpaths = ('//h2[contains(@class,"item__title")]/a')),
21 | 							callback = 'parse_item', follow = False)
22 | 	}
23 | 
24 | 	def parse_item(self, response):
25 | 		ml_item = MercadoItem()
26 | 		#info de producto
27 | 		ml_item['titulo'] = response.xpath('normalize-space(//h1[@class="item-title__primary "]/text())').extract_first()
28 | 		ml_item['modelo'] = response.xpath('normalize-space(/html/body/main/div/div/div[1]/div[1]/section[1]/div/section[1]/ul/li[1]/span)').extract()
29 | 		ml_item['marca'] = response.xpath('normalize-space(/html/body/main/div/div/div[1]/div[1]/section[1]/div/section[1]/ul/li[2]/span)').extract()
30 | 		ml_item['tecnologia'] = response.xpath('normalize-space(/html/body/main/div/div/div[1]/div[1]/section[1]/div/section[2]/ul/li[1]/span)').extract()
31 | 		ml_item['tipo'] = response.xpath('normalize-space(/html/body/main/div/div/div[1]/div[1]/section[1]/div/section[2]/ul/li[2]/span)').extract()
32 | 		ml_item['precio'] = response.xpath('normalize-space(//span[@class="price-tag-fraction"]/text())').extract()
33 | 		ml_item['condicion'] = response.xpath('normalize-space(//div[@class="item-conditions"]/text())').extract()
34 | 		ml_item['envio'] = response.xpath('normalize-space(//p[contains(@class, "shipping-method-title")]/text())').extract()
35 | 		ml_item['ubicacion'] = response.xpath('normalize-space(//p[contains(@class, "card-description")])').extract()
36 | 		ml_item['opiniones'] = response.xpath('normalize-space(//span[@class="review-summary-average"]/text())').extract()
37 | 		#imagenes del producto
38 | 		ml_item['image_urls'] = response.xpath('//figure[contains(@class, "gallery-image-container")]/a/img/@src').extract()
39 | 		ml_item['image_name'] = response.xpath('normalize-space(//h1[@class="item-title__primary "]/text())').extract_first()
40 | 		#info de la tienda o vendedor
41 | 		ml_item['vendedor_url'] = response.xpath('//*[contains(@class, "reputation-view-more")]/@href').extract()
42 | 		ml_item['tipo_vendedor'] = response.xpath('normalize-space(//p[contains(@class, "power-seller")]/text())').extract()
43 | 		ml_item['ventas_vendedor'] = response.xpath('normalize-space(//div[@class="feedback-title"]/text())').extract()
44 | 		self.item_count += 1
45 | 		if self.item_count > 5:
46 | 			raise CloseSpider('item_exceeded')
47 | 		yield ml_item
48 | 


--------------------------------------------------------------------------------
/mercado/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for mercado project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'mercado'
13 | 
14 | SPIDER_MODULES = ['mercado.spiders']
15 | NEWSPIDER_MODULE = 'mercado.spiders'
16 | 
17 | #CSV IMPORTACION
18 | ITEM_PIPELINES = {'mercado.pipelines.MercadoPipeline': 500,
19 | 					'mercado.pipelines.MercadoImagenesPipeline': 600, }
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'mercado (+http://www.yourdomain.com)'
23 | 
24 | # Obey robots.txt rules
25 | ROBOTSTXT_OBEY = True
26 | 
27 | #Imagenes
28 | IMAGES_STORE = '/URL/DE/TU/DIRECTORIO/imagenes'
29 | DOWNLOAD_DELAY = 2
30 | 
31 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
32 | #CONCURRENT_REQUESTS = 32
33 | 
34 | # Configure a delay for requests for the same website (default: 0)
35 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
36 | # See also autothrottle settings and docs
37 | #DOWNLOAD_DELAY = 3
38 | # The download delay setting will honor only one of:
39 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
40 | #CONCURRENT_REQUESTS_PER_IP = 16
41 | 
42 | # Disable cookies (enabled by default)
43 | #COOKIES_ENABLED = False
44 | 
45 | # Disable Telnet Console (enabled by default)
46 | #TELNETCONSOLE_ENABLED = False
47 | 
48 | # Override the default request headers:
49 | #DEFAULT_REQUEST_HEADERS = {
50 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
51 | #   'Accept-Language': 'en',
52 | #}
53 | 
54 | # Enable or disable spider middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
56 | #SPIDER_MIDDLEWARES = {
57 | #    'mercado.middlewares.mercadoSpiderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable downloader middlewares
61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
62 | #DOWNLOADER_MIDDLEWARES = {
63 | #    'mercado.middlewares.MyCustomDownloaderMiddleware': 543,
64 | #}
65 | 
66 | # Enable or disable extensions
67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
68 | #EXTENSIONS = {
69 | #    'scrapy.extensions.telnet.TelnetConsole': None,
70 | #}
71 | 
72 | # Configure item pipelines
73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
74 | #ITEM_PIPELINES = {
75 | #    'mercado.pipelines.MercadoPipeline': 300,
76 | #}
77 | 
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
80 | #AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | #AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | #AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | #AUTOTHROTTLE_DEBUG = False
90 | 
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | #HTTPCACHE_ENABLED = True
94 | #HTTPCACHE_EXPIRATION_SECS = 0
95 | #HTTPCACHE_DIR = 'httpcache'
96 | #HTTPCACHE_IGNORE_HTTP_CODES = []
97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
98 | 


--------------------------------------------------------------------------------