├── mercado ├── __init__.py ├── spiders │ ├── __init__.py │ └── spider.py ├── items.py ├── pipelines.py ├── middlewares.py └── settings.py ├── .gitignore ├── scrapy.cfg └── README.md /mercado/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.csv 3 | *.jpg 4 | -------------------------------------------------------------------------------- /mercado/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = mercado.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = mercado 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tutorial Scrapy; Extraer información de Mercado Libre 2 | 3 | Si quieres ver como se realizo este proyecto puedes verlo en video: https://goo.gl/73crqm 4 | 5 | Si tienes alguna duda, puedes crear una pregunta en: [Issues](https://github.com/luisramirez-m/mercadolibre-scrapy/issues "Issues") 6 | 7 | ## Modificaciones: 8 | 9 | 10 | ### [0.0.3] - 2017-09-13 11 | - Cambios para el nuevo diseño de Mercado Libre 12 | - Se removieron muchos campos/variables que Mercado Libre ya no usa 13 | 14 | 15 | ### [0.0.2] - 2017-03-12 16 | - Bajar imágenes del producto 17 |  - Recuerda cambiar en el archivo settings.py tu ruta donde quieres que se descargen las imágenes: 18 | 19 | ``` 20 | IMAGES_STORE = '/URL/DE/TU/DIRECTORIO/imagenes' 21 | ``` 22 | 23 | ### [0.0.1] - 2017-03-11 24 | - Primera version 25 | - Tutorial en video 26 | -------------------------------------------------------------------------------- /mercado/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class MercadoItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | 15 | #info de producto 16 | titulo = scrapy.Field() 17 | modelo = scrapy.Field() 18 | marca = scrapy.Field() 19 | tecnologia = scrapy.Field() 20 | tipo = scrapy.Field() 21 | precio = scrapy.Field() 22 | condicion = scrapy.Field() 23 | envio = scrapy.Field() 24 | ubicacion = scrapy.Field() 25 | opiniones = scrapy.Field() 26 | 27 | #imagenes 28 | image_urls = scrapy.Field() 29 | images = scrapy.Field() 30 | image_name = scrapy.Field() 31 | 32 | 33 | #info de la tienda o vendedor 34 | vendedor_url = scrapy.Field() 35 | tipo_vendedor = scrapy.Field() 36 | ventas_vendedor = scrapy.Field() 37 | -------------------------------------------------------------------------------- /mercado/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import scrapy 9 | from scrapy import signals 10 | from scrapy.exporters import CsvItemExporter 11 | from scrapy.pipelines.images import ImagesPipeline 12 | from scrapy.exceptions import DropItem 13 | from scrapy import Request 14 | import csv 15 | 16 | class MercadoPipeline(object): 17 | def __init__(self): 18 | self.files = {} 19 | 20 | @classmethod 21 | def from_crawler(cls, crawler): 22 | pipeline = cls() 23 | crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) 24 | crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) 25 | return pipeline 26 | 27 | def spider_opened(self, spider): 28 | file = open('%s_items.csv' % spider.name, 'w+b') 29 | self.files[spider] = file 30 | self.exporter = CsvItemExporter(file) 31 | self.exporter.fields_to_export = ['titulo', 'modelo', 'marca', 'tecnologia', 'tipo', 'precio', 'condicion', 'envio', 'ubicacion','opiniones', 32 | 'vendedor_url', 'tipo_vendedor', 'ventas_vendedor', 'image_name', 'image_urls'] 33 | self.exporter.start_exporting() 34 | 35 | def spider_closed(self, spider): 36 | self.exporter.finish_exporting() 37 | file = self.files.pop(spider) 38 | file.close() 39 | 40 | def process_item(self, item, spider): 41 | self.exporter.export_item(item) 42 | return item 43 | 44 | class MercadoImagenesPipeline(ImagesPipeline): 45 | 46 | def get_media_requests(self, item, info): 47 | return [Request(x, meta={'image_name': item["image_name"]}) 48 | for x in item.get('image_urls', [])] 49 | 50 | def file_path(self, request, response=None, info=None): 51 | return '%s.jpg' % request.meta['image_name'] 52 | -------------------------------------------------------------------------------- /mercado/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class MercadoSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /mercado/spiders/spider.py: -------------------------------------------------------------------------------- 1 | # import sys 2 | # reload(sys) 3 | # sys.setdefaultencoding('utf8') 4 | 5 | import scrapy 6 | from scrapy.spider import CrawlSpider, Rule 7 | from scrapy.linkextractors import LinkExtractor 8 | from scrapy.exceptions import CloseSpider 9 | from mercado.items import MercadoItem 10 | 11 | class MercadoSpider(CrawlSpider): 12 | name = 'mercado' 13 | item_count = 0 14 | allowed_domain = ['www.mercadolibre.com.mx'] 15 | start_urls = ['http://listado.mercadolibre.com.mx/impresoras#D[A:impresoras,L:1]'] 16 | 17 | rules = { 18 | # Para cada item 19 | Rule(LinkExtractor(allow = (), restrict_xpaths = ('//li[@class="pagination__next"]/a'))), 20 | Rule(LinkExtractor(allow =(), restrict_xpaths = ('//h2[contains(@class,"item__title")]/a')), 21 | callback = 'parse_item', follow = False) 22 | } 23 | 24 | def parse_item(self, response): 25 | ml_item = MercadoItem() 26 | #info de producto 27 | ml_item['titulo'] = response.xpath('normalize-space(//h1[@class="item-title__primary "]/text())').extract_first() 28 | ml_item['modelo'] = response.xpath('normalize-space(/html/body/main/div/div/div[1]/div[1]/section[1]/div/section[1]/ul/li[1]/span)').extract() 29 | ml_item['marca'] = response.xpath('normalize-space(/html/body/main/div/div/div[1]/div[1]/section[1]/div/section[1]/ul/li[2]/span)').extract() 30 | ml_item['tecnologia'] = response.xpath('normalize-space(/html/body/main/div/div/div[1]/div[1]/section[1]/div/section[2]/ul/li[1]/span)').extract() 31 | ml_item['tipo'] = response.xpath('normalize-space(/html/body/main/div/div/div[1]/div[1]/section[1]/div/section[2]/ul/li[2]/span)').extract() 32 | ml_item['precio'] = response.xpath('normalize-space(//span[@class="price-tag-fraction"]/text())').extract() 33 | ml_item['condicion'] = response.xpath('normalize-space(//div[@class="item-conditions"]/text())').extract() 34 | ml_item['envio'] = response.xpath('normalize-space(//p[contains(@class, "shipping-method-title")]/text())').extract() 35 | ml_item['ubicacion'] = response.xpath('normalize-space(//p[contains(@class, "card-description")])').extract() 36 | ml_item['opiniones'] = response.xpath('normalize-space(//span[@class="review-summary-average"]/text())').extract() 37 | #imagenes del producto 38 | ml_item['image_urls'] = response.xpath('//figure[contains(@class, "gallery-image-container")]/a/img/@src').extract() 39 | ml_item['image_name'] = response.xpath('normalize-space(//h1[@class="item-title__primary "]/text())').extract_first() 40 | #info de la tienda o vendedor 41 | ml_item['vendedor_url'] = response.xpath('//*[contains(@class, "reputation-view-more")]/@href').extract() 42 | ml_item['tipo_vendedor'] = response.xpath('normalize-space(//p[contains(@class, "power-seller")]/text())').extract() 43 | ml_item['ventas_vendedor'] = response.xpath('normalize-space(//div[@class="feedback-title"]/text())').extract() 44 | self.item_count += 1 45 | if self.item_count > 5: 46 | raise CloseSpider('item_exceeded') 47 | yield ml_item 48 | -------------------------------------------------------------------------------- /mercado/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for mercado project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'mercado' 13 | 14 | SPIDER_MODULES = ['mercado.spiders'] 15 | NEWSPIDER_MODULE = 'mercado.spiders' 16 | 17 | #CSV IMPORTACION 18 | ITEM_PIPELINES = {'mercado.pipelines.MercadoPipeline': 500, 19 | 'mercado.pipelines.MercadoImagenesPipeline': 600, } 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'mercado (+http://www.yourdomain.com)' 23 | 24 | # Obey robots.txt rules 25 | ROBOTSTXT_OBEY = True 26 | 27 | #Imagenes 28 | IMAGES_STORE = '/URL/DE/TU/DIRECTORIO/imagenes' 29 | DOWNLOAD_DELAY = 2 30 | 31 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 32 | #CONCURRENT_REQUESTS = 32 33 | 34 | # Configure a delay for requests for the same website (default: 0) 35 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 36 | # See also autothrottle settings and docs 37 | #DOWNLOAD_DELAY = 3 38 | # The download delay setting will honor only one of: 39 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 40 | #CONCURRENT_REQUESTS_PER_IP = 16 41 | 42 | # Disable cookies (enabled by default) 43 | #COOKIES_ENABLED = False 44 | 45 | # Disable Telnet Console (enabled by default) 46 | #TELNETCONSOLE_ENABLED = False 47 | 48 | # Override the default request headers: 49 | #DEFAULT_REQUEST_HEADERS = { 50 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 51 | # 'Accept-Language': 'en', 52 | #} 53 | 54 | # Enable or disable spider middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 56 | #SPIDER_MIDDLEWARES = { 57 | # 'mercado.middlewares.mercadoSpiderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable downloader middlewares 61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 62 | #DOWNLOADER_MIDDLEWARES = { 63 | # 'mercado.middlewares.MyCustomDownloaderMiddleware': 543, 64 | #} 65 | 66 | # Enable or disable extensions 67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 68 | #EXTENSIONS = { 69 | # 'scrapy.extensions.telnet.TelnetConsole': None, 70 | #} 71 | 72 | # Configure item pipelines 73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 74 | #ITEM_PIPELINES = { 75 | # 'mercado.pipelines.MercadoPipeline': 300, 76 | #} 77 | 78 | # Enable and configure the AutoThrottle extension (disabled by default) 79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 80 | #AUTOTHROTTLE_ENABLED = True 81 | # The initial download delay 82 | #AUTOTHROTTLE_START_DELAY = 5 83 | # The maximum download delay to be set in case of high latencies 84 | #AUTOTHROTTLE_MAX_DELAY = 60 85 | # The average number of requests Scrapy should be sending in parallel to 86 | # each remote server 87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 88 | # Enable showing throttling stats for every response received: 89 | #AUTOTHROTTLE_DEBUG = False 90 | 91 | # Enable and configure HTTP caching (disabled by default) 92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 93 | #HTTPCACHE_ENABLED = True 94 | #HTTPCACHE_EXPIRATION_SECS = 0 95 | #HTTPCACHE_DIR = 'httpcache' 96 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 98 | --------------------------------------------------------------------------------