├── ZHXiezi ├── __init__.py ├── spiders │ ├── __init__.py │ └── xiezi.py ├── pipelines.py ├── items.py └── settings.py ├── README.md └── scrapy.cfg /ZHXiezi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ZHxieziDownloader 2 | 3 | Dowloads sneakers pictures. 4 | 5 | Requirements: 6 | 7 | -Python 2.7 8 | 9 | -Scrapy 10 | 11 | Usage : scrapy crawl xiezi 12 | -------------------------------------------------------------------------------- /ZHXiezi/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = ZHXiezi.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = ZHXiezi 12 | -------------------------------------------------------------------------------- /ZHXiezi/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class ZhxieziPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /ZHXiezi/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ZhxieziItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /ZHXiezi/spiders/xiezi.py: -------------------------------------------------------------------------------- 1 | from scrapy.contrib.spiders import CrawlSpider, Rule 2 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 3 | import wget 4 | import os 5 | 6 | 7 | class ZHxiezidownloader(CrawlSpider): 8 | 9 | name = "xiezi" 10 | allowed_domains = ["v.yupoo.com"] 11 | start_urls = [ 12 | "http://v.yupoo.com/photos/xy0594xy/collections/" 13 | ] 14 | 15 | rules = [Rule(SgmlLinkExtractor(restrict_xpaths=('//fieldset\ 16 | //div[@class="Sets"]//a')), callback='parse_item')] 17 | 18 | def parse_item(self, response): 19 | 20 | mainpic = response.xpath("//td[@valign='top']\ 21 | /a/img/@src").extract()[0] # foto principal 22 | 23 | pics = response.xpath("//li/div/a/img/@src").extract() # lista de fotos 24 | 25 | folder = response.url.split("/")[6] # ultima parte de la url 26 | 27 | if not os.path.exists(folder): 28 | os.makedirs(folder) 29 | 30 | wget.download(mainpic, out=folder) 31 | 32 | for pic in pics: 33 | wget.download(pic, out=folder) 34 | -------------------------------------------------------------------------------- /ZHXiezi/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for ZHXiezi project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'ZHXiezi' 13 | 14 | SPIDER_MODULES = ['ZHXiezi.spiders'] 15 | NEWSPIDER_MODULE = 'ZHXiezi.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'ZHXiezi (+http://www.yourdomain.com)' 20 | 21 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 22 | CONCURRENT_REQUESTS=1 23 | 24 | # Configure a delay for requests for the same website (default: 0) 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 26 | # See also autothrottle settings and docs 27 | #DOWNLOAD_DELAY=3 28 | # The download delay setting will honor only one of: 29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 30 | #CONCURRENT_REQUESTS_PER_IP=16 31 | 32 | # Disable cookies (enabled by default) 33 | #COOKIES_ENABLED=False 34 | 35 | # Disable Telnet Console (enabled by default) 36 | #TELNETCONSOLE_ENABLED=False 37 | 38 | # Override the default request headers: 39 | #DEFAULT_REQUEST_HEADERS = { 40 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 41 | # 'Accept-Language': 'en', 42 | #} 43 | 44 | # Enable or disable spider middlewares 45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 46 | #SPIDER_MIDDLEWARES = { 47 | # 'ZHXiezi.middlewares.MyCustomSpiderMiddleware': 543, 48 | #} 49 | 50 | # Enable or disable downloader middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 52 | #DOWNLOADER_MIDDLEWARES = { 53 | # 'ZHXiezi.middlewares.MyCustomDownloaderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable extensions 57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 58 | #EXTENSIONS = { 59 | # 'scrapy.telnet.TelnetConsole': None, 60 | #} 61 | 62 | # Configure item pipelines 63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 64 | #ITEM_PIPELINES = { 65 | # 'ZHXiezi.pipelines.SomePipeline': 300, 66 | #} 67 | 68 | # Enable and configure the AutoThrottle extension (disabled by default) 69 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 70 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 71 | #AUTOTHROTTLE_ENABLED=True 72 | # The initial download delay 73 | #AUTOTHROTTLE_START_DELAY=5 74 | # The maximum download delay to be set in case of high latencies 75 | #AUTOTHROTTLE_MAX_DELAY=60 76 | # Enable showing throttling stats for every response received: 77 | #AUTOTHROTTLE_DEBUG=False 78 | 79 | # Enable and configure HTTP caching (disabled by default) 80 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 81 | #HTTPCACHE_ENABLED=True 82 | #HTTPCACHE_EXPIRATION_SECS=0 83 | #HTTPCACHE_DIR='httpcache' 84 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 85 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 86 | DEPTH_PRIORITY = 1 87 | SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue' 88 | SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue' 89 | --------------------------------------------------------------------------------