├── ZHXiezi
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   └── xiezi.py
    ├── pipelines.py
    ├── items.py
    └── settings.py
├── README.md
└── scrapy.cfg


/ZHXiezi/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ZHxieziDownloader
 2 | 
 3 | Dowloads sneakers pictures.
 4 | 
 5 | Requirements:
 6 | 
 7 | -Python 2.7
 8 | 
 9 | -Scrapy
10 | 
11 | Usage : scrapy crawl xiezi
12 | 


--------------------------------------------------------------------------------
/ZHXiezi/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ZHXiezi.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ZHXiezi
12 | 


--------------------------------------------------------------------------------
/ZHXiezi/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class ZhxieziPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/ZHXiezi/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ZhxieziItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/ZHXiezi/spiders/xiezi.py:
--------------------------------------------------------------------------------
 1 | from scrapy.contrib.spiders import CrawlSpider, Rule
 2 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 3 | import wget
 4 | import os
 5 | 
 6 | 
 7 | class ZHxiezidownloader(CrawlSpider):
 8 | 
 9 |     name = "xiezi"
10 |     allowed_domains = ["v.yupoo.com"]
11 |     start_urls = [
12 |         "http://v.yupoo.com/photos/xy0594xy/collections/"
13 |     ]
14 | 
15 |     rules = [Rule(SgmlLinkExtractor(restrict_xpaths=('//fieldset\
16 |         //div[@class="Sets"]//a')), callback='parse_item')]
17 | 
18 |     def parse_item(self, response):
19 | 
20 |         mainpic = response.xpath("//td[@valign='top']\
21 |         /a/img/@src").extract()[0]  # foto principal
22 | 
23 |         pics = response.xpath("//li/div/a/img/@src").extract()  # lista de fotos
24 | 
25 |         folder = response.url.split("/")[6]  # ultima parte de la url
26 | 
27 |         if not os.path.exists(folder):
28 |             os.makedirs(folder)
29 | 
30 |         wget.download(mainpic, out=folder)
31 | 
32 |         for pic in pics:
33 |             wget.download(pic, out=folder)
34 | 


--------------------------------------------------------------------------------
/ZHXiezi/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for ZHXiezi project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'ZHXiezi'
13 | 
14 | SPIDER_MODULES = ['ZHXiezi.spiders']
15 | NEWSPIDER_MODULE = 'ZHXiezi.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'ZHXiezi (+http://www.yourdomain.com)'
20 | 
21 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
22 | CONCURRENT_REQUESTS=1
23 | 
24 | # Configure a delay for requests for the same website (default: 0)
25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
26 | # See also autothrottle settings and docs
27 | #DOWNLOAD_DELAY=3
28 | # The download delay setting will honor only one of:
29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
30 | #CONCURRENT_REQUESTS_PER_IP=16
31 | 
32 | # Disable cookies (enabled by default)
33 | #COOKIES_ENABLED=False
34 | 
35 | # Disable Telnet Console (enabled by default)
36 | #TELNETCONSOLE_ENABLED=False
37 | 
38 | # Override the default request headers:
39 | #DEFAULT_REQUEST_HEADERS = {
40 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
41 | #   'Accept-Language': 'en',
42 | #}
43 | 
44 | # Enable or disable spider middlewares
45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
46 | #SPIDER_MIDDLEWARES = {
47 | #    'ZHXiezi.middlewares.MyCustomSpiderMiddleware': 543,
48 | #}
49 | 
50 | # Enable or disable downloader middlewares
51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
52 | #DOWNLOADER_MIDDLEWARES = {
53 | #    'ZHXiezi.middlewares.MyCustomDownloaderMiddleware': 543,
54 | #}
55 | 
56 | # Enable or disable extensions
57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
58 | #EXTENSIONS = {
59 | #    'scrapy.telnet.TelnetConsole': None,
60 | #}
61 | 
62 | # Configure item pipelines
63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
64 | #ITEM_PIPELINES = {
65 | #    'ZHXiezi.pipelines.SomePipeline': 300,
66 | #}
67 | 
68 | # Enable and configure the AutoThrottle extension (disabled by default)
69 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
70 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
71 | #AUTOTHROTTLE_ENABLED=True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY=5
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY=60
76 | # Enable showing throttling stats for every response received:
77 | #AUTOTHROTTLE_DEBUG=False
78 | 
79 | # Enable and configure HTTP caching (disabled by default)
80 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
81 | #HTTPCACHE_ENABLED=True
82 | #HTTPCACHE_EXPIRATION_SECS=0
83 | #HTTPCACHE_DIR='httpcache'
84 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
85 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
86 | DEPTH_PRIORITY = 1
87 | SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
88 | SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
89 | 


--------------------------------------------------------------------------------