├── spidermeizi
    ├── __init__.py
    ├── main.py
    ├── __pycache__
    │   ├── items.cpython-36.pyc
    │   ├── __init__.cpython-36.pyc
    │   ├── pipelines.cpython-36.pyc
    │   └── settings.cpython-36.pyc
    ├── spiders
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── meizispider.cpython-36.pyc
    │   ├── __init__.py
    │   └── meizispider.py
    ├── items.py
    ├── pipelines.py
    ├── settings.py
    └── middlewares.py
├── .gitignore
├── requirements.txt
├── project.egg-info
    ├── dependency_links.txt
    ├── top_level.txt
    ├── entry_points.txt
    ├── PKG-INFO
    └── SOURCES.txt
├── meizi.egg
├── bmp
    ├── show-1.jpg
    ├── show-2.jpg
    ├── show-3.jpg
    ├── show-4.jpg
    ├── show-5.jpg
    └── show-6.jpg
├── setup.py
├── scrapy.cfg
└── README.md


/spidermeizi/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/*
2 | 
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Scrapy==1.5.1
2 | 


--------------------------------------------------------------------------------
/project.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/project.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | spidermeizi
2 | 


--------------------------------------------------------------------------------
/meizi.egg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/meizi.egg


--------------------------------------------------------------------------------
/bmp/show-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/bmp/show-1.jpg


--------------------------------------------------------------------------------
/bmp/show-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/bmp/show-2.jpg


--------------------------------------------------------------------------------
/bmp/show-3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/bmp/show-3.jpg


--------------------------------------------------------------------------------
/bmp/show-4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/bmp/show-4.jpg


--------------------------------------------------------------------------------
/bmp/show-5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/bmp/show-5.jpg


--------------------------------------------------------------------------------
/bmp/show-6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/bmp/show-6.jpg


--------------------------------------------------------------------------------
/project.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [scrapy]
2 | settings = spidermeizi.settings
3 | 
4 | 


--------------------------------------------------------------------------------
/spidermeizi/main.py:
--------------------------------------------------------------------------------
1 | import scrapy.cmdline as cmd
2 | cmd.execute('scrapy crawl meizi'.split())
3 | 
4 | 


--------------------------------------------------------------------------------
/spidermeizi/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/spidermeizi/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/spidermeizi/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/spidermeizi/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spidermeizi/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/spidermeizi/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/spidermeizi/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/spidermeizi/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/spidermeizi/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/spidermeizi/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spidermeizi/spiders/__pycache__/meizispider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/spidermeizi/spiders/__pycache__/meizispider.cpython-36.pyc


--------------------------------------------------------------------------------
/spidermeizi/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/project.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.0
 2 | Name: project
 3 | Version: 1.0
 4 | Summary: UNKNOWN
 5 | Home-page: UNKNOWN
 6 | Author: UNKNOWN
 7 | Author-email: UNKNOWN
 8 | License: UNKNOWN
 9 | Description: UNKNOWN
10 | Platform: UNKNOWN
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapyd-deploy
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(
 6 |     name         = 'project',
 7 |     version      = '1.0',
 8 |     packages     = find_packages(),
 9 |     entry_points = {'scrapy': ['settings = spidermeizi.settings']},
10 | )
11 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = spidermeizi.settings
 8 | 
 9 | [deploy]
10 | url = http://localhost:6800/
11 | project = spidermeizi
12 | 


--------------------------------------------------------------------------------
/spidermeizi/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class SpidermeiziItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     name = scrapy.Field()  # 名字
14 |     imgurls = scrapy.Field()  # urls
15 |     pass
16 | 


--------------------------------------------------------------------------------
/project.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | setup.py
 2 | project.egg-info/PKG-INFO
 3 | project.egg-info/SOURCES.txt
 4 | project.egg-info/dependency_links.txt
 5 | project.egg-info/entry_points.txt
 6 | project.egg-info/top_level.txt
 7 | spidermeizi/__init__.py
 8 | spidermeizi/items.py
 9 | spidermeizi/main.py
10 | spidermeizi/middlewares.py
11 | spidermeizi/pipelines.py
12 | spidermeizi/settings.py
13 | spidermeizi/spiders/__init__.py
14 | spidermeizi/spiders/meizispider.py


--------------------------------------------------------------------------------
/spidermeizi/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | from scrapy.pipelines.images import ImagesPipeline
 9 | from scrapy.exceptions import DropItem
10 | from scrapy import Request
11 | 
12 | 
13 | class SpidermeiziPipeline(ImagesPipeline):
14 |     def get_media_requests(self, item, info):
15 |         headers = {
16 |             "Referer": "https://www.mzitu.com/"
17 |         }
18 |         for url in item["imgurls"]:
19 |             yield Request(url, headers=headers, meta={"name": item["name"]})
20 | 
21 |     def item_completed(self, results, item, info):
22 |         image_paths = [x['path'] for ok, x in results if ok]
23 |         if not image_paths:
24 |             raise DropItem("Item contains no images")
25 |         print("下载完成: ", image_paths)
26 |         return item
27 | 
28 |     def file_path(self, request, response=None, info=None):
29 |         name = request.meta['name']
30 |         image_guid = request.url.split('/')[-1]
31 |         filenames = "zujun/%s/%s" % (name, image_guid)
32 |         return filenames
33 | 


--------------------------------------------------------------------------------
/spidermeizi/spiders/meizispider.py:
--------------------------------------------------------------------------------
 1 | #   _*_ coding:utf-8 _*_
 2 | 
 3 | import scrapy
 4 | 
 5 | from spidermeizi.items import SpidermeiziItem
 6 | 
 7 | 
 8 | class MeiziSpider(scrapy.Spider):
 9 |     name = "meizi"
10 |     host = "http://www.mzitu.com/"
11 |     start_urls = [
12 |         "http://www.mzitu.com/"
13 |     ]
14 | 
15 |     def parse(self, response):
16 |         urls = response.xpath('//div[@class="postlist"]/ul/li/a/@href').extract()
17 |         print("urls", urls)
18 |         for url in urls:
19 |             send = url + "/2"
20 |             yield scrapy.Request(send, callback=self.parse_item)
21 | 
22 |     def parse_item(self, response):
23 |         item = SpidermeiziItem()
24 |         imgurl = response.xpath('//div[@class="main-image"]/p/a/img/@src').extract()[0]
25 |         name = response.xpath('//div[@class="main-image"]/p/a/img/@alt').extract()[0]
26 |         total = int(response.xpath('//div[@class="pagenavi"]/a/span/text()').extract()[-2])
27 |         baseimg = imgurl[0:-6]
28 |         urllist = []
29 |         for i in range(1, total):
30 |             urllist.append(baseimg + ("%02d" % i) + ".jpg")
31 |         item["imgurls"] = urllist
32 |         item["name"] = name
33 |         yield item
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # spidermeizi
 2 | spider可视化管理
 3 | 
 4 | 
 5 | ## 搭建环境
 6 | #### 软件安装
 7 | ```
 8 | pip install scrapy          #安装scrapy
 9 | 
10 | pip install scrapyd         #安装scrapy服务器
11 | 
12 | pip install scrapyd-client  #安装scrapy客户端
13 | ```
14 | 
15 | #### 运行scrapyd
16 | 
17 | ###### 如果安装正常，则能正常运行
18 | 
19 | #### 对scrapy包生成egg包
20 | ##### 在项目工程下 scrapy.cfg  所在目录运行命令
21 | 
22 | ```
23 | scrapyd-deploy -p spidermeizi -v 0.0.01 --build-egg=meizi.egg
24 | # 格式 ---->  scrapyd-deploy -p 项目名 -v 版本号 --build-egg=eggName.egg
25 | ```
26 | 
27 | #### 如运行scrapyd-deploy报错，原因是windows下 scrapyd-deploy是一个没有后缀的文件，需要如下操作
28 | ##### 在python安装目录Scripts所在目录下新建 scrapyd-deploy.bat文件，并如填写内容,并运行
29 | ```
30 | @echo off
31 | "C:\Python36\python.exe" "C:\Python36\Scripts\scrapyd-deploy" %1 %2 %3 %4 %5 %6 %7 %8 %9
32 | # 相关路劲根据自己的安装环境配置
33 | ```
34 | ### 启动服务:
35 | #### 在cmd端运行 scrapyd
36 | #### 本地启动scrapykeeper （相关代码可以在github上搜索）
37 | 
38 | ### 首页
39 | ![image](./bmp/show-1.jpg)
40 | 
41 | ### 添加egg项目文件
42 | ![image](./bmp/show-2.jpg)
43 | 
44 | ### 添加爬虫任务
45 | ![image](./bmp/show-3.jpg)
46 | 
47 | ### 项目运行显示
48 | ![image](./bmp/show-4.jpg)
49 | 
50 | ### 项目运行结果
51 | ![image](./bmp/show-5.jpg)
52 | 
53 | ![image](./bmp/show-6.jpg)
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/spidermeizi/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for spidermeizi project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'spidermeizi'
 13 | 
 14 | SPIDER_MODULES = ['spidermeizi.spiders']
 15 | NEWSPIDER_MODULE = 'spidermeizi.spiders'
 16 | 
 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 18 | # USER_AGENT = 'spidermeizi (+http://www.yourdomain.com)'
 19 | 
 20 | # Obey robots.txt rules
 21 | ROBOTSTXT_OBEY = True
 22 | LOG_LEVEL = 'INFO'  #日志级别
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | # CONCURRENT_REQUESTS = 32
 26 | 
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | 
 31 | # The download delay setting will honor only one of:
 32 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 33 | # CONCURRENT_REQUESTS_PER_IP = 16
 34 | 
 35 | # Disable cookies (enabled by default)
 36 | # COOKIES_ENABLED = False
 37 | 
 38 | # Disable Telnet Console (enabled by default)
 39 | # TELNETCONSOLE_ENABLED = False
 40 | 
 41 | 
 42 | 
 43 | 
 44 | # Override the default request headers:
 45 | # DEFAULT_REQUEST_HEADERS = {
 46 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 47 | #   'Accept-Language': 'en',
 48 | # }
 49 | 
 50 | # Enable or disable spider middlewares
 51 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 52 | # SPIDER_MIDDLEWARES = {
 53 | #    'spidermeizi.middlewares.SpidermeiziSpiderMiddleware': 543,
 54 | # }
 55 | 
 56 | # Enable or disable downloader middlewares
 57 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 58 | # DOWNLOADER_MIDDLEWARES = {
 59 | #    'spidermeizi.middlewares.SpidermeiziDownloaderMiddleware': 543,
 60 | # }
 61 | 
 62 | # Enable or disable extensions
 63 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 64 | # EXTENSIONS = {
 65 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 66 | # }
 67 | 
 68 | # Configure item pipelines
 69 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 70 | ITEM_PIPELINES = {
 71 |    'spidermeizi.pipelines.SpidermeiziPipeline': 300,
 72 | }
 73 | 
 74 | # IMAGES_STORE = "."  #保存路径
 75 | IMAGES_STORE = "E:\\save\\"  #保存路径
 76 | DOWNLOAD_DELAY = 0
 77 | 
 78 | # Enable and configure the AutoThrottle extension (disabled by default)
 79 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 80 | # AUTOTHROTTLE_ENABLED = True
 81 | # The initial download delay
 82 | # AUTOTHROTTLE_START_DELAY = 5
 83 | # The maximum download delay to be set in case of high latencies
 84 | # AUTOTHROTTLE_MAX_DELAY = 60
 85 | # The average number of requests Scrapy should be sending in parallel to
 86 | # each remote server
 87 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 88 | # Enable showing throttling stats for every response received:
 89 | # AUTOTHROTTLE_DEBUG = False
 90 | 
 91 | # Enable and configure HTTP caching (disabled by default)
 92 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 93 | # HTTPCACHE_ENABLED = True
 94 | # HTTPCACHE_EXPIRATION_SECS = 0
 95 | # HTTPCACHE_DIR = 'httpcache'
 96 | # HTTPCACHE_IGNORE_HTTP_CODES = []
 97 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 98 | 
 99 | # 多线程 提高抓取速度
100 | CONCURRENT_REQUESTS = 5
101 | 
102 | CONCURRENT_REQUESTS_PER_DOMAIN = 5
103 | 
104 | CONCURRENT_REQUESTS_PER_IP = 5
105 | 


--------------------------------------------------------------------------------
/spidermeizi/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class SpidermeiziSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 |         # Should return None or raise an exception.
 27 |         return None
 28 | 
 29 |     def process_spider_output(self, response, result, spider):
 30 |         # Called with the results returned from the Spider, after
 31 |         # it has processed the response.
 32 |         # Must return an iterable of Request, dict or Item objects.
 33 |         for i in result:
 34 |             yield i
 35 | 
 36 |     def process_spider_exception(self, response, exception, spider):
 37 |         # Called when a spider or process_spider_input() method
 38 |         # (from other spider middleware) raises an exception.
 39 |         # Should return either None or an iterable of Response, dict
 40 |         # or Item objects.
 41 |         pass
 42 | 
 43 |     def process_start_requests(self, start_requests, spider):
 44 |         # Called with the start requests of the spider, and works
 45 |         # similarly to the process_spider_output() method, except
 46 |         # that it doesn’t have a response associated.
 47 |         # Must return only requests (not items).
 48 |         for r in start_requests:
 49 |             yield r
 50 | 
 51 |     def spider_opened(self, spider):
 52 |         spider.logger.info('Spider opened: %s' % spider.name)
 53 | 
 54 | 
 55 | class SpidermeiziDownloaderMiddleware(object):
 56 |     # Not all methods need to be defined. If a method is not defined,
 57 |     # scrapy acts as if the downloader middleware does not modify the
 58 |     # passed objects.
 59 | 
 60 |     @classmethod
 61 |     def from_crawler(cls, crawler):
 62 |         # This method is used by Scrapy to create your spiders.
 63 |         s = cls()
 64 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 65 |         return s
 66 | 
 67 |     def process_request(self, request, spider):
 68 |         # Called for each request that goes through the downloader
 69 |         # middleware.
 70 | 
 71 |         # Must either:
 72 |         # - return None: continue processing this request
 73 |         # - or return a Response object
 74 |         # - or return a Request object
 75 |         # - or raise IgnoreRequest: process_exception() methods of
 76 |         #   installed downloader middleware will be called
 77 |         return None
 78 | 
 79 |     def process_response(self, request, response, spider):
 80 |         # Called with the response returned from the downloader.
 81 | 
 82 |         # Must either;
 83 |         # - return a Response object
 84 |         # - return a Request object
 85 |         # - or raise IgnoreRequest
 86 |         return response
 87 | 
 88 |     def process_exception(self, request, exception, spider):
 89 |         # Called when a download handler or a process_request()
 90 |         # (from other downloader middleware) raises an exception.
 91 | 
 92 |         # Must either:
 93 |         # - return None: continue processing this exception
 94 |         # - return a Response object: stops process_exception() chain
 95 |         # - return a Request object: stops process_exception() chain
 96 |         pass
 97 | 
 98 |     def spider_opened(self, spider):
 99 |         spider.logger.info('Spider opened: %s' % spider.name)
100 | 


--------------------------------------------------------------------------------