├── spidermeizi ├── __init__.py ├── main.py ├── __pycache__ │ ├── items.cpython-36.pyc │ ├── __init__.cpython-36.pyc │ ├── pipelines.cpython-36.pyc │ └── settings.cpython-36.pyc ├── spiders │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── meizispider.cpython-36.pyc │ ├── __init__.py │ └── meizispider.py ├── items.py ├── pipelines.py ├── settings.py └── middlewares.py ├── .gitignore ├── requirements.txt ├── project.egg-info ├── dependency_links.txt ├── top_level.txt ├── entry_points.txt ├── PKG-INFO └── SOURCES.txt ├── meizi.egg ├── bmp ├── show-1.jpg ├── show-2.jpg ├── show-3.jpg ├── show-4.jpg ├── show-5.jpg └── show-6.jpg ├── setup.py ├── scrapy.cfg └── README.md /spidermeizi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy==1.5.1 2 | -------------------------------------------------------------------------------- /project.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /project.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | spidermeizi 2 | -------------------------------------------------------------------------------- /meizi.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/meizi.egg -------------------------------------------------------------------------------- /bmp/show-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/bmp/show-1.jpg -------------------------------------------------------------------------------- /bmp/show-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/bmp/show-2.jpg -------------------------------------------------------------------------------- /bmp/show-3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/bmp/show-3.jpg -------------------------------------------------------------------------------- /bmp/show-4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/bmp/show-4.jpg -------------------------------------------------------------------------------- /bmp/show-5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/bmp/show-5.jpg -------------------------------------------------------------------------------- /bmp/show-6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/bmp/show-6.jpg -------------------------------------------------------------------------------- /project.egg-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [scrapy] 2 | settings = spidermeizi.settings 3 | 4 | -------------------------------------------------------------------------------- /spidermeizi/main.py: -------------------------------------------------------------------------------- 1 | import scrapy.cmdline as cmd 2 | cmd.execute('scrapy crawl meizi'.split()) 3 | 4 | -------------------------------------------------------------------------------- /spidermeizi/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/spidermeizi/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /spidermeizi/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/spidermeizi/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spidermeizi/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/spidermeizi/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /spidermeizi/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/spidermeizi/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /spidermeizi/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/spidermeizi/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spidermeizi/spiders/__pycache__/meizispider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itzujun/spidermeizi/HEAD/spidermeizi/spiders/__pycache__/meizispider.cpython-36.pyc -------------------------------------------------------------------------------- /spidermeizi/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /project.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: project 3 | Version: 1.0 4 | Summary: UNKNOWN 5 | Home-page: UNKNOWN 6 | Author: UNKNOWN 7 | Author-email: UNKNOWN 8 | License: UNKNOWN 9 | Description: UNKNOWN 10 | Platform: UNKNOWN 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapyd-deploy 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name = 'project', 7 | version = '1.0', 8 | packages = find_packages(), 9 | entry_points = {'scrapy': ['settings = spidermeizi.settings']}, 10 | ) 11 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = spidermeizi.settings 8 | 9 | [deploy] 10 | url = http://localhost:6800/ 11 | project = spidermeizi 12 | -------------------------------------------------------------------------------- /spidermeizi/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class SpidermeiziItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | name = scrapy.Field() # 名字 14 | imgurls = scrapy.Field() # urls 15 | pass 16 | -------------------------------------------------------------------------------- /project.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | setup.py 2 | project.egg-info/PKG-INFO 3 | project.egg-info/SOURCES.txt 4 | project.egg-info/dependency_links.txt 5 | project.egg-info/entry_points.txt 6 | project.egg-info/top_level.txt 7 | spidermeizi/__init__.py 8 | spidermeizi/items.py 9 | spidermeizi/main.py 10 | spidermeizi/middlewares.py 11 | spidermeizi/pipelines.py 12 | spidermeizi/settings.py 13 | spidermeizi/spiders/__init__.py 14 | spidermeizi/spiders/meizispider.py -------------------------------------------------------------------------------- /spidermeizi/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from scrapy.pipelines.images import ImagesPipeline 9 | from scrapy.exceptions import DropItem 10 | from scrapy import Request 11 | 12 | 13 | class SpidermeiziPipeline(ImagesPipeline): 14 | def get_media_requests(self, item, info): 15 | headers = { 16 | "Referer": "https://www.mzitu.com/" 17 | } 18 | for url in item["imgurls"]: 19 | yield Request(url, headers=headers, meta={"name": item["name"]}) 20 | 21 | def item_completed(self, results, item, info): 22 | image_paths = [x['path'] for ok, x in results if ok] 23 | if not image_paths: 24 | raise DropItem("Item contains no images") 25 | print("下载完成: ", image_paths) 26 | return item 27 | 28 | def file_path(self, request, response=None, info=None): 29 | name = request.meta['name'] 30 | image_guid = request.url.split('/')[-1] 31 | filenames = "zujun/%s/%s" % (name, image_guid) 32 | return filenames 33 | -------------------------------------------------------------------------------- /spidermeizi/spiders/meizispider.py: -------------------------------------------------------------------------------- 1 | # _*_ coding:utf-8 _*_ 2 | 3 | import scrapy 4 | 5 | from spidermeizi.items import SpidermeiziItem 6 | 7 | 8 | class MeiziSpider(scrapy.Spider): 9 | name = "meizi" 10 | host = "http://www.mzitu.com/" 11 | start_urls = [ 12 | "http://www.mzitu.com/" 13 | ] 14 | 15 | def parse(self, response): 16 | urls = response.xpath('//div[@class="postlist"]/ul/li/a/@href').extract() 17 | print("urls", urls) 18 | for url in urls: 19 | send = url + "/2" 20 | yield scrapy.Request(send, callback=self.parse_item) 21 | 22 | def parse_item(self, response): 23 | item = SpidermeiziItem() 24 | imgurl = response.xpath('//div[@class="main-image"]/p/a/img/@src').extract()[0] 25 | name = response.xpath('//div[@class="main-image"]/p/a/img/@alt').extract()[0] 26 | total = int(response.xpath('//div[@class="pagenavi"]/a/span/text()').extract()[-2]) 27 | baseimg = imgurl[0:-6] 28 | urllist = [] 29 | for i in range(1, total): 30 | urllist.append(baseimg + ("%02d" % i) + ".jpg") 31 | item["imgurls"] = urllist 32 | item["name"] = name 33 | yield item 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spidermeizi 2 | spider可视化管理 3 | 4 | 5 | ## 搭建环境 6 | #### 软件安装 7 | ``` 8 | pip install scrapy #安装scrapy 9 | 10 | pip install scrapyd #安装scrapy服务器 11 | 12 | pip install scrapyd-client #安装scrapy客户端 13 | ``` 14 | 15 | #### 运行scrapyd 16 | 17 | ###### 如果安装正常,则能正常运行 18 | 19 | #### 对scrapy包生成egg包 20 | ##### 在项目工程下 scrapy.cfg 所在目录运行命令 21 | 22 | ``` 23 | scrapyd-deploy -p spidermeizi -v 0.0.01 --build-egg=meizi.egg 24 | # 格式 ----> scrapyd-deploy -p 项目名 -v 版本号 --build-egg=eggName.egg 25 | ``` 26 | 27 | #### 如运行scrapyd-deploy报错,原因是windows下 scrapyd-deploy是一个没有后缀的文件,需要如下操作 28 | ##### 在python安装目录Scripts所在目录下新建 scrapyd-deploy.bat文件,并如填写内容,并运行 29 | ``` 30 | @echo off 31 | "C:\Python36\python.exe" "C:\Python36\Scripts\scrapyd-deploy" %1 %2 %3 %4 %5 %6 %7 %8 %9 32 | # 相关路劲根据自己的安装环境配置 33 | ``` 34 | ### 启动服务: 35 | #### 在cmd端运行 scrapyd 36 | #### 本地启动scrapykeeper (相关代码可以在github上搜索) 37 | 38 | ### 首页 39 | ![image](./bmp/show-1.jpg) 40 | 41 | ### 添加egg项目文件 42 | ![image](./bmp/show-2.jpg) 43 | 44 | ### 添加爬虫任务 45 | ![image](./bmp/show-3.jpg) 46 | 47 | ### 项目运行显示 48 | ![image](./bmp/show-4.jpg) 49 | 50 | ### 项目运行结果 51 | ![image](./bmp/show-5.jpg) 52 | 53 | ![image](./bmp/show-6.jpg) 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /spidermeizi/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for spidermeizi project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'spidermeizi' 13 | 14 | SPIDER_MODULES = ['spidermeizi.spiders'] 15 | NEWSPIDER_MODULE = 'spidermeizi.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'spidermeizi (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = True 22 | LOG_LEVEL = 'INFO' #日志级别 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | # CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | 31 | # The download delay setting will honor only one of: 32 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | # CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | # COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | # TELNETCONSOLE_ENABLED = False 40 | 41 | 42 | 43 | 44 | # Override the default request headers: 45 | # DEFAULT_REQUEST_HEADERS = { 46 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 47 | # 'Accept-Language': 'en', 48 | # } 49 | 50 | # Enable or disable spider middlewares 51 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 52 | # SPIDER_MIDDLEWARES = { 53 | # 'spidermeizi.middlewares.SpidermeiziSpiderMiddleware': 543, 54 | # } 55 | 56 | # Enable or disable downloader middlewares 57 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 58 | # DOWNLOADER_MIDDLEWARES = { 59 | # 'spidermeizi.middlewares.SpidermeiziDownloaderMiddleware': 543, 60 | # } 61 | 62 | # Enable or disable extensions 63 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 64 | # EXTENSIONS = { 65 | # 'scrapy.extensions.telnet.TelnetConsole': None, 66 | # } 67 | 68 | # Configure item pipelines 69 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 70 | ITEM_PIPELINES = { 71 | 'spidermeizi.pipelines.SpidermeiziPipeline': 300, 72 | } 73 | 74 | # IMAGES_STORE = "." #保存路径 75 | IMAGES_STORE = "E:\\save\\" #保存路径 76 | DOWNLOAD_DELAY = 0 77 | 78 | # Enable and configure the AutoThrottle extension (disabled by default) 79 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 80 | # AUTOTHROTTLE_ENABLED = True 81 | # The initial download delay 82 | # AUTOTHROTTLE_START_DELAY = 5 83 | # The maximum download delay to be set in case of high latencies 84 | # AUTOTHROTTLE_MAX_DELAY = 60 85 | # The average number of requests Scrapy should be sending in parallel to 86 | # each remote server 87 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 88 | # Enable showing throttling stats for every response received: 89 | # AUTOTHROTTLE_DEBUG = False 90 | 91 | # Enable and configure HTTP caching (disabled by default) 92 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 93 | # HTTPCACHE_ENABLED = True 94 | # HTTPCACHE_EXPIRATION_SECS = 0 95 | # HTTPCACHE_DIR = 'httpcache' 96 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 97 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 98 | 99 | # 多线程 提高抓取速度 100 | CONCURRENT_REQUESTS = 5 101 | 102 | CONCURRENT_REQUESTS_PER_DOMAIN = 5 103 | 104 | CONCURRENT_REQUESTS_PER_IP = 5 105 | -------------------------------------------------------------------------------- /spidermeizi/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class SpidermeiziSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | # Should return None or raise an exception. 27 | return None 28 | 29 | def process_spider_output(self, response, result, spider): 30 | # Called with the results returned from the Spider, after 31 | # it has processed the response. 32 | # Must return an iterable of Request, dict or Item objects. 33 | for i in result: 34 | yield i 35 | 36 | def process_spider_exception(self, response, exception, spider): 37 | # Called when a spider or process_spider_input() method 38 | # (from other spider middleware) raises an exception. 39 | # Should return either None or an iterable of Response, dict 40 | # or Item objects. 41 | pass 42 | 43 | def process_start_requests(self, start_requests, spider): 44 | # Called with the start requests of the spider, and works 45 | # similarly to the process_spider_output() method, except 46 | # that it doesn’t have a response associated. 47 | # Must return only requests (not items). 48 | for r in start_requests: 49 | yield r 50 | 51 | def spider_opened(self, spider): 52 | spider.logger.info('Spider opened: %s' % spider.name) 53 | 54 | 55 | class SpidermeiziDownloaderMiddleware(object): 56 | # Not all methods need to be defined. If a method is not defined, 57 | # scrapy acts as if the downloader middleware does not modify the 58 | # passed objects. 59 | 60 | @classmethod 61 | def from_crawler(cls, crawler): 62 | # This method is used by Scrapy to create your spiders. 63 | s = cls() 64 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 65 | return s 66 | 67 | def process_request(self, request, spider): 68 | # Called for each request that goes through the downloader 69 | # middleware. 70 | 71 | # Must either: 72 | # - return None: continue processing this request 73 | # - or return a Response object 74 | # - or return a Request object 75 | # - or raise IgnoreRequest: process_exception() methods of 76 | # installed downloader middleware will be called 77 | return None 78 | 79 | def process_response(self, request, response, spider): 80 | # Called with the response returned from the downloader. 81 | 82 | # Must either; 83 | # - return a Response object 84 | # - return a Request object 85 | # - or raise IgnoreRequest 86 | return response 87 | 88 | def process_exception(self, request, exception, spider): 89 | # Called when a download handler or a process_request() 90 | # (from other downloader middleware) raises an exception. 91 | 92 | # Must either: 93 | # - return None: continue processing this exception 94 | # - return a Response object: stops process_exception() chain 95 | # - return a Request object: stops process_exception() chain 96 | pass 97 | 98 | def spider_opened(self, spider): 99 | spider.logger.info('Spider opened: %s' % spider.name) 100 | --------------------------------------------------------------------------------