├── README.md ├── dmzj_scrapy ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── dmzj_spider.py ├── dmzj_start.py └── scrapy.cfg /README.md: -------------------------------------------------------------------------------- 1 | # dmzj_scrapy 2 | 下载动漫之家漫画 3 | 运行dmzj_scapy.py开始下载漫画 4 | -------------------------------------------------------------------------------- /dmzj_scrapy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weakmaple/dmzj_scrapy/a879f067429dc3f526df9746141fcd30986572a0/dmzj_scrapy/__init__.py -------------------------------------------------------------------------------- /dmzj_scrapy/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DmzjScrapyItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | #pic_urls是一话所有图片的链接 15 | #title该话的名字(第一话,第二话 之类的) 16 | #big_title指的是漫画的名字 17 | pic_urls = scrapy.Field() 18 | title = scrapy.Field() 19 | big_title = scrapy.Field() -------------------------------------------------------------------------------- /dmzj_scrapy/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # from selenium import webdriver 4 | # import scrapy 5 | # import time 6 | # 7 | # class DmzjScrapyDownloaderMiddleware(object): 8 | # def __init__(self): 9 | # self.driver = webdriver.Chrome() 10 | # 11 | # def process_request(self, request, spider): 12 | # self.driver.get(request.url) 13 | # time.sleep(2) 14 | # source = self.driver.page_source 15 | # # source = b'%s' % (source) 16 | # return scrapy.http.HtmlResponse(url=self.driver.current_url,body=source,request=request,encoding='utf-8') -------------------------------------------------------------------------------- /dmzj_scrapy/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import os 9 | import requests 10 | 11 | class DmzjScrapyPipeline(object): 12 | 13 | def process_item(self, item, spider): 14 | headers = { 15 | 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 16 | 'cookie': 'UM_distinctid=16706eb86dcd8-02bc2f02cd5057-333b5602-100200-16706eb86dd6e5; bdshare_firstime=1542009358470; laravel_session=eyJpdiI6InJZMTQrZFlRVFphOVlXU2R0dFwvRnl3PT0iLCJ2YWx1ZSI6IjFJZGdacHA5YkJSRzhHSWJoVklwaTNiaTl0dHRCMzFwSzRGT0oxWm81MXl5aGtkT0lDVHFQRTlBcUJkN3hEQ2xWXC9yZWRxbjJzNSthVWg2VFBVbXRUZz09IiwibWFjIjoiZmZiY2Q0YjkyZTAwMzBjNDk4YjAwZmVkYTg1NzY3NmY4MzU5YjM2NjQzZTdlNTExMWI3ZmJiYTMyNjhlN2YwMSJ9; CNZZDATA1255781707=1633799429-1542006327-%7C1542011727; CNZZDATA1000465408=1899900830-1542005345-%7C1542012974', 17 | 'referer': 'https://m.dmzj.com/info/zuizhongwochengleni.html' 18 | } 19 | 20 | # path = os.path.dirname(os.path.dirname(__file__)) 21 | #path是下载的地址,可以根据需要把这里改掉,这里设定的是当前目录 22 | path = './' 23 | 24 | #确认漫画的目录是否存在 25 | manhua_name = os.path.join(path,item['big_title']) 26 | # print("="*40) 27 | # print(manhua_name) 28 | # print("=" * 40) 29 | if not os.path.exists(manhua_name): 30 | os.mkdir(manhua_name) 31 | # 确认每一话的目录是否存在 32 | catapot_name = os.path.join(manhua_name,item['title']) 33 | if not os.path.exists(catapot_name): 34 | os.mkdir(catapot_name) 35 | #开始逐一下载一话中所有的图片 36 | #capter为该话中图片的具体名字 37 | for pic_url in item['pic_urls']: 38 | capter = pic_url.split('/')[-1] 39 | response = requests.get(pic_url, headers=headers).content 40 | with open(catapot_name+r'\\'+capter,'wb') as fp: 41 | fp.write(response) 42 | fp.close() 43 | print("正在下载"+item['big_title']+" "+item['title']+" "+capter) 44 | return item 45 | -------------------------------------------------------------------------------- /dmzj_scrapy/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for dmzj_scrapy project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'dmzj_scrapy' 13 | 14 | SPIDER_MODULES = ['dmzj_scrapy.spiders'] 15 | NEWSPIDER_MODULE = 'dmzj_scrapy.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'dmzj_scrapy (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | 'Accept-Language': 'en', 45 | 'user-agent':'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1' 46 | } 47 | 48 | # Enable or disable spider middlewares 49 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'dmzj_scrapy.middlewares.DmzjScrapySpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 56 | # DOWNLOADER_MIDDLEWARES = { 57 | # 'dmzj_scrapy.middlewares.DmzjScrapyDownloaderMiddleware': 543, 58 | # } 59 | 60 | # Enable or disable extensions 61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'dmzj_scrapy.pipelines.DmzjScrapyPipeline': 300, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /dmzj_scrapy/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /dmzj_scrapy/spiders/dmzj_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | import json 5 | from dmzj_scrapy.items import DmzjScrapyItem 6 | class DmzjSpiderSpider(scrapy.Spider): 7 | name = 'dmzj_spider' 8 | allowed_domains = ['m.dmzj.com'] 9 | start_urls = ['http://m.dmzj.com/'] 10 | 11 | def __init__(self,manhua_url=None, manhua_names=None, *args, **kwargs): 12 | self.url = manhua_url 13 | self.manhua_name = manhua_names 14 | # print("="*40) 15 | # print(manhua_url) 16 | # print(manhua_names) 17 | # print("=" * 40) 18 | 19 | def start_requests(self): 20 | yield scrapy.http.Request(url=self.url, callback=self.first_url_parse) 21 | 22 | #将pc端网址装化为app端网址 23 | def first_url_parse(self,response): 24 | url = 'https://m.dmzj.com/info/%s.html' 25 | if 'info' in self.url: 26 | manhua_id_1 = re.split('/',self.url)[-1] 27 | manhua_id = re.split(r'\.',manhua_id_1)[0] 28 | else: 29 | manhua_id_1 = response.xpath('//head/script/text()').get() 30 | manhua_id = re.search(r'.*?g_current_id = "(.*?)";.*',manhua_id_1).group(1) 31 | main_url = url % manhua_id 32 | # print(main_url) 33 | yield scrapy.http.Request(url=main_url, callback=self.parse_total) 34 | 35 | # 获得漫画的目录 36 | def parse_total(self,response): 37 | 38 | catalog = response.xpath('//body/script[@type="text/javascript"]/text()').getall()[1] 39 | catalog = re.sub(r'},{"title":.*?,"data":\[(.*?)\]', '', catalog) 40 | catalog_list = json.loads(re.search(r'"data":(.*?)}]\);.*',catalog).group(1)) 41 | data_list = [] 42 | for ls in catalog_list: 43 | #detail_url是每一话的地址 44 | detail_url = 'https://m.dmzj.com/view/%s/%s.html' % (ls['comic_id'],ls['id']) 45 | yield scrapy.http.Request(url=detail_url,callback=self.parse_detail) 46 | 47 | # 获得该话所有图片的url,再将这里的链接交给piplines处理 48 | def parse_detail(self,response): 49 | title = response.xpath('//a[@class="BarTit"]/text()').get() 50 | pic_urls = response.xpath('//body/script[@type="text/javascript"]/text()').getall()[1] 51 | pic_urls = json.loads(re.search(r'.*?"page_url":(.*?),"chapter_type".*', pic_urls).group(1)) 52 | item = DmzjScrapyItem(pic_urls=pic_urls,title=title,big_title=self.manhua_name) 53 | yield item 54 | -------------------------------------------------------------------------------- /dmzj_start.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | import sys 3 | if __name__ == '__main__': 4 | manhua_url = input("请输入网址:").strip() 5 | manhua_names = input("请输入漫画名:").strip() 6 | cmdline.execute(str("scrapy crawl dmzj_spider -a manhua_url=%s -a manhua_names=%s"%(manhua_url,manhua_names)).split()) 7 | # cmdline.execute(str("scrapy crawl dmzj_spider -a manhua_url=%s -a manhua_names=%s"%(sys.argv[1],sys.argv[2])).split()) -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = dmzj_scrapy.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = dmzj_scrapy 12 | --------------------------------------------------------------------------------