├── README.md
├── dmzj_scrapy
    ├── __init__.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   └── dmzj_spider.py
├── dmzj_start.py
└── scrapy.cfg


/README.md:
--------------------------------------------------------------------------------
1 | # dmzj_scrapy
2 | 下载动漫之家漫画
3 | 运行dmzj_scapy.py开始下载漫画
4 | 


--------------------------------------------------------------------------------
/dmzj_scrapy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weakmaple/dmzj_scrapy/a879f067429dc3f526df9746141fcd30986572a0/dmzj_scrapy/__init__.py


--------------------------------------------------------------------------------
/dmzj_scrapy/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DmzjScrapyItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     #pic_urls是一话所有图片的链接
15 |     #title该话的名字（第一话，第二话 之类的）
16 |     #big_title指的是漫画的名字
17 |     pic_urls = scrapy.Field()
18 |     title = scrapy.Field()
19 |     big_title = scrapy.Field()


--------------------------------------------------------------------------------
/dmzj_scrapy/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # from selenium import webdriver
 4 | # import scrapy
 5 | # import time
 6 | #
 7 | # class DmzjScrapyDownloaderMiddleware(object):
 8 | #     def __init__(self):
 9 | #         self.driver = webdriver.Chrome()
10 | #
11 | #     def process_request(self, request, spider):
12 | #         self.driver.get(request.url)
13 | #         time.sleep(2)
14 | #         source = self.driver.page_source
15 | #         # source = b'%s' % (source)
16 | #         return scrapy.http.HtmlResponse(url=self.driver.current_url,body=source,request=request,encoding='utf-8')


--------------------------------------------------------------------------------
/dmzj_scrapy/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import os
 9 | import requests
10 | 
11 | class DmzjScrapyPipeline(object):
12 | 
13 |     def process_item(self, item, spider):
14 |         headers = {
15 |             'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
16 |             'cookie': 'UM_distinctid=16706eb86dcd8-02bc2f02cd5057-333b5602-100200-16706eb86dd6e5; bdshare_firstime=1542009358470; laravel_session=eyJpdiI6InJZMTQrZFlRVFphOVlXU2R0dFwvRnl3PT0iLCJ2YWx1ZSI6IjFJZGdacHA5YkJSRzhHSWJoVklwaTNiaTl0dHRCMzFwSzRGT0oxWm81MXl5aGtkT0lDVHFQRTlBcUJkN3hEQ2xWXC9yZWRxbjJzNSthVWg2VFBVbXRUZz09IiwibWFjIjoiZmZiY2Q0YjkyZTAwMzBjNDk4YjAwZmVkYTg1NzY3NmY4MzU5YjM2NjQzZTdlNTExMWI3ZmJiYTMyNjhlN2YwMSJ9; CNZZDATA1255781707=1633799429-1542006327-%7C1542011727; CNZZDATA1000465408=1899900830-1542005345-%7C1542012974',
17 |             'referer': 'https://m.dmzj.com/info/zuizhongwochengleni.html'
18 |         }
19 | 
20 |         # path = os.path.dirname(os.path.dirname(__file__))
21 |         #path是下载的地址，可以根据需要把这里改掉，这里设定的是当前目录
22 |         path = './'
23 | 
24 |         #确认漫画的目录是否存在
25 |         manhua_name = os.path.join(path,item['big_title'])
26 |         # print("="*40)
27 |         # print(manhua_name)
28 |         # print("=" * 40)
29 |         if not os.path.exists(manhua_name):
30 |             os.mkdir(manhua_name)
31 |         # 确认每一话的目录是否存在
32 |         catapot_name = os.path.join(manhua_name,item['title'])
33 |         if not os.path.exists(catapot_name):
34 |             os.mkdir(catapot_name)
35 |         #开始逐一下载一话中所有的图片
36 |         #capter为该话中图片的具体名字
37 |         for pic_url in item['pic_urls']:
38 |             capter = pic_url.split('/')[-1]
39 |             response = requests.get(pic_url, headers=headers).content
40 |             with open(catapot_name+r'\\'+capter,'wb') as fp:
41 |                 fp.write(response)
42 |                 fp.close()
43 |                 print("正在下载"+item['big_title']+" "+item['title']+" "+capter)
44 |         return item
45 | 


--------------------------------------------------------------------------------
/dmzj_scrapy/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for dmzj_scrapy project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'dmzj_scrapy'
13 | 
14 | SPIDER_MODULES = ['dmzj_scrapy.spiders']
15 | NEWSPIDER_MODULE = 'dmzj_scrapy.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'dmzj_scrapy (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | DEFAULT_REQUEST_HEADERS = {
43 |   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 |   'Accept-Language': 'en',
45 |   'user-agent':'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'
46 | }
47 | 
48 | # Enable or disable spider middlewares
49 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'dmzj_scrapy.middlewares.DmzjScrapySpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
56 | # DOWNLOADER_MIDDLEWARES = {
57 | #    'dmzj_scrapy.middlewares.DmzjScrapyDownloaderMiddleware': 543,
58 | # }
59 | 
60 | # Enable or disable extensions
61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |    'dmzj_scrapy.pipelines.DmzjScrapyPipeline': 300,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/dmzj_scrapy/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/dmzj_scrapy/spiders/dmzj_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import re
 4 | import json
 5 | from dmzj_scrapy.items import DmzjScrapyItem
 6 | class DmzjSpiderSpider(scrapy.Spider):
 7 |     name = 'dmzj_spider'
 8 |     allowed_domains = ['m.dmzj.com']
 9 |     start_urls = ['http://m.dmzj.com/']
10 | 
11 |     def __init__(self,manhua_url=None, manhua_names=None,  *args, **kwargs):
12 |         self.url = manhua_url
13 |         self.manhua_name = manhua_names
14 |         # print("="*40)
15 |         # print(manhua_url)
16 |         # print(manhua_names)
17 |         # print("=" * 40)
18 | 
19 |     def start_requests(self):
20 |         yield scrapy.http.Request(url=self.url, callback=self.first_url_parse)
21 | 
22 |     #将pc端网址装化为app端网址
23 |     def first_url_parse(self,response):
24 |         url = 'https://m.dmzj.com/info/%s.html'
25 |         if 'info' in self.url:
26 |             manhua_id_1 = re.split('/',self.url)[-1]
27 |             manhua_id = re.split(r'\.',manhua_id_1)[0]
28 |         else:
29 |             manhua_id_1 = response.xpath('//head/script/text()').get()
30 |             manhua_id = re.search(r'.*?g_current_id = "(.*?)";.*',manhua_id_1).group(1)
31 |         main_url = url % manhua_id
32 |         # print(main_url)
33 |         yield scrapy.http.Request(url=main_url, callback=self.parse_total)
34 | 
35 |     # 获得漫画的目录
36 |     def parse_total(self,response):
37 | 
38 |         catalog = response.xpath('//body/script[@type="text/javascript"]/text()').getall()[1]
39 |         catalog = re.sub(r'},{"title":.*?,"data":\[(.*?)\]', '', catalog)
40 |         catalog_list = json.loads(re.search(r'"data":(.*?)}]\);.*',catalog).group(1))
41 |         data_list = []
42 |         for ls in catalog_list:
43 |             #detail_url是每一话的地址
44 |             detail_url = 'https://m.dmzj.com/view/%s/%s.html' % (ls['comic_id'],ls['id'])
45 |             yield scrapy.http.Request(url=detail_url,callback=self.parse_detail)
46 | 
47 |     # 获得该话所有图片的url，再将这里的链接交给piplines处理
48 |     def parse_detail(self,response):
49 |         title = response.xpath('//a[@class="BarTit"]/text()').get()
50 |         pic_urls = response.xpath('//body/script[@type="text/javascript"]/text()').getall()[1]
51 |         pic_urls = json.loads(re.search(r'.*?"page_url":(.*?),"chapter_type".*', pic_urls).group(1))
52 |         item = DmzjScrapyItem(pic_urls=pic_urls,title=title,big_title=self.manhua_name)
53 |         yield item
54 | 


--------------------------------------------------------------------------------
/dmzj_start.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | import sys
3 | if __name__ == '__main__':
4 |     manhua_url = input("请输入网址：").strip()
5 |     manhua_names = input("请输入漫画名：").strip()
6 |     cmdline.execute(str("scrapy crawl dmzj_spider -a manhua_url=%s -a manhua_names=%s"%(manhua_url,manhua_names)).split())
7 |     # cmdline.execute(str("scrapy crawl dmzj_spider -a manhua_url=%s -a manhua_names=%s"%(sys.argv[1],sys.argv[2])).split())


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = dmzj_scrapy.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dmzj_scrapy
12 | 


--------------------------------------------------------------------------------