├── .gitignore
├── README.md
├── dist
    └── zxzxspcrawler.exe
└── zxzxspcrawler.py


/.gitignore:
--------------------------------------------------------------------------------
1 | # See https://help.github.com/ignore-files/ for more about ignoring files.
2 | .idea


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 樱桃社爬虫
 2 | 福利吧-樱桃社下载器
 3 | ## Run
 4 | 安装python3
 5 | 
 6 | 安装依赖
 7 | 
 8 | ```
 9 | pip3 install requests -i https://pypi.doubanio.com/simple/
10 | pip3 install beautifulsoup4 -i https://pypi.doubanio.com/simple/
11 | pip3 install lxml -i https://pypi.doubanio.com/simple/
12 | ```
13 | 运行py文件
14 | 
15 | 输入你要从哪一页开始到哪一页结束，
16 | 
17 | 它会自动创建文件夹并多线程下载页面中每一期的**所有图片**。
18 | ## Release
19 | 下载打包好的exe文件一键运行
20 | 
21 | -  [zxzxspcrawler.exe](https://github.com/wanglu58/zxzxspcrawler/releases)
22 | 


--------------------------------------------------------------------------------
/dist/zxzxspcrawler.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wanglu58/zxzxspcrawler/443abd358cd2f786aa05afae01c0dd52d62dc152/dist/zxzxspcrawler.exe


--------------------------------------------------------------------------------
/zxzxspcrawler.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import datetime
  4 | import requests
  5 | from bs4 import BeautifulSoup
  6 | from concurrent.futures import ThreadPoolExecutor, as_completed
  7 | requests.packages.urllib3.disable_warnings()
  8 | 
  9 | 
 10 | def GetData(page):
 11 |     json_dict = {}
 12 |     HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
 13 |                'Accept-Language': 'zh-CN,zh;q=0.8',
 14 |                'Accept-Encoding': 'gzip, deflate',
 15 |                'Connection': 'close',}
 16 |     headers = HEADERS
 17 |     headers['user-agent'] = "Mozilla/5.0+(Windows+NT+6.2;+WOW64)+AppleWebKit/537.36+" \
 18 |                             "(KHTML,+like+Gecko)+Chrome/45.0.2454.101+Safari/537.36"
 19 |     response = requests.get('https://www.zxzxsp.com/page/{}'.format(page),
 20 |                             allow_redirects=True,timeout=(30,30),headers=headers,verify=False)
 21 |     soup = BeautifulSoup(response.content,'lxml')
 22 |     data = soup.find_all('h2')
 23 |     for j in data[:-1]:
 24 |         url = j.a.get('href')
 25 |         t = 0
 26 |         while t <3 :
 27 |             try:
 28 |                 response = requests.get(url,allow_redirects=True,timeout=(30,30),headers=headers,verify=False)
 29 |                 soup = BeautifulSoup(response.content, 'lxml')
 30 |                 title = soup.h1.string
 31 |                 div_class = soup.find('div', attrs={'class': 'entry-content u-text-format u-clearfix'})
 32 |                 if div_class:
 33 |                     div = div_class.find_all('img')
 34 |                     for img in div:
 35 |                         json_dict.update({img.attrs['src']: [div.index(img) + 1, title]})
 36 |                 else:
 37 |                     div_id = soup.find('div', attrs={'id': 'gallery-1'})
 38 |                     if div_id:
 39 |                         div = div_id.find_all('img')
 40 |                         for img in div:
 41 |                             json_dict.update({img.attrs['src']: [div.index(img) + 1, title]})
 42 |                 t = 3
 43 |             except requests.exceptions.RequestException as e:
 44 |                 if t == 2:
 45 |                     print(e)
 46 |                     print('这期无法访问：{}。'.format(url))
 47 |                 else:
 48 |                     time.sleep(2)
 49 |                 t += 1
 50 | 
 51 |     return json_dict
 52 | 
 53 | 
 54 | def GetDownload(json_dict):
 55 |     path = os.getcwd()
 56 |     HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
 57 |                'Accept-Language': 'zh-CN,zh;q=0.8',
 58 |                'Accept-Encoding': 'gzip, deflate',
 59 |                'Connection': 'close', }
 60 |     headers = HEADERS
 61 |     headers['user-agent'] = "Mozilla/5.0+(Windows+NT+6.2;+WOW64)+AppleWebKit/537.36+" \
 62 |                             "(KHTML,+like+Gecko)+Chrome/45.0.2454.101+Safari/537.36"
 63 |     for src in json_dict:
 64 |         number = json_dict[src][0]
 65 |         title = json_dict[src][1]
 66 |         mkdirpath = path + '/{}'.format(title)
 67 |         os.makedirs(mkdirpath,exist_ok=True)
 68 |         t = 0
 69 |         while t < 3:
 70 |             try:
 71 |                 img_file = requests.get(src,allow_redirects=True,timeout=(30,30),headers=headers,verify=False)
 72 |                 if img_file.status_code == 200:
 73 |                     img_type = img_file.headers.get('content-type')
 74 |                     if img_type.split('/')[-1] == 'jpeg':
 75 |                         downPath = mkdirpath + '/{}.jpg'.format(number)
 76 |                     else:
 77 |                         downPath = mkdirpath + '/{}.{}'.format(number,img_type.split('/')[-1])
 78 |                     with open(downPath, 'wb') as f:
 79 |                         f.write(img_file.content)
 80 |                         t = 3
 81 |                 else:
 82 |                     if t == 2:
 83 |                         print('{}第{}张图片无法下载。地址：{}'.format(title,number,src))
 84 |                     else:
 85 |                         time.sleep(2)
 86 |                     t += 1
 87 |             except requests.exceptions.RequestException as e:
 88 |                 if t == 2:
 89 |                     print(e)
 90 |                     print('{}第{}张图片无法下载。地址：{}'.format(title,number,src))
 91 |                 else:
 92 |                     time.sleep(2)
 93 |                 t += 1
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 |     print('福利吧-樱桃社下载器\n')
 98 |     print('获取和下载需要一定时间，根据您的网速决定，请耐心等待。')
 99 |     print('一次性爬取的页数不宜过大，10页以内，防止下载失败。')
100 |     print('窗口有可能卡住，CMD窗口自身原因。您可以按回车键刷新输出。\n')
101 |     start = input("请输入你要从哪一页开始爬取（例：第1页：输入1）\n")
102 |     while start.isdigit() == False or start == '':
103 |         print('请好好输！\n')
104 |         start = input("请输入你要从哪一页开始爬取（例：第1页：输入1）\n")
105 |     end = input("请输入你要从哪一页结束爬取（例：第2页：输入2）\n")
106 |     while end.isdigit() == False or end == '' or end < start:
107 |         print('请好好输！\n')
108 |         end = input("请输入你要从哪一页结束爬取（例：第2页：输入2）\n")
109 |     start = int(start)
110 |     end = int(end) + 1
111 |     starttime = datetime.datetime.now()
112 |     print('获取图片地址，请稍候。。。。。。\n')
113 |     with ThreadPoolExecutor() as executor:
114 |         task_list = []
115 |         json_dict = {}
116 |         for i in range(start,end):
117 |             task = executor.submit(GetData,i)
118 |             task_list.append(task)
119 |         for res in as_completed(task_list):
120 |             json_dict.update(res.result())
121 |     print('图片地址获取完毕，开始下载，请稍候。。。。。。\n')
122 |     with ThreadPoolExecutor() as executor:
123 |         task_list = []
124 |         for key in json_dict:
125 |             task = executor.submit(GetDownload,{key:json_dict[key]})
126 |             task_list.append(task)
127 |         for res in as_completed(task_list):
128 |             res.result()
129 |     endtime = datetime.datetime.now()
130 |     print('已全部下载完成！请在当前路径下查看！用时：{}秒'.format(int((endtime - starttime).total_seconds())))
131 |     print('Enjoy it')
132 |     print('Powered by 所向披靡\n')
133 |     key = input('按c键退出\n')
134 |     while key != 'c':
135 |         key = input('按c键退出\n')


--------------------------------------------------------------------------------