├── .gitignore ├── README.md ├── dist └── zxzxspcrawler.exe └── zxzxspcrawler.py /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/ignore-files/ for more about ignoring files. 2 | .idea -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 樱桃社爬虫 2 | 福利吧-樱桃社下载器 3 | ## Run 4 | 安装python3 5 | 6 | 安装依赖 7 | 8 | ``` 9 | pip3 install requests -i https://pypi.doubanio.com/simple/ 10 | pip3 install beautifulsoup4 -i https://pypi.doubanio.com/simple/ 11 | pip3 install lxml -i https://pypi.doubanio.com/simple/ 12 | ``` 13 | 运行py文件 14 | 15 | 输入你要从哪一页开始到哪一页结束, 16 | 17 | 它会自动创建文件夹并多线程下载页面中每一期的**所有图片**。 18 | ## Release 19 | 下载打包好的exe文件一键运行 20 | 21 | - [zxzxspcrawler.exe](https://github.com/wanglu58/zxzxspcrawler/releases) 22 | -------------------------------------------------------------------------------- /dist/zxzxspcrawler.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wanglu58/zxzxspcrawler/443abd358cd2f786aa05afae01c0dd52d62dc152/dist/zxzxspcrawler.exe -------------------------------------------------------------------------------- /zxzxspcrawler.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import datetime 4 | import requests 5 | from bs4 import BeautifulSoup 6 | from concurrent.futures import ThreadPoolExecutor, as_completed 7 | requests.packages.urllib3.disable_warnings() 8 | 9 | 10 | def GetData(page): 11 | json_dict = {} 12 | HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9', 13 | 'Accept-Language': 'zh-CN,zh;q=0.8', 14 | 'Accept-Encoding': 'gzip, deflate', 15 | 'Connection': 'close',} 16 | headers = HEADERS 17 | headers['user-agent'] = "Mozilla/5.0+(Windows+NT+6.2;+WOW64)+AppleWebKit/537.36+" \ 18 | "(KHTML,+like+Gecko)+Chrome/45.0.2454.101+Safari/537.36" 19 | response = requests.get('https://www.zxzxsp.com/page/{}'.format(page), 20 | allow_redirects=True,timeout=(30,30),headers=headers,verify=False) 21 | soup = BeautifulSoup(response.content,'lxml') 22 | data = soup.find_all('h2') 23 | for j in data[:-1]: 24 | url = j.a.get('href') 25 | t = 0 26 | while t <3 : 27 | try: 28 | response = requests.get(url,allow_redirects=True,timeout=(30,30),headers=headers,verify=False) 29 | soup = BeautifulSoup(response.content, 'lxml') 30 | title = soup.h1.string 31 | div_class = soup.find('div', attrs={'class': 'entry-content u-text-format u-clearfix'}) 32 | if div_class: 33 | div = div_class.find_all('img') 34 | for img in div: 35 | json_dict.update({img.attrs['src']: [div.index(img) + 1, title]}) 36 | else: 37 | div_id = soup.find('div', attrs={'id': 'gallery-1'}) 38 | if div_id: 39 | div = div_id.find_all('img') 40 | for img in div: 41 | json_dict.update({img.attrs['src']: [div.index(img) + 1, title]}) 42 | t = 3 43 | except requests.exceptions.RequestException as e: 44 | if t == 2: 45 | print(e) 46 | print('这期无法访问:{}。'.format(url)) 47 | else: 48 | time.sleep(2) 49 | t += 1 50 | 51 | return json_dict 52 | 53 | 54 | def GetDownload(json_dict): 55 | path = os.getcwd() 56 | HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9', 57 | 'Accept-Language': 'zh-CN,zh;q=0.8', 58 | 'Accept-Encoding': 'gzip, deflate', 59 | 'Connection': 'close', } 60 | headers = HEADERS 61 | headers['user-agent'] = "Mozilla/5.0+(Windows+NT+6.2;+WOW64)+AppleWebKit/537.36+" \ 62 | "(KHTML,+like+Gecko)+Chrome/45.0.2454.101+Safari/537.36" 63 | for src in json_dict: 64 | number = json_dict[src][0] 65 | title = json_dict[src][1] 66 | mkdirpath = path + '/{}'.format(title) 67 | os.makedirs(mkdirpath,exist_ok=True) 68 | t = 0 69 | while t < 3: 70 | try: 71 | img_file = requests.get(src,allow_redirects=True,timeout=(30,30),headers=headers,verify=False) 72 | if img_file.status_code == 200: 73 | img_type = img_file.headers.get('content-type') 74 | if img_type.split('/')[-1] == 'jpeg': 75 | downPath = mkdirpath + '/{}.jpg'.format(number) 76 | else: 77 | downPath = mkdirpath + '/{}.{}'.format(number,img_type.split('/')[-1]) 78 | with open(downPath, 'wb') as f: 79 | f.write(img_file.content) 80 | t = 3 81 | else: 82 | if t == 2: 83 | print('{}第{}张图片无法下载。地址:{}'.format(title,number,src)) 84 | else: 85 | time.sleep(2) 86 | t += 1 87 | except requests.exceptions.RequestException as e: 88 | if t == 2: 89 | print(e) 90 | print('{}第{}张图片无法下载。地址:{}'.format(title,number,src)) 91 | else: 92 | time.sleep(2) 93 | t += 1 94 | 95 | 96 | if __name__ == '__main__': 97 | print('福利吧-樱桃社下载器\n') 98 | print('获取和下载需要一定时间,根据您的网速决定,请耐心等待。') 99 | print('一次性爬取的页数不宜过大,10页以内,防止下载失败。') 100 | print('窗口有可能卡住,CMD窗口自身原因。您可以按回车键刷新输出。\n') 101 | start = input("请输入你要从哪一页开始爬取(例:第1页:输入1)\n") 102 | while start.isdigit() == False or start == '': 103 | print('请好好输!\n') 104 | start = input("请输入你要从哪一页开始爬取(例:第1页:输入1)\n") 105 | end = input("请输入你要从哪一页结束爬取(例:第2页:输入2)\n") 106 | while end.isdigit() == False or end == '' or end < start: 107 | print('请好好输!\n') 108 | end = input("请输入你要从哪一页结束爬取(例:第2页:输入2)\n") 109 | start = int(start) 110 | end = int(end) + 1 111 | starttime = datetime.datetime.now() 112 | print('获取图片地址,请稍候。。。。。。\n') 113 | with ThreadPoolExecutor() as executor: 114 | task_list = [] 115 | json_dict = {} 116 | for i in range(start,end): 117 | task = executor.submit(GetData,i) 118 | task_list.append(task) 119 | for res in as_completed(task_list): 120 | json_dict.update(res.result()) 121 | print('图片地址获取完毕,开始下载,请稍候。。。。。。\n') 122 | with ThreadPoolExecutor() as executor: 123 | task_list = [] 124 | for key in json_dict: 125 | task = executor.submit(GetDownload,{key:json_dict[key]}) 126 | task_list.append(task) 127 | for res in as_completed(task_list): 128 | res.result() 129 | endtime = datetime.datetime.now() 130 | print('已全部下载完成!请在当前路径下查看!用时:{}秒'.format(int((endtime - starttime).total_seconds()))) 131 | print('Enjoy it') 132 | print('Powered by 所向披靡\n') 133 | key = input('按c键退出\n') 134 | while key != 'c': 135 | key = input('按c键退出\n') --------------------------------------------------------------------------------