├── .gitignore ├── README.md ├── dist └── fulibuscrawler.exe └── fulibuscrawler.py /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/ignore-files/ for more about ignoring files. 2 | .idea -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fulibuscrawler 2 | 福利吧-福利汇总下载器 3 | ## Run 4 | 安装python3 5 | 6 | 安装依赖 7 | 8 | ``` 9 | pip3 install requests -i https://pypi.doubanio.com/simple/ 10 | pip3 install beautifulsoup4 -i https://pypi.doubanio.com/simple/ 11 | pip3 install lxml -i https://pypi.doubanio.com/simple/ 12 | ``` 13 | 运行py文件 14 | 15 | 输入你要从哪一期开始到哪一期结束, 16 | 17 | 它会自动创建文件夹并多线程下载福利汇总所有页面的**图片**和**热门视频链接**。 18 | ## Release 19 | 下载打包好的exe文件一键运行 20 | - [fulibuscrawler.exe](https://github.com/wanglu58/fulibuscrawler/releases) -------------------------------------------------------------------------------- /dist/fulibuscrawler.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wanglu58/fulibuscrawler/c10419d5fb50b14c19f58212a68ba9a20c407d49/dist/fulibuscrawler.exe -------------------------------------------------------------------------------- /fulibuscrawler.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | from concurrent.futures import ThreadPoolExecutor, as_completed 4 | import requests 5 | from bs4 import BeautifulSoup 6 | 7 | 8 | def GetUrl(number): 9 | dict = {} 10 | video_list = [] 11 | video_number = 0 12 | i = 1 13 | try: 14 | response = requests.get('https://fulibus.net/{}.html'.format(number)) 15 | soup = BeautifulSoup(response.content, 'lxml') 16 | page = len(soup.find('div', attrs={'class': 'article-paging'}).find_all('a')) + 2 17 | for i in range(1,page): 18 | if i != 1: 19 | response = requests.get('https://fulibus.net/{}.html/{}'.format(number,i)) 20 | soup = BeautifulSoup(response.content, 'lxml') 21 | data = soup.find_all('p') 22 | img_number = 0 23 | for d in data: 24 | if i == 1: 25 | for item in d.find_all('a'): 26 | if item.get('rel') and item.string != item.get('href') and item.string != '福利吧': 27 | video_number += 1 28 | video_list.append({ 29 | 'number': video_number, 30 | 'title': item.string, 31 | 'href': item['href'] 32 | }) 33 | for img in d.find_all('img'): 34 | img_number += 1 35 | picture_url = img.get('src') 36 | route_url= '{}-{}-{}'.format(number,i,img_number) 37 | dict[route_url] = picture_url 38 | if video_list: 39 | dict['{}'.format(number)] = video_list 40 | except requests.exceptions.RequestException as e: 41 | print(e) 42 | print('{0}期网站无法加载,链接:https://fulibus.net/{0}.html/{1}'.format(number, i)) 43 | return dict 44 | 45 | def GetData(route_url,data): 46 | route_url_list = route_url.split('-') 47 | path = os.getcwd() 48 | if len(route_url_list) == 3: 49 | path = path + '/{}/{}'.format(route_url_list[0],route_url_list[1]) 50 | os.makedirs(path,exist_ok=True) 51 | try: 52 | HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9', 53 | 'Accept-Language': 'zh-CN,zh;q=0.8', 54 | 'Accept-Encoding': 'gzip, deflate', } 55 | headers = HEADERS 56 | headers['user-agent'] = "Mozilla/5.0+(Windows+NT+6.2;+WOW64)+AppleWebKit/537.36+" \ 57 | "(KHTML,+like+Gecko)+Chrome/45.0.2454.101+Safari/537.36" 58 | img_file = requests.get(data,allow_redirects=False,timeout=(60, 60),headers=headers) 59 | if img_file.status_code == 200: 60 | img_type = img_file.headers.get('content-type') 61 | if img_type.split('/')[-1] == 'jpeg': 62 | downPath = path + '/{}.jpg'.format(route_url_list[2]) 63 | else: 64 | downPath = path + '/{}.{}'.format(route_url_list[2], img_type.split('/')[-1]) 65 | with open(downPath, 'wb') as f: 66 | f.write(img_file.content) 67 | return '{}期第{}页第{}张图片下载成功,链接:{}'.format( 68 | route_url_list[0],route_url_list[1],route_url_list[2],data) 69 | else: 70 | return '{}期第{}页第{}张图片下载失败,链接:{}'.format( 71 | route_url_list[0], route_url_list[1], route_url_list[2], data) 72 | except requests.exceptions.RequestException as e: 73 | print(e) 74 | return '{}期第{}页第{}张图片下载失败,链接:{}'.format( 75 | route_url_list[0], route_url_list[1], route_url_list[2], data) 76 | else: 77 | path = path + '/{}'.format(route_url_list[0]) 78 | os.makedirs(path, exist_ok=True) 79 | with open(path+'/热门视频.txt','w',encoding='utf-8') as f: 80 | for i in data: 81 | f.write('{}、{}'.format(i['number'],i['title'])) 82 | f.write('\n') 83 | f.write(i['href']) 84 | f.write('\n') 85 | return '{}期热门视频链接保存成功。'.format(route_url) 86 | 87 | 88 | if __name__ == '__main__': 89 | issue_start = input("请输入从哪一期开始爬取(例:2020年第1期:输入2020001)\n") 90 | while issue_start.isdigit() == False or issue_start == '': 91 | print('请好好输!\n') 92 | issue_start = input("请输入从哪一期开始爬取(例:2020年第1期:输入2020001)\n") 93 | issue_end = input("请输入从哪一期结束爬取(例:2020年第60期:输入2020060)\n") 94 | while issue_end.isdigit() == False or issue_end == '' or issue_end < issue_start: 95 | print('请好好输!\n') 96 | issue_end = input("请输入从哪一期结束爬取(例:2020年第60期:输入2020060)\n") 97 | issue_start = int(issue_start) 98 | issue_end = int(issue_end) + 1 99 | starttime = datetime.datetime.now() 100 | print('开始下载,请稍候。。。。。。') 101 | task_list = [] 102 | init_data = {} 103 | with ThreadPoolExecutor() as executor: 104 | for i in range(issue_start, issue_end): 105 | task = executor.submit(GetUrl,i) 106 | task_list.append(task) 107 | for res in as_completed(task_list): 108 | init_data.update(res.result()) 109 | # print(init_data) 110 | task_list = [] 111 | with ThreadPoolExecutor() as executor: 112 | for key in init_data: 113 | task = executor.submit(GetData,key,init_data[key]) 114 | task_list.append(task) 115 | for res in as_completed(task_list): 116 | print(res.result()) 117 | endtime = datetime.datetime.now() 118 | print('已全部下载完成!请在当前路径下查看!用时:{}秒'.format(int((endtime-starttime).total_seconds()))) 119 | print('Enjoy it') 120 | print('Powered by 所向披靡\n') 121 | key = input('按回车键退出\n') 122 | while key != '': 123 | key = input('按回车键退出\n') --------------------------------------------------------------------------------