├── .gitignore
├── README.md
├── dist
    └── fulibuscrawler.exe
└── fulibuscrawler.py


/.gitignore:
--------------------------------------------------------------------------------
1 | # See https://help.github.com/ignore-files/ for more about ignoring files.
2 | .idea


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Fulibuscrawler
 2 | 福利吧-福利汇总下载器
 3 | ## Run
 4 | 安装python3
 5 | 
 6 | 安装依赖
 7 | 
 8 | ```
 9 | pip3 install requests -i https://pypi.doubanio.com/simple/
10 | pip3 install beautifulsoup4 -i https://pypi.doubanio.com/simple/
11 | pip3 install lxml -i https://pypi.doubanio.com/simple/
12 | ```
13 | 运行py文件
14 | 
15 | 输入你要从哪一期开始到哪一期结束，
16 | 
17 | 它会自动创建文件夹并多线程下载福利汇总所有页面的**图片**和**热门视频链接**。
18 | ## Release
19 | 下载打包好的exe文件一键运行
20 | -  [fulibuscrawler.exe](https://github.com/wanglu58/fulibuscrawler/releases)


--------------------------------------------------------------------------------
/dist/fulibuscrawler.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wanglu58/fulibuscrawler/c10419d5fb50b14c19f58212a68ba9a20c407d49/dist/fulibuscrawler.exe


--------------------------------------------------------------------------------
/fulibuscrawler.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import datetime
  3 | from concurrent.futures import ThreadPoolExecutor, as_completed
  4 | import requests
  5 | from bs4 import BeautifulSoup
  6 | 
  7 | 
  8 | def GetUrl(number):
  9 |     dict = {}
 10 |     video_list = []
 11 |     video_number = 0
 12 |     i = 1
 13 |     try:
 14 |         response = requests.get('https://fulibus.net/{}.html'.format(number))
 15 |         soup = BeautifulSoup(response.content, 'lxml')
 16 |         page = len(soup.find('div', attrs={'class': 'article-paging'}).find_all('a')) + 2
 17 |         for i in range(1,page):
 18 |             if i != 1:
 19 |                 response = requests.get('https://fulibus.net/{}.html/{}'.format(number,i))
 20 |                 soup = BeautifulSoup(response.content, 'lxml')
 21 |             data = soup.find_all('p')
 22 |             img_number = 0
 23 |             for d in data:
 24 |                 if i == 1:
 25 |                     for item in d.find_all('a'):
 26 |                         if item.get('rel') and item.string != item.get('href') and item.string != '福利吧':
 27 |                             video_number += 1
 28 |                             video_list.append({
 29 |                                 'number': video_number,
 30 |                                 'title': item.string,
 31 |                                 'href': item['href']
 32 |                             })
 33 |                 for img in  d.find_all('img'):
 34 |                     img_number += 1
 35 |                     picture_url = img.get('src')
 36 |                     route_url= '{}-{}-{}'.format(number,i,img_number)
 37 |                     dict[route_url] = picture_url
 38 |             if video_list:
 39 |                 dict['{}'.format(number)] = video_list
 40 |     except requests.exceptions.RequestException as e:
 41 |         print(e)
 42 |         print('{0}期网站无法加载，链接：https://fulibus.net/{0}.html/{1}'.format(number, i))
 43 |     return dict
 44 | 
 45 | def GetData(route_url,data):
 46 |     route_url_list = route_url.split('-')
 47 |     path = os.getcwd()
 48 |     if len(route_url_list) == 3:
 49 |         path = path + '/{}/{}'.format(route_url_list[0],route_url_list[1])
 50 |         os.makedirs(path,exist_ok=True)
 51 |         try:
 52 |             HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
 53 |                        'Accept-Language': 'zh-CN,zh;q=0.8',
 54 |                        'Accept-Encoding': 'gzip, deflate', }
 55 |             headers = HEADERS
 56 |             headers['user-agent'] = "Mozilla/5.0+(Windows+NT+6.2;+WOW64)+AppleWebKit/537.36+" \
 57 |                                     "(KHTML,+like+Gecko)+Chrome/45.0.2454.101+Safari/537.36"
 58 |             img_file = requests.get(data,allow_redirects=False,timeout=(60, 60),headers=headers)
 59 |             if img_file.status_code == 200:
 60 |                 img_type = img_file.headers.get('content-type')
 61 |                 if img_type.split('/')[-1] == 'jpeg':
 62 |                     downPath = path + '/{}.jpg'.format(route_url_list[2])
 63 |                 else:
 64 |                     downPath = path + '/{}.{}'.format(route_url_list[2], img_type.split('/')[-1])
 65 |                 with open(downPath, 'wb') as f:
 66 |                     f.write(img_file.content)
 67 |                 return '{}期第{}页第{}张图片下载成功，链接：{}'.format(
 68 |                     route_url_list[0],route_url_list[1],route_url_list[2],data)
 69 |             else:
 70 |                 return '{}期第{}页第{}张图片下载失败，链接：{}'.format(
 71 |                     route_url_list[0], route_url_list[1], route_url_list[2], data)
 72 |         except requests.exceptions.RequestException as e:
 73 |             print(e)
 74 |             return '{}期第{}页第{}张图片下载失败，链接：{}'.format(
 75 |                 route_url_list[0], route_url_list[1], route_url_list[2], data)
 76 |     else:
 77 |         path = path + '/{}'.format(route_url_list[0])
 78 |         os.makedirs(path, exist_ok=True)
 79 |         with open(path+'/热门视频.txt','w',encoding='utf-8') as f:
 80 |             for i in data:
 81 |                 f.write('{}、{}'.format(i['number'],i['title']))
 82 |                 f.write('\n')
 83 |                 f.write(i['href'])
 84 |                 f.write('\n')
 85 |         return '{}期热门视频链接保存成功。'.format(route_url)
 86 | 
 87 | 
 88 | if __name__ == '__main__':
 89 |     issue_start = input("请输入从哪一期开始爬取（例：2020年第1期：输入2020001）\n")
 90 |     while issue_start.isdigit() == False or issue_start == '':
 91 |         print('请好好输！\n')
 92 |         issue_start = input("请输入从哪一期开始爬取（例：2020年第1期：输入2020001）\n")
 93 |     issue_end = input("请输入从哪一期结束爬取（例：2020年第60期：输入2020060）\n")
 94 |     while issue_end.isdigit() == False or issue_end == '' or issue_end < issue_start:
 95 |         print('请好好输！\n')
 96 |         issue_end = input("请输入从哪一期结束爬取（例：2020年第60期：输入2020060）\n")
 97 |     issue_start = int(issue_start)
 98 |     issue_end = int(issue_end) + 1
 99 |     starttime = datetime.datetime.now()
100 |     print('开始下载，请稍候。。。。。。')
101 |     task_list = []
102 |     init_data = {}
103 |     with ThreadPoolExecutor() as executor:
104 |         for i in range(issue_start, issue_end):
105 |             task = executor.submit(GetUrl,i)
106 |             task_list.append(task)
107 |         for res in as_completed(task_list):
108 |             init_data.update(res.result())
109 |     # print(init_data)
110 |     task_list = []
111 |     with ThreadPoolExecutor() as executor:
112 |         for key in init_data:
113 |             task = executor.submit(GetData,key,init_data[key])
114 |             task_list.append(task)
115 |         for res in as_completed(task_list):
116 |             print(res.result())
117 |     endtime = datetime.datetime.now()
118 |     print('已全部下载完成！请在当前路径下查看！用时：{}秒'.format(int((endtime-starttime).total_seconds())))
119 |     print('Enjoy it')
120 |     print('Powered by 所向披靡\n')
121 |     key = input('按回车键退出\n')
122 |     while key != '':
123 |         key = input('按回车键退出\n')


--------------------------------------------------------------------------------