├── README.md
├── audio.py
├── banned.py
├── column.py
├── functions
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-37.pyc
    │   ├── database.cpython-37.pyc
    │   ├── deal_json.cpython-37.pyc
    │   ├── requests_func.cpython-37.pyc
    │   └── thread.cpython-37.pyc
    ├── database.py
    ├── deal_json.py
    ├── requests_func.py
    └── thread.py
├── game.py
├── member.py
├── micro_video.py
├── pictures.py
├── rank.py
└── video.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Bilibili网站爬虫
 2 | ![][8] ![][9] ![][10] ![][11]
 3 | 
 4 | 
 5 | 
 6 | -------------
 7 | 我胖虎今天就是要把b站爬完
 8 | --------
 9 | 
10 | 
11 | File Name | Description | Remarks
12 | ---------|----------|---------
13 | /[_video.py_][3] | 视频信息爬取 | 从av100逐一递增遍历视频信息，并录入数据库
14 | /[_rank.py_][4] | 排行榜数据爬取 | 爬取排行榜数据，可自定义爬取分类以及排行榜时间
15 | /[_pictures.py_][5] | 相簿图片爬取 | 爬取相簿图片，并下载至本地
16 | /[_member.py_][6] | 会员信息数据爬取 | 从id0开始逐一递增，爬取所有会员信息，并录入数据库，但是由于会员数量过于庞大，我试过同时开20个进程同时爬取，但是由于请求过于频繁，ip被封了20来分钟，但是思路已经摆在这了，有ip代理的话问题不大
17 | /[_audio.py_][7] | 音频爬取下载 | 爬取音频专辑id，从12000逐一递增爬取，并将音乐下载至本地
18 | /[_banned.py_][13] | 小黑屋数据爬取 | 从第1页开始逐一递增，爬取所有小黑屋数据，并录入数据库
19 | /[_column.py_][19] | 专栏文章爬取 | 遍历所有专栏分类，并爬取对应专栏下的文章数据（不包含文章内容）录入数据库
20 | /[_game.py_][20] | 游戏列表爬取 | 爬取游戏列表所有数据，录入数据库
21 | /[_micro_video.py_][21] | 小视频爬取下载 | 爬取所有小视频分类下的视频信息，将视频信息录入数据库，将视频下载至本地
22 | [_functions_][12]/[_database.py_][15] | Mysql数据库操作相关函数 | None
23 | [_functions_][12]/[_deal_json.py_][16] | Json数据处理相关函数 | None
24 | [_functions_][12]/[_requests_func.py_][17] | http请求相关函数 | None
25 | [_functions_][12]/[_thread.py_][18] | 多线程相关函数 | None
26 | 
27 | 
28 | ---------------
29 | 
30 | 
31 | [1]:https://blog.tryfang.cn
32 | [2]:https://space.bilibili.com/25216986
33 | [3]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/video.py
34 | [4]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/rank.py
35 | [5]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/pictures.py
36 | [6]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/member.py
37 | [7]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/audio.py
38 | [8]:https://img.shields.io/badge/Python-v3.7.1-brightgreen.svg
39 | [9]:https://img.shields.io/badge/requests-2.21-green.svg
40 | [10]:https://img.shields.io/badge/pymysql-0.9.3-red.svg
41 | [11]:https://img.shields.io/badge/Bilibili-%E5%B9%B2%E6%9D%AF-ff69b4.svg
42 | [12]:https://github.com/Liangzhenzhuo/Bilibili/tree/master/functions
43 | [14]:https://github.com/Liangzhenzhuo/Bilibili/tree/master/
44 | [13]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/banned.py
45 | [15]:https://github.com/Liangzhenzhuo/Bilibili/tree/master/functions/database.py
46 | [16]:https://github.com/Liangzhenzhuo/Bilibili/tree/master/functions/deal_json.py
47 | [17]:https://github.com/Liangzhenzhuo/Bilibili/tree/master/functions/requests_func.py
48 | [18]:https://github.com/Liangzhenzhuo/Bilibili/tree/master/functions/thread.py
49 | [19]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/column.py
50 | [20]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/game.py
51 | [21]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/micro_video.py
52 | 


--------------------------------------------------------------------------------
/audio.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019/4/7 0:49
 3 | # @Author  : Nismison
 4 | # @FileName: audio.py
 5 | # @Description: bilibili音频爬取下载
 6 | # @Blog    ：https://blog.tryfang.cn
 7 | 
 8 | from os.path import dirname, exists
 9 | from os import mkdir
10 | from functions.deal_json import dict_get
11 | from functions.requests_func import url_get
12 | 
13 | 
14 | def audio_crawler(path='songs'):
15 |     """
16 |     音频爬取函数
17 |     :param path: 提供自定义下载路径修改
18 |     :return: None
19 |     """
20 |     # 规定基础路径
21 |     base_dir = dirname(__file__) + "/" + path + "/"
22 |     # 如果路径不存在则创建路径
23 |     if not exists(base_dir):
24 |         mkdir(base_dir)
25 |     # 从12032-20000遍历sid，生成专辑url
26 |     for sid in range(12032, 20000):
27 |         # 拼接专辑url
28 |         url = "https://www.bilibili.com/audio/music-service-c/web/song/of-menu?sid={}&pn=1&ps=100".format(sid)
29 |         res = url_get(url=url, mode="json")
30 |         data = dict_get(res, "data")
31 |         # 如果data为空，则跳过
32 |         if data is None:
33 |             continue
34 |         items = dict_get(data, "data")
35 |         # 获取专辑信息请求
36 |         info_url = "https://www.bilibili.com/audio/music-service-c/web/menu/info?sid={}".format(sid)
37 |         info_get = url_get(url=info_url, mode="json")
38 |         album_title = dict_get(info_get, "title").replace("/", '').replace("<", '').replace(">", '').replace(
39 |             "|", '').replace(":", '').replace("*", '').replace("?", '').replace("\\", '')
40 |         # 如果路径不存在则创建路径
41 |         if not exists(base_dir + album_title):
42 |             mkdir(base_dir + album_title)
43 |         # 遍历专辑下所有音乐
44 |         for item in items:
45 |             author = dict_get(item, "author")  # 歌手
46 |             title = dict_get(item, "title")  # 音乐标题
47 |             sid = dict_get(item, "id")  # 音乐id，用于拼接音乐下载url
48 |             songs_url = "https://www.bilibili.com/audio/music-service-c/web/url?sid={}".format(sid)
49 |             songs_get = url_get(url=songs_url, mode="json")
50 |             file_size = round(dict_get(songs_get, "size") / 1024 / 1024, 2)  # 音频文件大小
51 |             # 分析json中cdns数据，判断音频文件真实地址
52 |             cdns = dict_get(songs_get, "cdns")
53 |             if cdns[0] > cdns[1]:
54 |                 real_url = cdns[0]
55 |             else:
56 |                 real_url = cdns[1]
57 |             print("Downloading Audio")
58 |             song_file_name = base_dir + album_title + "/" + title + " - " + author + '.m4a'
59 |             # 如果文件已存在，则跳过
60 |             if exists(song_file_name):
61 |                 continue
62 |             # 下载音频文件
63 |             song_file_get = url_get(url=real_url, mode="content")
64 |             with open(song_file_name, "wb") as song:
65 |                 song.write(song_file_get)
66 |                 song.close()
67 |             # 显示进程信息
68 |             print("album_title: {}".format(album_title))
69 |             print("author: {}".format(author))
70 |             print("title: {}".format(title))
71 |             print("file_size: {} MB".format(file_size))
72 |             print("-" * 60)
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     audio_crawler()
77 | 


--------------------------------------------------------------------------------
/banned.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019/4/8 15:17
 3 | # @Author  : Nismison
 4 | # @FileName: banned.py
 5 | # @Description: Bilibili小黑屋数据爬取
 6 | # @Blog    ：https://blog.tryfang.cn
 7 | 
 8 | from functions.requests_func import url_get
 9 | from functions.deal_json import dict_get
10 | from functions.database import Database
11 | from time import strftime, localtime
12 | 
13 | 
14 | def banned_crawler():
15 |     database = Database("localhost", "root", "", "bilibili")
16 |     pn = 1
17 |     while True:
18 |         data = {}
19 |         banned_url = "https://api.bilibili.com/x/credit/blocked/list?pn={}".format(pn)
20 |         banned_data = url_get(banned_url, mode="json")
21 |         if banned_data['code'] != 0:
22 |             print("爬取完毕")
23 |             print("-" * 60)
24 |             return
25 |         for item in dict_get(banned_data, "data"):
26 |             data["banned_uname"] = dict_get(item, "uname")
27 |             data['banned_uid'] = dict_get(item, "uid")
28 |             data['banned_reason'] = dict_get(item, "reasonTypeName")
29 |             data['banned_days'] = dict_get(item, "blockedDays")
30 |             banned_time = dict_get(item, "punishTime")
31 |             data['banned_time'] = strftime("%Y-%m-%d %H:%M:%S", localtime(banned_time))
32 |             if database.execute_sql(table_name="banned", mode="insert", keys=list(data.keys()), values=list(data.values())):
33 |                 print("用户名： {}".format(data["banned_uname"]))
34 |                 print("用户id： {}".format(data['banned_uid']))
35 |                 print("封禁类型： {}".format(data['banned_reason']))
36 |                 print("封禁时长： {}".format(data['banned_days']))
37 |                 print("封禁时间： {}".format(data['banned_time']))
38 |                 print("-" * 60)
39 |         pn += 1
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     banned_crawler()
44 | 


--------------------------------------------------------------------------------
/column.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-04-08 16:37:54
 3 | # @Author  : Nismison
 4 | # @FileName: column.py
 5 | # @Description: Bilibili专栏文章爬取
 6 | # @Blog    ：https://blog.tryfang.cn
 7 | 
 8 | from functions.requests_func import url_get
 9 | from functions.database import Database
10 | from functions.deal_json import dict_get
11 | from time import strftime, localtime
12 | 
13 | 
14 | 
15 | def column_crawler():
16 |     database = Database("localhost", "root", "", "bilibili")
17 |     table_name = "zhuanlan"
18 |     cid_dict = {
19 |         "动画": 2,
20 |         "游戏": 1,
21 |         "影视": 28,
22 |         "生活": 3,
23 |         "兴趣": 29,
24 |         "轻小说": 16,
25 |         "科技": 17,
26 |     }
27 |     for v in cid_dict.values():
28 |         pn = 1
29 |         while True:
30 |             column_url = "https://api.bilibili.com/x/article/recommends?cid={}&pn={}&ps=100&sort=0".format(v, pn)
31 |             column_get = url_get(column_url, mode="json")
32 |             column_data = dict_get(column_get, "data")
33 |             if len(column_data) == 0:
34 |                 print(pn)
35 |                 break
36 |             for item in column_data:
37 |                 data = {}
38 |                 author_info = dict_get(item, "author")  # 作者信息
39 |                 data['author_mid'] = author_info['mid']  # 作者id
40 |                 data['author_name'] = author_info['name']  # 作者用户名
41 |                 data['category'] = dict_get(item, "category")['name']  # 所属分类
42 |                 data['update_time'] = strftime("%Y-%m-%d %H:%M:%S", localtime(dict_get(item, 'update_time')))  # 上传时间
43 |                 data['art_id'] = dict_get(item, "id")  # 文章id，如果需要爬取文章内容可以拼接url: https://www.bilibili.com/read/cv[文章id]
44 |                 data['art_title'] = dict_get(item, "title")  # 文章标题
45 |                 data['art_words'] = dict_get(item, "words")  # 文章字数
46 |                 data['art_like'] = dict_get(item, "like")  # 文章点赞数
47 |                 data['art_reply'] = dict_get(item, "reply")  # 文章评论数
48 |                 data['art_view'] = dict_get(item, "view")  # 文章浏览数
49 |                 data['art_favorite'] = dict_get(item, "favorite")  # 文章收藏数
50 |                 data['art_coin'] = dict_get(item, "coin")  # 文章投币数
51 |                 data['art_share'] = dict_get(item, "share")  # 文章分享数
52 |                 data['art_summary'] = dict_get(item, "summary")  # 文章摘要
53 |                 data['crawl_time'] = strftime("%Y-%m-%d %H:%M:%S", localtime())  # 爬取时间
54 | 
55 |                 if database.execute_sql(table_name=table_name, select="id", key="art_id", value=data['art_id']) != 0:
56 |                     print("id:{} 重复，跳过".format(data['art_id']))
57 |                     print("-" * 60)
58 |                     # pn += 1
59 |                     continue
60 |                 if database.execute_sql(table_name=table_name, mode="insert", keys=list(
61 |                         data.keys()), values=list(data.values())):
62 |                     print("作者id: {}".format(data['author_mid']))
63 |                     print("作者用户名: {}".format(data['author_name']))
64 |                     print("所属分类: {}".format(data['category']))
65 |                     print("上传时间: {}".format(data['update_time']))
66 |                     print("文章id: {}".format(data['art_id']))
67 |                     print("文章标题: {}".format(data['art_title']))
68 |                     print("文章字数: {}".format(data['art_words']))
69 |                     print("文章点赞数: {}".format(data['art_like']))
70 |                     print("文章评论数: {}".format(data['art_reply']))
71 |                     print("文章浏览数: {}".format(data['art_view']))
72 |                     print("文章收藏数: {}".format(data['art_favorite']))
73 |                     print("文章投币数: {}".format(data['art_coin']))
74 |                     print("文章分享数: {}".format(data['art_share']))
75 |                     print("文章摘要: {}".format(data['art_summary']))
76 |                     print("爬取时间: {}".format(data['crawl_time']))
77 |                     print("-" * 60)
78 |                 else:
79 |                     print("id:{} 异常，跳过".format(data['art_id']))
80 |                     print("-" * 60)
81 |                     # pn += 1
82 |                     continue
83 |             pn += 1
84 | 
85 | if __name__ == "__main__":
86 |     column_crawler()
87 | 


--------------------------------------------------------------------------------
/functions/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Time    : 2019/4/7 3:14
3 | # @Author  : Nismison
4 | # @FileName: __init__.py
5 | # @Description: 
6 | # @Blog    ：https://blog.tryfang.cn
7 | 
8 | 


--------------------------------------------------------------------------------
/functions/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Liangzhenzhuo/Bilibili/0ec4380453b7013c46fbc505843470b973b67aa8/functions/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/functions/__pycache__/database.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Liangzhenzhuo/Bilibili/0ec4380453b7013c46fbc505843470b973b67aa8/functions/__pycache__/database.cpython-37.pyc


--------------------------------------------------------------------------------
/functions/__pycache__/deal_json.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Liangzhenzhuo/Bilibili/0ec4380453b7013c46fbc505843470b973b67aa8/functions/__pycache__/deal_json.cpython-37.pyc


--------------------------------------------------------------------------------
/functions/__pycache__/requests_func.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Liangzhenzhuo/Bilibili/0ec4380453b7013c46fbc505843470b973b67aa8/functions/__pycache__/requests_func.cpython-37.pyc


--------------------------------------------------------------------------------
/functions/__pycache__/thread.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Liangzhenzhuo/Bilibili/0ec4380453b7013c46fbc505843470b973b67aa8/functions/__pycache__/thread.cpython-37.pyc


--------------------------------------------------------------------------------
/functions/database.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019/4/7 4:17
 3 | # @Author  : Nismison
 4 | # @FileName: database.py
 5 | # @Description: 数据库函数
 6 | # @Blog    ：https://blog.tryfang.cn
 7 | 
 8 | from pymysql import connect
 9 | 
10 | 
11 | class Database(object):
12 |     def __init__(self, host, username, password, db_name):
13 |         self.__connection = connect(host, username, password, db_name)
14 |         self.__cursor = self.__connection.cursor()
15 | 
16 |     def execute_sql(self, table_name, mode="search", select="*", **kwargs):
17 |         """
18 |         :param table_name: 需要执行操作的表名（str）
19 |         :param mode: 需要执行的操作（search: 查询，insert: 插入）
20 |             search - key: 需要查询的字段（str/list）
21 |             search -> value: 需要匹配的值（str）
22 |             insert - keys: 需要插入的字段名（list）
23 |             inset - values: 需要插入的数据（list）
24 |         :return: 查询到的数据 or True/False
25 |         """
26 | 
27 |         if mode == "search":
28 |             if isinstance(kwargs['key'], list) and kwargs['value'] == 'all':
29 |                 key = ''
30 |                 for i in range(len(kwargs['key'])):
31 |                     if i == len(kwargs['key']) - 1:
32 |                         key += "{}".format(kwargs['key'][i])
33 |                     else:
34 |                         key += "{}".format(kwargs['key'][i]) + ", "
35 |                 sql = "select {} from {}".format(key, table_name)
36 |                 self.__cursor.execute(sql)
37 |                 return list(self.__cursor.fetchall())
38 |             elif isinstance(kwargs['key'], str) and (isinstance(kwargs['value'], str) or isinstance(kwargs['value'], int) or isinstance(kwargs['value'], float)):
39 |                 sql = "select {} from {} where {}='{}'".format(select, table_name, kwargs['key'], kwargs['value'])
40 |                 return self.__cursor.execute(sql)
41 |             else:
42 |                 raise TypeError("The 'key' must be a list or str type and the 'value' must be a string type.")
43 | 
44 |         elif mode == "insert":
45 |             key = ''
46 |             value = ''
47 |             keys = kwargs['keys']
48 |             values = kwargs['values']
49 |             # 如果keys和values类型不是list，抛出异常
50 |             if not isinstance(keys, list) or not isinstance(values, list):
51 |                 raise TypeError("The 'keys' and 'value' must be list or number type.")
52 |             try:
53 |                 for i in range(len(keys)):
54 |                     if i == len(keys) - 1:
55 |                         key += "{}".format(keys[i])
56 |                     else:
57 |                         key += "{}".format(keys[i]) + ", "
58 |                 for i in range(len(values)):
59 |                     if i == len(values) - 1:
60 |                         value += "'{}'".format(values[i])
61 |                     else:
62 |                         value += "'{}'".format(values[i]) + ", "
63 |                 sql = "insert into {} ({}) values ({})".format(table_name, key, value)
64 |                 # 游标对象执行操作
65 |                 self.__cursor.execute(sql)
66 |                 # connection对象提交操作
67 |                 self.__connection.commit()
68 |                 return True
69 |             except Exception as e:
70 |                 print("Exception:", e)
71 |                 return False
72 | 
73 |     def get_cursor(self):
74 |         """
75 |         :return: Cursor Object
76 |         """
77 |         return self.__cursor
78 | 
79 |     def get_connection(self):
80 |         """
81 |         :return: Connection Object
82 |         """
83 |         return self.__connection
84 | 
85 |     def close(self):
86 |         """
87 |         close connection and cursor
88 |         """
89 |         self.__cursor.close()
90 |         self.__connection.close()
91 | 


--------------------------------------------------------------------------------
/functions/deal_json.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019/4/7 3:15
 3 | # @Author  : Nismison
 4 | # @FileName: deal_json.py
 5 | # @Description: json处理函数
 6 | # @Blog    ：https://blog.tryfang.cn
 7 | 
 8 | 
 9 | def dict_get(dict_, objkey):
10 |     """
11 |     从嵌套的字典中拿到需要的值
12 |     :param dict_: 要遍历的字典
13 |     :param objkey: 目标key
14 |     :return: 目标key对应的value
15 |     """
16 |     if isinstance(dict_, dict):
17 |         for key, value in dict_.items():
18 |             if key == objkey:
19 |                 return value
20 |             else:
21 |                 # 如果value是dict类型，则进行迭代
22 |                 if isinstance(value, dict):
23 |                     ret = dict_get(value, objkey)
24 |                     if ret is not None:
25 |                         return ret
26 |                 # 如果value是list类型，则依次进行迭代
27 |                 elif isinstance(value, list):
28 |                     for i in range(len(value)):
29 |                         ret = dict_get(value[i], objkey)
30 |                         if ret is not None:
31 |                             return ret
32 |         # 如果找不到指定的key，返回None
33 |         return None
34 |     else:
35 |         return None
36 | 


--------------------------------------------------------------------------------
/functions/requests_func.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019/4/7 3:17
 3 | # @Author  : Nismison
 4 | # @FileName: requests_func.py
 5 | # @Description: requests函数
 6 | # @Blog    ：https://blog.tryfang.cn
 7 | 
 8 | from requests import get
 9 | from fake_useragent import UserAgent
10 | 
11 | 
12 | def url_get(url, mode=None, timeout=20):
13 |     # 重试次数
14 |     retry_count = 0
15 |     try:
16 |         if mode is None:
17 |             return get(url=url, headers={"User-Agent": UserAgent().random}, timeout=timeout)
18 |         elif mode == "json":
19 |             return get(url=url, headers={"User-Agent": UserAgent().random}, timeout=timeout).json()
20 |         elif mode == "content":
21 |                 return get(url=url, headers={"User-Agent": UserAgent().random}, timeout=timeout).content
22 |         elif mode == "text":
23 |                 return get(url=url, headers={"User-Agent": UserAgent().random}, timeout=timeout).text
24 |         elif mode == "code":
25 |                     return get(url=url, headers={"User-Agent": UserAgent().random}, timeout=timeout).status_code
26 |         else:
27 |             raise ValueError("Mode error, mode must be one of None/json/content/text/code")
28 |     except Exception:
29 |         if retry_count > 3:
30 |             raise Exception("Maximum retries")
31 |         else:
32 |             url_get(url=url, mode=mode)
33 |             retry_count += 1
34 | 


--------------------------------------------------------------------------------
/functions/thread.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019/4/8 0:32
 3 | # @Author  : Nismison
 4 | # @FileName: thread.py
 5 | # @Description: 
 6 | # @Blog    ：https://blog.tryfang.cn
 7 | 
 8 | from threading import Thread
 9 | 
10 | 
11 | def thread_create(thread_num, method):
12 |     """
13 |     批量启动线程
14 |     :param thread_num: 线程数
15 |     :param method: 线程调用方法
16 |     """
17 |     # 线程池
18 |     thread_pool = []
19 |     # 批量创建线程放到线程池中
20 |     for i in range(thread_num):
21 |         th = Thread(target=method, args=(4415 + i * 2000, ))
22 |         thread_pool.append(th)
23 |     # 从线程池中批量启动线程
24 |     for i in range(len(thread_pool)):
25 |         thread_pool[i].start()
26 |         print("线程 {} 已启动".format(i + 1))
27 |     # 等待子线程执行结束
28 |     for th in thread_pool:
29 |         Thread.join(th)
30 | 


--------------------------------------------------------------------------------
/game.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019/4/8 20:19
 3 | # @Author  : Nismison
 4 | # @FileName: game.py
 5 | # @Description: Bilibili游戏列表爬取
 6 | # @Blog    ：https://blog.tryfang.cn
 7 | 
 8 | from functions.requests_func import url_get
 9 | from functions.database import Database
10 | from functions.deal_json import dict_get
11 | 
12 | 
13 | def game_crawler():
14 |     database = Database("localhost", "root", "", "bilibili")
15 |     table_name = "game_list"
16 |     game_list_url = "https://game.bilibili.com/gamelist.json"
17 |     game_list_json = url_get(game_list_url, "json")
18 |     for game in game_list_json:
19 |         game_info = {}
20 |         game_info['name'] = dict_get(game, "title")
21 |         game_info['summary'] = dict_get(game, "summary")
22 |         game_info['website'] = dict_get(game, "website")
23 | 
24 |         if database.execute_sql(table_name=table_name, key="name", value=game_info['name']) != 0:
25 |             print("{} 重复，跳过".format(game_info['name']))
26 |             print("-" * 60)
27 | 
28 |         if database.execute_sql(table_name=table_name, mode="insert", keys=list(game_info.keys()), values=list(game_info.values())):
29 |             print("游戏名: {}".format(game_info['name']))
30 |             print("游戏介绍: {}".format(game_info['summary']))
31 |             print("游戏官网: {}".format(game_info['website']))
32 |             print("-" * 60)
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     game_crawler()
37 | 


--------------------------------------------------------------------------------
/member.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Time    : 2019/4/7 21:33
  3 | # @Author  : Nismison
  4 | # @FileName: member.py
  5 | # @Description: Bilibili会员信息爬取
  6 | # @Blog    ：https://blog.tryfang.cn
  7 | 
  8 | from functions.requests_func import url_get
  9 | from functions.deal_json import dict_get
 10 | from functions.database import Database
 11 | from functions.thread import thread_create
 12 | 
 13 | 
 14 | def member_crawler(mid):
 15 |     database = Database("localhost", "root", "", "bilibili")
 16 |     while True:
 17 |         follow_url = "https://api.bilibili.com/x/relation/stat?vmid={}".format(mid)
 18 |         view_url = "https://api.bilibili.com/x/space/upstat?mid={}".format(mid)
 19 |         info_url = "https://api.bilibili.com/x/space/acc/info?mid={}".format(mid)
 20 |         tag_url = "https://space.bilibili.com/ajax/member/getTags?mids={}".format(mid)
 21 |         charging_url = "https://elec.bilibili.com/api/query.rank.do?mid={}".format(mid)
 22 |         upload_data_url = "https://api.bilibili.com/x/space/navnum?mid={}".format(mid)
 23 |         try:
 24 |             member_info = url_get(info_url, mode='json')
 25 |             username = dict_get(member_info, "name")
 26 |             if username is None:
 27 |                 print("该会员不存在, 跳过 {}".format(mid))
 28 |                 print("-" * 60)
 29 |                 mid += 1
 30 |                 continue
 31 |             level = dict_get(member_info, "level")
 32 |             member_id = dict_get(member_info, "mid")
 33 |             sex = dict_get(member_info, "sex")
 34 |             coins = dict_get(member_info, "coins")
 35 |             official_data = dict_get(member_info, "official")
 36 |             follow_data = url_get(follow_url, mode="json")
 37 |             following = dict_get(follow_data, 'following')
 38 |             follower = dict_get(follow_data, 'follower')
 39 |             view = dict_get(url_get(view_url, mode="json"), "view")
 40 | 
 41 |             if official_data['role'] == 1:
 42 |                 official = official_data['title']
 43 |             else:
 44 |                 official = "暂无认证"
 45 |             birthday = dict_get(member_info, "birthday")
 46 |             sign = dict_get(member_info, "sign")
 47 |             vip = dict_get(member_info, "status")
 48 |             if vip == 1:
 49 |                 vip_status = "是"
 50 |             else:
 51 |                 vip_status = "否"
 52 |             tag = ''
 53 |             for x in dict_get(url_get(tag_url, mode="json"), "tags"):
 54 |                 tag += x + ' '
 55 |             charging = dict_get(url_get(charging_url, mode="json"), "total_count")
 56 |             video_upload = dict_get(url_get(upload_data_url, mode="json"), "video")
 57 | 
 58 |             if database.execute_sql(table_name="member", mode="search", key="member_id", value=member_id) != 0:
 59 |                 print("该会员已存在, 跳过 {}".format(member_id))
 60 |                 print("-" * 60)
 61 |                 mid += 1
 62 |                 continue
 63 | 
 64 |             insert_data = {
 65 |                 "member_id": member_id,
 66 |                 "username": username,
 67 |                 "sex": sex,
 68 |                 "birthday": birthday,
 69 |                 "level": level,
 70 |                 "coins": coins,
 71 |                 "sign": sign,
 72 |                 "charging": charging,
 73 |                 "video_upload": video_upload,
 74 |                 "tag": tag,
 75 |                 "vip_status": vip_status,
 76 |                 "official": official,
 77 |                 "following": following,
 78 |                 "follower": follower,
 79 |                 "view": view,
 80 |             }
 81 | 
 82 |             if database.execute_sql(mode="insert", table_name="member", keys=list(
 83 |                     insert_data.keys()), values=list(insert_data.values())):
 84 |                 print("用户id: {}".format(member_id))
 85 |                 print("用户名: {}".format(username))
 86 |                 print("性别: {}".format(sex))
 87 |                 print("生日: {}".format(birthday))
 88 |                 print("等级: {}".format(level))
 89 |                 print("B币: {}".format(coins))
 90 |                 print("个人签名: {}".format(sign))
 91 |                 print("充电人数: {}".format(charging))
 92 |                 print("视频数量: {}".format(video_upload))
 93 |                 print("标签: {}".format(tag))
 94 |                 print("B站大会员: {}".format(vip_status))
 95 |                 print("Bilibili认证: {}".format(official))
 96 |                 print("关注数: {}".format(following))
 97 |                 print("粉丝数: {}".format(follower))
 98 |                 print("播放量: {}".format(view))
 99 |                 print("-" * 60)
100 |             mid += 1
101 |         except Exception as e:
102 |             print("错误, 跳过 mid={}".format(mid))
103 |             print(e)
104 |             print("-" * 60)
105 |             mid += 1
106 |             continue
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     member_crawler(mid=0)
111 | 


--------------------------------------------------------------------------------
/micro_video.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019/4/8 20:46
 3 | # @Author  : Nismison
 4 | # @FileName: micro_video.py
 5 | # @Description: Bilibili小视频爬取
 6 | # @Blog    ：https://blog.tryfang.cn
 7 | 
 8 | from functions.requests_func import url_get
 9 | from functions.deal_json import dict_get
10 | from functions.database import Database
11 | 
12 | 
13 | def micro_video_crawler(order='', page_num=1):
14 |     """
15 |     :param order: 排序方式，new为按照视频上传时间排序，默认为系统推荐
16 |     """
17 |     database = Database("localhost", "root", "", "bilibili")
18 |     table_name = "micro_video"
19 |     classification = []
20 |     # 获取所有分类
21 |     classification_url = "https://api.vc.bilibili.com/clip/v1/video/zonelist?page=total"
22 |     classification_json = url_get(classification_url, "json")
23 |     classification_data = dict_get(classification_json, "data")
24 |     for i in classification_data:
25 |         if classification_data[i] == '':
26 |             continue
27 |         for j in classification_data[i]['tags']:
28 |             classification.append(j)
29 | 
30 |     for tag in classification:
31 |         ps = 50  # page_size最大50
32 |         pn = page_num  # 开始页，调用时可自定义
33 |         while True:
34 |             next_offset = (pn - 1) * ps
35 |             micro_video_url = "https://api.vc.bilibili.com/clip/v1/video/search?" \
36 |                               "page_size={}&need_playurl=0&next_offset={}&order={}" \
37 |                               "&tag={}".format(ps, next_offset, order, tag)
38 |             micro_video_json = url_get(micro_video_url, "json")
39 |             items = dict_get(micro_video_json, "items")
40 |             if len(items) == 0:
41 |                 break
42 |             for item in items:
43 |                 video_info = {"tag": tag}
44 |                 video_info['title'] = dict_get(item, "description").replace("\n", "")  # 视频标题
45 |                 video_info['video_id'] = dict_get(item, "id")  # 视频id
46 |                 video_info['reply'] = dict_get(item, "reply")  # 视频评论数
47 |                 video_info['upload_time'] = dict_get(item, "upload_time")  # 视频上传时间
48 |                 video_info['video_size'] = round(float(dict_get(item, "video_size")) / 1024**2, 2)  # 视频文件大小，单位mb（float）
49 |                 video_info['video_time'] = dict_get(item, "video_time")  # 视频时长，单位s
50 |                 video_info['video_playurl'] = dict_get(item, "video_playurl")  # 视频播放地址
51 |                 video_info['watched_num'] = dict_get(item, "watched_num")  # 视频播放数
52 |                 video_info['name'] = dict_get(item, "name")  # 上传者用户名
53 |                 video_info['uid'] = dict_get(item, "uid")  # 上传者uid
54 | 
55 |                 # 如果需要下载视频，请把下面注释去掉
56 |                 # video_content = url_get(video_info['video_playurl'], "content")  # 获取视频内容
57 |                 # video_file_name = video_info['title'][:30].replace("/", '').replace("<", '').replace(">", '').replace(
58 |                 #     "|", '').replace(":", '').replace("*", '').replace("?", '').replace("\\", '') + ".mp4"  # 拼接视频文件名
59 |                 # # 保存视频
60 |                 # with open(video_file_name, "wb") as video_file:
61 |                 #     video_file.write(video_content)
62 |                 #     video_file.close()
63 | 
64 |                 # 如果不需要插入数据库，请把下面部分注释掉
65 |                 if database.execute_sql(table_name=table_name, key="video_id", value=video_info['video_id']) != 0:
66 |                     print("视频id：{} 重复，跳过".format(video_info['video_id']))
67 |                     print("-" * 60)
68 |                     continue
69 |                 if database.execute_sql(table_name=table_name, mode="insert",
70 |                                         keys=list(video_info.keys()), values=list(video_info.values())):
71 |                     print("视频标题: {}".format(video_info['title']))
72 |                     print("视频id: {}".format(video_info['video_id']))
73 |                     print("视频评论数: {}".format(video_info['reply']))
74 |                     print("视频上传时间: {}".format(video_info['upload_time']))
75 |                     print("视频大小（mb）: {}".format(video_info['video_size']))
76 |                     print("视频时长: {}".format(video_info['video_time']))
77 |                     print("视频播放地址: {}".format(video_info['video_playurl']))
78 |                     print("视频观看数: {}".format(video_info['watched_num']))
79 |                     print("上传者用户名: {}".format(video_info['name']))
80 |                     print("上传者id: {}".format(video_info['uid']))
81 |                     print("-" * 60)
82 |             pn += 1
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     micro_video_crawler()
87 | 


--------------------------------------------------------------------------------
/pictures.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019/4/6 19:42
 3 | # @Author  : Nismison
 4 | # @FileName: pictures.py
 5 | # @Description: Bilibili相簿爬取
 6 | # @Blog    ：https://blog.tryfang.cn
 7 | 
 8 | from os.path import dirname, exists
 9 | from os import mkdir
10 | from functions.requests_func import url_get
11 | 
12 | 
13 | def dict_get(dict_, objkey):
14 |     """
15 |     从嵌套的字典中拿到需要的值
16 |     :param dict_: 要遍历的字典
17 |     :param objkey: 目标key
18 |     :return: 目标key对应的value
19 |     """
20 |     for key, value in dict_.items():
21 |         if key == objkey:
22 |             return value
23 |         else:
24 |             # 如果value是dict类型，则迭代
25 |             if isinstance(value, dict):
26 |                 ret = dict_get(value, objkey)
27 |                 if ret is not None:
28 |                     return ret
29 |             # 如果value是list类型，则取第0个进行迭代
30 |             elif isinstance(value, list):
31 |                 ret = dict_get(value[0], objkey)
32 |                 if ret is not None:
33 |                     return ret
34 |     # 如果找不到指定的key，返回None 
35 |     return None
36 | 
37 | 
38 | def crawler(type_, sort, path='save_picture', page_num=0):
39 |     """
40 |     :param type_: 分类 --> cos or sifu
41 |     :param sort: 排序 --> hot or new
42 |     :param path: 路径（当前目录下）
43 |     :param page_num: 开始页，默认0页开始
44 |     """
45 |     if path != '' and not exists(path):
46 |         mkdir(path)
47 |     base_dir = dirname(__file__) + "/" + path + "/"
48 |     url = "https://api.vc.bilibili.com/link_draw/v2/Photo/list?category={}&type={}&page_num={}&page_size=20".format(
49 |         type_, sort, page_num)
50 |     res = url_get(url=url, mode="json")
51 |     items = dict_get(res, "items")
52 |     if len(items) == 0:
53 |         print("Current page have no any picture, Exit mission!")
54 |         return
55 |     for i in items:
56 |         title = dict_get(i, "title")  # 相簿标题
57 |         up = dict_get(i, "name")  # up主
58 |         directory_name = title.replace("/", '').replace("<", '').replace(">", '').replace(
59 |             "|", '').replace(":", '').replace("*", '').replace("?", '').replace("\\", '') + "-" + up
60 |         if not exists(path + "/" + directory_name):
61 |             mkdir(path + "/" + directory_name)
62 |         picture_list = []  # 存放图片地址
63 |         for picture in dict_get(i, "pictures"):
64 |             picture_list.append(picture['img_src'])
65 |         print("Downloading Pictures")
66 |         for pic in picture_list:
67 |             pic_name = pic.split("/")[-1]
68 |             full_pic_path = base_dir + directory_name + "/" + pic_name
69 |             if not exists(full_pic_path):
70 |                 pic_get = url_get(url=pic, mode="content")
71 |                 with open(full_pic_path, "wb") as pic_file:
72 |                     pic_file.write(pic_get)
73 |             else:
74 |                 continue
75 |         print("current page: {}".format(page_num + 1))
76 |         print("title: {}".format(title))
77 |         print("up: {}".format(up))
78 |         print("picture: {}".format(len(picture_list)))
79 |         print("-" * 60)
80 |     crawler(type_=type_, sort=sort, path=path, page_num=page_num + 1)
81 | 
82 | 
83 | if __name__ == '__main__':
84 |     crawler(type_="sifu", sort="hot")
85 | 


--------------------------------------------------------------------------------
/rank.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019-04-07 18:44:09
 3 | # @Author  : Nismison
 4 | # @FileName: rank.py
 5 | # @Description: Bilibili排行榜爬取
 6 | # @Blog    ：https://blog.tryfang.cn
 7 | 
 8 | from functions.requests_func import url_get
 9 | from functions.deal_json import dict_get
10 | from os.path import exists
11 | from os import mkdir
12 | 
13 | 
14 | def rank_crawler():
15 |     # 保存目录
16 |     save_path = "rank"
17 |     # 如果目录不存在则创建目录
18 |     if not exists(save_path):
19 |         mkdir(save_path)
20 |     # rid字典
21 |     rid_dict = {
22 |         "全站": 0,
23 |         "动画": 1,
24 |         "国创相关": 168,
25 |         "音乐": 3,
26 |         "舞蹈": 129,
27 |         "游戏": 4,
28 |         "科技": 36,
29 |         "数码": 188,
30 |         "生活": 160,
31 |         "鬼畜": 119,
32 |         "时尚": 155,
33 |         "娱乐": 5,
34 |         "影视": 181,
35 |     }
36 |     # 排行时间字典
37 |     day_dict = {
38 |         "日排行": 1,
39 |         "三日排行": 3,
40 |         "周排行": 7,
41 |         "月排行": 30,
42 |     }
43 |     # 遍历rid字典
44 |     for k, v in rid_dict.items():
45 |         rid = v
46 |         # 遍历排行时间字典
47 |         for k2, v2 in day_dict.items():
48 |             day = v2
49 |             # 拼接url
50 |             url = "https://api.bilibili.com/x/web-interface/ranking?rid={}&day={}".format(rid, day)
51 |             res = url_get(url=url, mode="json")
52 |             rank_list = dict_get(res, "list")
53 |             for i in range(len(rank_list)):
54 |                 aid = dict_get(rank_list[i], "aid")  # 视频id
55 |                 author = dict_get(rank_list[i], "author")  # up主
56 |                 coins = dict_get(rank_list[i], "coins")  # 投币数
57 |                 play = dict_get(rank_list[i], "play")  # 播放数
58 |                 pts = dict_get(rank_list[i], "pts")  # 综合得分
59 |                 title = dict_get(rank_list[i], "title")  # 视频标题
60 |                 video_review = dict_get(rank_list[i], "video_review")  # 视频弹幕数（？）
61 |                 no_reprint = dict_get(rank_list[i], "no_reprint")
62 |                 if no_reprint == 1:  # 判断是否原创
63 |                     reprint = "原创"
64 |                 else:
65 |                     reprint = "转载"
66 | 
67 |                 # 将数据保存到txt文件中，也可以导入functions.database包将数据保存到数据库中
68 |                 with open("{}/Bilibili-{}-{}.txt".format(save_path, k, k2), "a+", encoding="utf-8") as data_file:
69 |                     data_file.write("排名: {}\n".format(i + 1))
70 |                     data_file.write("视频id: {}\n".format(aid))
71 |                     data_file.write("up主: {}\n".format(author))
72 |                     data_file.write("投币数: {}\n".format(coins))
73 |                     data_file.write("播放数: {}\n".format(play))
74 |                     data_file.write("综合得分: {}\n".format(pts))
75 |                     data_file.write("视频标题: {}\n".format(title))
76 |                     data_file.write("视频弹幕数: {}\n".format(video_review))
77 |                     data_file.write("是否原创: {}\n".format(reprint))
78 |                     data_file.write("-" * 60 + "\n")
79 |                     data_file.close()
80 | 
81 |                 # 打印进程显示
82 |                 print("排名: {}".format(i + 1))
83 |                 print("视频id: {}".format(aid))
84 |                 print("up主: {}".format(author))
85 |                 print("投币数: {}".format(coins))
86 |                 print("播放数: {}".format(play))
87 |                 print("综合得分: {}".format(pts))
88 |                 print("视频标题: {}".format(title))
89 |                 print("视频弹幕数: {}".format(video_review))
90 |                 print("是否原创: {}".format(reprint))
91 |                 print("-" * 60)
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     rank_crawler()
96 | 


--------------------------------------------------------------------------------
/video.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2019/4/7 3:12
 3 | # @Author  : Nismison
 4 | # @FileName: video.py
 5 | # @Description: Bilibili视频信息爬取
 6 | # @Blog    ：https://blog.tryfang.cn
 7 | 
 8 | from functions.requests_func import url_get
 9 | from functions.deal_json import dict_get
10 | from functions.database import Database
11 | from time import strftime, gmtime
12 | 
13 | 
14 | def crawler(av):
15 |     database = Database(host="localhost", username="root", password="", db_name="bilibili")
16 |     for av_num in range(av, 48544470):
17 |         url = "https://api.bilibili.com/x/web-interface/view?aid={}".format(av_num)
18 |         get_json = url_get(url=url, mode="json")
19 |         if dict_get(get_json, "code") != 0:
20 |             print('错误！没有此视频！av:{}'.format(av_num))
21 |             print('-' * 60)
22 |             continue
23 |         data = {}
24 |         data['video_av'] = str(av_num)
25 |         data['video_up'] = dict_get(get_json, "name")
26 |         data['video_title'] = dict_get(get_json, "title")
27 |         data['video_classification'] = dict_get(get_json, "tname")
28 |         data['video_view'] = dict_get(get_json, "view")
29 |         data['video_share'] = dict_get(get_json, "share")
30 |         data['video_like'] = dict_get(get_json, "like")
31 |         data['video_favorite'] = dict_get(get_json, "favorite")
32 |         data['video_coin'] = dict_get(get_json, "coin")
33 |         data['video_update'] = strftime("%Y-%m-%d %H:%M:%S", gmtime(dict_get(get_json, "ctime")))
34 |         data['video_reply'] = dict_get(get_json, "reply")
35 |         data['video_danmaku'] = dict_get(get_json, "danmaku")
36 |         video_reprint = dict_get(get_json, "no_reprint")
37 |         if video_reprint == 0:
38 |             data['video_reprint'] = "转载" 
39 |         else:
40 |             data['video_reprint'] = "原创"
41 | 
42 |         db_select = database.execute_sql(table_name="video", mode="search", key="video_av", value=data['video_av'])
43 |         if db_select != 0:
44 |             print('错误！此视频已存在！av:{}'.format(av_num))
45 |             print('-' * 60)
46 |         else:
47 |             if database.execute_sql(table_name="video", mode="insert", keys=list(data.keys()),
48 |                                     values=list(data.values())):
49 |                 print("视频av号: {}".format(data['video_av']))
50 |                 print("作者: {}".format(data['video_up']))
51 |                 print("标题: {}".format(data['video_title']))
52 |                 print("视频分类: {}".format(data['video_classification']))
53 |                 print("观看数: {}".format(data['video_view']))
54 |                 print("分享数: {}".format(data['video_share']))
55 |                 print("点赞数: {}".format(data['video_like']))
56 |                 print("收藏数: {}".format(data['video_favorite']))
57 |                 print("投币数: {}".format(data['video_coin']))
58 |                 print("上传时间: {}".format(data['video_update']))
59 |                 print("评论数: {}".format(data['video_reply']))
60 |                 print("弹幕数: {}".format(data['video_danmaku']))
61 |                 print("性质: {}".format(data['video_reprint']))
62 |                 print("-" * 60)
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     crawler(av=6000)
67 | 


--------------------------------------------------------------------------------