├── README.md ├── audio.py ├── banned.py ├── column.py ├── functions ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── database.cpython-37.pyc │ ├── deal_json.cpython-37.pyc │ ├── requests_func.cpython-37.pyc │ └── thread.cpython-37.pyc ├── database.py ├── deal_json.py ├── requests_func.py └── thread.py ├── game.py ├── member.py ├── micro_video.py ├── pictures.py ├── rank.py └── video.py /README.md: -------------------------------------------------------------------------------- 1 | # Bilibili网站爬虫 2 | ![][8] ![][9] ![][10] ![][11] 3 | 4 | 5 | 6 | ------------- 7 | 我胖虎今天就是要把b站爬完 8 | -------- 9 | 10 | 11 | File Name | Description | Remarks 12 | ---------|----------|--------- 13 | /[_video.py_][3] | 视频信息爬取 | 从av100逐一递增遍历视频信息,并录入数据库 14 | /[_rank.py_][4] | 排行榜数据爬取 | 爬取排行榜数据,可自定义爬取分类以及排行榜时间 15 | /[_pictures.py_][5] | 相簿图片爬取 | 爬取相簿图片,并下载至本地 16 | /[_member.py_][6] | 会员信息数据爬取 | 从id0开始逐一递增,爬取所有会员信息,并录入数据库,但是由于会员数量过于庞大,我试过同时开20个进程同时爬取,但是由于请求过于频繁,ip被封了20来分钟,但是思路已经摆在这了,有ip代理的话问题不大 17 | /[_audio.py_][7] | 音频爬取下载 | 爬取音频专辑id,从12000逐一递增爬取,并将音乐下载至本地 18 | /[_banned.py_][13] | 小黑屋数据爬取 | 从第1页开始逐一递增,爬取所有小黑屋数据,并录入数据库 19 | /[_column.py_][19] | 专栏文章爬取 | 遍历所有专栏分类,并爬取对应专栏下的文章数据(不包含文章内容)录入数据库 20 | /[_game.py_][20] | 游戏列表爬取 | 爬取游戏列表所有数据,录入数据库 21 | /[_micro_video.py_][21] | 小视频爬取下载 | 爬取所有小视频分类下的视频信息,将视频信息录入数据库,将视频下载至本地 22 | [_functions_][12]/[_database.py_][15] | Mysql数据库操作相关函数 | None 23 | [_functions_][12]/[_deal_json.py_][16] | Json数据处理相关函数 | None 24 | [_functions_][12]/[_requests_func.py_][17] | http请求相关函数 | None 25 | [_functions_][12]/[_thread.py_][18] | 多线程相关函数 | None 26 | 27 | 28 | --------------- 29 | 30 | 31 | [1]:https://blog.tryfang.cn 32 | [2]:https://space.bilibili.com/25216986 33 | [3]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/video.py 34 | [4]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/rank.py 35 | [5]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/pictures.py 36 | [6]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/member.py 37 | [7]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/audio.py 38 | [8]:https://img.shields.io/badge/Python-v3.7.1-brightgreen.svg 39 | [9]:https://img.shields.io/badge/requests-2.21-green.svg 40 | [10]:https://img.shields.io/badge/pymysql-0.9.3-red.svg 41 | [11]:https://img.shields.io/badge/Bilibili-%E5%B9%B2%E6%9D%AF-ff69b4.svg 42 | [12]:https://github.com/Liangzhenzhuo/Bilibili/tree/master/functions 43 | [14]:https://github.com/Liangzhenzhuo/Bilibili/tree/master/ 44 | [13]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/banned.py 45 | [15]:https://github.com/Liangzhenzhuo/Bilibili/tree/master/functions/database.py 46 | [16]:https://github.com/Liangzhenzhuo/Bilibili/tree/master/functions/deal_json.py 47 | [17]:https://github.com/Liangzhenzhuo/Bilibili/tree/master/functions/requests_func.py 48 | [18]:https://github.com/Liangzhenzhuo/Bilibili/tree/master/functions/thread.py 49 | [19]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/column.py 50 | [20]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/game.py 51 | [21]:https://github.com/Liangzhenzhuo/Bilibili/blob/master/micro_video.py 52 | -------------------------------------------------------------------------------- /audio.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/4/7 0:49 3 | # @Author : Nismison 4 | # @FileName: audio.py 5 | # @Description: bilibili音频爬取下载 6 | # @Blog :https://blog.tryfang.cn 7 | 8 | from os.path import dirname, exists 9 | from os import mkdir 10 | from functions.deal_json import dict_get 11 | from functions.requests_func import url_get 12 | 13 | 14 | def audio_crawler(path='songs'): 15 | """ 16 | 音频爬取函数 17 | :param path: 提供自定义下载路径修改 18 | :return: None 19 | """ 20 | # 规定基础路径 21 | base_dir = dirname(__file__) + "/" + path + "/" 22 | # 如果路径不存在则创建路径 23 | if not exists(base_dir): 24 | mkdir(base_dir) 25 | # 从12032-20000遍历sid,生成专辑url 26 | for sid in range(12032, 20000): 27 | # 拼接专辑url 28 | url = "https://www.bilibili.com/audio/music-service-c/web/song/of-menu?sid={}&pn=1&ps=100".format(sid) 29 | res = url_get(url=url, mode="json") 30 | data = dict_get(res, "data") 31 | # 如果data为空,则跳过 32 | if data is None: 33 | continue 34 | items = dict_get(data, "data") 35 | # 获取专辑信息请求 36 | info_url = "https://www.bilibili.com/audio/music-service-c/web/menu/info?sid={}".format(sid) 37 | info_get = url_get(url=info_url, mode="json") 38 | album_title = dict_get(info_get, "title").replace("/", '').replace("<", '').replace(">", '').replace( 39 | "|", '').replace(":", '').replace("*", '').replace("?", '').replace("\\", '') 40 | # 如果路径不存在则创建路径 41 | if not exists(base_dir + album_title): 42 | mkdir(base_dir + album_title) 43 | # 遍历专辑下所有音乐 44 | for item in items: 45 | author = dict_get(item, "author") # 歌手 46 | title = dict_get(item, "title") # 音乐标题 47 | sid = dict_get(item, "id") # 音乐id,用于拼接音乐下载url 48 | songs_url = "https://www.bilibili.com/audio/music-service-c/web/url?sid={}".format(sid) 49 | songs_get = url_get(url=songs_url, mode="json") 50 | file_size = round(dict_get(songs_get, "size") / 1024 / 1024, 2) # 音频文件大小 51 | # 分析json中cdns数据,判断音频文件真实地址 52 | cdns = dict_get(songs_get, "cdns") 53 | if cdns[0] > cdns[1]: 54 | real_url = cdns[0] 55 | else: 56 | real_url = cdns[1] 57 | print("Downloading Audio") 58 | song_file_name = base_dir + album_title + "/" + title + " - " + author + '.m4a' 59 | # 如果文件已存在,则跳过 60 | if exists(song_file_name): 61 | continue 62 | # 下载音频文件 63 | song_file_get = url_get(url=real_url, mode="content") 64 | with open(song_file_name, "wb") as song: 65 | song.write(song_file_get) 66 | song.close() 67 | # 显示进程信息 68 | print("album_title: {}".format(album_title)) 69 | print("author: {}".format(author)) 70 | print("title: {}".format(title)) 71 | print("file_size: {} MB".format(file_size)) 72 | print("-" * 60) 73 | 74 | 75 | if __name__ == '__main__': 76 | audio_crawler() 77 | -------------------------------------------------------------------------------- /banned.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/4/8 15:17 3 | # @Author : Nismison 4 | # @FileName: banned.py 5 | # @Description: Bilibili小黑屋数据爬取 6 | # @Blog :https://blog.tryfang.cn 7 | 8 | from functions.requests_func import url_get 9 | from functions.deal_json import dict_get 10 | from functions.database import Database 11 | from time import strftime, localtime 12 | 13 | 14 | def banned_crawler(): 15 | database = Database("localhost", "root", "", "bilibili") 16 | pn = 1 17 | while True: 18 | data = {} 19 | banned_url = "https://api.bilibili.com/x/credit/blocked/list?pn={}".format(pn) 20 | banned_data = url_get(banned_url, mode="json") 21 | if banned_data['code'] != 0: 22 | print("爬取完毕") 23 | print("-" * 60) 24 | return 25 | for item in dict_get(banned_data, "data"): 26 | data["banned_uname"] = dict_get(item, "uname") 27 | data['banned_uid'] = dict_get(item, "uid") 28 | data['banned_reason'] = dict_get(item, "reasonTypeName") 29 | data['banned_days'] = dict_get(item, "blockedDays") 30 | banned_time = dict_get(item, "punishTime") 31 | data['banned_time'] = strftime("%Y-%m-%d %H:%M:%S", localtime(banned_time)) 32 | if database.execute_sql(table_name="banned", mode="insert", keys=list(data.keys()), values=list(data.values())): 33 | print("用户名: {}".format(data["banned_uname"])) 34 | print("用户id: {}".format(data['banned_uid'])) 35 | print("封禁类型: {}".format(data['banned_reason'])) 36 | print("封禁时长: {}".format(data['banned_days'])) 37 | print("封禁时间: {}".format(data['banned_time'])) 38 | print("-" * 60) 39 | pn += 1 40 | 41 | 42 | if __name__ == '__main__': 43 | banned_crawler() 44 | -------------------------------------------------------------------------------- /column.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019-04-08 16:37:54 3 | # @Author : Nismison 4 | # @FileName: column.py 5 | # @Description: Bilibili专栏文章爬取 6 | # @Blog :https://blog.tryfang.cn 7 | 8 | from functions.requests_func import url_get 9 | from functions.database import Database 10 | from functions.deal_json import dict_get 11 | from time import strftime, localtime 12 | 13 | 14 | 15 | def column_crawler(): 16 | database = Database("localhost", "root", "", "bilibili") 17 | table_name = "zhuanlan" 18 | cid_dict = { 19 | "动画": 2, 20 | "游戏": 1, 21 | "影视": 28, 22 | "生活": 3, 23 | "兴趣": 29, 24 | "轻小说": 16, 25 | "科技": 17, 26 | } 27 | for v in cid_dict.values(): 28 | pn = 1 29 | while True: 30 | column_url = "https://api.bilibili.com/x/article/recommends?cid={}&pn={}&ps=100&sort=0".format(v, pn) 31 | column_get = url_get(column_url, mode="json") 32 | column_data = dict_get(column_get, "data") 33 | if len(column_data) == 0: 34 | print(pn) 35 | break 36 | for item in column_data: 37 | data = {} 38 | author_info = dict_get(item, "author") # 作者信息 39 | data['author_mid'] = author_info['mid'] # 作者id 40 | data['author_name'] = author_info['name'] # 作者用户名 41 | data['category'] = dict_get(item, "category")['name'] # 所属分类 42 | data['update_time'] = strftime("%Y-%m-%d %H:%M:%S", localtime(dict_get(item, 'update_time'))) # 上传时间 43 | data['art_id'] = dict_get(item, "id") # 文章id,如果需要爬取文章内容可以拼接url: https://www.bilibili.com/read/cv[文章id] 44 | data['art_title'] = dict_get(item, "title") # 文章标题 45 | data['art_words'] = dict_get(item, "words") # 文章字数 46 | data['art_like'] = dict_get(item, "like") # 文章点赞数 47 | data['art_reply'] = dict_get(item, "reply") # 文章评论数 48 | data['art_view'] = dict_get(item, "view") # 文章浏览数 49 | data['art_favorite'] = dict_get(item, "favorite") # 文章收藏数 50 | data['art_coin'] = dict_get(item, "coin") # 文章投币数 51 | data['art_share'] = dict_get(item, "share") # 文章分享数 52 | data['art_summary'] = dict_get(item, "summary") # 文章摘要 53 | data['crawl_time'] = strftime("%Y-%m-%d %H:%M:%S", localtime()) # 爬取时间 54 | 55 | if database.execute_sql(table_name=table_name, select="id", key="art_id", value=data['art_id']) != 0: 56 | print("id:{} 重复,跳过".format(data['art_id'])) 57 | print("-" * 60) 58 | # pn += 1 59 | continue 60 | if database.execute_sql(table_name=table_name, mode="insert", keys=list( 61 | data.keys()), values=list(data.values())): 62 | print("作者id: {}".format(data['author_mid'])) 63 | print("作者用户名: {}".format(data['author_name'])) 64 | print("所属分类: {}".format(data['category'])) 65 | print("上传时间: {}".format(data['update_time'])) 66 | print("文章id: {}".format(data['art_id'])) 67 | print("文章标题: {}".format(data['art_title'])) 68 | print("文章字数: {}".format(data['art_words'])) 69 | print("文章点赞数: {}".format(data['art_like'])) 70 | print("文章评论数: {}".format(data['art_reply'])) 71 | print("文章浏览数: {}".format(data['art_view'])) 72 | print("文章收藏数: {}".format(data['art_favorite'])) 73 | print("文章投币数: {}".format(data['art_coin'])) 74 | print("文章分享数: {}".format(data['art_share'])) 75 | print("文章摘要: {}".format(data['art_summary'])) 76 | print("爬取时间: {}".format(data['crawl_time'])) 77 | print("-" * 60) 78 | else: 79 | print("id:{} 异常,跳过".format(data['art_id'])) 80 | print("-" * 60) 81 | # pn += 1 82 | continue 83 | pn += 1 84 | 85 | if __name__ == "__main__": 86 | column_crawler() 87 | -------------------------------------------------------------------------------- /functions/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/4/7 3:14 3 | # @Author : Nismison 4 | # @FileName: __init__.py 5 | # @Description: 6 | # @Blog :https://blog.tryfang.cn 7 | 8 | -------------------------------------------------------------------------------- /functions/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Liangzhenzhuo/Bilibili/0ec4380453b7013c46fbc505843470b973b67aa8/functions/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /functions/__pycache__/database.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Liangzhenzhuo/Bilibili/0ec4380453b7013c46fbc505843470b973b67aa8/functions/__pycache__/database.cpython-37.pyc -------------------------------------------------------------------------------- /functions/__pycache__/deal_json.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Liangzhenzhuo/Bilibili/0ec4380453b7013c46fbc505843470b973b67aa8/functions/__pycache__/deal_json.cpython-37.pyc -------------------------------------------------------------------------------- /functions/__pycache__/requests_func.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Liangzhenzhuo/Bilibili/0ec4380453b7013c46fbc505843470b973b67aa8/functions/__pycache__/requests_func.cpython-37.pyc -------------------------------------------------------------------------------- /functions/__pycache__/thread.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Liangzhenzhuo/Bilibili/0ec4380453b7013c46fbc505843470b973b67aa8/functions/__pycache__/thread.cpython-37.pyc -------------------------------------------------------------------------------- /functions/database.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/4/7 4:17 3 | # @Author : Nismison 4 | # @FileName: database.py 5 | # @Description: 数据库函数 6 | # @Blog :https://blog.tryfang.cn 7 | 8 | from pymysql import connect 9 | 10 | 11 | class Database(object): 12 | def __init__(self, host, username, password, db_name): 13 | self.__connection = connect(host, username, password, db_name) 14 | self.__cursor = self.__connection.cursor() 15 | 16 | def execute_sql(self, table_name, mode="search", select="*", **kwargs): 17 | """ 18 | :param table_name: 需要执行操作的表名(str) 19 | :param mode: 需要执行的操作(search: 查询,insert: 插入) 20 | search - key: 需要查询的字段(str/list) 21 | search -> value: 需要匹配的值(str) 22 | insert - keys: 需要插入的字段名(list) 23 | inset - values: 需要插入的数据(list) 24 | :return: 查询到的数据 or True/False 25 | """ 26 | 27 | if mode == "search": 28 | if isinstance(kwargs['key'], list) and kwargs['value'] == 'all': 29 | key = '' 30 | for i in range(len(kwargs['key'])): 31 | if i == len(kwargs['key']) - 1: 32 | key += "{}".format(kwargs['key'][i]) 33 | else: 34 | key += "{}".format(kwargs['key'][i]) + ", " 35 | sql = "select {} from {}".format(key, table_name) 36 | self.__cursor.execute(sql) 37 | return list(self.__cursor.fetchall()) 38 | elif isinstance(kwargs['key'], str) and (isinstance(kwargs['value'], str) or isinstance(kwargs['value'], int) or isinstance(kwargs['value'], float)): 39 | sql = "select {} from {} where {}='{}'".format(select, table_name, kwargs['key'], kwargs['value']) 40 | return self.__cursor.execute(sql) 41 | else: 42 | raise TypeError("The 'key' must be a list or str type and the 'value' must be a string type.") 43 | 44 | elif mode == "insert": 45 | key = '' 46 | value = '' 47 | keys = kwargs['keys'] 48 | values = kwargs['values'] 49 | # 如果keys和values类型不是list,抛出异常 50 | if not isinstance(keys, list) or not isinstance(values, list): 51 | raise TypeError("The 'keys' and 'value' must be list or number type.") 52 | try: 53 | for i in range(len(keys)): 54 | if i == len(keys) - 1: 55 | key += "{}".format(keys[i]) 56 | else: 57 | key += "{}".format(keys[i]) + ", " 58 | for i in range(len(values)): 59 | if i == len(values) - 1: 60 | value += "'{}'".format(values[i]) 61 | else: 62 | value += "'{}'".format(values[i]) + ", " 63 | sql = "insert into {} ({}) values ({})".format(table_name, key, value) 64 | # 游标对象执行操作 65 | self.__cursor.execute(sql) 66 | # connection对象提交操作 67 | self.__connection.commit() 68 | return True 69 | except Exception as e: 70 | print("Exception:", e) 71 | return False 72 | 73 | def get_cursor(self): 74 | """ 75 | :return: Cursor Object 76 | """ 77 | return self.__cursor 78 | 79 | def get_connection(self): 80 | """ 81 | :return: Connection Object 82 | """ 83 | return self.__connection 84 | 85 | def close(self): 86 | """ 87 | close connection and cursor 88 | """ 89 | self.__cursor.close() 90 | self.__connection.close() 91 | -------------------------------------------------------------------------------- /functions/deal_json.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/4/7 3:15 3 | # @Author : Nismison 4 | # @FileName: deal_json.py 5 | # @Description: json处理函数 6 | # @Blog :https://blog.tryfang.cn 7 | 8 | 9 | def dict_get(dict_, objkey): 10 | """ 11 | 从嵌套的字典中拿到需要的值 12 | :param dict_: 要遍历的字典 13 | :param objkey: 目标key 14 | :return: 目标key对应的value 15 | """ 16 | if isinstance(dict_, dict): 17 | for key, value in dict_.items(): 18 | if key == objkey: 19 | return value 20 | else: 21 | # 如果value是dict类型,则进行迭代 22 | if isinstance(value, dict): 23 | ret = dict_get(value, objkey) 24 | if ret is not None: 25 | return ret 26 | # 如果value是list类型,则依次进行迭代 27 | elif isinstance(value, list): 28 | for i in range(len(value)): 29 | ret = dict_get(value[i], objkey) 30 | if ret is not None: 31 | return ret 32 | # 如果找不到指定的key,返回None 33 | return None 34 | else: 35 | return None 36 | -------------------------------------------------------------------------------- /functions/requests_func.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/4/7 3:17 3 | # @Author : Nismison 4 | # @FileName: requests_func.py 5 | # @Description: requests函数 6 | # @Blog :https://blog.tryfang.cn 7 | 8 | from requests import get 9 | from fake_useragent import UserAgent 10 | 11 | 12 | def url_get(url, mode=None, timeout=20): 13 | # 重试次数 14 | retry_count = 0 15 | try: 16 | if mode is None: 17 | return get(url=url, headers={"User-Agent": UserAgent().random}, timeout=timeout) 18 | elif mode == "json": 19 | return get(url=url, headers={"User-Agent": UserAgent().random}, timeout=timeout).json() 20 | elif mode == "content": 21 | return get(url=url, headers={"User-Agent": UserAgent().random}, timeout=timeout).content 22 | elif mode == "text": 23 | return get(url=url, headers={"User-Agent": UserAgent().random}, timeout=timeout).text 24 | elif mode == "code": 25 | return get(url=url, headers={"User-Agent": UserAgent().random}, timeout=timeout).status_code 26 | else: 27 | raise ValueError("Mode error, mode must be one of None/json/content/text/code") 28 | except Exception: 29 | if retry_count > 3: 30 | raise Exception("Maximum retries") 31 | else: 32 | url_get(url=url, mode=mode) 33 | retry_count += 1 34 | -------------------------------------------------------------------------------- /functions/thread.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/4/8 0:32 3 | # @Author : Nismison 4 | # @FileName: thread.py 5 | # @Description: 6 | # @Blog :https://blog.tryfang.cn 7 | 8 | from threading import Thread 9 | 10 | 11 | def thread_create(thread_num, method): 12 | """ 13 | 批量启动线程 14 | :param thread_num: 线程数 15 | :param method: 线程调用方法 16 | """ 17 | # 线程池 18 | thread_pool = [] 19 | # 批量创建线程放到线程池中 20 | for i in range(thread_num): 21 | th = Thread(target=method, args=(4415 + i * 2000, )) 22 | thread_pool.append(th) 23 | # 从线程池中批量启动线程 24 | for i in range(len(thread_pool)): 25 | thread_pool[i].start() 26 | print("线程 {} 已启动".format(i + 1)) 27 | # 等待子线程执行结束 28 | for th in thread_pool: 29 | Thread.join(th) 30 | -------------------------------------------------------------------------------- /game.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/4/8 20:19 3 | # @Author : Nismison 4 | # @FileName: game.py 5 | # @Description: Bilibili游戏列表爬取 6 | # @Blog :https://blog.tryfang.cn 7 | 8 | from functions.requests_func import url_get 9 | from functions.database import Database 10 | from functions.deal_json import dict_get 11 | 12 | 13 | def game_crawler(): 14 | database = Database("localhost", "root", "", "bilibili") 15 | table_name = "game_list" 16 | game_list_url = "https://game.bilibili.com/gamelist.json" 17 | game_list_json = url_get(game_list_url, "json") 18 | for game in game_list_json: 19 | game_info = {} 20 | game_info['name'] = dict_get(game, "title") 21 | game_info['summary'] = dict_get(game, "summary") 22 | game_info['website'] = dict_get(game, "website") 23 | 24 | if database.execute_sql(table_name=table_name, key="name", value=game_info['name']) != 0: 25 | print("{} 重复,跳过".format(game_info['name'])) 26 | print("-" * 60) 27 | 28 | if database.execute_sql(table_name=table_name, mode="insert", keys=list(game_info.keys()), values=list(game_info.values())): 29 | print("游戏名: {}".format(game_info['name'])) 30 | print("游戏介绍: {}".format(game_info['summary'])) 31 | print("游戏官网: {}".format(game_info['website'])) 32 | print("-" * 60) 33 | 34 | 35 | if __name__ == '__main__': 36 | game_crawler() 37 | -------------------------------------------------------------------------------- /member.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/4/7 21:33 3 | # @Author : Nismison 4 | # @FileName: member.py 5 | # @Description: Bilibili会员信息爬取 6 | # @Blog :https://blog.tryfang.cn 7 | 8 | from functions.requests_func import url_get 9 | from functions.deal_json import dict_get 10 | from functions.database import Database 11 | from functions.thread import thread_create 12 | 13 | 14 | def member_crawler(mid): 15 | database = Database("localhost", "root", "", "bilibili") 16 | while True: 17 | follow_url = "https://api.bilibili.com/x/relation/stat?vmid={}".format(mid) 18 | view_url = "https://api.bilibili.com/x/space/upstat?mid={}".format(mid) 19 | info_url = "https://api.bilibili.com/x/space/acc/info?mid={}".format(mid) 20 | tag_url = "https://space.bilibili.com/ajax/member/getTags?mids={}".format(mid) 21 | charging_url = "https://elec.bilibili.com/api/query.rank.do?mid={}".format(mid) 22 | upload_data_url = "https://api.bilibili.com/x/space/navnum?mid={}".format(mid) 23 | try: 24 | member_info = url_get(info_url, mode='json') 25 | username = dict_get(member_info, "name") 26 | if username is None: 27 | print("该会员不存在, 跳过 {}".format(mid)) 28 | print("-" * 60) 29 | mid += 1 30 | continue 31 | level = dict_get(member_info, "level") 32 | member_id = dict_get(member_info, "mid") 33 | sex = dict_get(member_info, "sex") 34 | coins = dict_get(member_info, "coins") 35 | official_data = dict_get(member_info, "official") 36 | follow_data = url_get(follow_url, mode="json") 37 | following = dict_get(follow_data, 'following') 38 | follower = dict_get(follow_data, 'follower') 39 | view = dict_get(url_get(view_url, mode="json"), "view") 40 | 41 | if official_data['role'] == 1: 42 | official = official_data['title'] 43 | else: 44 | official = "暂无认证" 45 | birthday = dict_get(member_info, "birthday") 46 | sign = dict_get(member_info, "sign") 47 | vip = dict_get(member_info, "status") 48 | if vip == 1: 49 | vip_status = "是" 50 | else: 51 | vip_status = "否" 52 | tag = '' 53 | for x in dict_get(url_get(tag_url, mode="json"), "tags"): 54 | tag += x + ' ' 55 | charging = dict_get(url_get(charging_url, mode="json"), "total_count") 56 | video_upload = dict_get(url_get(upload_data_url, mode="json"), "video") 57 | 58 | if database.execute_sql(table_name="member", mode="search", key="member_id", value=member_id) != 0: 59 | print("该会员已存在, 跳过 {}".format(member_id)) 60 | print("-" * 60) 61 | mid += 1 62 | continue 63 | 64 | insert_data = { 65 | "member_id": member_id, 66 | "username": username, 67 | "sex": sex, 68 | "birthday": birthday, 69 | "level": level, 70 | "coins": coins, 71 | "sign": sign, 72 | "charging": charging, 73 | "video_upload": video_upload, 74 | "tag": tag, 75 | "vip_status": vip_status, 76 | "official": official, 77 | "following": following, 78 | "follower": follower, 79 | "view": view, 80 | } 81 | 82 | if database.execute_sql(mode="insert", table_name="member", keys=list( 83 | insert_data.keys()), values=list(insert_data.values())): 84 | print("用户id: {}".format(member_id)) 85 | print("用户名: {}".format(username)) 86 | print("性别: {}".format(sex)) 87 | print("生日: {}".format(birthday)) 88 | print("等级: {}".format(level)) 89 | print("B币: {}".format(coins)) 90 | print("个人签名: {}".format(sign)) 91 | print("充电人数: {}".format(charging)) 92 | print("视频数量: {}".format(video_upload)) 93 | print("标签: {}".format(tag)) 94 | print("B站大会员: {}".format(vip_status)) 95 | print("Bilibili认证: {}".format(official)) 96 | print("关注数: {}".format(following)) 97 | print("粉丝数: {}".format(follower)) 98 | print("播放量: {}".format(view)) 99 | print("-" * 60) 100 | mid += 1 101 | except Exception as e: 102 | print("错误, 跳过 mid={}".format(mid)) 103 | print(e) 104 | print("-" * 60) 105 | mid += 1 106 | continue 107 | 108 | 109 | if __name__ == '__main__': 110 | member_crawler(mid=0) 111 | -------------------------------------------------------------------------------- /micro_video.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/4/8 20:46 3 | # @Author : Nismison 4 | # @FileName: micro_video.py 5 | # @Description: Bilibili小视频爬取 6 | # @Blog :https://blog.tryfang.cn 7 | 8 | from functions.requests_func import url_get 9 | from functions.deal_json import dict_get 10 | from functions.database import Database 11 | 12 | 13 | def micro_video_crawler(order='', page_num=1): 14 | """ 15 | :param order: 排序方式,new为按照视频上传时间排序,默认为系统推荐 16 | """ 17 | database = Database("localhost", "root", "", "bilibili") 18 | table_name = "micro_video" 19 | classification = [] 20 | # 获取所有分类 21 | classification_url = "https://api.vc.bilibili.com/clip/v1/video/zonelist?page=total" 22 | classification_json = url_get(classification_url, "json") 23 | classification_data = dict_get(classification_json, "data") 24 | for i in classification_data: 25 | if classification_data[i] == '': 26 | continue 27 | for j in classification_data[i]['tags']: 28 | classification.append(j) 29 | 30 | for tag in classification: 31 | ps = 50 # page_size最大50 32 | pn = page_num # 开始页,调用时可自定义 33 | while True: 34 | next_offset = (pn - 1) * ps 35 | micro_video_url = "https://api.vc.bilibili.com/clip/v1/video/search?" \ 36 | "page_size={}&need_playurl=0&next_offset={}&order={}" \ 37 | "&tag={}".format(ps, next_offset, order, tag) 38 | micro_video_json = url_get(micro_video_url, "json") 39 | items = dict_get(micro_video_json, "items") 40 | if len(items) == 0: 41 | break 42 | for item in items: 43 | video_info = {"tag": tag} 44 | video_info['title'] = dict_get(item, "description").replace("\n", "") # 视频标题 45 | video_info['video_id'] = dict_get(item, "id") # 视频id 46 | video_info['reply'] = dict_get(item, "reply") # 视频评论数 47 | video_info['upload_time'] = dict_get(item, "upload_time") # 视频上传时间 48 | video_info['video_size'] = round(float(dict_get(item, "video_size")) / 1024**2, 2) # 视频文件大小,单位mb(float) 49 | video_info['video_time'] = dict_get(item, "video_time") # 视频时长,单位s 50 | video_info['video_playurl'] = dict_get(item, "video_playurl") # 视频播放地址 51 | video_info['watched_num'] = dict_get(item, "watched_num") # 视频播放数 52 | video_info['name'] = dict_get(item, "name") # 上传者用户名 53 | video_info['uid'] = dict_get(item, "uid") # 上传者uid 54 | 55 | # 如果需要下载视频,请把下面注释去掉 56 | # video_content = url_get(video_info['video_playurl'], "content") # 获取视频内容 57 | # video_file_name = video_info['title'][:30].replace("/", '').replace("<", '').replace(">", '').replace( 58 | # "|", '').replace(":", '').replace("*", '').replace("?", '').replace("\\", '') + ".mp4" # 拼接视频文件名 59 | # # 保存视频 60 | # with open(video_file_name, "wb") as video_file: 61 | # video_file.write(video_content) 62 | # video_file.close() 63 | 64 | # 如果不需要插入数据库,请把下面部分注释掉 65 | if database.execute_sql(table_name=table_name, key="video_id", value=video_info['video_id']) != 0: 66 | print("视频id:{} 重复,跳过".format(video_info['video_id'])) 67 | print("-" * 60) 68 | continue 69 | if database.execute_sql(table_name=table_name, mode="insert", 70 | keys=list(video_info.keys()), values=list(video_info.values())): 71 | print("视频标题: {}".format(video_info['title'])) 72 | print("视频id: {}".format(video_info['video_id'])) 73 | print("视频评论数: {}".format(video_info['reply'])) 74 | print("视频上传时间: {}".format(video_info['upload_time'])) 75 | print("视频大小(mb): {}".format(video_info['video_size'])) 76 | print("视频时长: {}".format(video_info['video_time'])) 77 | print("视频播放地址: {}".format(video_info['video_playurl'])) 78 | print("视频观看数: {}".format(video_info['watched_num'])) 79 | print("上传者用户名: {}".format(video_info['name'])) 80 | print("上传者id: {}".format(video_info['uid'])) 81 | print("-" * 60) 82 | pn += 1 83 | 84 | 85 | if __name__ == '__main__': 86 | micro_video_crawler() 87 | -------------------------------------------------------------------------------- /pictures.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/4/6 19:42 3 | # @Author : Nismison 4 | # @FileName: pictures.py 5 | # @Description: Bilibili相簿爬取 6 | # @Blog :https://blog.tryfang.cn 7 | 8 | from os.path import dirname, exists 9 | from os import mkdir 10 | from functions.requests_func import url_get 11 | 12 | 13 | def dict_get(dict_, objkey): 14 | """ 15 | 从嵌套的字典中拿到需要的值 16 | :param dict_: 要遍历的字典 17 | :param objkey: 目标key 18 | :return: 目标key对应的value 19 | """ 20 | for key, value in dict_.items(): 21 | if key == objkey: 22 | return value 23 | else: 24 | # 如果value是dict类型,则迭代 25 | if isinstance(value, dict): 26 | ret = dict_get(value, objkey) 27 | if ret is not None: 28 | return ret 29 | # 如果value是list类型,则取第0个进行迭代 30 | elif isinstance(value, list): 31 | ret = dict_get(value[0], objkey) 32 | if ret is not None: 33 | return ret 34 | # 如果找不到指定的key,返回None 35 | return None 36 | 37 | 38 | def crawler(type_, sort, path='save_picture', page_num=0): 39 | """ 40 | :param type_: 分类 --> cos or sifu 41 | :param sort: 排序 --> hot or new 42 | :param path: 路径(当前目录下) 43 | :param page_num: 开始页,默认0页开始 44 | """ 45 | if path != '' and not exists(path): 46 | mkdir(path) 47 | base_dir = dirname(__file__) + "/" + path + "/" 48 | url = "https://api.vc.bilibili.com/link_draw/v2/Photo/list?category={}&type={}&page_num={}&page_size=20".format( 49 | type_, sort, page_num) 50 | res = url_get(url=url, mode="json") 51 | items = dict_get(res, "items") 52 | if len(items) == 0: 53 | print("Current page have no any picture, Exit mission!") 54 | return 55 | for i in items: 56 | title = dict_get(i, "title") # 相簿标题 57 | up = dict_get(i, "name") # up主 58 | directory_name = title.replace("/", '').replace("<", '').replace(">", '').replace( 59 | "|", '').replace(":", '').replace("*", '').replace("?", '').replace("\\", '') + "-" + up 60 | if not exists(path + "/" + directory_name): 61 | mkdir(path + "/" + directory_name) 62 | picture_list = [] # 存放图片地址 63 | for picture in dict_get(i, "pictures"): 64 | picture_list.append(picture['img_src']) 65 | print("Downloading Pictures") 66 | for pic in picture_list: 67 | pic_name = pic.split("/")[-1] 68 | full_pic_path = base_dir + directory_name + "/" + pic_name 69 | if not exists(full_pic_path): 70 | pic_get = url_get(url=pic, mode="content") 71 | with open(full_pic_path, "wb") as pic_file: 72 | pic_file.write(pic_get) 73 | else: 74 | continue 75 | print("current page: {}".format(page_num + 1)) 76 | print("title: {}".format(title)) 77 | print("up: {}".format(up)) 78 | print("picture: {}".format(len(picture_list))) 79 | print("-" * 60) 80 | crawler(type_=type_, sort=sort, path=path, page_num=page_num + 1) 81 | 82 | 83 | if __name__ == '__main__': 84 | crawler(type_="sifu", sort="hot") 85 | -------------------------------------------------------------------------------- /rank.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019-04-07 18:44:09 3 | # @Author : Nismison 4 | # @FileName: rank.py 5 | # @Description: Bilibili排行榜爬取 6 | # @Blog :https://blog.tryfang.cn 7 | 8 | from functions.requests_func import url_get 9 | from functions.deal_json import dict_get 10 | from os.path import exists 11 | from os import mkdir 12 | 13 | 14 | def rank_crawler(): 15 | # 保存目录 16 | save_path = "rank" 17 | # 如果目录不存在则创建目录 18 | if not exists(save_path): 19 | mkdir(save_path) 20 | # rid字典 21 | rid_dict = { 22 | "全站": 0, 23 | "动画": 1, 24 | "国创相关": 168, 25 | "音乐": 3, 26 | "舞蹈": 129, 27 | "游戏": 4, 28 | "科技": 36, 29 | "数码": 188, 30 | "生活": 160, 31 | "鬼畜": 119, 32 | "时尚": 155, 33 | "娱乐": 5, 34 | "影视": 181, 35 | } 36 | # 排行时间字典 37 | day_dict = { 38 | "日排行": 1, 39 | "三日排行": 3, 40 | "周排行": 7, 41 | "月排行": 30, 42 | } 43 | # 遍历rid字典 44 | for k, v in rid_dict.items(): 45 | rid = v 46 | # 遍历排行时间字典 47 | for k2, v2 in day_dict.items(): 48 | day = v2 49 | # 拼接url 50 | url = "https://api.bilibili.com/x/web-interface/ranking?rid={}&day={}".format(rid, day) 51 | res = url_get(url=url, mode="json") 52 | rank_list = dict_get(res, "list") 53 | for i in range(len(rank_list)): 54 | aid = dict_get(rank_list[i], "aid") # 视频id 55 | author = dict_get(rank_list[i], "author") # up主 56 | coins = dict_get(rank_list[i], "coins") # 投币数 57 | play = dict_get(rank_list[i], "play") # 播放数 58 | pts = dict_get(rank_list[i], "pts") # 综合得分 59 | title = dict_get(rank_list[i], "title") # 视频标题 60 | video_review = dict_get(rank_list[i], "video_review") # 视频弹幕数(?) 61 | no_reprint = dict_get(rank_list[i], "no_reprint") 62 | if no_reprint == 1: # 判断是否原创 63 | reprint = "原创" 64 | else: 65 | reprint = "转载" 66 | 67 | # 将数据保存到txt文件中,也可以导入functions.database包将数据保存到数据库中 68 | with open("{}/Bilibili-{}-{}.txt".format(save_path, k, k2), "a+", encoding="utf-8") as data_file: 69 | data_file.write("排名: {}\n".format(i + 1)) 70 | data_file.write("视频id: {}\n".format(aid)) 71 | data_file.write("up主: {}\n".format(author)) 72 | data_file.write("投币数: {}\n".format(coins)) 73 | data_file.write("播放数: {}\n".format(play)) 74 | data_file.write("综合得分: {}\n".format(pts)) 75 | data_file.write("视频标题: {}\n".format(title)) 76 | data_file.write("视频弹幕数: {}\n".format(video_review)) 77 | data_file.write("是否原创: {}\n".format(reprint)) 78 | data_file.write("-" * 60 + "\n") 79 | data_file.close() 80 | 81 | # 打印进程显示 82 | print("排名: {}".format(i + 1)) 83 | print("视频id: {}".format(aid)) 84 | print("up主: {}".format(author)) 85 | print("投币数: {}".format(coins)) 86 | print("播放数: {}".format(play)) 87 | print("综合得分: {}".format(pts)) 88 | print("视频标题: {}".format(title)) 89 | print("视频弹幕数: {}".format(video_review)) 90 | print("是否原创: {}".format(reprint)) 91 | print("-" * 60) 92 | 93 | 94 | if __name__ == "__main__": 95 | rank_crawler() 96 | -------------------------------------------------------------------------------- /video.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/4/7 3:12 3 | # @Author : Nismison 4 | # @FileName: video.py 5 | # @Description: Bilibili视频信息爬取 6 | # @Blog :https://blog.tryfang.cn 7 | 8 | from functions.requests_func import url_get 9 | from functions.deal_json import dict_get 10 | from functions.database import Database 11 | from time import strftime, gmtime 12 | 13 | 14 | def crawler(av): 15 | database = Database(host="localhost", username="root", password="", db_name="bilibili") 16 | for av_num in range(av, 48544470): 17 | url = "https://api.bilibili.com/x/web-interface/view?aid={}".format(av_num) 18 | get_json = url_get(url=url, mode="json") 19 | if dict_get(get_json, "code") != 0: 20 | print('错误!没有此视频!av:{}'.format(av_num)) 21 | print('-' * 60) 22 | continue 23 | data = {} 24 | data['video_av'] = str(av_num) 25 | data['video_up'] = dict_get(get_json, "name") 26 | data['video_title'] = dict_get(get_json, "title") 27 | data['video_classification'] = dict_get(get_json, "tname") 28 | data['video_view'] = dict_get(get_json, "view") 29 | data['video_share'] = dict_get(get_json, "share") 30 | data['video_like'] = dict_get(get_json, "like") 31 | data['video_favorite'] = dict_get(get_json, "favorite") 32 | data['video_coin'] = dict_get(get_json, "coin") 33 | data['video_update'] = strftime("%Y-%m-%d %H:%M:%S", gmtime(dict_get(get_json, "ctime"))) 34 | data['video_reply'] = dict_get(get_json, "reply") 35 | data['video_danmaku'] = dict_get(get_json, "danmaku") 36 | video_reprint = dict_get(get_json, "no_reprint") 37 | if video_reprint == 0: 38 | data['video_reprint'] = "转载" 39 | else: 40 | data['video_reprint'] = "原创" 41 | 42 | db_select = database.execute_sql(table_name="video", mode="search", key="video_av", value=data['video_av']) 43 | if db_select != 0: 44 | print('错误!此视频已存在!av:{}'.format(av_num)) 45 | print('-' * 60) 46 | else: 47 | if database.execute_sql(table_name="video", mode="insert", keys=list(data.keys()), 48 | values=list(data.values())): 49 | print("视频av号: {}".format(data['video_av'])) 50 | print("作者: {}".format(data['video_up'])) 51 | print("标题: {}".format(data['video_title'])) 52 | print("视频分类: {}".format(data['video_classification'])) 53 | print("观看数: {}".format(data['video_view'])) 54 | print("分享数: {}".format(data['video_share'])) 55 | print("点赞数: {}".format(data['video_like'])) 56 | print("收藏数: {}".format(data['video_favorite'])) 57 | print("投币数: {}".format(data['video_coin'])) 58 | print("上传时间: {}".format(data['video_update'])) 59 | print("评论数: {}".format(data['video_reply'])) 60 | print("弹幕数: {}".format(data['video_danmaku'])) 61 | print("性质: {}".format(data['video_reprint'])) 62 | print("-" * 60) 63 | 64 | 65 | if __name__ == '__main__': 66 | crawler(av=6000) 67 | --------------------------------------------------------------------------------