├── .gitignore ├── README.md ├── aid2uid.py ├── alluid.py ├── avdesc.py ├── avtag.py ├── bilisupport.py ├── comment.py ├── danmaku.py ├── distinct.py ├── space.py └── tag.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *$py.class 4 | *.db 5 | *.exe 6 | */ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | bilibili-data 2 | === 3 | ## 哔哩哔哩弹幕网数据爬虫 4 | 5 | 6 | 基于Python3,需要以下库支持: 7 | > sudo -H pip install requests pymongo multiprocessing BeautifulSoup4 8 | 9 | 默认使用了Mongodb存储数据,SQL是坏文明。 10 | 11 | > BeautifulSoup 用了 lxml 来解析网页,你可能需要安装 lxml 库: 12 | > sudo -H pip install lxml 13 | 但是 pip 安装 lxml 速度奇慢而且很可能报错,如果你是 Ubuntu 或者 Debian,推荐: 14 | > sudo apt-get install python-lxml 15 | 16 | 如果你是 Windows 用户,请去 [这里](http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml) 下载对应版本的whl来安装 17 | 18 | ## 脚本分类 19 | 20 | | 文件 | 用途 | 21 | | ----- | ----- | 22 | | aid2uid.py | AV号与UP主对应关系 | 23 | | alluid.py | 全站UP主投稿遍历 | 24 | | avdesc.py | 全站AV号基础数据补足 | 25 | | avtag.py | 全站AV号TAG数据对应 | 26 | | bilisupport.py | 使用到的API和常量列表 | 27 | | comment.py | 全站视频评论遍历 | 28 | | danmaku.py | 全站视频弹幕遍历 | 29 | | space.py | 全站用户空间信息遍历 | 30 | | tag.py | 全站TAG数据遍历 | 31 | 32 | ## 稿件数据结构为: 33 | 34 | ```code 35 | { 36 | "aid" : AV号, 37 | "title" : 标题, 38 | "subtitle" : 备注, 39 | "typeid" : 分区, 40 | "description" : 视频简介, 41 | "created" : 投稿时间, 42 | "mid" : UP主id, 43 | "author" : UP主昵称, 44 | "copyright" : 原创/搬运, 45 | "pic" : 封面图, 46 | "length" : 视频时长, 47 | "play" : 播放数, 48 | "review" : 弹幕数, 49 | "coins" : 硬币数, 50 | "favorites" : 收藏数, 51 | "comment" : 评论数, 52 | "video_review" : 评论数 53 | } 54 | ``` 55 | 56 | | aid | title | typeid | mid | author | created | copyright | length | 57 | | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | 58 | | INT | CHAR | INT | INT | CHAR | CHAR | CHAR | INT | 59 | | ----- | ----- | 24:'MAD·AMV' | ----- | ----- | ----- | Copy | ----- | 60 | | ----- | ----- | 25:'MMD·3D' | ----- | ----- | ----- | Original | ----- | 61 | | ----- | ----- | 47:'短片·手书·配音' | ----- | ----- | ----- | Unknow | ----- | 62 | | ----- | ----- | 27:'综合' | ----- | ----- | ----- | ----- | ----- | 63 | | ----- | ----- | 33:'连载动画' | ----- | ----- | ----- | ----- | ----- | 64 | | ----- | ----- | 32:'完结动画' | ----- | ----- | ----- | ----- | ----- | 65 | | ----- | ----- | 153:'国产动画' | ----- | ----- | ----- | ----- | ----- | 66 | | ----- | ----- | 51:'番剧资讯' | ----- | ----- | ----- | ----- | ----- | 67 | | ----- | ----- | 152:'官方延伸' | ----- | ----- | ----- | ----- | ----- | 68 | | ----- | ----- | 28:'原创音乐' | ----- | ----- | ----- | ----- | ----- | 69 | | ----- | ----- | 31:'翻唱' | ----- | ----- | ----- | ----- | ----- | 70 | | ----- | ----- | 30:'VOCALOID·UTAU' | ----- | ----- | ----- | ----- | ----- | 71 | | ----- | ----- | 59:'演奏' | ----- | ----- | ----- | ----- | ----- | 72 | | ----- | ----- | 29:'三次元音乐' | ----- | ----- | ----- | ----- | ----- | 73 | | ----- | ----- | 54:'OP/ED/OST' | ----- | ----- | ----- | ----- | ----- | 74 | | ----- | ----- | 130:'音乐选集' | ----- | ----- | ----- | ----- | ----- | 75 | | ----- | ----- | 20:'宅舞' | ----- | ----- | ----- | ----- | ----- | 76 | | ----- | ----- | 154:'三次元舞蹈' | ----- | ----- | ----- | ----- | ----- | 77 | | ----- | ----- | 156:'舞蹈教程' | ----- | ----- | ----- | ----- | ----- | 78 | | ----- | ----- | 17:'单机联机' | ----- | ----- | ----- | ----- | ----- | 79 | | ----- | ----- | 65:'网游·电竞' | ----- | ----- | ----- | ----- | ----- | 80 | | ----- | ----- | 136:'音游' | ----- | ----- | ----- | ----- | ----- | 81 | | ----- | ----- | 19:'Mugen' | ----- | ----- | ----- | ----- | ----- | 82 | | ----- | ----- | 121:'GMV' | ----- | ----- | ----- | ----- | ----- | 83 | | ----- | ----- | 37:'纪录片' | ----- | ----- | ----- | ----- | ----- | 84 | | ----- | ----- | 124:'趣味科普人文' | ----- | ----- | ----- | ----- | ----- | 85 | | ----- | ----- | 122:'野生技术协会' | ----- | ----- | ----- | ----- | ----- | 86 | | ----- | ----- | 39:'演讲•公开课' | ----- | ----- | ----- | ----- | ----- | 87 | | ----- | ----- | 96:'星海' | ----- | ----- | ----- | ----- | ----- | 88 | | ----- | ----- | 95:'数码' | ----- | ----- | ----- | ----- | ----- | 89 | | ----- | ----- | 98:'机械' | ----- | ----- | ----- | ----- | ----- | 90 | | ----- | ----- | 138:'搞笑' | ----- | ----- | ----- | ----- | ----- | 91 | | ----- | ----- | 21:'日常' | ----- | ----- | ----- | ----- | ----- | 92 | | ----- | ----- | 76:'美食圈' | ----- | ----- | ----- | ----- | ----- | 93 | | ----- | ----- | 75:'动物圈' | ----- | ----- | ----- | ----- | ----- | 94 | | ----- | ----- | 161:'手工' | ----- | ----- | ----- | ----- | ----- | 95 | | ----- | ----- | 162:'绘画' | ----- | ----- | ----- | ----- | ----- | 96 | | ----- | ----- | 163:'运动' | ----- | ----- | ----- | ----- | ----- | 97 | | ----- | ----- | 22:'鬼畜调教' | ----- | ----- | ----- | ----- | ----- | 98 | | ----- | ----- | 26:'音MAD' | ----- | ----- | ----- | ----- | ----- | 99 | | ----- | ----- | 126:'人力VOCALOID' | ----- | ----- | ----- | ----- | ----- | 100 | | ----- | ----- | 127:'教程演示' | ----- | ----- | ----- | ----- | ----- | 101 | | ----- | ----- | 157:'美妆' | ----- | ----- | ----- | ----- | ----- | 102 | | ----- | ----- | 158:'服饰' | ----- | ----- | ----- | ----- | ----- | 103 | | ----- | ----- | 164:'健身' | ----- | ----- | ----- | ----- | ----- | 104 | | ----- | ----- | 159:'时尚资讯' | ----- | ----- | ----- | ----- | ----- | 105 | | ----- | ----- | 166:'广告' | ----- | ----- | ----- | ----- | ----- | 106 | | ----- | ----- | 71:'综艺' | ----- | ----- | ----- | ----- | ----- | 107 | | ----- | ----- | 137:'明星' | ----- | ----- | ----- | ----- | ----- | 108 | | ----- | ----- | 131:'Korea相关' | ----- | ----- | ----- | ----- | ----- | 109 | | ----- | ----- | 82:'电影相关' | ----- | ----- | ----- | ----- | ----- | 110 | | ----- | ----- | 85:'短片' | ----- | ----- | ----- | ----- | ----- | 111 | | ----- | ----- | 145:'欧美电影' | ----- | ----- | ----- | ----- | ----- | 112 | | ----- | ----- | 146:'日本电影' | ----- | ----- | ----- | ----- | ----- | 113 | | ----- | ----- | 147:'国产电影' | ----- | ----- | ----- | ----- | ----- | 114 | | ----- | ----- | 83:'其他国家' | ----- | ----- | ----- | ----- | ----- | 115 | | ----- | ----- | 15:'连载剧集' | ----- | ----- | ----- | ----- | ----- | 116 | | ----- | ----- | 34:'完结剧集' | ----- | ----- | ----- | ----- | ----- | 117 | | ----- | ----- | 86:'特摄·布袋' | ----- | ----- | ----- | ----- | ----- | 118 | | ----- | ----- | 129:'电视剧相关' | ----- | ----- | ----- | ----- | ----- | 119 | 120 | * 由于分区变动原因实际获取到的 typeid 并不只有上表所列的范围。 121 | * Unknow 状态的稿件随机测试的结果应该是搬运状态,有待确认。 122 | 123 | ## 评论数据结构为: 124 | 125 | ```code 126 | { 127 | "root_str" : "0", 128 | "content" : { 129 | "device" : 设备, 130 | "message" : 评论正文, 131 | "plat" : 平台, 132 | "members" : [] 133 | }, 134 | "parent_str" : 评论父楼层, 135 | "parent" : 评论父楼层, 136 | "mid" : 用户uid, 137 | "root" : 0, 138 | "member" : { 139 | "uname" : 用户昵称, 140 | "rank" : "10000", 141 | "vip" : {……}, vip/大会员相关 142 | "sign" : 用户签名, 143 | "sex" : 用户性别, 144 | "avatar" : 用户头像, 145 | "DisplayRank" : "0", 146 | "mid" : 用户uid, 147 | "level_info" : { 148 | "current_min" : 1500, 149 | "current_level" : 现在等级, 150 | "next_exp" : 距离下一级经验值, 151 | "current_exp" : 现在经验值 152 | }, 153 | "nameplate" : {……}, 154 | "pendant" : {……}, 155 | "official_verify" : {……} 156 | }, 157 | "type" : 1, 158 | "like" : 点赞数, 159 | "rpid" : 评论ID, 160 | "count" : 0, 161 | "rpid_str" : 评论ID, 162 | "ctime" : 评论时间, 163 | "action" : 0, 164 | "oid" : 评论所属视频, 165 | "replies" : [], 166 | "floor" : 评论楼层, 167 | "rcount" : 0, 168 | "state" : 0 169 | } 170 | ``` 171 | 172 | | oid | floor | parent | mid | ctime | content.message | plat | rpid | 173 | | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | 174 | | INT | INT | INT | INT | INT | CHAR | CHAR | INT | 175 | | ----- | ----- | 0: 主楼 | ----- | ----- | ----- | 1: 网页 | ----- | 176 | | ----- | ----- | 9: 楼中楼 | ----- | ----- | ----- | 2: 安卓 | ----- | 177 | | ----- | ----- | ----- | ----- | ----- | ----- | 3: iOS | ----- | 178 | | ----- | ----- | ----- | ----- | ----- | ----- | 4: Windows Phone | ----- | 179 | | ----- | ----- | ----- | ----- | ----- | ----- | 5: 安卓? | ----- | 180 | 181 | * 设备信息具体对应不确定,仅作参考 182 | 183 | ## 弹幕数据结构为: 184 | 185 | ```code 186 | { 187 | "aid" : 视频av号, # 并不包含在弹幕文件中 188 | "cid" : 视频cid, # 并不包含在弹幕文件中 189 | "time" : 弹幕时间点, 190 | "mode" : 弹幕模式, 191 | "font" : 字号大小, 192 | "color" : 弹幕颜色, 193 | "date" : 弹幕发送日期, 194 | "pool" : 弹幕池, 195 | "hash" : 用户uid的HASH, 196 | "id" : 弹幕id, 197 | "text" : 弹幕内容 198 | } 199 | ``` 200 | 201 | | aid | cid | time | mode | font | color | date | pool | hash | id | text | 202 | | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | 203 | | INT | INT | CHAR | INT | INT | CHAR | CHAR | INT | CHAR | INT | CHAR | 204 | | ----- | ----- | ----- | 1~3: 普通弹幕 | ----- | ----- | #FFFFFF | ----- | ----- | ----- | ----- | 205 | | ----- | ----- | ----- | 4: 底部弹幕 | ----- | ----- | ----- | ----- | ----- | ----- | ----- | 206 | | ----- | ----- | ----- | 5: 顶部弹幕 | ----- | ----- | ----- | ----- | ----- | ----- | ----- | 207 | | ----- | ----- | ----- | 6: 逆向弹幕 | ----- | ----- | ----- | ----- | ----- | ----- | ----- | 208 | | ----- | ----- | ----- | 7~8: 高级弹幕 | ----- | ----- | ----- | ----- | ----- | ----- | ----- | 209 | 210 | * 未确认其他类型弹幕的具体对应关系 211 | 212 | ## 用户空间数据结构为: 213 | 214 | ```code 215 | { 216 | "mid" : 用户uid, 217 | "place" : 地区, 218 | "playNum" : 投稿播放总数, 219 | "sex" : 用户性别, 220 | "coins" : 硬币数, 221 | "spacesta" : 0, 222 | "DisplayRank" : "1001", 223 | "attentions" : [……], 关注列表 224 | "theme_preview" : "", 225 | "friend" : 0, 226 | "official_verify" : {……}, 227 | "toutu" : 空间头图, 228 | "sign" : 签名, 229 | "description" : 签名, 230 | "toutuId" : 1, 231 | "im9_sign" : 兴趣圈, 232 | "name" : 用户昵称, 233 | "level_info" : {……}, 用户等级相关 234 | "nameplate" : {……}, 勋章相关 235 | "approve" : false, 236 | "face" : 用户头像, 237 | "birthday" : 生日, 238 | "article" : 0, 239 | "theme" : 空间使用主题, 240 | "rank" : "10000", 241 | "fans" : 粉丝数, 242 | "pendant" : {……}, 挂件相关 243 | "attention" : 关注数, 244 | "regtime" : 注册时间 245 | } 246 | ``` 247 | 248 | | mid | name | sex | regtime | fans | attention | description | 249 | | ----- | ----- | ----- | ----- | ----- | ----- | ----- | 250 | | INT | CHAR | CHAR | INT | INT | INT | CHAR | 251 | | ----- | ----- | ----- | ----- | ----- | ----- | ----- | 252 | 253 | * description 待确认 254 | 255 | ## TAG数据结构为: 256 | 257 | ```code 258 | { 259 | "tag_id" : 标签id, 260 | "subscribe_count" : 订阅数, 261 | "cover" : 封面, 262 | "name" : 标签名, 263 | "visit_count" : 访问数, 264 | "subscribed" : 0 265 | } 266 | ``` 267 | 268 | | tag_id | name | subscribe_count | visit_count | cover | subscribed | 269 | | ----- | ----- | ----- | ----- | ----- | ----- | 270 | | INT | CHAR | INT | INT |CHAR | INT | 271 | | ----- | ----- | ----- | ----- | ----- | 0 | 272 | 273 | * 未确认subscribed的作用 -------------------------------------------------------------------------------- /aid2uid.py: -------------------------------------------------------------------------------- 1 | '''AID对应MID''' 2 | #!/usr/bin/python 3 | # -*- coding: utf-8 -*- 4 | 5 | from multiprocessing.dummy import Pool as ThreadPool 6 | from bilisupport import API_VIDEOSTATUS, AID2MID, APPKEY, HEADERS 7 | import requests 8 | 9 | 10 | def getdata(avid): 11 | '''获取数据''' 12 | params = { 13 | 'type': 'json', 14 | 'appkey': APPKEY, 15 | 'id': avid 16 | } 17 | try: 18 | rawdata = requests.get(url=API_VIDEOSTATUS, params=params, headers=HEADERS).json() 19 | except TimeoutError: 20 | pass 21 | postdata = { 22 | 'aid': avid, 23 | 'mid': rawdata.get('mid') 24 | } 25 | AID2MID.insert(postdata) 26 | 27 | 28 | if __name__ == '__main__': 29 | MULTIPOOL = ThreadPool(8) 30 | for aid in range(1, 7454282): 31 | if AID2MID.find_one({"aid": aid}) is None: 32 | print(aid) 33 | MULTIPOOL.apply_async(getdata, (aid,)) 34 | MULTIPOOL.close() 35 | MULTIPOOL.join() 36 | -------------------------------------------------------------------------------- /alluid.py: -------------------------------------------------------------------------------- 1 | '''遍历用户UID''' 2 | #!/usr/bin/python 3 | # -*- coding: utf-8 -*- 4 | 5 | from multiprocessing.dummy import Pool as ThreadPool 6 | from bilisupport import API_SUBMITVIDEOS, AVIDLIST, HEADERS 7 | import requests 8 | 9 | 10 | def getvideos(uid): 11 | '''获取用户投稿''' 12 | params = { 13 | 'mid': uid, 14 | 'pagesize': 30, 15 | 'tid': 0, 16 | 'page': 1 17 | } 18 | gsvres = requests.get(url=API_SUBMITVIDEOS, headers=HEADERS, params=params).json() 19 | if not gsvres['status']: 20 | return 503 21 | else: 22 | while params['page'] <= gsvres['data']['pages']: 23 | videos = gsvres['data']['vlist'] 24 | AVIDLIST.insert_many(videos) 25 | params['page'] += 1 26 | gsvres = requests.get(url=API_SUBMITVIDEOS, headers=HEADERS, params=params).json() 27 | 28 | 29 | if __name__ == '__main__': 30 | # getvideos(39147112) 31 | MULTIPOOL = ThreadPool(8) 32 | for mid in range(1, 60640000): 33 | print(mid) 34 | MULTIPOOL.apply_async(getvideos, (mid,)) 35 | MULTIPOOL.close() 36 | MULTIPOOL.join() 37 | -------------------------------------------------------------------------------- /avdesc.py: -------------------------------------------------------------------------------- 1 | '''遍历全站AID''' 2 | #!/usr/bin/python 3 | # -*- coding: utf-8 -*- 4 | 5 | from multiprocessing.dummy import Pool as ThreadPool 6 | from bilisupport import API_VIDEOSTATUS, HEADERS, APPKEY, AVIDLIST 7 | import requests 8 | 9 | 10 | def getinfo(aid): 11 | '''获取aid信息''' 12 | if not aid: 13 | return 404 14 | else: 15 | aid = int(aid) 16 | paramsinfo = {'type': 'json', 'appkey': APPKEY, 'id': aid} 17 | gsvres = requests.get(url=API_VIDEOSTATUS, params=paramsinfo, headers=HEADERS).json() 18 | if gsvres.get('code') is None: 19 | postdata = { 20 | 'comment': int(gsvres.get('video_review')), 21 | 'video_review': int(gsvres.get('video_review')), 22 | 'coins': int(gsvres.get('coins')), 23 | 'favorites': int(gsvres.get('favorites')) 24 | } 25 | if gsvres.get('play') != "--": 26 | postdata['play'] = gsvres.get('play') 27 | AVIDLIST.update({'aid': aid}, {'$set': postdata}) 28 | else: 29 | print(aid, gsvres.get('code'), gsvres.get('error')) 30 | return 404 31 | 32 | 33 | if __name__ == '__main__': 34 | # getinfo(2053) 35 | MULTIPOOL = ThreadPool(16) 36 | for avid in open('videoaid.csv', 'r'): 37 | MULTIPOOL.apply_async(getinfo, (avid, )) 38 | MULTIPOOL.close() 39 | MULTIPOOL.join() 40 | -------------------------------------------------------------------------------- /avtag.py: -------------------------------------------------------------------------------- 1 | '''遍历稿件tag''' 2 | #!/usr/bin/python 3 | # -*- coding: utf-8 -*- 4 | 5 | from multiprocessing.dummy import Pool as ThreadPool 6 | from bilisupport import AVTAGLIST, ERRORLIST, API_TAG 7 | import requests 8 | 9 | 10 | def gettag(aid): 11 | '''获取稿件tag''' 12 | if not aid: 13 | return 404 14 | aid = int(aid) 15 | aidparams = { 16 | 'aid': aid, 17 | 'jsonp': 'jsonp' 18 | } 19 | info = requests.get(url=API_TAG, params=aidparams).json() 20 | if info.get('code') == 0: 21 | tags = [{ 22 | 'aid': aid, 23 | 'tag': x.get('tag_id') 24 | } for x in info.get('data')] 25 | AVTAGLIST.insert_many(tags) 26 | print(aid) 27 | else: 28 | ERRORLIST.insert_one(info) 29 | 30 | 31 | if __name__ == '__main__': 32 | MULTIPOOL = ThreadPool(8) 33 | for avid in open('videoaid.csv', 'r'): 34 | MULTIPOOL.apply_async(gettag, (avid,)) 35 | MULTIPOOL.close() 36 | MULTIPOOL.join() 37 | -------------------------------------------------------------------------------- /bilisupport.py: -------------------------------------------------------------------------------- 1 | '''API列表''' 2 | #!/usr/bin/python 3 | # -*- coding: utf-8 -*- 4 | 5 | from pymongo import MongoClient 6 | 7 | DATABASE = MongoClient('mongodb://127.0.0.1:27017/', connect=False) 8 | AID2MID = DATABASE['bilibili-data']['AID2MID'] 9 | AVIDLIST = DATABASE['bilibili-data']['SubmitVideos'] 10 | DANMAKULIST = DATABASE['bilibili-data']['DanmakuData'] 11 | REPLYLIST = DATABASE['bilibili-data']['CommentData'] 12 | ACCOUNTLIST = DATABASE['bilibili-data']['SpaceInfo'] 13 | TAGLIST = DATABASE['bilibili-data']['TagData'] 14 | AVTAGLIST = DATABASE['bilibili-data']['AVTagData'] 15 | ERRORLIST = DATABASE['bilibili-data']['Errorlist'] 16 | 17 | # APPKEY = '12737ff7776f1ade' 18 | APPKEY = '8e9fc618fbd41e28' 19 | 20 | # CID_DANMAKU.format(cid) 21 | CID_DANMAKU = 'http://comment.bilibili.com/{0}.xml' 22 | # CID_DANMAKU_HIS.format(cid) 23 | CID_DANMAKU_HIS = 'http://comment.bilibili.com/rolldate,{0}' 24 | # CID_HISDANMAKU.format(timestamp, cid) 25 | CID_HISDANMAKU = 'http://comment.bilibili.com/dmroll,{0},{1}' 26 | 27 | # {'aid': aid} 28 | API_PAGELIST = 'http://www.bilibili.com/widget/getPageList?' 29 | 30 | # {'mid': mid, 'pagesize': pagesize, 'tid': tid, 'page': page} 31 | API_SUBMITVIDEOS = 'http://space.bilibili.com/ajax/member/getSubmitVideos?' 32 | 33 | # {'type': 'json', 'appkey': APPKEY, 'id': aid} & cookies needed 34 | API_VIDEOSTATUS = 'http://api.bilibili.com/view?' 35 | 36 | # {'mid': mid} {'Referer': 'http://space.bilibili.com/{mid}/'} POST method 37 | API_SPACE = 'http://space.bilibili.com/ajax/member/GetInfo' 38 | 39 | # {'jsonp': jsonp, 'type': 1, 'sort': 0, 'oid': oid, 'pn': page, 'nohot': 1} 40 | API_REPLY = 'http://api.bilibili.com/x/v2/reply?' 41 | 42 | # {'aid': aid, 'jsonp': 'jsonp'} 43 | API_TAG = 'http://api.bilibili.com/x/tag/archive/tags?' 44 | # {'id': tid, 'jsonp': 'jsonp'} 45 | API_TAGINFO = 'http://api.bilibili.com/tags/info_description?' 46 | # {'aid': aid, 'tag_id': tagid, 'jsonp': 'jsonp'} POST method & cookies needed 47 | API_TAGDEL = 'http://api.bilibili.com/x/tag/archive/del' 48 | # {'aid': aid, 'tag_name': tagname, 'jsonp': 'jsonp'} POST method & cookies needed 49 | API_TAGADD = 'http://api.bilibili.com/x/tag/archive/add' 50 | 51 | HEADERS = { 52 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 \ 53 | (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 54 | 'Cookie': 'SESSDATA=; \ 55 | LIVE_BUVID=; \ 56 | LIVE_BUVID__ckMd5=; \ 57 | DedeUserID=; \ 58 | DedeUserID__ckMd5=' 59 | } 60 | 61 | PROXIES = { 62 | 'http': 'socks5://127.0.0.1:1080', 63 | 'https': 'socks5://127.0.0.1:1080' 64 | } 65 | -------------------------------------------------------------------------------- /comment.py: -------------------------------------------------------------------------------- 1 | '''遍历全站评论''' 2 | #!/usr/bin/python 3 | # -*- coding: utf-8 -*- 4 | 5 | from multiprocessing.dummy import Pool as ThreadPool 6 | from bilisupport import REPLYLIST, ERRORLIST, API_REPLY 7 | import requests 8 | 9 | 10 | def getcomment(oid): 11 | '''获取视频评论内容''' 12 | oid = int(oid) 13 | page = 1 14 | params = { 15 | 'jsonp': 'jsonp', 16 | 'type': 1, 17 | 'sort': 0, 18 | 'oid': oid, 19 | 'pn': page, 20 | 'nohot': 1 21 | } 22 | temp = requests.get(url=API_REPLY, params=params).json() 23 | if temp.get('code') != 0: 24 | errdict = { 25 | 'oid': oid, 26 | 'code': temp.get('code'), 27 | 'message': temp.get('message') 28 | } 29 | ERRORLIST.insert_one(errdict) 30 | print(oid, temp.get('code'), temp.get('message')) 31 | else: 32 | # 循环 33 | while temp['data'].get('replies'): 34 | replylst = temp['data'].get('replies') 35 | REPLYLIST.insert_many(replylst) 36 | print(oid, params['pn']) 37 | params['pn'] = temp['data']['page']['num'] + 1 38 | temp = requests.get(url=API_REPLY, params=params).json() 39 | 40 | 41 | if __name__ == '__main__': 42 | MULTIPOOL = ThreadPool(16) 43 | for avid in open('videoaid.csv', 'r'): 44 | MULTIPOOL.apply_async(getcomment, (avid,)) 45 | MULTIPOOL.close() 46 | MULTIPOOL.join() 47 | -------------------------------------------------------------------------------- /danmaku.py: -------------------------------------------------------------------------------- 1 | '''遍历全站弹幕''' 2 | #!/usr/bin/python 3 | # -*- coding: utf-8 -*- 4 | 5 | from multiprocessing.dummy import Pool as ThreadPool 6 | from bilisupport import DANMAKULIST, API_PAGELIST, CID_DANMAKU 7 | import requests 8 | from bs4 import BeautifulSoup 9 | 10 | 11 | def getdanmaku(aid, cid): 12 | '''通过CID获取弹幕''' 13 | if not cid: 14 | return 404 15 | else: 16 | aid = int(aid) 17 | cid = int(cid) 18 | print(aid, cid) 19 | link = CID_DANMAKU.format(cid) 20 | response = requests.get(url=link, timeout=300) 21 | content = BeautifulSoup(response.text, "xml") 22 | # 弹幕字幕 23 | danmaku_raw = [x for x in content.select('i')[0].select('d')] 24 | danmaku_data = [{ 25 | 'aid': aid, 26 | 'cid': cid, 27 | 'time': float(x.attrs['p'].split(',')[0]), 28 | 'mode': int(x.attrs['p'].split(',')[1]), 29 | 'font': int(x.attrs['p'].split(',')[2]), 30 | 'color': ("#%06x" % int(x.attrs['p'].split(',')[3], 10)).upper(), 31 | 'date': float(x.attrs['p'].split(',')[4]), 32 | 'pool': int(x.attrs['p'].split(',')[5]), 33 | 'hash': x.attrs['p'].split(',')[6], 34 | 'id': int(x.attrs['p'].split(',')[7]), 35 | 'text': x.string 36 | } for x in danmaku_raw] 37 | DANMAKULIST.insert_many(danmaku_data) 38 | 39 | 40 | if __name__ == '__main__': 41 | MULTIPOOL = ThreadPool(16) 42 | for avid in open('videoaid.csv', 'r'): 43 | params = {'aid': avid} 44 | resp = requests.get(url=API_PAGELIST, params=params) 45 | if resp.status_code == 200: 46 | pages = resp.json() 47 | for page in pages: 48 | MULTIPOOL.apply_async(getdanmaku, (avid, page['cid'])) 49 | MULTIPOOL.close() 50 | MULTIPOOL.join() 51 | -------------------------------------------------------------------------------- /distinct.py: -------------------------------------------------------------------------------- 1 | '''数据库去重''' 2 | #!/usr/bin/python 3 | # -*- coding: utf-8 -*- 4 | 5 | from multiprocessing.dummy import Pool as ThreadPool 6 | from pymongo import MongoClient 7 | 8 | DATABASE = MongoClient('mongodb://127.0.0.1:27017/', connect=False) 9 | TAGLIST = DATABASE['bilibili-data']['TagData'] 10 | 11 | 12 | def check(tid): 13 | '''去重''' 14 | if TAGLIST.find({"tag_id": tid}).count() > 1: 15 | TAGLIST.delete_one({"tag_id": tid}) 16 | 17 | 18 | if __name__ == '__main__': 19 | MULTIPOOL = ThreadPool(16) 20 | for i in range(1, 1773900): 21 | MULTIPOOL.apply_async(check, (i,)) 22 | MULTIPOOL.close() 23 | MULTIPOOL.join() 24 | -------------------------------------------------------------------------------- /space.py: -------------------------------------------------------------------------------- 1 | '''用户空间信息爬虫''' 2 | #!/usr/bin/python 3 | # -*- coding: utf-8 -*- 4 | 5 | import time 6 | from bilisupport import ACCOUNTLIST, API_SPACE, ERRORLIST, HEADERS 7 | import requests 8 | 9 | 10 | def getspaceinfo(headers, form): 11 | '''json解码完写数据''' 12 | try: 13 | jsondata = requests.post(url=API_SPACE, headers=headers, data=form) 14 | except TimeoutError: 15 | ERRORLIST.insert(form) 16 | if jsondata.status_code != 200: 17 | ERRORLIST.insert(form) 18 | print(u"API 返回 403") 19 | return 0 20 | data = jsondata.json().get('data') 21 | if data == "服务器遇到了一些问题": 22 | print(u"服务器遇到了一些问题") 23 | ERRORLIST.insert(form) 24 | return 0 25 | ACCOUNTLIST.update(form, {'$set': data}, upsert=True) 26 | print(form) 27 | return 1 28 | 29 | 30 | if __name__ == '__main__': 31 | for mid in range(1, 60631040): 32 | spaceurl = 'http://space.bilibili.com/{mid}/'.format(mid=mid) 33 | HEADERS['Referer'] = spaceurl 34 | postheaders = {'Referer': spaceurl} 35 | postdata = {'mid':"{0}".format(mid)} 36 | if getspaceinfo(HEADERS, postdata): 37 | pass 38 | else: 39 | print(u"屏蔽判定,暂停1分钟") 40 | time.sleep(61) 41 | -------------------------------------------------------------------------------- /tag.py: -------------------------------------------------------------------------------- 1 | '''遍历tag''' 2 | #!/usr/bin/python 3 | # -*- coding: utf-8 -*- 4 | 5 | from multiprocessing.dummy import Pool as ThreadPool 6 | from bilisupport import TAGLIST, ERRORLIST, API_TAGINFO 7 | import requests 8 | 9 | 10 | def taginfo(tid): 11 | '''获取tag信息''' 12 | if not tid: 13 | return 404 14 | tid = int(tid) 15 | tagparams = { 16 | 'id': tid, 17 | 'jsonp': 'jsonp' 18 | } 19 | info = requests.get(url=API_TAGINFO, params=tagparams).json() 20 | if info.get('code') == 0: 21 | print(info.get('result')) 22 | TAGLIST.update({'tag_id': tid}, {'$set': info.get('result')}, upsert=True) 23 | else: 24 | ERRORLIST.insert_one({'tag_id': tid}) 25 | 26 | 27 | if __name__ == '__main__': 28 | # taginfo(2053) 29 | MULTIPOOL = ThreadPool(4) 30 | for i in range(1, 1773900): 31 | MULTIPOOL.apply_async(taginfo, (i,)) 32 | MULTIPOOL.close() 33 | MULTIPOOL.join() 34 | --------------------------------------------------------------------------------