├── README.md └── spiders ├── db.py ├── cookies_pool.py ├── start.py ├── author_follow.py ├── author_room.py ├── tag.py ├── rank_add.py ├── utils.py ├── add_public_video.py ├── video_data_spider.py ├── author_data_spider.py └── biliob.py /README.md: -------------------------------------------------------------------------------- 1 | 如果成功配置了所有环境变量,并且安装好了环境和所需的包: 2 | 3 | ``` bash 4 | cd spiders 5 | python3 start.py 6 | ``` 7 | -------------------------------------------------------------------------------- /spiders/db.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | import os 3 | import motor 4 | env_dist = os.environ 5 | 6 | client = MongoClient(env_dist.get("MONGO_URL")) 7 | db = client[env_dist.get("DB_NAME")] # 获得数据库的句柄 8 | async_client = motor.motor_tornado.MotorClient( 9 | env_dist.get(env_dist.get("DB_NAME"))) 10 | 11 | async_db = async_client[env_dist.get("DB_NAME")] 12 | -------------------------------------------------------------------------------- /spiders/cookies_pool.py: -------------------------------------------------------------------------------- 1 | cookies_pool = ["JSESSIONID=36B4E30CDFE059235541CC81C91743A3; bili_jct=520c3f01daf4d9732fdeb9d95586612c; SESSDATA=4b484445%2C1616518159%2Ca3c73*91; DedeUserID=416679427; sid=7r9xx2ji; buvid3=CDCFA651-49CB-4A84-B114-D107F6AD0B2E138364infoc; DedeUserID__ckMd5=a64ed305aa816673; _uuid=A60FD75F-122D-851F-7994-2DD11583170748070infoc; finger=158939783", 2 | "JSESSIONID=F226418244D87B8BA9576F6925A77DEE; bili_jct=39e4699e907b32e2102d5bf649efc608; SESSDATA=ff19c0a2%2C1616518648%2Ca1be7*91; DedeUserID=416679530; sid=8ud5j0dx; buvid3=CDCFA651-49CB-4A84-B114-D107F6AD0B2E138364infoc; DedeUserID__ckMd5=a86b9ac2fc85db23; _uuid=98868A14-C781-7EF4-B8AB-CBF11A860D7B41053infoc; finger=158939783"] 3 | -------------------------------------------------------------------------------- /spiders/start.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # encoding=utf-8 3 | import os 4 | import psutil 5 | import schedule 6 | import datetime 7 | from time import sleep 8 | 9 | 10 | def find_procs_by_name(name): 11 | "Return a list of processes matching 'name'." 12 | ls = [] 13 | 14 | for process in psutil.process_iter(): 15 | try: 16 | for each in process.cmdline(): 17 | if name in each: 18 | ls.append(process.pid) 19 | break 20 | pass 21 | except Exception as e: 22 | pass 23 | return ls 24 | 25 | 26 | def delete_by_name(name): 27 | pids = find_procs_by_name(name) 28 | for pid in pids: 29 | os.kill(pid, 9) 30 | 31 | 32 | spiders = [ 33 | 'add_public_video.py', 34 | 'author_follow.py', 35 | 'author_data_spider.py', 36 | 'rank_add.py', 37 | 'tag.py', ] 38 | 39 | 40 | def check(): 41 | for each_spider_group in [spiders]: 42 | for each_spider in each_spider_group: 43 | pid = find_procs_by_name(each_spider) 44 | if len(pid) == 0: 45 | run_spider(each_spider) 46 | pass 47 | 48 | 49 | def run_spider(spider): 50 | print('[{}] 重启 {}'.format(datetime.datetime.utcnow() + 51 | datetime.timedelta(hours=8), spider)) 52 | delete_by_name(spider) 53 | cmd = 'nohup python3 {} 1>{}.log 2>&1 &'.format(spider, spider) 54 | os.system(cmd) 55 | pass 56 | 57 | 58 | schedule.every(10).seconds.do(check) 59 | for each_spider in spiders: 60 | run_spider(each_spider) 61 | while True: 62 | schedule.run_pending() 63 | sleep(10) 64 | -------------------------------------------------------------------------------- /spiders/author_follow.py: -------------------------------------------------------------------------------- 1 | 2 | from time import sleep 3 | from db import db 4 | from biliob import BiliobSpider 5 | import logging 6 | 7 | 8 | class BiliobAuthorFollowSpider(BiliobSpider): 9 | 10 | def __init__(self): 11 | super().__init__("UP主关注爬虫") 12 | self.except_content_type = 'application/json' 13 | pass 14 | 15 | async def gen_url(self): 16 | ps = 50 17 | mid = 0 18 | pn_list = [1, 2, 3] 19 | url = 'http://api.bilibili.com/x/relation/followings?vmid={mid}&pn={pn}&ps={ps}' 20 | while True: 21 | try: 22 | authors = db.author.find( 23 | {'mid': {'$gt': mid}}, 24 | {'mid': 1}).limit(30) 25 | flag = False 26 | for each_author in authors: 27 | flag = True 28 | for pn in pn_list: 29 | yield url.format(mid=each_author['mid'], pn=pn, ps=ps) 30 | if flag == False: 31 | mid = 0 32 | except Exception as e: 33 | logging.exception(e) 34 | sleep(10) 35 | 36 | async def parse(self, res): 37 | try: 38 | j = res.json_data 39 | item = { 40 | 'mid': int(str(res.url).split('?')[1].split('&')[0].split('=')[1]), 41 | 'follows': [] 42 | } 43 | for each_member in j['data']['list']: 44 | item['follows'].append(each_member['mid']) 45 | except Exception as e: 46 | self.logger.exception(e) 47 | return None 48 | return item 49 | 50 | async def save(self, item): 51 | if item == None: 52 | return 0 53 | db.author.update_one({'mid': item['mid']}, {'$addToSet': { 54 | 'follows': {'$each': item['follows']} 55 | }}) 56 | return 1 57 | 58 | 59 | if __name__ == "__main__": 60 | s = BiliobAuthorFollowSpider() 61 | s.run() 62 | -------------------------------------------------------------------------------- /spiders/author_room.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from biliob import BiliobSpider 3 | from cookies_pool import cookies_pool 4 | from fake_useragent import UserAgent 5 | ua = UserAgent() 6 | 7 | 8 | class CookiesPool(): 9 | def __init__(self): 10 | self.cookies_pool = cookies_pool 11 | self.cookies_pool_index = 0 12 | self.__c = self.__cookies_gener() 13 | 14 | def get_cookies(self): 15 | return next(self.__c) 16 | 17 | def __cookies_gener(self): 18 | l = len(self.cookies_pool) 19 | while True: 20 | yield cookies_pool[self.cookies_pool_index % l] 21 | 22 | 23 | class BiliOLiveRoomSpider(BiliobSpider): 24 | def __init__(self): 25 | super().__init__("Author Live Spider", 0.1, 8) 26 | self.cookies_pool = CookiesPool() 27 | self.except_content_type = 'application/json' 28 | self.use_proxy = True 29 | self.retry = 3 30 | 31 | async def gen_url(self): 32 | while True: 33 | authors = self.async_db.author.find({'forceFocus': True}, {'mid': 1}).sort( 34 | [('biliob.live_update', 1)]).limit(100).batch_size(100) 35 | async for author in authors: 36 | yield author['mid'] 37 | pass 38 | 39 | async def parse(self, mid): 40 | try: 41 | self.headers = { 42 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.8 Safari/537.36', 43 | 'cookie': self.cookies_pool.get_cookies() 44 | } 45 | res = await self.get(f'http://api.bilibili.com/x/space/acc/info?mid={mid}') 46 | if res == None or res.json_data == None: 47 | return None 48 | if 'data' not in res.json_data or 'live_room' not in res.json_data['data']: 49 | return (mid, None) 50 | data = res.json_data['data']['live_room'] 51 | return (mid, data) 52 | except Exception as e: 53 | self.logger.exception(e) 54 | return None 55 | 56 | async def save(self, item): 57 | try: 58 | if item == None: 59 | return 0 60 | mid, data = item 61 | await self.async_db.author.update_one({'mid': mid}, { 62 | '$set': {'live_room': data, 'biliob.live_update': datetime.datetime.utcnow()}}) 63 | return item 64 | except Exception as e: 65 | self.logger.exception(e) 66 | 67 | 68 | if __name__ == "__main__": 69 | s = BiliOLiveRoomSpider() 70 | s.run() 71 | -------------------------------------------------------------------------------- /spiders/tag.py: -------------------------------------------------------------------------------- 1 | 2 | from biliob import BiliobSpider 3 | import logging 4 | from utils import enc, dec 5 | import asyncio 6 | 7 | from biliob import BiliobSpider 8 | # TODO: https://api.bilibili.com/x/tag/archive/tags?aid=7 9 | # TODO: https://api.bilibili.com/x/tag/archive/tags?bvid=1at411K7R1 10 | 11 | 12 | class BiliobTagSpider(BiliobSpider): 13 | def __init__(self): 14 | super().__init__(name="TAG", 15 | interval=0, concurrency=4) 16 | 17 | async def gen_url(self): 18 | while True: 19 | try: 20 | last_tag = await self.async_db.video_info.find_one( 21 | {'tag': {'$exists': True}}, 22 | {'aid': 1, 'bvid': 1}, sort=[('_id', -1)]) 23 | videos = self.async_db.video_info.find({'_id': {'$gt': last_tag['_id']}}, { 24 | 'aid': 1, 'bvid': 1}).limit(30) 25 | flag = 0 26 | async for each_video in videos: 27 | flag = 1 28 | if 'bvid' in each_video: 29 | bvid = each_video['bvid'] 30 | else: 31 | bvid = enc(each_video['aid']) 32 | yield 'https://api.bilibili.com/x/tag/archive/tags?bvid={}'.format(bvid) 33 | if flag == 0: 34 | await asyncio.sleep(1) 35 | except Exception as e: 36 | logging.exception(e) 37 | 38 | async def parse(self, res): 39 | try: 40 | json_data = res.json_data 41 | 42 | item = {} 43 | bvid = res.url.query['bvid'] 44 | 45 | item['bvid'] = bvid 46 | item['aid'] = dec('BV' + bvid) 47 | if res == None or res.json_data['data'] == None: 48 | return None 49 | m = map(lambda x: x['tag_name'], res.json_data['data']) 50 | if m == None: 51 | return None 52 | item['tag_list'] = list(m) 53 | return item 54 | except Exception as e: 55 | self.logger.exception(e) 56 | return None 57 | 58 | async def save(self, item): 59 | if item == None: 60 | return 0 61 | if item['tag_list'] == []: 62 | item['tag_list'] = [None] 63 | if await self.async_db.video_info.find_one({'bvid': item['bvid']}, {'bvid': item['bvid']}) != None: 64 | try: 65 | await self.async_db.video_info.update_one({ 66 | 'bvid': item['bvid'] 67 | }, { 68 | '$set': { 69 | 'aid': item['aid'], 70 | 'tag': item['tag_list'] 71 | } 72 | }, upsert=True) 73 | except Exception as e: 74 | pass 75 | else: 76 | await self.async_db.video_info.update_one({ 77 | 'aid': item['aid'] 78 | }, { 79 | '$set': { 80 | 'bvid': item['bvid'], 81 | 'tag': item['tag_list'] 82 | } 83 | }, upsert=True) 84 | return len(item['tag_list']) 85 | 86 | 87 | s = BiliobTagSpider() 88 | 89 | if __name__ == "__main__": 90 | s.run() 91 | -------------------------------------------------------------------------------- /spiders/rank_add.py: -------------------------------------------------------------------------------- 1 | 2 | import requests 3 | import asyncio 4 | from db import db 5 | import datetime 6 | from biliob import BiliobSpider 7 | 8 | 9 | def update_interval(interval: int, key: str, value): 10 | now = datetime.datetime.utcnow() + datetime.timedelta(hours=8) 11 | return { 12 | 'next': now, 13 | 'date': now, 14 | 'interval': interval, 15 | key: value, 16 | } 17 | 18 | 19 | def update_video_interval(interval: int, bvid, aid): 20 | now = datetime.datetime.utcnow() 21 | return { 22 | 'next': now, 23 | 'date': now, 24 | 'interval': interval, 25 | 'bvid': bvid, 26 | 'aid': aid 27 | } 28 | 29 | 30 | class BiliobRankAdd(BiliobSpider): 31 | def __init__(self): 32 | super().__init__("从排行榜获得数据", interval=0, concurrency=8, use_proxy=False) 33 | self.except_content_type = 'application/json' 34 | 35 | async def gen_url(self): 36 | while True: 37 | try: 38 | online_url = 'http://api.bilibili.com/x/web-interface/online' 39 | url = 'http://api.bilibili.com/x/web-interface/ranking?rid={}&day=1&type=1&arc_type=0' 40 | self.logger.info("Get From Online") 41 | online_data = requests.get(online_url).json() 42 | rids = online_data['data']['region_count'].keys() 43 | # data = await self.get(url, self.get_proxy()) 44 | # data = await data.json() 45 | for rid in rids: 46 | self.logger.info(f"Crawl rid: {rid}") 47 | yield url.format(rid) 48 | await asyncio.sleep(86400) 49 | except Exception as e: 50 | self.logger.exception(e) 51 | 52 | async def parse(self, res): 53 | if res == None: 54 | return None 55 | j = res.json_data 56 | data = j['data'] 57 | items = [] 58 | l = data['list'] 59 | for video_info in l: 60 | if 'aid' in video_info: 61 | aid = video_info['aid'] 62 | else: 63 | aid = None 64 | bvid = video_info['bvid'] 65 | mid = video_info['mid'] 66 | item = { 67 | 'bvid': bvid.lstrip("BV"), 68 | 'aid': int(aid), 69 | 'mid': int(mid) 70 | } 71 | items.append(item) 72 | return items 73 | 74 | async def save(self, items): 75 | if items == None: 76 | return 0 77 | for item in items: 78 | author = db.author_interval.find_one({'mid': item['mid']}) 79 | if author == None or author['interval'] > 3600: 80 | db.author_interval.update_one({ 81 | 'mid': item['mid']}, 82 | { 83 | '$set': update_interval(3600 * 12, 'mid', item['mid']) 84 | }, upsert=True) 85 | if 'bvid' in item and item['bvid'] != None: 86 | filter_dict = { 87 | 'bvid': item['bvid']} 88 | else: 89 | filter_dict = {'aid': item['aid']} 90 | 91 | self.db.video_interval.update_one(filter_dict, 92 | { 93 | '$set': update_video_interval(3600 * 12, item['bvid'], item['aid']) 94 | }, upsert=True) 95 | return len(items) 96 | 97 | 98 | s = BiliobRankAdd() 99 | if __name__ == "__main__": 100 | s.run() 101 | -------------------------------------------------------------------------------- /spiders/utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | sub_channel_2_channel = { 4 | 'ASMR': '生活', 5 | 'GMV': '游戏', 6 | 'Korea相关': '娱乐', 7 | 'MAD·AMV': '动画', 8 | '美食制作': '美食', 9 | '美食侦探': '美食', 10 | '美食记录': '美食', 11 | '田园美食': '美食', 12 | '美食测评': '美食', 13 | 'MMD·3D': '动画', 14 | 'Mugen': '游戏', 15 | 'OP/ED/OST': '音乐', 16 | 'VOCALOID·UTAU': '音乐', 17 | '三次元舞蹈': '舞蹈', 18 | '三次元音乐': '音乐', 19 | '人力VOCALOID': '鬼畜', 20 | '人文·历史': '纪录片', 21 | '健身': '时尚', 22 | '其他': '生活', 23 | '其他国家': '电影', 24 | '军事': '纪录片', 25 | '动物圈': '生活', 26 | '华语电影': '电影', 27 | '单机游戏': '游戏', 28 | '原创音乐': '音乐', 29 | '国产剧': '电视剧', 30 | '国产动画': '国创', 31 | '国产原创相关': '国创', 32 | '宅舞': '舞蹈', 33 | '完结动画': '番剧', 34 | '官方延伸': '番剧', 35 | '布袋戏': '国创', 36 | '广告': '广告', 37 | '影视剪辑': '影视', 38 | '影视杂谈': '影视', 39 | '手工': '生活', 40 | '手机游戏': '游戏', 41 | '搞笑': '生活', 42 | '教程演示': '鬼畜', 43 | '数码': '数码', 44 | '日常': '生活', 45 | '明星': '娱乐', 46 | '星海': '科技', 47 | '服饰': '时尚', 48 | '机械': '科技', 49 | '桌游棋牌': '游戏', 50 | '欧美电影': '电影', 51 | '汽车': '科技', 52 | '海外剧': '电视剧', 53 | '演奏': '音乐', 54 | '演讲·公开课': '科技', 55 | '特摄': '影视', 56 | '电子竞技': '游戏', 57 | '短片': '影视', 58 | '短片·手书·配音': '动画', 59 | '社会·美食·旅行': '纪录片', 60 | '科学·探索·自然': '纪录片', 61 | '绘画': '生活', 62 | '综艺': '娱乐', 63 | '网络游戏': '游戏', 64 | '美妆': '时尚', 65 | '美食圈': '生活', 66 | '翻唱': '音乐', 67 | '舞蹈教程': '舞蹈', 68 | '资讯': '国创', 69 | '趣味科普人文': '科技', 70 | '运动': '生活', 71 | '连载动画': '番剧', 72 | '野生技术协会': '科技', 73 | '音MAD': '鬼畜', 74 | '音乐选集': '音乐', 75 | '音游': '游戏', 76 | '预告 资讯': '影视', 77 | '预告·资讯': '影视', 78 | '单机联机': '游戏', 79 | '鬼畜调教': '鬼畜', 80 | '演讲• 公开课': '科技', 81 | '国产电影': '电影', 82 | '日本电影': '电影', 83 | '番剧': '番剧', 84 | '国创': '国创', 85 | '鬼畜': '鬼畜', 86 | '电视剧': '电视剧', 87 | '动画': '动画', 88 | '时尚': '时尚', 89 | '娱乐': '娱乐', 90 | '电影': '电影', 91 | '舞蹈': '舞蹈', 92 | '舞蹈综合': '舞蹈', 93 | '科技': '科技', 94 | '生活': '生活', 95 | '音乐': '音乐', 96 | '纪录片': '纪录片', 97 | '手机平板': '数码', 98 | '电脑装机': '数码', 99 | '影音智能': '数码', 100 | '摄影摄像': '数码', 101 | '风尚标': '时尚', 102 | '电音': '音乐', 103 | '音乐综合': '音乐', 104 | 'MV': '音乐', 105 | '音乐现场': '音乐', 106 | '游戏': '游戏', 107 | 'T台': '时尚', 108 | '动态漫·广播剧': '国创', 109 | '明星舞蹈': '舞蹈', 110 | '街舞': '舞蹈', 111 | '中国舞': '舞蹈', 112 | '社科人文': '知识', 113 | '热点': '资讯', 114 | '环球': '资讯', 115 | '社会': '资讯', 116 | '综合': '', 117 | '科学科普': '知识', 118 | '社科人文': '知识', 119 | '职业职场': '知识', 120 | '财经': '知识', 121 | '校园学习': '知识', 122 | '明星舞蹈': '舞蹈', 123 | '手办·模玩': "动画" 124 | } 125 | 126 | 127 | table = 'fZodR9XQDSUm21yCkr6zBqiveYah8bt4xsWpHnJE7jL5VG3guMTKNPAwcF' 128 | tr = {} 129 | for i in range(58): 130 | tr[table[i]] = i 131 | s = [11, 10, 3, 8, 4, 6] 132 | xor = 177451812 133 | add = 8728348608 134 | 135 | 136 | def dec(x): 137 | try: 138 | r = 0 139 | for i in range(6): 140 | r += tr[x[s[i]]]*58**i 141 | return (r-add) ^ xor 142 | except: 143 | return "" 144 | 145 | 146 | def enc(x): 147 | try: 148 | x = (x ^ xor)+add 149 | r = list('BV1 4 1 7 ') 150 | for i in range(6): 151 | r[s[i]] = table[x//58**i % 58] 152 | return ''.join(r) 153 | except: 154 | return -1 155 | -------------------------------------------------------------------------------- /spiders/add_public_video.py: -------------------------------------------------------------------------------- 1 | from utils import enc, dec 2 | from biliob import BiliobSpider 3 | from time import sleep 4 | from datetime import datetime 5 | import asyncio 6 | # url = 'https://space.bilibili.com/ajax/member/getSubmitVideos?mid={}&pagesize=10&page=1&order=pubdate' 7 | url = 'http://api.bilibili.com/x/space/arc/search?mid={}&ps=20&tid=0&pn=1&keyword=&order=pubdate' 8 | 9 | 10 | class AddPublicVideoSpider(BiliobSpider): 11 | def __init__(self): 12 | super().__init__(name='追加作者最新上传的视频', interval=0.1, concurrency=16) 13 | self.except_content_type = 'application/json' 14 | 15 | async def gen_url(self): 16 | # yield 'http://api.bilibili.com/x/space/arc/search?mid=326499679&ps=20&tid=0&pn=1&keyword=&order=pubdate' 17 | while True: 18 | try: 19 | cursor = self.async_db.author.find({'$or': [{'cFans': {'$gt': 100000}}, {'forceFocus': True}]}, 20 | {'mid': 1}).sort([('biliob.video_update', 1)]).limit(1000).batch_size(1000) 21 | async for each_author in cursor: 22 | mid = each_author['mid'] 23 | yield url.format(mid) 24 | except Exception as e: 25 | self.logger.exception(e) 26 | finally: 27 | await asyncio.sleep(1) 28 | 29 | async def parse(self, res): 30 | try: 31 | if res == None: 32 | return None 33 | j = res.json_data 34 | result = [] 35 | if j == None or 'data' not in j or j['data'] == None or 'list' not in j['data'] or 'vlist' not in j['data']['list']: 36 | return None 37 | bvid = None 38 | aid = None 39 | if len(j['data']['list']['vlist']) == 0: 40 | return None 41 | mid = int(res.url.query['mid']) 42 | channels = j['data']['list']['tlist'] 43 | channel_list = list(channels.values()) 44 | count = sum(map(lambda d: d['count'], channel_list)) 45 | main_channel = '复合' 46 | for channel in channel_list: 47 | channel['rate'] = float(channel['count'] / count) 48 | if channel['rate'] > 0.7: 49 | main_channel = channel['name'] 50 | await self.async_db.author.update_one( 51 | {'mid': mid}, {'$set': {'channels': channels, 'main_channel': main_channel, 'channel_list': channel_list}}) 52 | for each_video in j['data']['list']['vlist']: 53 | if 'bvid' in each_video: 54 | bvid = each_video['bvid'] 55 | if 'aid' in each_video: 56 | aid = each_video['aid'] 57 | result.append([bvid, aid, mid]) 58 | return result 59 | except Exception as e: 60 | pass 61 | self.logger.exception(e) 62 | 63 | def update_video_interval(self, interval: int, aid, bvid): 64 | if aid == None: 65 | aid = enc(bvid) 66 | if bvid == None: 67 | bvid = dec(aid) 68 | return { 69 | 'next': datetime.utcfromtimestamp(0), 70 | 'interval': interval, 71 | 'aid': aid, 72 | 'bvid': bvid 73 | } 74 | 75 | async def save(self, items): 76 | if items == None: 77 | return 0 78 | count = 0 79 | mid = None 80 | for (bvid, aid, mid) in items: 81 | bvid = bvid.lstrip('BV') 82 | interval_data = await self.async_db.video_interval.find_one( 83 | {'bvid': bvid, 'aid': aid}) 84 | if interval_data == None: 85 | if await self.async_db.video_interval.find_one({'aid': aid}) != None: 86 | continue 87 | await self.async_db.video_interval.update_one( 88 | {'bvid': bvid, 'aid': aid}, {'$set': self.update_video_interval(3600 * 24, aid, bvid), '$setOnInsert': {'date': datetime.utcnow()}}, upsert=True) 89 | count += 1 90 | if mid is not None: 91 | await self.async_db.author.update_one({'mid': mid}, {'$set': {'biliob.video_update': datetime.utcnow()}}, upsert=True) 92 | return count 93 | 94 | 95 | s = AddPublicVideoSpider() 96 | 97 | 98 | if __name__ == "__main__": 99 | 100 | s.run() 101 | -------------------------------------------------------------------------------- /spiders/video_data_spider.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | from db import db 3 | 4 | import datetime 5 | from biliob import BiliobSpider 6 | from utils import sub_channel_2_channel 7 | 8 | 9 | class BiliobNewVideoSpider(BiliobSpider): 10 | def __init__(self): 11 | super().__init__("新视频爬虫") 12 | self.except_content_type = 'application/json' 13 | 14 | async def gen_url(self): 15 | url = 'http://api.bilibili.com/x/web-interface/view?bvid={}' 16 | gen = self.video_gen() 17 | for each_video in gen: 18 | if each_video != None: 19 | yield url.format(each_video['bvid']) 20 | else: 21 | sleep(5) 22 | 23 | async def parse(self, res): 24 | try: 25 | r = res.json_data 26 | except Exception: 27 | return None 28 | bvid = str(res.url).split('bvid=')[1] 29 | if r['code'] == -404 or 'data' not in r: 30 | self.db.video.update_one({'bvid': bvid}, {'$set': {'deleted': True}}) 31 | return None 32 | d = r["data"] 33 | if 'aid' in d: 34 | aid = d['stat']['aid'] 35 | else: 36 | aid = None 37 | bvid = d['bvid'].lstrip('BV') 38 | author = d['owner']['name'] 39 | mid = d['owner']['mid'] 40 | view = d['stat']['view'] 41 | favorite = d['stat']['favorite'] 42 | danmaku = d['stat']['danmaku'] 43 | coin = d['stat']['coin'] 44 | share = d['stat']['share'] 45 | like = d['stat']['like'] 46 | reply = d['stat']['reply'] 47 | current_date = datetime.datetime.utcnow() + datetime.timedelta(hours=8) 48 | # 视频=硬币*0.4+收藏*0.3+弹幕*0.4+评论*0.4+播放*0.25+点赞*0.4+分享*0.6 49 | data = { 50 | 'view': view, 51 | 'favorite': favorite, 52 | 'danmaku': danmaku, 53 | 'coin': coin, 54 | 'share': share, 55 | 'like': like, 56 | 'reply': reply, 57 | 'jannchie': int(coin * 0.4 + favorite * 0.3 + danmaku * 0.4 + reply * 0.4 + view * 0.25 + like * 0.4 + share * 0.6), 58 | 'datetime': current_date 59 | } 60 | 61 | subChannel = d['tname'] 62 | title = d['title'] 63 | date = d['pubdate'] 64 | tid = d['tid'] 65 | pic = d['pic'] 66 | item = {} 67 | item['current_view'] = view 68 | item['current_favorite'] = favorite 69 | item['current_danmaku'] = danmaku 70 | item['current_coin'] = coin 71 | item['current_share'] = share 72 | item['current_reply'] = reply 73 | item['current_like'] = like 74 | item['current_datetime'] = current_date 75 | item['current_jannchie'] = int(coin * 0.4 + favorite * 0.3 + danmaku * 76 | 0.4 + reply * 0.4 + view * 0.25 + like * 0.4 + share * 0.6) 77 | item['aid'] = aid 78 | item['mid'] = mid 79 | item['pic'] = pic 80 | item['author'] = author 81 | item['bvid'] = bvid 82 | item['data'] = data 83 | item['title'] = title 84 | item['subChannel'] = subChannel 85 | item['datetime'] = date 86 | 87 | if subChannel != '': 88 | if subChannel not in sub_channel_2_channel: 89 | item['channel'] = '' 90 | self.logger.fatal(subChannel) 91 | else: 92 | item['channel'] = sub_channel_2_channel[subChannel] 93 | elif subChannel == '资讯': 94 | if tid == 51: 95 | item['channel'] == '番剧' 96 | if tid == 170: 97 | item['channel'] == '国创' 98 | if tid == 159: 99 | item['channel'] == '娱乐' 100 | elif subChannel == '综合': 101 | if tid == 1: 102 | item['channel'] == '动画' 103 | else: 104 | item['channel'] == '资讯' 105 | 106 | else: 107 | item['channel'] = None 108 | 109 | url_list = str(res.url).split('&') 110 | if len(url_list) == 2: 111 | item['object_id'] = url_list[1] 112 | else: 113 | item['object_id'] = None 114 | return item 115 | 116 | async def save(self, item): 117 | if db['aid'] != None: 118 | data_filter = {'aid': item['aid']} 119 | else: 120 | data_filter = {'bvid': item['bvid']} 121 | db['video'].update_one(data_filter, { 122 | '$set': { 123 | 'cView': item['current_view'], 124 | 'cFavorite': item['current_favorite'], 125 | 'cDanmaku': item['current_danmaku'], 126 | 'cCoin': item['current_coin'], 127 | 'cShare': item['current_share'], 128 | 'cLike': item['current_like'], 129 | 'cReply': item['current_reply'], 130 | 'cJannchie': item['current_jannchie'], 131 | 'cDatetime': item['current_datetime'], 132 | 'author': item['author'], 133 | 'subChannel': item['subChannel'], 134 | 'channel': item['channel'], 135 | 'bvid': item['bvid'], 136 | 'mid': item['mid'], 137 | 'pic': item['pic'], 138 | 'title': item['title'], 139 | 'datetime': datetime.datetime.fromtimestamp( 140 | item['datetime']) 141 | }, 142 | '$push': { 143 | 'data': { 144 | '$each': [item['data']], 145 | '$position': 0 146 | } 147 | } 148 | }, True) 149 | return 1 150 | 151 | 152 | s = BiliobNewVideoSpider() 153 | if __name__ == "__main__": 154 | s.run() 155 | -------------------------------------------------------------------------------- /spiders/author_data_spider.py: -------------------------------------------------------------------------------- 1 | from db import db 2 | import datetime 3 | from biliob import BiliobSpider 4 | import asyncio 5 | from cookies_pool import cookies_pool 6 | from fake_useragent import UserAgent 7 | ua = UserAgent() 8 | 9 | 10 | class CookiesPool(): 11 | def __init__(self): 12 | self.cookies_pool = cookies_pool 13 | self.cookies_pool_index = 0 14 | self.__c = self.__cookies_gener() 15 | 16 | def get_cookies(self): 17 | return next(self.__c) 18 | 19 | def __cookies_gener(self): 20 | l = len(self.cookies_pool) 21 | while True: 22 | yield cookies_pool[self.cookies_pool_index % l] 23 | 24 | 25 | class BiliOBAuthorDataSpider(BiliobSpider): 26 | async def reset_interval(self, reason="任务失败", mid=0): 27 | self.logger.warning("{}: {}".format(reason, mid)) 28 | return None 29 | 30 | def __init__(self): 31 | super().__init__("Author Data Spider", 0.1, 8) 32 | self.cookies_pool = CookiesPool() 33 | 34 | self.except_content_type = 'application/json' 35 | self.use_proxy = True 36 | self.retry = 3 37 | 38 | self.crawl_like_and_count = True 39 | 40 | async def gen_url(self): 41 | # url = '{}://api.bilibili.com/x/space/acc/info?mid={}' 42 | 43 | # url = '{}://api.bilibili.com/x/web-interface/card?mid={}' 44 | 45 | mid_gener = self.mid_gener() 46 | count = 1 47 | async for each in mid_gener: 48 | yield each 49 | await asyncio.sleep(0) 50 | else: 51 | await asyncio.sleep(0) 52 | 53 | async def parse(self, mid): 54 | try: 55 | self.headers = { 56 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.8 Safari/537.36', 57 | 'cookie': self.cookies_pool.get_cookies() 58 | } 59 | url = 'http://api.bilibili.com/x/web-interface/card?mid={}&jsonp=jsonp' 60 | try: 61 | # self.logger.info('1' + self.proxy) 62 | res = await self.get(url.format(mid)) 63 | if res == None: 64 | return await self.reset_interval("解析基础信息出错", mid) 65 | j = res.json_data 66 | if 'code' in j and j['code'] == -412: 67 | return await self.reset_interval("基础信息被Ban", mid) 68 | except Exception as e: 69 | # self.logger.exception(e) 70 | return await self.reset_interval("解析基础信息出错", mid) 71 | # 删除 72 | if j['code'] == -400 or j['code'] == -404: 73 | self.db.author_interval.delete_one({'mid': mid}) 74 | self.logger.warning(j) 75 | return None 76 | if 'code' in j and j['code'] == -412: 77 | return await self.reset_interval("基础信息被Ban", mid) 78 | name = j['data']['card']['name'] 79 | if mid != int(j['data']['card']['mid']): 80 | return await self.reset_interval("数据疑似被缓存", mid) 81 | sex = j['data']['card']['sex'] 82 | face = j['data']['card']['face'] 83 | if 'card' in j and 'data' in j['card'] and j['data']['card'] == None: 84 | saved_data = db['author'].find_one({'mid': mid}) 85 | if saved_data == None or 'data' not in saved_data: 86 | db['author_interval'].remove({'mid': mid}) 87 | return await self.reset_interval("解析基础信息出错", mid) 88 | level = j['data']['card']['level_info']['current_level'] 89 | official = j['data']['card']['Official']['title'] 90 | archive = j['data']['archive_count'] 91 | article = j['data']['article_count'] 92 | fans = j['data']['follower'] 93 | attention = j['data']['card']['attention'] 94 | item = {} 95 | item['mid'] = int(mid) 96 | item['name'] = name 97 | item['face'] = face 98 | item['official'] = official 99 | item['sex'] = sex 100 | item['level'] = int(level) 101 | item['data'] = { 102 | 'fans': int(fans), 103 | 'attention': int(attention), 104 | 'archive': int(archive), 105 | 'article': int(article), 106 | 'datetime': datetime.datetime.utcnow() + datetime.timedelta(hours=8) 107 | } 108 | item['c_fans'] = int(fans) 109 | item['c_attention'] = int(attention) 110 | item['c_archive'] = int(archive) 111 | item['c_article'] = int(article) 112 | 113 | if self.crawl_like_and_count: 114 | try: 115 | view_data_res = await self.get( 116 | "{}://api.bilibili.com/x/space/upstat?mid={}".format('http', mid)) 117 | if view_data_res == None: 118 | return await self.reset_interval("解析UP主播放、点赞出错", mid) 119 | j = view_data_res.json_data 120 | if 'code' in j and j['code'] == -412: 121 | self.cookies_pool.cookies_pool_index += 1 122 | return await self.reset_interval("解析UP主播放、点赞被BAN", mid) 123 | except Exception: 124 | return await self.reset_interval("解析UP主播放、点赞出错", mid) 125 | archive_view = j['data']['archive']['view'] 126 | article_view = j['data']['article']['view'] 127 | like = j['data']['likes'] 128 | item['data']['archiveView'] = archive_view 129 | item['data']['articleView'] = article_view 130 | item['data']['like'] = like 131 | item['c_like'] = like 132 | item['c_archive_view'] = int(archive_view) 133 | item['c_article_view'] = int(article_view) 134 | 135 | now = datetime.datetime.utcnow() + datetime.timedelta(hours=8) 136 | last_data = self.db.author_data.find_one( 137 | {'mid': item['mid'], 'datetime': {'$lt': now - datetime.timedelta(1)}}) 138 | if last_data == None: 139 | last_data = self.db.author_data.find_one( 140 | {'mid': item['mid']}) 141 | if (last_data != None): 142 | item['c_rate'] = item['data']['fans'] - last_data['fans'] 143 | else: 144 | item["c_rate"] = 0 145 | else: 146 | delta_seconds = now.timestamp() - last_data['datetime'].timestamp() 147 | delta_fans = item['data']['fans'] - last_data['fans'] 148 | item['c_rate'] = int(delta_fans / delta_seconds * 86400) 149 | # self.proxy = await self.proxy_gener.__anext__() 150 | return item 151 | except Exception as e: 152 | self.logger.exception(e) 153 | return await self.reset_interval(mid) 154 | 155 | async def save(self, item): 156 | try: 157 | if item == None: 158 | return 0 159 | mid = item['mid'] 160 | s = { 161 | 'focus': True, 162 | 'sex': item['sex'], 163 | 'name': item['name'], 164 | 'face': item['face'], 165 | 'level': item['level'], 166 | 'cFans': item['c_fans'], 167 | 'cRate': item['c_rate'], 168 | 'cLike': item['c_like'], 169 | 'cArchive_view': item['c_archive_view'], 170 | 'cArticle_view': item['c_article_view'], 171 | 'cArchive': item['c_archive'], 172 | 'cArticle': item['c_article'], 173 | 'official': item['official'], 174 | 'data': item['data'] 175 | } 176 | await self.async_db.author.update_one({ 177 | 'mid': item['mid'] 178 | }, { 179 | '$set': s 180 | }, True) 181 | 182 | item['data']['mid'] = item['mid'] 183 | await self.async_db.author_data.replace_one( 184 | {'mid': item['data']['mid'], 185 | # 'src': self.hostname, 186 | 'datetime': item['data']['datetime']}, item['data'], upsert=True) 187 | await self.update_author_interval_by_mid(mid) 188 | return item 189 | except Exception as e: 190 | self.logger.exception(e) 191 | await self.reset_interval("存储失败", item['mid']) 192 | 193 | 194 | if __name__ == "__main__": 195 | s = BiliOBAuthorDataSpider() 196 | s.run() 197 | -------------------------------------------------------------------------------- /spiders/biliob.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import asyncio 3 | from time import sleep 4 | from db import async_db, db 5 | import datetime 6 | from simpyder.spiders import AsynSpider 7 | from simpyder import FAKE_UA 8 | from utils import enc 9 | from utils import dec 10 | import os 11 | env_dist = os.environ 12 | 13 | 14 | class BiliobSpider(AsynSpider): 15 | 16 | async def gen_proxy(self): 17 | url = env_dist['PYOXIES_URL'] 18 | while True: 19 | try: 20 | async with self.sem_gen_proxy: 21 | res = await self.get(f"{url}/proxies", proxy=None) 22 | proxies = await res.json() 23 | self.logger.info( 24 | f"Get Proxies From: {url}/proxies") 25 | for proxy in proxies['proxies']: 26 | self.logger.info( 27 | f"Get Proxies : {proxy}") 28 | yield proxy 29 | await asyncio.sleep(1) 30 | except Exception as e: 31 | self.logger.exception(e) 32 | 33 | def __init__(self, name="BiliOB Spider", interval=1.5, concurrency=4, use_proxy=True): 34 | self.sem_gen_proxy = asyncio.Semaphore(1) 35 | super().__init__(name=name, user_agent=FAKE_UA, 36 | interval=interval, concurrency=concurrency) 37 | loop = asyncio.get_event_loop() 38 | self.db = db 39 | self.async_db = async_db 40 | self.hostname = socket.gethostname() 41 | self.use_proxy = use_proxy 42 | 43 | async def mid_gener(self): 44 | last_data = set() 45 | while True: 46 | try: 47 | # 如果存在锁 48 | # if self.db.lock.count_documents({"name": "author_data_spider"}): 49 | # await asyncio.sleep(0.1) 50 | # continue 51 | 52 | # 挂锁 53 | # await self.async_db['lock'].insert_one( 54 | # {"name": "author_data_spider", "date": datetime.datetime.utcnow()}) 55 | data = [] 56 | 57 | mc = self.async_db.author_interval.find( 58 | {'next': {'$lt': datetime.datetime.utcnow()}}).sort([('next', 1)]).limit(100) 59 | async for d in mc: 60 | # 手动操作设置为已经执行 61 | data.append(d) 62 | if 'order' in d: 63 | await self.async_db.user_record.update_many({'_id': {'$in': d['order']}}, {'$set': { 64 | 'isExecuted': True 65 | }}) 66 | 67 | # 解锁 68 | # await self.async_db.lock.delete_one( 69 | # {"name": "author_data_spider"}) 70 | tmp_set = set() 71 | for each_data in data: 72 | if 'mid' not in each_data: 73 | self.async_db.author_interval.delete_one({'mid': None}) 74 | self.logger.warning('删除没有mid的author interval记录') 75 | continue 76 | if each_data['mid'] not in last_data: 77 | yield each_data['mid'] 78 | tmp_set.add(each_data['mid']) 79 | last_data = tmp_set 80 | except Exception as e: 81 | self.logger.exception(e) 82 | 83 | async def video_gen_without_lock(self): 84 | last_data = set() 85 | batch = 2000 86 | while True: 87 | try: 88 | d = [] 89 | data = self.async_db.video_interval.find( 90 | {'next': {'$lte': datetime.datetime.utcnow()}}).sort([('next', 1)]).hint("idx_next").limit(batch) 91 | async for each in data: 92 | if 'aid' not in each and 'bvid' in each and 'bvid' != '': 93 | each['aid'] = dec('BV' + each['bvid'].lstrip('BV')) 94 | elif 'bvid' not in each and 'aid' in each or 'bvid' == '': 95 | each['bvid'] = enc(each['aid']).lstrip('BV') 96 | elif 'aid' in each and 'bvid' in each and each['aid'] != None and type(each['aid']) != str and each['aid'] > 0: 97 | pass 98 | else: 99 | await self.async_db.video_interval.delete_one({'_id': each['_id']}) 100 | d.append(each) 101 | for data in d: 102 | if 'aid' not in data: 103 | continue 104 | if data['aid'] not in last_data: 105 | last_data.add(data['aid']) 106 | yield data 107 | last_data = set() 108 | 109 | if len(d) < batch / 2: 110 | await asyncio.sleep(10) 111 | except Exception as e: 112 | self.logger.exception(e) 113 | 114 | async def update_author_interval_by_mid(self, mid): 115 | interval_data = await self.async_db.author_interval.find_one({'mid': mid}) 116 | self.logger.debug(f"更新 {mid}") 117 | await self.update_author_interval(interval_data) 118 | 119 | async def update_author_interval(self, interval_data): 120 | try: 121 | if 'interval' in interval_data: 122 | interval = interval_data['interval'] 123 | else: 124 | interval = 86400 125 | interval_data['interval'] = interval 126 | interval_data['next'] = datetime.datetime.utcnow() + \ 127 | datetime.timedelta(seconds=interval) 128 | interval_data['order'] = [] 129 | await self.async_db.author_interval.update_one( 130 | {'mid': interval_data['mid']}, {'$set': interval_data}) 131 | except Exception as e: 132 | self.logger.exception(e) 133 | 134 | async def total_video_gen(self): 135 | aid = 55157643 136 | while True: 137 | self.logger.info(f'now: {aid}') 138 | c = self.async_db.video.find({'aid': {"$gt": aid}}, { 139 | 'aid': 1}).sort([('aid', 1)]).limit(100) 140 | async for doc in c: 141 | aid = doc['aid'] 142 | yield aid 143 | 144 | async def video_gen(self): 145 | while True: 146 | # 如果存在锁 147 | if await self.async_db.lock.count_documents({"name": "video_interval"}): 148 | sleep(0.1) 149 | continue 150 | # 挂锁 151 | await self.async_db.lock.insert_one( 152 | {"name": "video_interval", "date": datetime.datetime.utcnow()}) 153 | try: 154 | d = [] 155 | data = await self.async_db.video_interval.find( 156 | {'order': {'$exists': True, '$ne': []}}).hint("idx_order").limit(100) 157 | for each in data: 158 | d.append(each) 159 | data = await self.async_db.video_interval.find( 160 | {'next': {'$lt': datetime.datetime.utcnow()}}).limit(100) 161 | for each in data: 162 | d.append(each) 163 | for data in d: 164 | # 如果存在手动操作,则刷新数据 165 | if 'order' in data: 166 | for order_id in data['order']: 167 | await self.async_db.user_record.update_one({'_id': order_id}, {'$set': { 168 | 'isExecuted': True 169 | }}) 170 | data['next'] = data['next'] + \ 171 | datetime.timedelta(seconds=data['interval']) 172 | data['order'] = [] 173 | try: 174 | if 'aid' not in data: 175 | data['aid'] = dec('BV' + data['bvid']) 176 | filt = {'bvid': data['bvid']} 177 | elif 'bvid' not in data: 178 | data['bvid'] = enc(data['aid']).lstrip("BV") 179 | filt = {'aid': data['aid']} 180 | else: 181 | filt = {'bvid': data['bvid']} 182 | except Exception: 183 | if 'aid' in data: 184 | await self.async_db.video_interval.delete_many({'aid': data['aid']}) 185 | else: 186 | await self.async_db.video_interval.delete_many({'bvid': data['bvid']}) 187 | continue 188 | if await self.async_db.video_interval.count(filt) > 1: 189 | await self.async_db.video_interval.delete_many(filt) 190 | 191 | await self.async_db.video_interval.update_one( 192 | filt, {'$set': data}) 193 | 194 | # 解锁 195 | await self.async_db.lock.delete_one( 196 | {"name": "video_interval"}) 197 | for data in d: 198 | yield data 199 | except Exception as e: 200 | self.logger.exception(e) 201 | --------------------------------------------------------------------------------