├── image ├── 1.png └── online.png ├── bili_online ├── README.MD ├── data.php ├── bili_online.sql ├── biliOnline.py └── show.html ├── bilibili ├── README.MD ├── bilidata.php ├── bilibili.sql ├── show.html └── bilibili.py ├── README.md ├── spider.py ├── qiubai.py ├── taobaomm.py └── zhihu.py /image/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StephinChou/Pythonspider/HEAD/image/1.png -------------------------------------------------------------------------------- /image/online.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StephinChou/Pythonspider/HEAD/image/online.png -------------------------------------------------------------------------------- /bili_online/README.MD: -------------------------------------------------------------------------------- 1 | # 操作流程: 2 | * 直接在window下运行biliOnline.py获取在线数据 3 | * 推荐用cron计划任务来运行,如果采用此方式,请__去掉代码里的死循环__ 4 | * 构造php运行环境,配置data.php文件(推荐直接使用wamp) 5 | * 运行show.html 文件即可查看数据图标(注意data.php的**路径**) 6 | 7 | # 数据展示 8 | ![](https://github.com/StephinChou/Pythonspider/blob/master/image/online.png) 9 | -------------------------------------------------------------------------------- /bilibili/README.MD: -------------------------------------------------------------------------------- 1 | # 爬取B站 视频热度排行的 视频数据 bilibili.py 2 | * 只需输入一个大模块名,如游戏模块名为'game',自行会爬取下面几个小类,并按播放数、硬币数等排行分别爬取 3 | * 已解析出数据接口,直接获取视频数据,不使用webDriver,爬取速度提升数十倍,并且不会miss数据2016.9.22 4 | * @TODO 对爬取到的视频做日期归类(待完成) 5 | * 目前爬取的信息有: 6 | * up主id 7 | * up主名 8 | * 视频AV号 9 | * 播放数 10 | * 收藏数 11 | * 弹幕数 12 | * 视频描述 13 | * 硬币数(获取不稳定,少数会获取不到) 14 | * 分享数(同上) 15 | 16 | 17 | # 数据展示 18 | ![](https://github.com/StephinChou/Pythonspider/blob/master/image/1.png) 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pythonspider,一个简单的python爬虫 2 | * 娱乐随手写的,代码不太严谨,仅仅实现功能 3 | * 原生python+BeautifulSoup4 4 | * python3.4版本 5 | * 所有脚本要和spider.py放到同一目录下 6 | * 自行下载BeautifulSoup4 的类库 window下 `pip install bs4`即可 7 | 8 | ## 爬取知乎的爬虫 zhihu.py 9 | * 主要实现 爬取一个收藏夹 里 所有问题答案下的 图片 10 | * 文字信息暂未收录,可自行实现,比图片更简单 11 | * 具体代码里有详细注释,请自行阅读 12 | 13 | ## 子文件夹文件说明: 14 | |文件名|说明| 15 | |:-:|:-:| 16 | |\*.py文件|爬虫程序| 17 | |\*.sql文件|数据表结构| 18 | |\*.html文件|数据展示| 19 | |\*.php文件|数据展示界面数据接口| 20 | -------------------------------------------------------------------------------- /bili_online/data.php: -------------------------------------------------------------------------------- 1 | "播放量","coin"=>"硬币数","collect"=>"收藏数","danmu"=>"弹幕数"); 10 | if(!isset($fields[$order])){ 11 | echo 0; 12 | die; 13 | } 14 | if($average){ 15 | $by = 'avg'.$order; 16 | } 17 | $sql = "SELECT author_name,count('*') as 'avNum', AVG(`{$order}`) as avg{$order},sum(`{$order}`) as {$order} FROM `bilibili` group by author order by {$order} DESC limit {$limit}"; 18 | 19 | 20 | $result = mysqli_query($link,$sql); 21 | file_put_contents("a.txt", $sql); 22 | 23 | $data['name'] = $average ? "平均每视频".$fields[$order] :$fields[$order]; 24 | while($row = mysqli_fetch_array($result)) 25 | { 26 | if($average){ 27 | $num = intval($row['avg'.$order]); 28 | }else{ 29 | $num = intval($row[$order]); 30 | } 31 | $sort[] = $num; 32 | $tmp = array($row['author_name'],$num); 33 | $data['data'][] = $tmp; 34 | } 35 | 36 | array_multisort($data['data'],SORT_ASC ,SORT_NUMERIC ,$sort); 37 | $callback = $_GET['callback'] ? $_GET['callback'] :"callback"; 38 | 39 | echo "{$callback}(".json_encode($data).")"; -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | #!/url/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'waiting' 4 | import os,re,codecs,urllib,io,gzip,zlib 5 | from urllib import request 6 | from bs4 import BeautifulSoup 7 | import chardet 8 | 9 | 10 | class SpiderHTML(object): 11 | #打开页面 12 | def getUrl(self, url, coding='utf-8'): 13 | req = request.Request(url) 14 | req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 UBrowser/5.5.9703.2 Safari/537.36') 15 | req.add_header('Accept-encoding', 'gzip') 16 | with request.urlopen(req) as response: 17 | gzipd = response.headers.get('Content-Encoding') 18 | if gzipd == 'gzip': 19 | data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS) 20 | 21 | else: 22 | data = response.read() 23 | return BeautifulSoup(data.decode(coding)) 24 | 25 | #保存文本内容到本地 26 | def saveText(self,filename,content,mode='w'): 27 | self._checkPath(filename) 28 | with codecs.open(filename, encoding='utf-8', mode=mode) as f: 29 | f.write(content) 30 | 31 | 32 | #保存图片 33 | def saveImg(self, imgUrl, imgName): 34 | data=request.urlopen(imgUrl).read() 35 | self._checkPath(imgName) 36 | with open(imgName,'wb') as f: 37 | f.write(data) 38 | 39 | #创建目录 40 | def _checkPath(self, path): 41 | dirname = os.path.dirname(path.strip()) 42 | if not os.path.exists(dirname): 43 | os.makedirs(dirname) 44 | -------------------------------------------------------------------------------- /bilibili/bilibili.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Navicat MySQL Data Transfer 3 | 4 | Source Server : localhost 5 | Source Server Version : 50617 6 | Source Host : localhost:3306 7 | Source Database : test 8 | 9 | Target Server Type : MYSQL 10 | Target Server Version : 50617 11 | File Encoding : 65001 12 | 13 | Date: 2016-09-20 15:20:15 14 | */ 15 | 16 | SET FOREIGN_KEY_CHECKS=0; 17 | 18 | -- ---------------------------- 19 | -- Table structure for `bilibili` 20 | -- ---------------------------- 21 | DROP TABLE IF EXISTS `bilibili`; 22 | CREATE TABLE `bilibili` ( 23 | `id` int(11) NOT NULL AUTO_INCREMENT, 24 | `av` varchar(10) NOT NULL COMMENT '视频av号', 25 | `title` varchar(100) NOT NULL COMMENT '视频标题', 26 | `module` varchar(20) NOT NULL DEFAULT '' COMMENT '视频模块', 27 | `tid` varchar(5) NOT NULL DEFAULT '' COMMENT '模块编号', 28 | `author` varchar(10) NOT NULL COMMENT '作者id', 29 | `author_name` varchar(30) NOT NULL COMMENT '作者名字', 30 | `play` int(11) NOT NULL COMMENT '播放数', 31 | `danmu` int(11) NOT NULL COMMENT '弹幕数', 32 | `collect` int(11) NOT NULL COMMENT '收藏数', 33 | `desc` varchar(500) NOT NULL COMMENT '视频描述', 34 | `share` int(11) NOT NULL COMMENT '分享数', 35 | `coin` int(11) NOT NULL COMMENT '硬币数', 36 | `mtime` int(11) NOT NULL COMMENT '修改时间', 37 | `ctime` int(11) NOT NULL COMMENT '创建时间', 38 | PRIMARY KEY (`id`), 39 | UNIQUE KEY `UQ_video` (`av`) USING BTREE 40 | ) ENGINE=InnoDB AUTO_INCREMENT=436 DEFAULT CHARSET=utf8mb4; 41 | -------------------------------------------------------------------------------- /qiubai.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import sys,os,pdb,re,time,random,datetime 3 | from spider import SpiderHTML 4 | from bs4 import BeautifulSoup 5 | 6 | class QiubaiSpider(SpiderHTML): 7 | def __init__(self,contentType,pageStart=1, pageEnd=1): 8 | #super.__init__(self) 9 | self._contentType = contentType 10 | self._pageStart = int(pageStart) 11 | self._pageEnd = int(pageEnd)+1 12 | self.__url = {'new':'http://www.qiushibaike.com/textnew/page/','hot':'http://www.qiushibaike.com/text/page/'} 13 | 14 | def getJokes(self): 15 | reqUrl = '' 16 | 17 | if contentType in self.__url: 18 | reqUrl = self.__url[self._contentType] 19 | else: 20 | reqUrl = self.__url['new'] 21 | for i in range(self._pageStart,self._pageEnd): 22 | pageUrl = reqUrl+str(i)+'/' 23 | jokes = self.getUrl(pageUrl) 24 | jokes = jokes.find_all('div',id=re.compile('qiushi_tag_\d+')) 25 | filepath = os.path.join('E:\\','qiubai',str(datetime.date.today())+self._contentType+str(i)) 26 | info = '正在保存第{page}页的糗事到文件 {file}.txt' 27 | print(info.format(page=i,file=filepath)) 28 | for joke in jokes: 29 | jokeContent = str(joke.find('div',attrs={'class':'content'})) 30 | jokeContent = re.sub('
','',jokeContent) 31 | jokeContent = re.sub('
','',jokeContent) 32 | jokeContent = re.sub('','',jokeContent) 33 | jokeContent = re.sub('
','\n',jokeContent) 34 | jokeContent = re.sub('
','\n',jokeContent) 35 | try: 36 | author = joke.find(attrs={'class':'author clearfix'}).find('h2').string 37 | upvote = joke.find(attrs={'class':'stats'}).span.i.string 38 | except AttributeError: 39 | pass 40 | 41 | joke = '-----------------------------\r\n作者:{author}\r\n{joke}\r\n\r\n{upvote}人觉得很赞\r\n'.format(joke=jokeContent.strip(),author=author,upvote=upvote) 42 | 43 | self.saveText(filepath+'.txt',joke,'a') 44 | if i%2 == 0: #防止被封,间隔时间长一点 45 | time.sleep(random.random()*3) 46 | 47 | if __name__ == '__main__': 48 | contentType = 'new' 49 | page = 0 50 | paramsNum = len(sys.argv) 51 | 52 | #输入想获取最新的糗百还是最热的糗百 53 | #参数2,3为想要获取的页数 54 | if paramsNum>=4: 55 | contentType = sys.argv[1] 56 | page = sys.argv[2] 57 | pageEnd = sys.argv[3] 58 | elif paramsNum>=3: 59 | contentType = sys.argv[1] 60 | page = sys.argv[2] 61 | pageEnd = page 62 | elif paramsNum == 2: 63 | contentType = sys.argv[1] 64 | page,pageEnd = 1,1 65 | else: 66 | contentType = 'new' 67 | page,pageEnd = 1,1 68 | 69 | qiubai = QiubaiSpider(contentType,page,pageEnd) 70 | qiubai.getJokes() -------------------------------------------------------------------------------- /bili_online/show.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 75 | 76 | 77 | 78 |
79 | -------------------------------------------------------------------------------- /taobaomm.py: -------------------------------------------------------------------------------- 1 | from spider import SpiderHTML 2 | import re,os,sys,time,urllib,random,http 3 | ''' 4 | 抓取淘宝模特的靓图 5 | ''' 6 | 7 | class TaobaommSpider(SpiderHTML): 8 | #抓取起始页,结束页,每个妹子抓取的图片数量 9 | def __init__(self,pageStart, pageEnd,limit_img): 10 | self._pageStart = int(pageStart) 11 | self._pageEnd = int(pageEnd)+1 12 | self._limit = limit_img 13 | self.__url = 'https://mm.taobao.com/json/request_top_list.htm?page=' 14 | self.__dir = 'E:\\taobaomm' 15 | 16 | def start(self): 17 | for page in range(self._pageStart,self._pageEnd): 18 | url = self.__url + str(page) 19 | contents = self.getUrl(url,'gbk') 20 | lists = contents.find_all('div',class_='personal-info') 21 | for girl in lists: 22 | info = girl.find('a',attrs={'class':'lady-name'}) 23 | avatar = girl.find('a',class_='lady-avatar') 24 | 25 | girlinfo = {} 26 | girlinfo['name'] = info.string 27 | girlinfo['age'] = info.find_next_sibling('em').strong.string 28 | girlinfo['city'] = info.find_next('span').string 29 | girlinfo['url'] = 'https:'+avatar['href'] 30 | #去除掉缩小的图片 31 | girlinfo['avatar'] = 'https:'+re.sub('_\d+x\d+\.\w+$','',avatar.img['src']) 32 | imgType = os.path.splitext(girlinfo['avatar'])[1] 33 | logInfo = '找到一位MM:{name},{age}岁,她在{city}'.format(**girlinfo) 34 | print(logInfo) 35 | tmpDir = os.path.join(self.__dir,girlinfo['name']+'-'+girlinfo['age']+'-'+girlinfo['city']) 36 | if(os.path.exists(tmpDir)): 37 | print('已经获得过信息,去找下一位') 38 | continue 39 | #以名字命名,保存图片和基本信息 40 | self.saveImg(girlinfo['avatar'],os.path.join(tmpDir,'avatar'+imgType)) 41 | print('正在进入她的个人中心获取私图') 42 | 43 | gilrsCenter = self.getUrl(girlinfo['url'],'gbk') 44 | imgs = gilrsCenter.find('div',class_='mm-aixiu-content').find_all('img') 45 | i = 0 46 | for img in imgs: 47 | i = i + 1 48 | if i % 5 == 0: 49 | print('正在获取第{i}张图'.format(i=i)) 50 | try: 51 | imgurl = 'https:'+img['src'] 52 | extend_name = os.path.splitext(img['src'])[1] 53 | if extend_name == '.gif': 54 | continue #一般都是表情图,略过 55 | self.saveImg(imgurl,os.path.join(tmpDir,str(i)+extend_name)) 56 | except urllib.error.HTTPError as e: 57 | pass 58 | except KeyError as e: 59 | pass 60 | except http.client.IncompleteRead: 61 | pass 62 | 63 | if i >= self._limit: 64 | pass #若要限制每个模特抓图的张数,此处改为break 65 | time.sleep(random.random()*2) 66 | 67 | 68 | if __name__ == '__main__': 69 | page, limit, paramsNum= 1, 0, len(sys.argv) 70 | if paramsNum>=4: 71 | page, pageEnd, limit = sys.argv[1], sys.argv[2], int(sys.argv[3]) 72 | elif paramsNum == 2: 73 | page = sys.argv[1] 74 | pageEnd = page 75 | else: 76 | page,pageEnd = 1,1 77 | 78 | if limit <5: 79 | limit = 20 80 | spider = TaobaommSpider(page,pageEnd,limit) 81 | spider.start() 82 | -------------------------------------------------------------------------------- /zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from spider import SpiderHTML 4 | from multiprocessing import Pool 5 | import sys,urllib,http,os,random,re,time 6 | __author__ = 'waiting' 7 | ''' 8 | 使用了第三方的类库 BeautifulSoup4,请自行安装 9 | 需要目录下的spider.py文件 10 | 运行环境:python3.4,windows7 11 | ''' 12 | 13 | #收藏夹的地址 14 | url = 'https://www.zhihu.com/collection/30822111' #page参数改为代码添加 15 | 16 | #本地存放的路径,不存在会自动创建 17 | store_path = 'E:\\zhihu\收藏夹\\会员才知道的世界' 18 | 19 | class zhihuCollectionSpider(SpiderHTML): 20 | def __init__(self,pageStart, pageEnd, url): 21 | self._url = url 22 | self._pageStart = int(pageStart) 23 | self._pageEnd = int(pageEnd)+1 24 | self.downLimit = 0 #低于此赞同的答案不收录 25 | 26 | def start(self): 27 | for page in range(self._pageStart,self._pageEnd): #收藏夹的页数 28 | url = self._url + '?page='+str(page) 29 | content = self.getUrl(url) 30 | questionList = content.find_all('div',class_='zm-item') 31 | for question in questionList: #收藏夹的每个问题 32 | Qtitle = question.find('h2',class_='zm-item-title') 33 | if Qtitle is None: #被和谐了 34 | continue 35 | 36 | questionStr = Qtitle.a.string 37 | Qurl = 'https://www.zhihu.com'+Qtitle.a['href'] #问题题目 38 | Qtitle = re.sub(r'[\\/:*?"<>]','#',Qtitle.a.string) #windows文件/目录名不支持的特殊符号 39 | try: 40 | print('-----正在获取问题:'+Qtitle+'-----') #获取到问题的链接和标题,进入抓取 41 | except UnicodeEncodeError: 42 | print(r'---问题含有特殊字符无法显示---') 43 | try: 44 | Qcontent = self.getUrl(Qurl) 45 | except: 46 | print('!!!!获取出错!!!!!') 47 | pass 48 | answerList = Qcontent.find_all('div',class_='zm-item-answer zm-item-expanded') 49 | self._processAnswer(answerList,Qtitle) #处理问题的答案 50 | time.sleep(5) 51 | 52 | 53 | def _processAnswer(self,answerList,Qtitle): 54 | j = 0 55 | for answer in answerList: 56 | j = j + 1 57 | 58 | upvoted = int(answer.find('span',class_='count').string.replace('K','000')) #获得此答案赞同数 59 | if upvoted < self.downLimit: 60 | continue 61 | authorInfo = answer.find('div',class_='zm-item-answer-author-info') #获取作者信息 62 | author = {'introduction':'','link':''} 63 | try: 64 | author['name'] = authorInfo.find('a',class_='author-link').string #获得作者的名字 65 | author['introduction'] = str(authorInfo.find('span',class_='bio')['title']) #获得作者的简介 66 | author['link'] = authorInfo.find('a',class_='author-link')['href'] 67 | except AttributeError: 68 | author['name'] = '匿名用户'+str(j) 69 | except TypeError: #简介为空的情况 70 | pass #匿名用户没有链接 71 | 72 | file_name = os.path.join(store_path,Qtitle,'info',author['name']+'_info.txt') 73 | if os.path.exists(file_name): #已经抓取过 74 | continue 75 | 76 | self.saveText(file_name,'{introduction}\r\n{link}'.format(**author)) #保存作者的信息 77 | print('正在获取用户`{name}`的答案'.format(**author)) 78 | answerContent = answer.find('div',class_='zm-editable-content clearfix') 79 | if answerContent is None: #被举报的用户没有答案内容 80 | continue 81 | 82 | imgs = answerContent.find_all('img') 83 | if len(imgs) == 0: #答案没有上图 84 | pass 85 | else: 86 | self._getImgFromAnswer(imgs,Qtitle,**author) 87 | 88 | #收录图片 89 | def _getImgFromAnswer(self,imgs,Qtitle,**author): 90 | i = 0 91 | for img in imgs: 92 | if 'inline-image' in img['class']: #不抓取知乎的小图 93 | continue 94 | i = i + 1 95 | imgUrl = img['src'] 96 | extension = os.path.splitext(imgUrl)[1] 97 | path_name = os.path.join(store_path,Qtitle,author['name']+'_'+str(i)+extension) 98 | try: 99 | self.saveImg(imgUrl,path_name) #捕获各种图片异常,流程不中断 100 | except: 101 | pass 102 | 103 | #收录文字 104 | def _getTextFromAnswer(self): 105 | pass 106 | 107 | #命令行下运行,例:zhihu.py 1 5 获取1到5页的数据 108 | if __name__ == '__main__': 109 | page, limit, paramsNum= 1, 0, len(sys.argv) 110 | if paramsNum>=3: 111 | page, pageEnd = sys.argv[1], sys.argv[2] 112 | elif paramsNum == 2: 113 | page = sys.argv[1] 114 | pageEnd = page 115 | else: 116 | page,pageEnd = 1,1 117 | 118 | spider = zhihuCollectionSpider(page,pageEnd,url) 119 | spider.start() 120 | 121 | -------------------------------------------------------------------------------- /bilibili/show.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 64 | 65 | 66 | 67 | 68 | 69 |
70 |
统计类型
71 |
72 |
73 | 76 | 79 | 82 | 85 |
86 |
87 |
88 | 89 |
90 |
up主数量
91 |
92 |
93 | 96 | 99 | 102 |
103 |
104 |
105 |
106 |
按视频平均数
107 |
108 |
109 | 112 | 115 |
116 |
117 |
118 | 119 | 120 | 137 |
138 | -------------------------------------------------------------------------------- /bilibili/bilibili.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2016-09-20 15:42:13 4 | # @Author : waitingChou (zhouzt52@qq.com) 5 | # @Link : https://github.com/StephinChou/ 6 | __author__ = 'waiting' 7 | from spider import SpiderHTML 8 | from multiprocessing import Pool 9 | import sys,urllib,http,os,re,time,codecs,json 10 | import pymysql 11 | pymysql.install_as_MySQLdb() 12 | 13 | #从本地记录里获取曾经爬取过的视频号 14 | f = open('avSet.txt','r') 15 | avSet = set([]) 16 | for line in f: 17 | avSet = set(line.split(',')) 18 | 19 | #一些配置 20 | conn = pymysql.connect(host='localhost',user='root',passwd='',db='test',port=3306,use_unicode=True, charset="utf8") 21 | cur=conn.cursor() 22 | pattern = re.compile(r'\d+') #获取av号的正则表达式 23 | orders = {"hot":"播放量","review":"评论数","promote":"硬币数","stow":"收藏数"} 24 | biliUrl = 'http://www.bilibili.com' 25 | 26 | class BilibiliSpider(SpiderHTML): 27 | def __init__(self,module,timeStart,timeEnd,limit): 28 | self.url = biliUrl + '/video/' + module + '.html' 29 | self.timeStart = timeStart 30 | self.timeEnd = timeEnd 31 | self.limit = limit 32 | 33 | def start(self): 34 | content = self.getUrl(self.url) 35 | sorts = content.find('ul',class_='n_num') 36 | subSorts = sorts.find_all('a') 37 | 38 | #处理该类别下的子模块 39 | for sub in subSorts: 40 | subName = sub.string 41 | if(subName == '全部'): 42 | continue 43 | #子模块只需要tid即可 44 | tid = sub.parent['tid'] 45 | if tid is None or tid == '' : 46 | print('模块{type} tid解析错误'.format(type=subName)) 47 | continue 48 | self.parsePage(subName,tid) 49 | 50 | #处理一个子模块的页面 51 | def parsePage(self,typeName,tid): 52 | for (order,name) in orders.items(): 53 | sumData = dict() 54 | print("对子模块‘{typeName}’进行‘{name}’排序的分析".format(name=name,typeName=typeName)) 55 | sort = 0; 56 | #是否获取到足够的排名 57 | isBreak = False 58 | for page in range(1,5): 59 | # http://www.bilibili.com/list/stow-65-1-2016-09-12~2016-09-19.html 60 | urlTmp = biliUrl + "/list/{order}-{tid}-{page}-{start}~{end}.html".format(order=order,tid=tid,page=page,start=self.timeStart,end=self.timeEnd) 61 | content = self.getUrl(urlTmp) 62 | 63 | videoContent = content.find('ul',class_='vd-list l1') 64 | videoList = videoContent.find_all('div',class_='l-item') 65 | 66 | for video in videoList: 67 | AVInfo = dict() #作品信息 68 | AVInfo['av'] = pattern.search(video.find('a',class_='title')['href']).group() #av号 69 | AVInfo['title'] = video.find('a',class_='title').string #标题 70 | sort=sort+1 71 | if AVInfo['av'] in avSet: 72 | print("已经爬取过该视频av{av},{title}".format(**AVInfo)) 73 | continue 74 | 75 | AVInfo['author_name'] = video.find('a',class_='v-author').string #作者 76 | AVInfo['module'] = typeName #模块名 77 | AVInfo['tid'] = tid #模块id 78 | coinInfo = self.parseAV(AVInfo['av']) #解析详细视频页面获取硬币和收藏数 79 | if coinInfo == 0: 80 | sort=sort-1 81 | print("作品名:{title},【视频信息获取失败】".format(**AVInfo)) 82 | continue 83 | 84 | AVInfo['play'] = video.find('span',class_='v-info-i gk').span.string #播放数 85 | AVInfo['danmu'] = video.find('span',class_='v-info-i dm').span.string #弹幕数 86 | AVInfo['collect'] = video.find('span',class_='v-info-i sc').span.string #收藏数 87 | AVInfo['url'] = biliUrl + video.find('a',class_='title')['href'] #视频链接 88 | AVInfo['desc'] = video.find('div',class_='v-desc').string #视频描述 89 | AVInfo['author'] = video.find('a',class_='v-author')['href'].split('/')[-1] #用户id 90 | #将此视频加入已经爬取过的列表 91 | avSet.add(AVInfo['av']) 92 | AVInfo['mtime'] = int(time.time()) 93 | AVInfo['ctime'] = int(time.time()) 94 | #合并信息 95 | AVInfo = dict(coinInfo,**AVInfo) 96 | 97 | print("排名第{sort}:\t{author_name},\t播放量:{play},\t收藏数:{collect},\t硬币数:{coin},\t作品名:{title}".format(sort=sort,**AVInfo)) 98 | sql = "INSERT IGNORE INTO `bilibili`(`av`, `title`, `module`,`tid`,`author`, `author_name`, `play`, `danmu`, `collect`, `desc`, `share`, `coin`, `mtime`, `ctime`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 99 | args = (AVInfo['av'],AVInfo['title'],AVInfo['module'],AVInfo['tid'],AVInfo['author'],AVInfo['author_name'],AVInfo['play'],AVInfo['danmu'],AVInfo['collect'],AVInfo['desc'],AVInfo['share'],AVInfo['coin'],AVInfo['mtime'],AVInfo['ctime']) 100 | cur.execute(sql,args) 101 | conn.commit() 102 | if sort >= self.limit: 103 | isBreak = True 104 | break 105 | if isBreak == True: 106 | break 107 | #全部获取完毕,保存av号 108 | with codecs.open('avSet.txt', encoding='utf-8', mode='w') as f: 109 | f.write(','.join(str(s) for s in avSet)) 110 | 111 | 112 | #解析单独的一个视频 113 | # @param avNum String video/av6315006/ 114 | def parseAV(self,avNum): 115 | url = "http://api.bilibili.com/archive_stat/stat?callback=&aid={av}&type=jsonp&_={time}".format(av=avNum,time=int(time.time()*1000)) 116 | info = dict() 117 | 118 | try: 119 | content = self.getUrl(url) 120 | data = json.loads(str(content)) 121 | info['coin'] = data['data']['coin'] 122 | info['share'] = data['data']['share'] 123 | except: 124 | return 0; 125 | return info 126 | 127 | #module 为 分类 :游戏 game 舞蹈 dance等 128 | module = 'game' 129 | #热度统计开始时间 130 | start = '2016-07-01' 131 | #热度统计结束时间 132 | end = '2016-07-31' 133 | #单个模块排名获取个数100以内 134 | limit = 40 135 | spider = BilibiliSpider(module,start, end,limit) 136 | print("分析周期:`{start}` ~ `{end}`".format(start=start,end=end)) 137 | spider.start() 138 | 139 | --------------------------------------------------------------------------------