├── image
├── 1.png
└── online.png
├── bili_online
├── README.MD
├── data.php
├── bili_online.sql
├── biliOnline.py
└── show.html
├── bilibili
├── README.MD
├── bilidata.php
├── bilibili.sql
├── show.html
└── bilibili.py
├── README.md
├── spider.py
├── qiubai.py
├── taobaomm.py
└── zhihu.py
/image/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StephinChou/Pythonspider/HEAD/image/1.png
--------------------------------------------------------------------------------
/image/online.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StephinChou/Pythonspider/HEAD/image/online.png
--------------------------------------------------------------------------------
/bili_online/README.MD:
--------------------------------------------------------------------------------
1 | # 操作流程:
2 | * 直接在window下运行biliOnline.py获取在线数据
3 | * 推荐用cron计划任务来运行,如果采用此方式,请__去掉代码里的死循环__
4 | * 构造php运行环境,配置data.php文件(推荐直接使用wamp)
5 | * 运行show.html 文件即可查看数据图标(注意data.php的**路径**)
6 |
7 | # 数据展示
8 | 
9 |
--------------------------------------------------------------------------------
/bilibili/README.MD:
--------------------------------------------------------------------------------
1 | # 爬取B站 视频热度排行的 视频数据 bilibili.py
2 | * 只需输入一个大模块名,如游戏模块名为'game',自行会爬取下面几个小类,并按播放数、硬币数等排行分别爬取
3 | * 已解析出数据接口,直接获取视频数据,不使用webDriver,爬取速度提升数十倍,并且不会miss数据2016.9.22
4 | * @TODO 对爬取到的视频做日期归类(待完成)
5 | * 目前爬取的信息有:
6 | * up主id
7 | * up主名
8 | * 视频AV号
9 | * 播放数
10 | * 收藏数
11 | * 弹幕数
12 | * 视频描述
13 | * 硬币数(获取不稳定,少数会获取不到)
14 | * 分享数(同上)
15 |
16 |
17 | # 数据展示
18 | 
19 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Pythonspider,一个简单的python爬虫
2 | * 娱乐随手写的,代码不太严谨,仅仅实现功能
3 | * 原生python+BeautifulSoup4
4 | * python3.4版本
5 | * 所有脚本要和spider.py放到同一目录下
6 | * 自行下载BeautifulSoup4 的类库 window下 `pip install bs4`即可
7 |
8 | ## 爬取知乎的爬虫 zhihu.py
9 | * 主要实现 爬取一个收藏夹 里 所有问题答案下的 图片
10 | * 文字信息暂未收录,可自行实现,比图片更简单
11 | * 具体代码里有详细注释,请自行阅读
12 |
13 | ## 子文件夹文件说明:
14 | |文件名|说明|
15 | |:-:|:-:|
16 | |\*.py文件|爬虫程序|
17 | |\*.sql文件|数据表结构|
18 | |\*.html文件|数据展示|
19 | |\*.php文件|数据展示界面数据接口|
20 |
--------------------------------------------------------------------------------
/bili_online/data.php:
--------------------------------------------------------------------------------
1 | "播放量","coin"=>"硬币数","collect"=>"收藏数","danmu"=>"弹幕数");
10 | if(!isset($fields[$order])){
11 | echo 0;
12 | die;
13 | }
14 | if($average){
15 | $by = 'avg'.$order;
16 | }
17 | $sql = "SELECT author_name,count('*') as 'avNum', AVG(`{$order}`) as avg{$order},sum(`{$order}`) as {$order} FROM `bilibili` group by author order by {$order} DESC limit {$limit}";
18 |
19 |
20 | $result = mysqli_query($link,$sql);
21 | file_put_contents("a.txt", $sql);
22 |
23 | $data['name'] = $average ? "平均每视频".$fields[$order] :$fields[$order];
24 | while($row = mysqli_fetch_array($result))
25 | {
26 | if($average){
27 | $num = intval($row['avg'.$order]);
28 | }else{
29 | $num = intval($row[$order]);
30 | }
31 | $sort[] = $num;
32 | $tmp = array($row['author_name'],$num);
33 | $data['data'][] = $tmp;
34 | }
35 |
36 | array_multisort($data['data'],SORT_ASC ,SORT_NUMERIC ,$sort);
37 | $callback = $_GET['callback'] ? $_GET['callback'] :"callback";
38 |
39 | echo "{$callback}(".json_encode($data).")";
--------------------------------------------------------------------------------
/spider.py:
--------------------------------------------------------------------------------
1 | #!/url/bin/env python
2 | # -*- coding: utf-8 -*-
3 | __author__ = 'waiting'
4 | import os,re,codecs,urllib,io,gzip,zlib
5 | from urllib import request
6 | from bs4 import BeautifulSoup
7 | import chardet
8 |
9 |
10 | class SpiderHTML(object):
11 | #打开页面
12 | def getUrl(self, url, coding='utf-8'):
13 | req = request.Request(url)
14 | req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 UBrowser/5.5.9703.2 Safari/537.36')
15 | req.add_header('Accept-encoding', 'gzip')
16 | with request.urlopen(req) as response:
17 | gzipd = response.headers.get('Content-Encoding')
18 | if gzipd == 'gzip':
19 | data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS)
20 |
21 | else:
22 | data = response.read()
23 | return BeautifulSoup(data.decode(coding))
24 |
25 | #保存文本内容到本地
26 | def saveText(self,filename,content,mode='w'):
27 | self._checkPath(filename)
28 | with codecs.open(filename, encoding='utf-8', mode=mode) as f:
29 | f.write(content)
30 |
31 |
32 | #保存图片
33 | def saveImg(self, imgUrl, imgName):
34 | data=request.urlopen(imgUrl).read()
35 | self._checkPath(imgName)
36 | with open(imgName,'wb') as f:
37 | f.write(data)
38 |
39 | #创建目录
40 | def _checkPath(self, path):
41 | dirname = os.path.dirname(path.strip())
42 | if not os.path.exists(dirname):
43 | os.makedirs(dirname)
44 |
--------------------------------------------------------------------------------
/bilibili/bilibili.sql:
--------------------------------------------------------------------------------
1 | /*
2 | Navicat MySQL Data Transfer
3 |
4 | Source Server : localhost
5 | Source Server Version : 50617
6 | Source Host : localhost:3306
7 | Source Database : test
8 |
9 | Target Server Type : MYSQL
10 | Target Server Version : 50617
11 | File Encoding : 65001
12 |
13 | Date: 2016-09-20 15:20:15
14 | */
15 |
16 | SET FOREIGN_KEY_CHECKS=0;
17 |
18 | -- ----------------------------
19 | -- Table structure for `bilibili`
20 | -- ----------------------------
21 | DROP TABLE IF EXISTS `bilibili`;
22 | CREATE TABLE `bilibili` (
23 | `id` int(11) NOT NULL AUTO_INCREMENT,
24 | `av` varchar(10) NOT NULL COMMENT '视频av号',
25 | `title` varchar(100) NOT NULL COMMENT '视频标题',
26 | `module` varchar(20) NOT NULL DEFAULT '' COMMENT '视频模块',
27 | `tid` varchar(5) NOT NULL DEFAULT '' COMMENT '模块编号',
28 | `author` varchar(10) NOT NULL COMMENT '作者id',
29 | `author_name` varchar(30) NOT NULL COMMENT '作者名字',
30 | `play` int(11) NOT NULL COMMENT '播放数',
31 | `danmu` int(11) NOT NULL COMMENT '弹幕数',
32 | `collect` int(11) NOT NULL COMMENT '收藏数',
33 | `desc` varchar(500) NOT NULL COMMENT '视频描述',
34 | `share` int(11) NOT NULL COMMENT '分享数',
35 | `coin` int(11) NOT NULL COMMENT '硬币数',
36 | `mtime` int(11) NOT NULL COMMENT '修改时间',
37 | `ctime` int(11) NOT NULL COMMENT '创建时间',
38 | PRIMARY KEY (`id`),
39 | UNIQUE KEY `UQ_video` (`av`) USING BTREE
40 | ) ENGINE=InnoDB AUTO_INCREMENT=436 DEFAULT CHARSET=utf8mb4;
41 |
--------------------------------------------------------------------------------
/qiubai.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import sys,os,pdb,re,time,random,datetime
3 | from spider import SpiderHTML
4 | from bs4 import BeautifulSoup
5 |
6 | class QiubaiSpider(SpiderHTML):
7 | def __init__(self,contentType,pageStart=1, pageEnd=1):
8 | #super.__init__(self)
9 | self._contentType = contentType
10 | self._pageStart = int(pageStart)
11 | self._pageEnd = int(pageEnd)+1
12 | self.__url = {'new':'http://www.qiushibaike.com/textnew/page/','hot':'http://www.qiushibaike.com/text/page/'}
13 |
14 | def getJokes(self):
15 | reqUrl = ''
16 |
17 | if contentType in self.__url:
18 | reqUrl = self.__url[self._contentType]
19 | else:
20 | reqUrl = self.__url['new']
21 | for i in range(self._pageStart,self._pageEnd):
22 | pageUrl = reqUrl+str(i)+'/'
23 | jokes = self.getUrl(pageUrl)
24 | jokes = jokes.find_all('div',id=re.compile('qiushi_tag_\d+'))
25 | filepath = os.path.join('E:\\','qiubai',str(datetime.date.today())+self._contentType+str(i))
26 | info = '正在保存第{page}页的糗事到文件 {file}.txt'
27 | print(info.format(page=i,file=filepath))
28 | for joke in jokes:
29 | jokeContent = str(joke.find('div',attrs={'class':'content'}))
30 | jokeContent = re.sub('
','',jokeContent)
31 | jokeContent = re.sub('
','',jokeContent)
32 | jokeContent = re.sub('','',jokeContent)
33 | jokeContent = re.sub('
','\n',jokeContent)
34 | jokeContent = re.sub('
','\n',jokeContent)
35 | try:
36 | author = joke.find(attrs={'class':'author clearfix'}).find('h2').string
37 | upvote = joke.find(attrs={'class':'stats'}).span.i.string
38 | except AttributeError:
39 | pass
40 |
41 | joke = '-----------------------------\r\n作者:{author}\r\n{joke}\r\n\r\n{upvote}人觉得很赞\r\n'.format(joke=jokeContent.strip(),author=author,upvote=upvote)
42 |
43 | self.saveText(filepath+'.txt',joke,'a')
44 | if i%2 == 0: #防止被封,间隔时间长一点
45 | time.sleep(random.random()*3)
46 |
47 | if __name__ == '__main__':
48 | contentType = 'new'
49 | page = 0
50 | paramsNum = len(sys.argv)
51 |
52 | #输入想获取最新的糗百还是最热的糗百
53 | #参数2,3为想要获取的页数
54 | if paramsNum>=4:
55 | contentType = sys.argv[1]
56 | page = sys.argv[2]
57 | pageEnd = sys.argv[3]
58 | elif paramsNum>=3:
59 | contentType = sys.argv[1]
60 | page = sys.argv[2]
61 | pageEnd = page
62 | elif paramsNum == 2:
63 | contentType = sys.argv[1]
64 | page,pageEnd = 1,1
65 | else:
66 | contentType = 'new'
67 | page,pageEnd = 1,1
68 |
69 | qiubai = QiubaiSpider(contentType,page,pageEnd)
70 | qiubai.getJokes()
--------------------------------------------------------------------------------
/bili_online/show.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
75 |
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/taobaomm.py:
--------------------------------------------------------------------------------
1 | from spider import SpiderHTML
2 | import re,os,sys,time,urllib,random,http
3 | '''
4 | 抓取淘宝模特的靓图
5 | '''
6 |
7 | class TaobaommSpider(SpiderHTML):
8 | #抓取起始页,结束页,每个妹子抓取的图片数量
9 | def __init__(self,pageStart, pageEnd,limit_img):
10 | self._pageStart = int(pageStart)
11 | self._pageEnd = int(pageEnd)+1
12 | self._limit = limit_img
13 | self.__url = 'https://mm.taobao.com/json/request_top_list.htm?page='
14 | self.__dir = 'E:\\taobaomm'
15 |
16 | def start(self):
17 | for page in range(self._pageStart,self._pageEnd):
18 | url = self.__url + str(page)
19 | contents = self.getUrl(url,'gbk')
20 | lists = contents.find_all('div',class_='personal-info')
21 | for girl in lists:
22 | info = girl.find('a',attrs={'class':'lady-name'})
23 | avatar = girl.find('a',class_='lady-avatar')
24 |
25 | girlinfo = {}
26 | girlinfo['name'] = info.string
27 | girlinfo['age'] = info.find_next_sibling('em').strong.string
28 | girlinfo['city'] = info.find_next('span').string
29 | girlinfo['url'] = 'https:'+avatar['href']
30 | #去除掉缩小的图片
31 | girlinfo['avatar'] = 'https:'+re.sub('_\d+x\d+\.\w+$','',avatar.img['src'])
32 | imgType = os.path.splitext(girlinfo['avatar'])[1]
33 | logInfo = '找到一位MM:{name},{age}岁,她在{city}'.format(**girlinfo)
34 | print(logInfo)
35 | tmpDir = os.path.join(self.__dir,girlinfo['name']+'-'+girlinfo['age']+'-'+girlinfo['city'])
36 | if(os.path.exists(tmpDir)):
37 | print('已经获得过信息,去找下一位')
38 | continue
39 | #以名字命名,保存图片和基本信息
40 | self.saveImg(girlinfo['avatar'],os.path.join(tmpDir,'avatar'+imgType))
41 | print('正在进入她的个人中心获取私图')
42 |
43 | gilrsCenter = self.getUrl(girlinfo['url'],'gbk')
44 | imgs = gilrsCenter.find('div',class_='mm-aixiu-content').find_all('img')
45 | i = 0
46 | for img in imgs:
47 | i = i + 1
48 | if i % 5 == 0:
49 | print('正在获取第{i}张图'.format(i=i))
50 | try:
51 | imgurl = 'https:'+img['src']
52 | extend_name = os.path.splitext(img['src'])[1]
53 | if extend_name == '.gif':
54 | continue #一般都是表情图,略过
55 | self.saveImg(imgurl,os.path.join(tmpDir,str(i)+extend_name))
56 | except urllib.error.HTTPError as e:
57 | pass
58 | except KeyError as e:
59 | pass
60 | except http.client.IncompleteRead:
61 | pass
62 |
63 | if i >= self._limit:
64 | pass #若要限制每个模特抓图的张数,此处改为break
65 | time.sleep(random.random()*2)
66 |
67 |
68 | if __name__ == '__main__':
69 | page, limit, paramsNum= 1, 0, len(sys.argv)
70 | if paramsNum>=4:
71 | page, pageEnd, limit = sys.argv[1], sys.argv[2], int(sys.argv[3])
72 | elif paramsNum == 2:
73 | page = sys.argv[1]
74 | pageEnd = page
75 | else:
76 | page,pageEnd = 1,1
77 |
78 | if limit <5:
79 | limit = 20
80 | spider = TaobaommSpider(page,pageEnd,limit)
81 | spider.start()
82 |
--------------------------------------------------------------------------------
/zhihu.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from spider import SpiderHTML
4 | from multiprocessing import Pool
5 | import sys,urllib,http,os,random,re,time
6 | __author__ = 'waiting'
7 | '''
8 | 使用了第三方的类库 BeautifulSoup4,请自行安装
9 | 需要目录下的spider.py文件
10 | 运行环境:python3.4,windows7
11 | '''
12 |
13 | #收藏夹的地址
14 | url = 'https://www.zhihu.com/collection/30822111' #page参数改为代码添加
15 |
16 | #本地存放的路径,不存在会自动创建
17 | store_path = 'E:\\zhihu\收藏夹\\会员才知道的世界'
18 |
19 | class zhihuCollectionSpider(SpiderHTML):
20 | def __init__(self,pageStart, pageEnd, url):
21 | self._url = url
22 | self._pageStart = int(pageStart)
23 | self._pageEnd = int(pageEnd)+1
24 | self.downLimit = 0 #低于此赞同的答案不收录
25 |
26 | def start(self):
27 | for page in range(self._pageStart,self._pageEnd): #收藏夹的页数
28 | url = self._url + '?page='+str(page)
29 | content = self.getUrl(url)
30 | questionList = content.find_all('div',class_='zm-item')
31 | for question in questionList: #收藏夹的每个问题
32 | Qtitle = question.find('h2',class_='zm-item-title')
33 | if Qtitle is None: #被和谐了
34 | continue
35 |
36 | questionStr = Qtitle.a.string
37 | Qurl = 'https://www.zhihu.com'+Qtitle.a['href'] #问题题目
38 | Qtitle = re.sub(r'[\\/:*?"<>]','#',Qtitle.a.string) #windows文件/目录名不支持的特殊符号
39 | try:
40 | print('-----正在获取问题:'+Qtitle+'-----') #获取到问题的链接和标题,进入抓取
41 | except UnicodeEncodeError:
42 | print(r'---问题含有特殊字符无法显示---')
43 | try:
44 | Qcontent = self.getUrl(Qurl)
45 | except:
46 | print('!!!!获取出错!!!!!')
47 | pass
48 | answerList = Qcontent.find_all('div',class_='zm-item-answer zm-item-expanded')
49 | self._processAnswer(answerList,Qtitle) #处理问题的答案
50 | time.sleep(5)
51 |
52 |
53 | def _processAnswer(self,answerList,Qtitle):
54 | j = 0
55 | for answer in answerList:
56 | j = j + 1
57 |
58 | upvoted = int(answer.find('span',class_='count').string.replace('K','000')) #获得此答案赞同数
59 | if upvoted < self.downLimit:
60 | continue
61 | authorInfo = answer.find('div',class_='zm-item-answer-author-info') #获取作者信息
62 | author = {'introduction':'','link':''}
63 | try:
64 | author['name'] = authorInfo.find('a',class_='author-link').string #获得作者的名字
65 | author['introduction'] = str(authorInfo.find('span',class_='bio')['title']) #获得作者的简介
66 | author['link'] = authorInfo.find('a',class_='author-link')['href']
67 | except AttributeError:
68 | author['name'] = '匿名用户'+str(j)
69 | except TypeError: #简介为空的情况
70 | pass #匿名用户没有链接
71 |
72 | file_name = os.path.join(store_path,Qtitle,'info',author['name']+'_info.txt')
73 | if os.path.exists(file_name): #已经抓取过
74 | continue
75 |
76 | self.saveText(file_name,'{introduction}\r\n{link}'.format(**author)) #保存作者的信息
77 | print('正在获取用户`{name}`的答案'.format(**author))
78 | answerContent = answer.find('div',class_='zm-editable-content clearfix')
79 | if answerContent is None: #被举报的用户没有答案内容
80 | continue
81 |
82 | imgs = answerContent.find_all('img')
83 | if len(imgs) == 0: #答案没有上图
84 | pass
85 | else:
86 | self._getImgFromAnswer(imgs,Qtitle,**author)
87 |
88 | #收录图片
89 | def _getImgFromAnswer(self,imgs,Qtitle,**author):
90 | i = 0
91 | for img in imgs:
92 | if 'inline-image' in img['class']: #不抓取知乎的小图
93 | continue
94 | i = i + 1
95 | imgUrl = img['src']
96 | extension = os.path.splitext(imgUrl)[1]
97 | path_name = os.path.join(store_path,Qtitle,author['name']+'_'+str(i)+extension)
98 | try:
99 | self.saveImg(imgUrl,path_name) #捕获各种图片异常,流程不中断
100 | except:
101 | pass
102 |
103 | #收录文字
104 | def _getTextFromAnswer(self):
105 | pass
106 |
107 | #命令行下运行,例:zhihu.py 1 5 获取1到5页的数据
108 | if __name__ == '__main__':
109 | page, limit, paramsNum= 1, 0, len(sys.argv)
110 | if paramsNum>=3:
111 | page, pageEnd = sys.argv[1], sys.argv[2]
112 | elif paramsNum == 2:
113 | page = sys.argv[1]
114 | pageEnd = page
115 | else:
116 | page,pageEnd = 1,1
117 |
118 | spider = zhihuCollectionSpider(page,pageEnd,url)
119 | spider.start()
120 |
121 |
--------------------------------------------------------------------------------
/bilibili/show.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
64 |
65 |
66 |
67 |
68 |
69 |
88 |
89 |
105 |
118 |
119 |
120 |
137 |
138 |
--------------------------------------------------------------------------------
/bilibili/bilibili.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # @Date : 2016-09-20 15:42:13
4 | # @Author : waitingChou (zhouzt52@qq.com)
5 | # @Link : https://github.com/StephinChou/
6 | __author__ = 'waiting'
7 | from spider import SpiderHTML
8 | from multiprocessing import Pool
9 | import sys,urllib,http,os,re,time,codecs,json
10 | import pymysql
11 | pymysql.install_as_MySQLdb()
12 |
13 | #从本地记录里获取曾经爬取过的视频号
14 | f = open('avSet.txt','r')
15 | avSet = set([])
16 | for line in f:
17 | avSet = set(line.split(','))
18 |
19 | #一些配置
20 | conn = pymysql.connect(host='localhost',user='root',passwd='',db='test',port=3306,use_unicode=True, charset="utf8")
21 | cur=conn.cursor()
22 | pattern = re.compile(r'\d+') #获取av号的正则表达式
23 | orders = {"hot":"播放量","review":"评论数","promote":"硬币数","stow":"收藏数"}
24 | biliUrl = 'http://www.bilibili.com'
25 |
26 | class BilibiliSpider(SpiderHTML):
27 | def __init__(self,module,timeStart,timeEnd,limit):
28 | self.url = biliUrl + '/video/' + module + '.html'
29 | self.timeStart = timeStart
30 | self.timeEnd = timeEnd
31 | self.limit = limit
32 |
33 | def start(self):
34 | content = self.getUrl(self.url)
35 | sorts = content.find('ul',class_='n_num')
36 | subSorts = sorts.find_all('a')
37 |
38 | #处理该类别下的子模块
39 | for sub in subSorts:
40 | subName = sub.string
41 | if(subName == '全部'):
42 | continue
43 | #子模块只需要tid即可
44 | tid = sub.parent['tid']
45 | if tid is None or tid == '' :
46 | print('模块{type} tid解析错误'.format(type=subName))
47 | continue
48 | self.parsePage(subName,tid)
49 |
50 | #处理一个子模块的页面
51 | def parsePage(self,typeName,tid):
52 | for (order,name) in orders.items():
53 | sumData = dict()
54 | print("对子模块‘{typeName}’进行‘{name}’排序的分析".format(name=name,typeName=typeName))
55 | sort = 0;
56 | #是否获取到足够的排名
57 | isBreak = False
58 | for page in range(1,5):
59 | # http://www.bilibili.com/list/stow-65-1-2016-09-12~2016-09-19.html
60 | urlTmp = biliUrl + "/list/{order}-{tid}-{page}-{start}~{end}.html".format(order=order,tid=tid,page=page,start=self.timeStart,end=self.timeEnd)
61 | content = self.getUrl(urlTmp)
62 |
63 | videoContent = content.find('ul',class_='vd-list l1')
64 | videoList = videoContent.find_all('div',class_='l-item')
65 |
66 | for video in videoList:
67 | AVInfo = dict() #作品信息
68 | AVInfo['av'] = pattern.search(video.find('a',class_='title')['href']).group() #av号
69 | AVInfo['title'] = video.find('a',class_='title').string #标题
70 | sort=sort+1
71 | if AVInfo['av'] in avSet:
72 | print("已经爬取过该视频av{av},{title}".format(**AVInfo))
73 | continue
74 |
75 | AVInfo['author_name'] = video.find('a',class_='v-author').string #作者
76 | AVInfo['module'] = typeName #模块名
77 | AVInfo['tid'] = tid #模块id
78 | coinInfo = self.parseAV(AVInfo['av']) #解析详细视频页面获取硬币和收藏数
79 | if coinInfo == 0:
80 | sort=sort-1
81 | print("作品名:{title},【视频信息获取失败】".format(**AVInfo))
82 | continue
83 |
84 | AVInfo['play'] = video.find('span',class_='v-info-i gk').span.string #播放数
85 | AVInfo['danmu'] = video.find('span',class_='v-info-i dm').span.string #弹幕数
86 | AVInfo['collect'] = video.find('span',class_='v-info-i sc').span.string #收藏数
87 | AVInfo['url'] = biliUrl + video.find('a',class_='title')['href'] #视频链接
88 | AVInfo['desc'] = video.find('div',class_='v-desc').string #视频描述
89 | AVInfo['author'] = video.find('a',class_='v-author')['href'].split('/')[-1] #用户id
90 | #将此视频加入已经爬取过的列表
91 | avSet.add(AVInfo['av'])
92 | AVInfo['mtime'] = int(time.time())
93 | AVInfo['ctime'] = int(time.time())
94 | #合并信息
95 | AVInfo = dict(coinInfo,**AVInfo)
96 |
97 | print("排名第{sort}:\t{author_name},\t播放量:{play},\t收藏数:{collect},\t硬币数:{coin},\t作品名:{title}".format(sort=sort,**AVInfo))
98 | sql = "INSERT IGNORE INTO `bilibili`(`av`, `title`, `module`,`tid`,`author`, `author_name`, `play`, `danmu`, `collect`, `desc`, `share`, `coin`, `mtime`, `ctime`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
99 | args = (AVInfo['av'],AVInfo['title'],AVInfo['module'],AVInfo['tid'],AVInfo['author'],AVInfo['author_name'],AVInfo['play'],AVInfo['danmu'],AVInfo['collect'],AVInfo['desc'],AVInfo['share'],AVInfo['coin'],AVInfo['mtime'],AVInfo['ctime'])
100 | cur.execute(sql,args)
101 | conn.commit()
102 | if sort >= self.limit:
103 | isBreak = True
104 | break
105 | if isBreak == True:
106 | break
107 | #全部获取完毕,保存av号
108 | with codecs.open('avSet.txt', encoding='utf-8', mode='w') as f:
109 | f.write(','.join(str(s) for s in avSet))
110 |
111 |
112 | #解析单独的一个视频
113 | # @param avNum String video/av6315006/
114 | def parseAV(self,avNum):
115 | url = "http://api.bilibili.com/archive_stat/stat?callback=&aid={av}&type=jsonp&_={time}".format(av=avNum,time=int(time.time()*1000))
116 | info = dict()
117 |
118 | try:
119 | content = self.getUrl(url)
120 | data = json.loads(str(content))
121 | info['coin'] = data['data']['coin']
122 | info['share'] = data['data']['share']
123 | except:
124 | return 0;
125 | return info
126 |
127 | #module 为 分类 :游戏 game 舞蹈 dance等
128 | module = 'game'
129 | #热度统计开始时间
130 | start = '2016-07-01'
131 | #热度统计结束时间
132 | end = '2016-07-31'
133 | #单个模块排名获取个数100以内
134 | limit = 40
135 | spider = BilibiliSpider(module,start, end,limit)
136 | print("分析周期:`{start}` ~ `{end}`".format(start=start,end=end))
137 | spider.start()
138 |
139 |
--------------------------------------------------------------------------------