├── image
    ├── 1.png
    └── online.png
├── bili_online
    ├── README.MD
    ├── data.php
    ├── bili_online.sql
    ├── biliOnline.py
    └── show.html
├── bilibili
    ├── README.MD
    ├── bilidata.php
    ├── bilibili.sql
    ├── show.html
    └── bilibili.py
├── README.md
├── spider.py
├── qiubai.py
├── taobaomm.py
└── zhihu.py


/image/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StephinChou/Pythonspider/HEAD/image/1.png


--------------------------------------------------------------------------------
/image/online.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StephinChou/Pythonspider/HEAD/image/online.png


--------------------------------------------------------------------------------
/bili_online/README.MD:
--------------------------------------------------------------------------------
1 | # 操作流程：
2 | * 直接在window下运行biliOnline.py获取在线数据
3 | * 推荐用cron计划任务来运行，如果采用此方式，请__去掉代码里的死循环__
4 | * 构造php运行环境，配置data.php文件（推荐直接使用wamp）
5 | * 运行show.html 文件即可查看数据图标(注意data.php的**路径**)
6 | 
7 | # 数据展示 
8 | ![](https://github.com/StephinChou/Pythonspider/blob/master/image/online.png)
9 | 


--------------------------------------------------------------------------------
/bilibili/README.MD:
--------------------------------------------------------------------------------
 1 | # 爬取B站 视频热度排行的 视频数据  bilibili.py
 2 | * 只需输入一个大模块名，如游戏模块名为'game'，自行会爬取下面几个小类，并按播放数、硬币数等排行分别爬取
 3 | * 已解析出数据接口，直接获取视频数据，不使用webDriver，爬取速度提升数十倍,并且不会miss数据2016.9.22 
 4 | * @TODO 对爬取到的视频做日期归类（待完成）
 5 | * 目前爬取的信息有：
 6 |  * up主id
 7 |  * up主名
 8 |  * 视频AV号
 9 |  * 播放数
10 |  * 收藏数
11 |  * 弹幕数
12 |  * 视频描述
13 |  * 硬币数(获取不稳定，少数会获取不到)
14 |  * 分享数(同上)
15 |  
16 | 
17 | # 数据展示
18 |  ![](https://github.com/StephinChou/Pythonspider/blob/master/image/1.png)
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pythonspider,一个简单的python爬虫
 2 | * 娱乐随手写的，代码不太严谨，仅仅实现功能
 3 | * 原生python+BeautifulSoup4
 4 | * python3.4版本
 5 | * 所有脚本要和spider.py放到同一目录下
 6 | * 自行下载BeautifulSoup4 的类库  window下 `pip install bs4`即可 
 7 | 
 8 | ## 爬取知乎的爬虫 zhihu.py 
 9 | * 主要实现 爬取一个收藏夹 里 所有问题答案下的 图片
10 | * 文字信息暂未收录，可自行实现，比图片更简单
11 | * 具体代码里有详细注释，请自行阅读
12 | 
13 | ## 子文件夹文件说明：
14 | |文件名|说明|
15 | |:-:|:-:|
16 | |\*.py文件|爬虫程序|
17 | |\*.sql文件|数据表结构|
18 | |\*.html文件|数据展示|
19 | |\*.php文件|数据展示界面数据接口|
20 | 


--------------------------------------------------------------------------------
/bili_online/data.php:
--------------------------------------------------------------------------------
 1 | <?php 
 2 | 
 3 | $link = mysqli_connect('localhost:3306', 'root', '');
 4 | mysqli_select_db($link,'test');
 5 | 
 6 | $result = mysqli_query($link,"select * from `bili_online` order by `ctime`");
 7 | 
 8 | $data = array();
 9 | while($row = mysqli_fetch_array($result))
10 | {
11 | 	//js插件默认是标准时间 ，我们给他 + 8个小时
12 | 	$tmp = array(intval($row['ctime'])*1000 + 8*3600000,intval($row['online']));
13 | 	$data[] = $tmp;
14 | }
15 | $callback = $_GET['callback'] ? $_GET['callback'] :"callback";
16 | 
17 | echo "{$callback}(".json_encode($data).")";


--------------------------------------------------------------------------------
/bili_online/bili_online.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Navicat MySQL Data Transfer
 3 | 
 4 | Source Server         : localhost
 5 | Source Server Version : 50617
 6 | Source Host           : localhost:3306
 7 | Source Database       : test
 8 | 
 9 | Target Server Type    : MYSQL
10 | Target Server Version : 50617
11 | File Encoding         : 65001
12 | 
13 | Date: 2016-09-21 15:14:49
14 | */
15 | 
16 | SET FOREIGN_KEY_CHECKS=0;
17 | 
18 | -- ----------------------------
19 | -- Table structure for `bili_online`
20 | -- ----------------------------
21 | DROP TABLE IF EXISTS `bili_online`;
22 | CREATE TABLE `bili_online` (
23 |   `id` int(11) NOT NULL AUTO_INCREMENT,
24 |   `online` int(11) NOT NULL COMMENT '在线人数',
25 |   `ctime` int(11) NOT NULL DEFAULT '0' COMMENT '时间戳',
26 |   PRIMARY KEY (`id`),
27 |   UNIQUE KEY `ctime` (`ctime`)
28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
29 | 
30 | 


--------------------------------------------------------------------------------
/bili_online/biliOnline.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from spider import SpiderHTML
 3 | from multiprocessing import Pool
 4 | import sys,urllib,http,random,re,time,datetime
 5 | import pymysql
 6 | from threading import Timer
 7 | pymysql.install_as_MySQLdb()
 8 | 
 9 | __author__ = 'waiting'
10 | 
11 | conn = pymysql.connect(host='localhost',user='root',passwd='',db='test',port=3306,use_unicode=True, charset="utf8")
12 | cur =  conn.cursor()
13 | url = "http://www.bilibili.com/"
14 | 
15 | def run():
16 |     msg=""
17 |     try:
18 |         a = SpiderHTML()
19 |         online = a.getUrl(url).find('span',class_='web-online').a.em.string
20 | 
21 |         args = (online,int(time.time()))
22 |         # print(args)
23 |         cur.execute("INSERT INTO `bili_online`( `online`, `ctime`) VALUES (%s,%s)",args)
24 |         conn.commit()
25 |     except:
26 |         msg = "执行出错"
27 |     finally:
28 |         print("当前时间{0} {1}".format(datetime.datetime.now(),msg))    
29 | 
30 |     #循环定时执行
31 |     global t    #Notice: use global variable!
32 |     t = Timer(120, run)
33 |     t.start()
34 | run()
35 | 


--------------------------------------------------------------------------------
/bilibili/bilidata.php:
--------------------------------------------------------------------------------
 1 | <?php 
 2 | 
 3 | $link = mysqli_connect('localhost:3306', 'root', '');
 4 | mysqli_select_db($link,'test');
 5 | mysqli_set_charset($link,'utf8');
 6 | $limit = $_GET['limit'] ? $_GET['limit'] : 15;//up数量
 7 | $by = $order = $_GET['order'] ? $_GET['order'] : 'play';//排序字段
 8 | $average = $_GET['average'];
 9 | $fields = array("play"=>"播放量","coin"=>"硬币数","collect"=>"收藏数","danmu"=>"弹幕数");
10 | if(!isset($fields[$order])){
11 | 	echo 0;
12 | 	die;
13 | }
14 | if($average){
15 | 	$by = 'avg'.$order;
16 | }
17 | $sql = "SELECT author_name,count('*') as 'avNum', AVG(`{$order}`) as avg{$order},sum(`{$order}`) as {$order} FROM `bilibili` group by author order by {$order} DESC limit {$limit}";
18 | 
19 | 
20 | $result = mysqli_query($link,$sql);
21 | file_put_contents("a.txt", $sql);
22 | 
23 | $data['name'] = $average ? "平均每视频".$fields[$order] :$fields[$order];
24 | while($row = mysqli_fetch_array($result))
25 | {
26 | 	if($average){
27 | 		$num = intval($row['avg'.$order]);
28 | 	}else{
29 | 		$num = intval($row[$order]);
30 | 	}
31 | 	$sort[] = $num;
32 | 	$tmp = array($row['author_name'],$num);
33 | 	$data['data'][] = $tmp;
34 | }
35 | 
36 | array_multisort($data['data'],SORT_ASC ,SORT_NUMERIC ,$sort);
37 | $callback = $_GET['callback'] ? $_GET['callback'] :"callback";
38 | 
39 | echo "{$callback}(".json_encode($data).")";


--------------------------------------------------------------------------------
/spider.py:
--------------------------------------------------------------------------------
 1 | #!/url/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __author__ = 'waiting'
 4 | import os,re,codecs,urllib,io,gzip,zlib
 5 | from urllib import request
 6 | from bs4 import BeautifulSoup
 7 | import chardet
 8 | 
 9 | 
10 | class SpiderHTML(object):
11 | 	#打开页面
12 | 	def getUrl(self, url, coding='utf-8'):
13 | 		req = request.Request(url)
14 | 		req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 UBrowser/5.5.9703.2 Safari/537.36')
15 | 		req.add_header('Accept-encoding', 'gzip')
16 | 		with request.urlopen(req) as response:
17 | 			gzipd = response.headers.get('Content-Encoding')
18 | 			if gzipd == 'gzip':
19 | 				data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS)
20 | 
21 | 			else:
22 | 				data = response.read()
23 | 			return BeautifulSoup(data.decode(coding))
24 | 
25 | 	#保存文本内容到本地
26 | 	def saveText(self,filename,content,mode='w'):
27 | 		self._checkPath(filename)
28 | 		with codecs.open(filename, encoding='utf-8', mode=mode) as f:
29 | 			f.write(content)
30 | 			
31 | 		
32 | 	#保存图片
33 | 	def saveImg(self, imgUrl, imgName):
34 | 		data=request.urlopen(imgUrl).read()
35 | 		self._checkPath(imgName)
36 | 		with open(imgName,'wb') as f:
37 | 			f.write(data)
38 | 
39 | 	#创建目录
40 | 	def _checkPath(self, path):
41 | 		dirname = os.path.dirname(path.strip())
42 | 		if not os.path.exists(dirname):
43 | 			os.makedirs(dirname)
44 | 


--------------------------------------------------------------------------------
/bilibili/bilibili.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Navicat MySQL Data Transfer
 3 | 
 4 | Source Server         : localhost
 5 | Source Server Version : 50617
 6 | Source Host           : localhost:3306
 7 | Source Database       : test
 8 | 
 9 | Target Server Type    : MYSQL
10 | Target Server Version : 50617
11 | File Encoding         : 65001
12 | 
13 | Date: 2016-09-20 15:20:15
14 | */
15 | 
16 | SET FOREIGN_KEY_CHECKS=0;
17 | 
18 | -- ----------------------------
19 | -- Table structure for `bilibili`
20 | -- ----------------------------
21 | DROP TABLE IF EXISTS `bilibili`;
22 | CREATE TABLE `bilibili` (
23 |   `id` int(11) NOT NULL AUTO_INCREMENT,
24 |   `av` varchar(10) NOT NULL COMMENT '视频av号',
25 |   `title` varchar(100) NOT NULL COMMENT '视频标题',
26 |   `module` varchar(20) NOT NULL DEFAULT '' COMMENT '视频模块',
27 |   `tid` varchar(5) NOT NULL DEFAULT '' COMMENT '模块编号',
28 |   `author` varchar(10) NOT NULL COMMENT '作者id',
29 |   `author_name` varchar(30) NOT NULL COMMENT '作者名字',
30 |   `play` int(11) NOT NULL COMMENT '播放数',
31 |   `danmu` int(11) NOT NULL COMMENT '弹幕数',
32 |   `collect` int(11) NOT NULL COMMENT '收藏数',
33 |   `desc` varchar(500) NOT NULL COMMENT '视频描述',
34 |   `share` int(11) NOT NULL COMMENT '分享数',
35 |   `coin` int(11) NOT NULL COMMENT '硬币数',
36 |   `mtime` int(11) NOT NULL COMMENT '修改时间',
37 |   `ctime` int(11) NOT NULL COMMENT '创建时间',
38 |   PRIMARY KEY (`id`),
39 |   UNIQUE KEY `UQ_video` (`av`) USING BTREE
40 | ) ENGINE=InnoDB AUTO_INCREMENT=436 DEFAULT CHARSET=utf8mb4;
41 | 


--------------------------------------------------------------------------------
/qiubai.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import sys,os,pdb,re,time,random,datetime
 3 | from spider import SpiderHTML
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | class QiubaiSpider(SpiderHTML):
 7 | 	def __init__(self,contentType,pageStart=1, pageEnd=1):
 8 | 		#super.__init__(self)
 9 | 		self._contentType = contentType
10 | 		self._pageStart = int(pageStart)
11 | 		self._pageEnd = int(pageEnd)+1
12 | 		self.__url = {'new':'http://www.qiushibaike.com/textnew/page/','hot':'http://www.qiushibaike.com/text/page/'}
13 | 
14 | 	def getJokes(self):
15 | 		reqUrl = ''
16 | 
17 | 		if contentType in self.__url:
18 | 			reqUrl = self.__url[self._contentType]
19 | 		else:
20 | 			reqUrl = self.__url['new']
21 | 		for i in range(self._pageStart,self._pageEnd):
22 | 			pageUrl = reqUrl+str(i)+'/'
23 | 			jokes = self.getUrl(pageUrl)
24 | 			jokes = jokes.find_all('div',id=re.compile('qiushi_tag_\d+'))
25 | 			filepath = os.path.join('E:\\','qiubai',str(datetime.date.today())+self._contentType+str(i))
26 | 			info = '正在保存第{page}页的糗事到文件 {file}.txt'
27 | 			print(info.format(page=i,file=filepath))
28 | 			for joke in jokes:
29 | 				jokeContent = str(joke.find('div',attrs={'class':'content'}))
30 | 				jokeContent = re.sub('<div class="content">','',jokeContent)
31 | 				jokeContent = re.sub('</div>','',jokeContent)
32 | 				jokeContent = re.sub('<!--\d+-->','',jokeContent)
33 | 				jokeContent = re.sub('<br>','\n',jokeContent)
34 | 				jokeContent = re.sub('<br/>','\n',jokeContent)
35 | 				try:
36 | 					author = joke.find(attrs={'class':'author clearfix'}).find('h2').string
37 | 					upvote = joke.find(attrs={'class':'stats'}).span.i.string
38 | 				except AttributeError:
39 | 					pass
40 | 
41 | 				joke = '-----------------------------\r\n作者：{author}\r\n{joke}\r\n\r\n{upvote}人觉得很赞\r\n'.format(joke=jokeContent.strip(),author=author,upvote=upvote)
42 | 				
43 | 				self.saveText(filepath+'.txt',joke,'a')
44 | 			if i%2 == 0:		#防止被封，间隔时间长一点
45 | 				time.sleep(random.random()*3)
46 | 
47 | if __name__ == '__main__':
48 | 	contentType = 'new'
49 | 	page = 0
50 | 	paramsNum = len(sys.argv)
51 | 
52 | 	#输入想获取最新的糗百还是最热的糗百
53 | 	#参数2,3为想要获取的页数
54 | 	if paramsNum>=4:
55 | 		contentType = sys.argv[1]
56 | 		page = sys.argv[2]
57 | 		pageEnd = sys.argv[3]
58 | 	elif paramsNum>=3:
59 | 		contentType = sys.argv[1]
60 | 		page = sys.argv[2]
61 | 		pageEnd = page
62 | 	elif paramsNum == 2:
63 | 		contentType = sys.argv[1]
64 | 		page,pageEnd = 1,1
65 | 	else:
66 | 		contentType = 'new'
67 | 		page,pageEnd = 1,1
68 | 
69 | 	qiubai = QiubaiSpider(contentType,page,pageEnd)
70 | 	qiubai.getJokes()


--------------------------------------------------------------------------------
/bili_online/show.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 3 |  <script src="http://code.jquery.com/jquery-1.9.1.js"></script>
 4 | <script type='text/javascript'>
 5 | 
 6 | $(function () {
 7 |     $.getJSON('http://localhost/data.php?callback=?', function (data) {
 8 | 
 9 | 
10 |         $('#container').highcharts({
11 |             tooltip: {
12 |                 xDateFormat: '%Y年%m月%d日 %H:%M',
13 |                 shared: true
14 |             },
15 |             chart: {
16 |                 zoomType: 'x'
17 |             },
18 |             title: {
19 |                 text: 'B站在线人数的变化'
20 |             },
21 |             subtitle: {
22 |                 text: document.ontouchstart === undefined ?
23 |                         '鼠标划取区域可以放大' : 'Pinch the chart to zoom in'
24 |             },
25 |             xAxis: {
26 |                 type: 'datetime',
27 |                 title: {
28 |                     text: '时间点'
29 |                 }
30 |             },
31 |             yAxis: {
32 |                 title: {
33 |                     text: '在线人数'
34 |                 }
35 |             },
36 |             legend: {
37 |                 enabled: false
38 |             },
39 |             plotOptions: {
40 |                 area: {
41 |                     fillColor: {
42 |                         linearGradient: {
43 |                             x1: 0,
44 |                             y1: 0,
45 |                             x2: 0,
46 |                             y2: 1
47 |                         },
48 |                         stops: [
49 |                             [0, Highcharts.getOptions().colors[0]],
50 |                             [1, Highcharts.Color(Highcharts.getOptions().colors[0]).setOpacity(0).get('rgba')]
51 |                         ]
52 |                     },
53 |                     marker: {
54 |                         radius: 2
55 |                     },
56 |                     lineWidth: 1,
57 |                     states: {
58 |                         hover: {
59 |                             lineWidth: 1
60 |                         }
61 |                     },
62 |                     threshold: null
63 |                 }
64 |             },
65 | 
66 |             series: [{
67 |                 type: 'area',
68 |                 name: '在线人数',
69 |                 data: data
70 |             }]
71 |         });
72 |     });
73 | });
74 | </script>
75 | 
76 | <script src="https://code.highcharts.com/highcharts.js"></script>
77 | <script src="https://code.highcharts.com/modules/exporting.js"></script>
78 | <div id="container" style="min-width: 310px; height: 400px; margin: 0 auto"></div>
79 | <html>


--------------------------------------------------------------------------------
/taobaomm.py:
--------------------------------------------------------------------------------
 1 | from spider import SpiderHTML
 2 | import re,os,sys,time,urllib,random,http
 3 | '''
 4 | 抓取淘宝模特的靓图
 5 | '''
 6 | 
 7 | class TaobaommSpider(SpiderHTML):
 8 | 	#抓取起始页，结束页，每个妹子抓取的图片数量
 9 | 	def __init__(self,pageStart, pageEnd,limit_img):
10 | 		self._pageStart = int(pageStart)
11 | 		self._pageEnd = int(pageEnd)+1
12 | 		self._limit = limit_img
13 | 		self.__url = 'https://mm.taobao.com/json/request_top_list.htm?page='
14 | 		self.__dir = 'E:\\taobaomm'
15 | 
16 | 	def start(self):
17 | 		for page in range(self._pageStart,self._pageEnd):
18 | 			url = self.__url + str(page)
19 | 			contents = self.getUrl(url,'gbk')
20 | 			lists = contents.find_all('div',class_='personal-info')
21 | 			for girl in lists:
22 | 				info = girl.find('a',attrs={'class':'lady-name'})
23 | 				avatar = girl.find('a',class_='lady-avatar')
24 | 
25 | 				girlinfo = {}
26 | 				girlinfo['name'] = info.string  
27 | 				girlinfo['age'] = info.find_next_sibling('em').strong.string  
28 | 				girlinfo['city'] = info.find_next('span').string
29 | 				girlinfo['url'] = 'https:'+avatar['href']
30 | 				#去除掉缩小的图片
31 | 				girlinfo['avatar'] = 'https:'+re.sub('_\d+x\d+\.\w+$','',avatar.img['src']) 
32 | 				imgType = os.path.splitext(girlinfo['avatar'])[1]
33 | 				logInfo = '找到一位MM：{name},{age}岁，她在{city}'.format(**girlinfo)
34 | 				print(logInfo)
35 | 				tmpDir = os.path.join(self.__dir,girlinfo['name']+'-'+girlinfo['age']+'-'+girlinfo['city'])
36 | 				if(os.path.exists(tmpDir)):
37 | 					print('已经获得过信息，去找下一位')
38 | 					continue
39 | 				#以名字命名，保存图片和基本信息
40 | 				self.saveImg(girlinfo['avatar'],os.path.join(tmpDir,'avatar'+imgType))
41 | 				print('正在进入她的个人中心获取私图')
42 | 
43 | 				gilrsCenter = self.getUrl(girlinfo['url'],'gbk')
44 | 				imgs = gilrsCenter.find('div',class_='mm-aixiu-content').find_all('img')
45 | 				i = 0
46 | 				for img in imgs:
47 | 					i = i + 1
48 | 					if i % 5 == 0:
49 | 						print('正在获取第{i}张图'.format(i=i))
50 | 					try:
51 | 						imgurl = 'https:'+img['src']
52 | 						extend_name = os.path.splitext(img['src'])[1]
53 | 						if extend_name == '.gif':
54 | 							continue	#一般都是表情图，略过
55 | 						self.saveImg(imgurl,os.path.join(tmpDir,str(i)+extend_name))
56 | 					except urllib.error.HTTPError as e:
57 | 						pass
58 | 					except KeyError as e:
59 | 						pass
60 | 					except http.client.IncompleteRead:
61 | 						pass
62 | 
63 | 					if i >= self._limit:
64 | 						pass	#若要限制每个模特抓图的张数，此处改为break
65 | 				time.sleep(random.random()*2)
66 | 
67 | 
68 | if __name__ == '__main__':
69 | 	page, limit, paramsNum= 1, 0, len(sys.argv)
70 | 	if paramsNum>=4:
71 | 		page, pageEnd, limit = sys.argv[1], sys.argv[2], int(sys.argv[3])
72 | 	elif paramsNum == 2:
73 | 		page = sys.argv[1]
74 | 		pageEnd = page
75 | 	else:
76 | 		page,pageEnd = 1,1
77 | 
78 | 	if limit <5:
79 | 		limit = 20
80 | 	spider = TaobaommSpider(page,pageEnd,limit)
81 | 	spider.start()
82 | 


--------------------------------------------------------------------------------
/zhihu.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from spider import SpiderHTML
  4 | from multiprocessing import Pool
  5 | import sys,urllib,http,os,random,re,time
  6 | __author__ = 'waiting'
  7 | '''
  8 | 使用了第三方的类库 BeautifulSoup4，请自行安装
  9 | 需要目录下的spider.py文件
 10 | 运行环境：python3.4,windows7
 11 | '''
 12 | 
 13 | #收藏夹的地址
 14 | url = 'https://www.zhihu.com/collection/30822111'  #page参数改为代码添加
 15 | 
 16 | #本地存放的路径,不存在会自动创建
 17 | store_path = 'E:\\zhihu\收藏夹\\会员才知道的世界'
 18 | 
 19 | class zhihuCollectionSpider(SpiderHTML):
 20 | 	def __init__(self,pageStart, pageEnd, url):
 21 | 		self._url = url
 22 | 		self._pageStart = int(pageStart)
 23 | 		self._pageEnd = int(pageEnd)+1
 24 | 		self.downLimit = 0						#低于此赞同的答案不收录
 25 | 
 26 | 	def start(self):
 27 | 		for page in range(self._pageStart,self._pageEnd):		#收藏夹的页数
 28 | 			url = self._url + '?page='+str(page)
 29 | 			content = self.getUrl(url)
 30 | 			questionList = content.find_all('div',class_='zm-item')
 31 | 			for question in questionList:						#收藏夹的每个问题
 32 | 				Qtitle = question.find('h2',class_='zm-item-title')
 33 | 				if Qtitle is None:								#被和谐了
 34 | 					continue
 35 | 
 36 | 				questionStr = Qtitle.a.string
 37 | 				Qurl = 'https://www.zhihu.com'+Qtitle.a['href']	#问题题目
 38 | 				Qtitle = re.sub(r'[\\/:*?"<>]','#',Qtitle.a.string)			#windows文件/目录名不支持的特殊符号
 39 | 				try:
 40 | 					print('-----正在获取问题:'+Qtitle+'-----')		#获取到问题的链接和标题，进入抓取
 41 | 				except UnicodeEncodeError:
 42 | 					print(r'---问题含有特殊字符无法显示---')
 43 | 				try:
 44 | 					Qcontent = self.getUrl(Qurl)
 45 | 				except:
 46 | 					print('!!!!获取出错!!!!!')
 47 | 					pass
 48 | 				answerList = Qcontent.find_all('div',class_='zm-item-answer  zm-item-expanded')
 49 | 				self._processAnswer(answerList,Qtitle)						#处理问题的答案
 50 | 				time.sleep(5)
 51 | 
 52 | 
 53 | 	def _processAnswer(self,answerList,Qtitle):
 54 | 		j = 0			
 55 | 		for answer in answerList:
 56 | 			j = j + 1
 57 | 			
 58 | 			upvoted = int(answer.find('span',class_='count').string.replace('K','000')) 	#获得此答案赞同数
 59 | 			if upvoted < self.downLimit:
 60 | 				continue
 61 | 			authorInfo = answer.find('div',class_='zm-item-answer-author-info')				#获取作者信息
 62 | 			author = {'introduction':'','link':''}
 63 | 			try:
 64 | 				author['name'] = authorInfo.find('a',class_='author-link').string 			#获得作者的名字
 65 | 				author['introduction'] = str(authorInfo.find('span',class_='bio')['title']) #获得作者的简介
 66 | 				author['link'] = authorInfo.find('a',class_='author-link')['href']			
 67 | 			except AttributeError:
 68 | 				author['name'] = '匿名用户'+str(j)
 69 | 			except TypeError:  																#简介为空的情况
 70 | 				pass 																		#匿名用户没有链接
 71 | 
 72 | 			file_name = os.path.join(store_path,Qtitle,'info',author['name']+'_info.txt')
 73 | 			if os.path.exists(file_name):							#已经抓取过
 74 | 				continue
 75 | 	
 76 | 			self.saveText(file_name,'{introduction}\r\n{link}'.format(**author))			#保存作者的信息
 77 | 			print('正在获取用户`{name}`的答案'.format(**author))
 78 | 			answerContent = answer.find('div',class_='zm-editable-content clearfix')
 79 | 			if answerContent is None:								#被举报的用户没有答案内容
 80 | 				continue
 81 | 	
 82 | 			imgs = answerContent.find_all('img')
 83 | 			if len(imgs) == 0:										#答案没有上图
 84 | 				pass
 85 | 			else:
 86 | 				self._getImgFromAnswer(imgs,Qtitle,**author)
 87 | 
 88 | 	#收录图片
 89 | 	def _getImgFromAnswer(self,imgs,Qtitle,**author):
 90 | 		i = 0
 91 | 		for img in imgs:
 92 | 			if 'inline-image' in img['class']:					#不抓取知乎的小图
 93 | 				continue
 94 | 			i = i + 1
 95 | 			imgUrl = img['src']
 96 | 			extension = os.path.splitext(imgUrl)[1]
 97 | 			path_name = os.path.join(store_path,Qtitle,author['name']+'_'+str(i)+extension)
 98 | 			try:
 99 | 				self.saveImg(imgUrl,path_name)					#捕获各种图片异常，流程不中断
100 | 			except:									
101 | 				pass
102 | 				
103 | 	#收录文字
104 | 	def _getTextFromAnswer(self):
105 | 		pass
106 | 
107 | #命令行下运行，例：zhihu.py 1 5   获取1到5页的数据
108 | if __name__ == '__main__':
109 | 	page, limit, paramsNum= 1, 0, len(sys.argv)
110 | 	if paramsNum>=3:
111 | 		page, pageEnd = sys.argv[1], sys.argv[2]
112 | 	elif paramsNum == 2:
113 | 		page = sys.argv[1]
114 | 		pageEnd = page
115 | 	else:
116 | 		page,pageEnd = 1,1
117 | 
118 | 	spider = zhihuCollectionSpider(page,pageEnd,url)
119 | 	spider.start()
120 | 
121 | 


--------------------------------------------------------------------------------
/bilibili/show.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  3 | <script src="http://code.jquery.com/jquery-1.9.1.js"></script>
  4 | <link rel="stylesheet" href="http://cdn.amazeui.org/amazeui/2.7.2/css/amazeui.min.css">
  5 | 
  6 | <script type='text/javascript'>
  7 | 
  8 | function run(order,limit,average) {
  9 |     $.getJSON('http://localhost/bilidata.php?order='+order+'&limit='+limit+'&average='+average+'&callback=?', function (data) {
 10 | 
 11 | 	$('#container').highcharts({
 12 |         chart: {
 13 |             type: 'column'
 14 |         },
 15 |         title: {
 16 |             text: 'B站8月视频统计'
 17 |         },
 18 |         subtitle: {
 19 |             text: data['name']
 20 |         },
 21 |         xAxis: {
 22 |             type: 'category',
 23 |             labels: {
 24 |                 rotation: -30,
 25 |                 style: {
 26 |                     fontSize: '13px',
 27 |                     fontFamily: 'Verdana, sans-serif'
 28 |                 }
 29 |             }
 30 |         },
 31 |         yAxis: {
 32 |             min: 0,
 33 |             title: {
 34 |                 text: '数量 '
 35 |             }
 36 |         },
 37 |         legend: {
 38 |             enabled: false
 39 |         },
 40 |         tooltip: {
 41 |             pointFormat: '<b>{point.y:.0f}</b>'
 42 |         },
 43 |         series: [{
 44 |             name: data['name'],
 45 |             data: data['data'],
 46 |             dataLabels: {
 47 |                 enabled: true,
 48 |                 rotation: 0,
 49 |                 color: '#FFFFFF',
 50 |                 align: 'center',
 51 |                 format: '{point.y:.0f}', // one decimal
 52 |                 y: 0, // 10 pixels down from the top
 53 |                 style: {
 54 |                     fontSize: '8px',
 55 |                     fontFamily: 'Verdana, sans-serif',
 56 | 					fontWeight: "bold"
 57 |                 }
 58 |             }
 59 |         }]
 60 |     });
 61 |     });
 62 | }
 63 | </script>
 64 | 
 65 | <script src="https://code.highcharts.com/highcharts.js"></script>
 66 | <script src="https://code.highcharts.com/modules/exporting.js"></script>
 67 | 
 68 | 
 69 | <div class="am-g am-margin-top">
 70 |             <div class="am-u-sm-4 am-u-md-2 am-text-right">统计类型</div>
 71 |             <div class="am-u-sm-8 am-u-md-10">
 72 |               <div class="am-btn-group" data-am-button="">
 73 |                 <label class="am-btn am-btn-default am-btn-xs am-active" vName="order" vValue="play">
 74 |                   <input type="radio" name="order" value="play"> 播放
 75 |                 </label>
 76 |                 <label class="am-btn am-btn-default am-btn-xs" vName="order" vValue="coin">
 77 |                   <input type="radio" name="order" value="coin"> 硬币
 78 |                 </label>
 79 |                 <label class="am-btn am-btn-default am-btn-xs" vName="order" vValue="collect">
 80 |                   <input type="radio" name="order" value="collect"> 收藏
 81 |                 </label>
 82 | 				<label class="am-btn am-btn-default am-btn-xs" vName="order" vValue="danmu">
 83 |                   <input type="radio" name="order" value="danmu"> 弹幕
 84 |                 </label>
 85 |               </div>
 86 |             </div>
 87 | </div>
 88 | 
 89 | <div class="am-g am-margin-top">
 90 |             <div class="am-u-sm-4 am-u-md-2 am-text-right">up主数量</div>
 91 |             <div class="am-u-sm-8 am-u-md-10">
 92 |               <div class="am-btn-group" data-am-button="">
 93 |                 <label class="am-btn am-btn-default am-btn-xs am-active" vName="limit" vValue="10">
 94 |                   <input type="radio" name="limit" value="10"> 10
 95 |                 </label>
 96 |                 <label class="am-btn am-btn-default am-btn-xs" vName="limit" vValue="15">
 97 |                   <input type="radio" name="limit" value="15"> 15
 98 |                 </label>
 99 |                 <label class="am-btn am-btn-default am-btn-xs" vName="limit" vValue="20">
100 |                   <input type="radio" name="limit" value="20"> 20
101 |                 </label>
102 |               </div>
103 |             </div>
104 | </div>
105 | <div class="am-g am-margin-top">
106 |             <div class="am-u-sm-4 am-u-md-2 am-text-right">按视频平均数</div>
107 |             <div class="am-u-sm-8 am-u-md-10">
108 |               <div class="am-btn-group" data-am-button="">
109 |                 <label class="am-btn am-btn-default am-btn-xs" vName="average" vValue="1">
110 |                   <input type="radio" name="average" value="1"> 是
111 |                 </label>
112 |                 <label class="am-btn am-btn-default am-btn-xs am-active" vName="average" vValue="0">
113 |                   <input type="radio" name="average" value="0"> 否
114 |                 </label>
115 |               </div>
116 |             </div>
117 | </div>
118 | 
119 | 
120 | <script type="text/javascript">
121 | $(function(){
122 | 	$(".am-btn-xs").change(function(){
123 | 		$(this).siblings().removeClass("am-active");
124 | 		$(this).addClass("am-active");
125 | 		var order = $("label[vName='order'].am-active").attr("vValue");
126 | 		var limit = $("label[vName='limit'].am-active").attr("vValue");
127 | 		var average = $("label[vName='average'].am-active").attr("vValue");
128 | 		
129 | 		if(order !='' && limit!='' ){
130 | 			run(order,limit,average);
131 | 		}
132 | 	});
133 | 	
134 | 	run("play",10,0);
135 | });
136 | </script>
137 | <div id="container" style="min-width: 310px; height: 400px; margin: 0 auto"></div>
138 | <html>


--------------------------------------------------------------------------------
/bilibili/bilibili.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Date    : 2016-09-20 15:42:13
  4 | # @Author  : waitingChou (zhouzt52@qq.com)
  5 | # @Link    : https://github.com/StephinChou/
  6 | __author__ = 'waiting'
  7 | from spider import SpiderHTML
  8 | from multiprocessing import Pool
  9 | import sys,urllib,http,os,re,time,codecs,json
 10 | import pymysql
 11 | pymysql.install_as_MySQLdb()
 12 | 
 13 | #从本地记录里获取曾经爬取过的视频号
 14 | f = open('avSet.txt','r')
 15 | avSet = set([])
 16 | for line in f:
 17 |     avSet = set(line.split(','))
 18 | 
 19 | #一些配置
 20 | conn = pymysql.connect(host='localhost',user='root',passwd='',db='test',port=3306,use_unicode=True, charset="utf8")
 21 | cur=conn.cursor()
 22 | pattern = re.compile(r'\d+')    #获取av号的正则表达式
 23 | orders = {"hot":"播放量","review":"评论数","promote":"硬币数","stow":"收藏数"}
 24 | biliUrl = 'http://www.bilibili.com'
 25 | 
 26 | class BilibiliSpider(SpiderHTML):
 27 |     def __init__(self,module,timeStart,timeEnd,limit):
 28 |         self.url = biliUrl + '/video/' + module + '.html'
 29 |         self.timeStart = timeStart
 30 |         self.timeEnd = timeEnd
 31 |         self.limit = limit
 32 | 
 33 |     def start(self):
 34 |         content = self.getUrl(self.url)
 35 |         sorts = content.find('ul',class_='n_num')
 36 |         subSorts = sorts.find_all('a')
 37 |         
 38 |         #处理该类别下的子模块
 39 |         for sub in subSorts:
 40 |             subName = sub.string
 41 |             if(subName == '全部'):
 42 |                 continue
 43 |             #子模块只需要tid即可
 44 |             tid = sub.parent['tid']
 45 |             if tid is None or tid == '' :
 46 |                 print('模块{type} tid解析错误'.format(type=subName))
 47 |                 continue
 48 |             self.parsePage(subName,tid)
 49 | 
 50 |     #处理一个子模块的页面
 51 |     def parsePage(self,typeName,tid):
 52 |         for (order,name) in orders.items():
 53 |             sumData = dict()
 54 |             print("对子模块‘{typeName}’进行‘{name}’排序的分析".format(name=name,typeName=typeName))
 55 |             sort = 0;
 56 |             #是否获取到足够的排名
 57 |             isBreak = False
 58 |             for page in range(1,5):
 59 |                 # http://www.bilibili.com/list/stow-65-1-2016-09-12~2016-09-19.html
 60 |                 urlTmp = biliUrl + "/list/{order}-{tid}-{page}-{start}~{end}.html".format(order=order,tid=tid,page=page,start=self.timeStart,end=self.timeEnd)
 61 |                 content = self.getUrl(urlTmp)
 62 | 
 63 |                 videoContent = content.find('ul',class_='vd-list l1')
 64 |                 videoList = videoContent.find_all('div',class_='l-item')
 65 |                 
 66 |                 for video in videoList:
 67 |                     AVInfo = dict()     #作品信息
 68 |                     AVInfo['av'] = pattern.search(video.find('a',class_='title')['href']).group()   #av号
 69 |                     AVInfo['title'] = video.find('a',class_='title').string                     #标题
 70 |                     sort=sort+1
 71 |                     if AVInfo['av'] in avSet:
 72 |                         print("已经爬取过该视频av{av},{title}".format(**AVInfo))
 73 |                         continue
 74 |                     
 75 |                     AVInfo['author_name'] = video.find('a',class_='v-author').string            #作者
 76 |                     AVInfo['module'] = typeName                                                 #模块名
 77 |                     AVInfo['tid'] = tid                                                         #模块id
 78 |                     coinInfo = self.parseAV(AVInfo['av'])             #解析详细视频页面获取硬币和收藏数
 79 |                     if coinInfo == 0:
 80 |                         sort=sort-1
 81 |                         print("作品名：{title},【视频信息获取失败】".format(**AVInfo))
 82 |                         continue
 83 | 
 84 |                     AVInfo['play'] = video.find('span',class_='v-info-i gk').span.string        #播放数
 85 |                     AVInfo['danmu'] = video.find('span',class_='v-info-i dm').span.string       #弹幕数
 86 |                     AVInfo['collect'] = video.find('span',class_='v-info-i sc').span.string     #收藏数
 87 |                     AVInfo['url'] = biliUrl + video.find('a',class_='title')['href']            #视频链接
 88 |                     AVInfo['desc'] = video.find('div',class_='v-desc').string                   #视频描述
 89 |                     AVInfo['author'] = video.find('a',class_='v-author')['href'].split('/')[-1]     #用户id
 90 |                     #将此视频加入已经爬取过的列表
 91 |                     avSet.add(AVInfo['av'])
 92 |                     AVInfo['mtime'] = int(time.time())
 93 |                     AVInfo['ctime'] = int(time.time())
 94 |                     #合并信息
 95 |                     AVInfo = dict(coinInfo,**AVInfo)
 96 | 
 97 |                     print("排名第{sort}：\t{author_name},\t播放量:{play},\t收藏数:{collect},\t硬币数:{coin},\t作品名：{title}".format(sort=sort,**AVInfo))
 98 |                     sql = "INSERT IGNORE INTO `bilibili`(`av`, `title`, `module`,`tid`,`author`, `author_name`, `play`, `danmu`, `collect`, `desc`, `share`, `coin`, `mtime`, `ctime`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
 99 |                     args = (AVInfo['av'],AVInfo['title'],AVInfo['module'],AVInfo['tid'],AVInfo['author'],AVInfo['author_name'],AVInfo['play'],AVInfo['danmu'],AVInfo['collect'],AVInfo['desc'],AVInfo['share'],AVInfo['coin'],AVInfo['mtime'],AVInfo['ctime'])
100 |                     cur.execute(sql,args)
101 |                     conn.commit()
102 |                     if sort >= self.limit:
103 |                         isBreak = True
104 |                         break
105 |                 if isBreak == True:
106 |                     break
107 |         #全部获取完毕，保存av号
108 |         with codecs.open('avSet.txt', encoding='utf-8', mode='w') as f:
109 |             f.write(','.join(str(s) for s in avSet))
110 |                     
111 | 
112 |     #解析单独的一个视频
113 |     # @param avNum String video/av6315006/
114 |     def parseAV(self,avNum):
115 |         url = "http://api.bilibili.com/archive_stat/stat?callback=&aid={av}&type=jsonp&_={time}".format(av=avNum,time=int(time.time()*1000))
116 |         info = dict()
117 | 
118 |         try:
119 |             content = self.getUrl(url)
120 |             data = json.loads(str(content))
121 |             info['coin'] = data['data']['coin']
122 |             info['share'] = data['data']['share']
123 |         except:
124 |             return 0;
125 |         return info
126 | 
127 | #module 为 分类 :游戏 game 舞蹈 dance等
128 | module = 'game'
129 | #热度统计开始时间
130 | start = '2016-07-01'
131 | #热度统计结束时间
132 | end = '2016-07-31'
133 | #单个模块排名获取个数100以内
134 | limit = 40
135 | spider = BilibiliSpider(module,start, end,limit)
136 | print("分析周期：`{start}` ~ `{end}`".format(start=start,end=end))
137 | spider.start()
138 | 
139 | 


--------------------------------------------------------------------------------