├── README.md ├── bilibili_video.py └── video.sql /README.md: -------------------------------------------------------------------------------- 1 | # bilibili-video 2 | Bilibili视频爬虫 3 | -------------------------------------------------------------------------------- /bilibili_video.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf8-*- 2 | 3 | from lxml import etree 4 | from multiprocessing.dummy import Pool as ThreadPool 5 | import requests 6 | import time 7 | import sys 8 | import re 9 | import json 10 | import MySQLdb 11 | 12 | reload(sys) 13 | 14 | sys.setdefaultencoding('utf-8') 15 | 16 | # id av cid title tminfo time click danmu coins favourites duration honor_click honor_coins honor_favourites 17 | # mid name article fans tags[3] common 18 | 19 | urls = [] 20 | 21 | head = { 22 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36' 23 | } 24 | 25 | time1 = time.time() 26 | 27 | for i in range(17501, 100000): 28 | url = 'http://bilibili.com/video/av' + str(i) 29 | urls.append(url) 30 | 31 | 32 | def spider(url): 33 | html = requests.get(url, headers=head) 34 | selector = etree.HTML(html.text) 35 | content = selector.xpath("//html") 36 | for each in content: 37 | title = each.xpath('//div[@class="v-title"]/h1/@title') 38 | if title: 39 | av = url.replace("http://bilibili.com/video/av", "") 40 | title = title[0] 41 | tminfo1_log = each.xpath('//div[@class="tminfo"]/a/text()') 42 | tminfo2_log = each.xpath('//div[@class="tminfo"]/span[1]/a/text()') 43 | tminfo3_log = each.xpath('//div[@class="tminfo"]/span[2]/a/text()') 44 | if tminfo1_log: 45 | tminfo1 = tminfo1_log[0] 46 | else: 47 | tminfo1 = "" 48 | if tminfo2_log: 49 | tminfo2 = tminfo2_log[0] 50 | else: 51 | tminfo2 = "" 52 | if tminfo3_log: 53 | tminfo3 = tminfo3_log[0] 54 | else: 55 | tminfo3 = "" 56 | tminfo = tminfo1 + '-' + tminfo2 + '-' + tminfo3 57 | time_log = each.xpath('//div[@class="tminfo"]/time/i/text()') 58 | mid_log = each.xpath('//div[@class="b-btn f hide"]/@mid') 59 | name_log = each.xpath('//div[@class="usname"]/a/@title') 60 | article_log = each.xpath('//div[@class="up-video-message"]/div[1]/text()') 61 | fans_log = each.xpath('//div[@class="up-video-message"]/div[2]/text()') 62 | 63 | if time_log: 64 | time = time_log[0] 65 | else: 66 | time = "" 67 | if mid_log: 68 | mid = mid_log[0] 69 | else: 70 | mid = "" 71 | if name_log: 72 | name = name_log[0] 73 | else: 74 | name = "" 75 | if article_log: 76 | article = article_log[0].replace(u"投稿:","") 77 | else: 78 | article = "-1" 79 | if fans_log: 80 | fans = fans_log[0].replace(u"粉丝:","") 81 | else: 82 | fans = "-1" 83 | 84 | tag1_log = each.xpath('//ul[@class="tag-list"]/li[1]/a/text()') 85 | tag2_log = each.xpath('//ul[@class="tag-list"]/li[2]/a/text()') 86 | tag3_log = each.xpath('//ul[@class="tag-list"]/li[3]/a/text()') 87 | if tag1_log: 88 | tag1 = tag1_log[0] 89 | else: 90 | tag1 = "" 91 | if tag2_log: 92 | tag2 = tag2_log[0] 93 | else: 94 | tag2 = "" 95 | if tag3_log: 96 | tag3 = tag3_log[0] 97 | else: 98 | tag3 = "" 99 | 100 | cid_html_1 = each.xpath('//div[@class="scontent"]/iframe/@src') 101 | cid_html_2 = each.xpath('//div[@class="scontent"]/script/text()') 102 | if cid_html_1 or cid_html_2: 103 | if cid_html_1: 104 | cid_html = cid_html_1[0] 105 | else: 106 | cid_html = cid_html_2[0] 107 | 108 | cids = re.findall(r'cid=.+&aid', cid_html) 109 | cid = cids[0].replace("cid=", "").replace("&aid", "") 110 | info_url = "http://interface.bilibili.com/player?id=cid:" + str(cid) + "&aid=" + av 111 | video_info = requests.get(info_url) 112 | video_selector = etree.HTML(video_info.text) 113 | for video_each in video_selector: 114 | click_log = video_each.xpath('//click/text()') 115 | danmu_log = video_each.xpath('//danmu/text()') 116 | coins_log = video_each.xpath('//coins/text()') 117 | favourites_log = video_each.xpath('//favourites/text()') 118 | duration_log = video_each.xpath('//duration/text()') 119 | honor_click_log = video_each.xpath('//honor[@t="click"]/text()') 120 | honor_coins_log = video_each.xpath('//honor[@t="coins"]/text()') 121 | honor_favourites_log = video_each.xpath('//honor[@t="favourites"]/text()') 122 | 123 | if honor_click_log: 124 | honor_click = honor_click_log[0] 125 | else: 126 | honor_click = 0 127 | if honor_coins_log: 128 | honor_coins = honor_coins_log[0] 129 | else: 130 | honor_coins = 0 131 | if honor_favourites_log: 132 | honor_favourites = honor_favourites_log[0] 133 | else: 134 | honor_favourites = 0 135 | 136 | if click_log: 137 | click = click_log[0] 138 | else: 139 | click = -1 140 | if danmu_log: 141 | danmu = danmu_log[0] 142 | else: 143 | danmu = -1 144 | if coins_log: 145 | coins = coins_log[0] 146 | else: 147 | coins = -1 148 | if favourites_log: 149 | favourites = favourites_log[0] 150 | else: 151 | favourites = -1 152 | if duration_log: 153 | duration = duration_log[0] 154 | else: 155 | duration = "" 156 | 157 | json_url = "http://api.bilibili.com/x/reply?jsonp=jsonp&type=1&sort=0&pn=1&nohot=1&oid=" + av 158 | jsoncontent = requests.get(json_url, headers=head).content 159 | jsDict = json.loads(jsoncontent) 160 | if jsDict['code'] == 0: 161 | jsData = jsDict['data'] 162 | jsPages = jsData['page'] 163 | common = jsPages['acount'] 164 | try: 165 | conn = MySQLdb.connect(host='localhost', user='root', passwd='', port=3306, charset='utf8') 166 | cur = conn.cursor() 167 | conn.select_db('python') 168 | cur.execute('INSERT INTO video VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', 169 | [str(av), str(av), cid, title, tminfo, time, click, danmu, coins, favourites, duration, 170 | mid, name, article, fans, tag1, tag2, tag3, str(common), honor_click, honor_coins, honor_favourites]) 171 | 172 | print "Succeed: av" + str(av) 173 | except MySQLdb.Error, e: 174 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 175 | else: 176 | print "Error_Json: " + url 177 | else: 178 | print "Error_noCid:" + url 179 | else: 180 | print "Error_404: " + url 181 | 182 | 183 | pool = ThreadPool(10) 184 | # results = pool.map(spider, urls) 185 | try: 186 | results = pool.map(spider, urls) 187 | except Exception, e: 188 | # print 'ConnectionError' 189 | print e 190 | time.sleep(300) 191 | results = pool.map(spider, urls) 192 | 193 | pool.close() 194 | pool.join() 195 | -------------------------------------------------------------------------------- /video.sql: -------------------------------------------------------------------------------- 1 | # ************************************************************ 2 | # Sequel Pro SQL dump 3 | # Version 4135 4 | # 5 | # http://www.sequelpro.com/ 6 | # http://code.google.com/p/sequel-pro/ 7 | # 8 | # Host: 127.0.0.1 (MySQL 5.1.63) 9 | # Database: python 10 | # Generation Time: 2016-03-23 04:37:40 +0000 11 | # ************************************************************ 12 | 13 | 14 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 15 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 16 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 17 | /*!40101 SET NAMES utf8 */; 18 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; 19 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; 20 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 21 | 22 | 23 | # Dump of table video 24 | # ------------------------------------------------------------ 25 | 26 | CREATE TABLE `video` ( 27 | `id` int(11) NOT NULL AUTO_INCREMENT, 28 | `av` int(11) DEFAULT NULL, 29 | `cid` int(11) DEFAULT NULL, 30 | `title` varchar(150) DEFAULT NULL, 31 | `tminfo` varchar(45) DEFAULT NULL, 32 | `time` varchar(45) DEFAULT NULL, 33 | `click` int(11) DEFAULT NULL, 34 | `danmu` int(11) DEFAULT NULL, 35 | `coins` int(11) DEFAULT NULL, 36 | `favourites` int(11) DEFAULT NULL, 37 | `duration` varchar(45) DEFAULT NULL, 38 | `mid` int(11) DEFAULT NULL, 39 | `name` varchar(45) DEFAULT NULL, 40 | `article` int(11) DEFAULT NULL, 41 | `fans` int(11) DEFAULT NULL, 42 | `tag1` varchar(45) DEFAULT NULL, 43 | `tag2` varchar(45) DEFAULT NULL, 44 | `tag3` varchar(45) DEFAULT NULL, 45 | `common` int(11) DEFAULT NULL, 46 | `honor_click` int(11) DEFAULT NULL, 47 | `honor_coins` int(11) DEFAULT NULL, 48 | `honor_favourites` int(11) DEFAULT NULL, 49 | PRIMARY KEY (`id`) 50 | ) ENGINE=MyISAM DEFAULT CHARSET=utf8; 51 | 52 | 53 | 54 | 55 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 56 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 57 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; 58 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 59 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 60 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 61 | --------------------------------------------------------------------------------