├── LICENSE ├── README.md ├── analysis.py ├── excelSave.py ├── img ├── fy.png ├── seg.png └── weibo.png ├── normal-topic-spyder.py ├── seg.py ├── seg_result ├── IG+rng.txt ├── edg.txt ├── igbanlan.txt ├── jackeylove.txt ├── mlxg.txt ├── result.txt ├── teamwe.txt ├── theshy.txt ├── uzi.txt └── 英雄联盟.txt ├── super-topic-spyder.py └── weibodata ├── IG+rng.xls ├── edg.xls ├── igbanlan.xls ├── jackeylove.xls ├── mlxg.xls ├── teamwe.xls ├── theshy.xls ├── uzi.xls ├── 肺炎.xls └── 英雄联盟.xls /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Greatcat 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # weibo-topic-spyder 2 | 微博超级话题爬虫,微博词频统计+情感分析+简单分类 3 | 4 | 新增微博普通话题爬取,其中讨论和阅读数量的爬取待完善 5 | 6 | 7 | 爬取数据展示 8 | ![](img/fy.png) 9 | ![](img/weibo.png) 10 | 11 | ### 使用方法 12 | 爬虫主文件: 13 | 14 | 微博普通话题:normal-topic-spyder.py 15 | 16 | 微博超级话题:super-topic-spyder.py 17 | 18 | 在爬虫主文件中的主函数中输入账号、密码和想要爬取的超话名称即可开始爬取,需要提前安装所需的python库和chromedriver驱动 19 | 20 | 爬取结束后数据会自动保存在当前目录下的excel文件中,每行为一个微博数据。 21 | 22 | 提示:普通的话题爬取需要添加#,例如#话题#,超级话题无需添加;一般而言,在微博中超级话题前带有钻石标志,普通话题为#话题#的格式 23 | 24 | ### 超级话题爬虫 25 | 26 | 使用了selenium模拟浏览器登陆进行爬取,具体话题爬取数量受微博限制,目前单个话题最大获取微博数量为8000条左右,选择了使用手机网页模式爬取,以获得最佳的爬取效果。 27 | 28 | 账号与IP数量对单个超话的爬取帮助不大,就只设置了单账号和ip模式,若需多超话同时爬取可以自行添加。 29 | 30 | 如需爬取多个超话,可以选择使用cookie登陆,最为方便 31 | 32 | ### 词频统计 33 | 使用了jieba库进行分词,最后对分词结果进行简单统计并且存储到txt中 34 | 35 | ### 情感分析 36 | 调用了百度大脑的api接口,可以自行注册获取key,平台不限调用次数,详细接口见[百度大脑](https://ai.baidu.com/tech/nlp_apply/sentiment_classify) 37 | 38 | ### 其他 39 | 欢迎大家参与和完善:如有其他问题,欢迎提交issue 40 | -------------------------------------------------------------------------------- /analysis.py: -------------------------------------------------------------------------------- 1 | from aip import AipNlp 2 | import pandas as pd 3 | import numpy as np 4 | import time 5 | 6 | 7 | # 此处输入baiduAIid 8 | APP_ID = '' 9 | API_KEY = '' 10 | SECRET_KEY = '' 11 | 12 | client = AipNlp(APP_ID, API_KEY, SECRET_KEY) 13 | 14 | def isPostive(text): 15 | try: 16 | if client.sentimentClassify(text)['items'][0]['positive_prob']>0.5: 17 | return "积极" 18 | else: 19 | return "消极" 20 | except: 21 | return "积极" 22 | 23 | # 读取文件,注意修改文件路径 24 | file_path = 'mlxg.xls' 25 | data = pd.read_excel(file_path,encoding='utf-8') 26 | 27 | moods = [] 28 | count = 1 29 | for i in data['微博内容']: 30 | moods.append(isPostive(i)) 31 | count+=1 32 | print("目前分析到:"+count) 33 | 34 | data['情感倾向'] = pd.Series(moods) 35 | 36 | # 此处为覆盖保存 37 | data.to_excel(file_path) 38 | print("分析完成,已保存") 39 | 40 | 41 | ''' 42 | # 此处为简单分类:P 43 | def fenlei(text): 44 | xf = ['抽奖',"抽一个","抽一位","买","通贩"] 45 | cz = ["画","实物","返图","合集","摸鱼","漫","自制","攻略","授权","草稿","绘"] 46 | gj = ["hz","狗粉丝","狗女儿"] 47 | for j in cz: 48 | if j in text: 49 | return "创作" 50 | for i in xf: 51 | if i in text: 52 | return "消费" 53 | for k in gj: 54 | if k in text: 55 | return "攻击" 56 | return "其他" 57 | ''' 58 | -------------------------------------------------------------------------------- /excelSave.py: -------------------------------------------------------------------------------- 1 | import xlrd 2 | import xlwt 3 | from xlutils.copy import copy 4 | 5 | def write_excel_xls(path, sheet_name, value): 6 | index = len(value) # 获取需要写入数据的行数 7 | workbook = xlwt.Workbook() # 新建一个工作簿 8 | sheet = workbook.add_sheet(sheet_name) # 在工作簿中新建一个表格 9 | for i in range(0, index): 10 | for j in range(0, len(value[i])): 11 | sheet.write(i, j, value[i][j]) # 像表格中写入数据(对应的行和列) 12 | workbook.save(path) # 保存工作簿 13 | print("xls格式表格写入数据成功!") 14 | 15 | def read_excel_xls(path): 16 | data = [] 17 | workbook = xlrd.open_workbook(path) # 打开工作簿 18 | sheets = workbook.sheet_names() # 获取工作簿中的所有表格 19 | worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格 20 | if worksheet.nrows == 1: 21 | print("目前是第一行") 22 | else: 23 | for i in range(1, worksheet.nrows): #从第二行取值 24 | dataTemp = [] 25 | for j in range(0, worksheet.ncols): 26 | #print(worksheet.cell_value(i, j), "\t", end="") # 逐行逐列读取数据 27 | dataTemp.append(worksheet.cell_value(i, j)) 28 | data.append(dataTemp) 29 | return data 30 | 31 | def write_excel_xls_append_norepeat(path, value): 32 | workbook = xlrd.open_workbook(path) # 打开工作簿 33 | sheets = workbook.sheet_names() # 获取工作簿中的所有表格 34 | worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格 35 | rows_old = worksheet.nrows # 获取表格中已存在的数据的行数 36 | new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象 37 | new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格 38 | rid = 0 39 | for i in range(0, len(value)): 40 | data = read_excel_xls(path) 41 | data_temp = [] 42 | for m in range(0,len(data)): 43 | data_temp.append(data[m][1:len(data[m])]) 44 | value_temp = [] 45 | for m in range(0,len(value)): 46 | value_temp.append(value[m][1:len(value[m])]) 47 | 48 | if value_temp[i] not in data_temp: 49 | for j in range(0, len(value[i])): 50 | new_worksheet.write(rid+rows_old, j, value[i][j]) # 追加写入数据,注意是从i+rows_old行开始写入 51 | rid = rid + 1 52 | new_workbook.save(path) # 保存工作簿 53 | print("xls格式表格【追加】写入数据成功!") 54 | else: 55 | print("数据重复") 56 | -------------------------------------------------------------------------------- /img/fy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/img/fy.png -------------------------------------------------------------------------------- /img/seg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/img/seg.png -------------------------------------------------------------------------------- /img/weibo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/img/weibo.png -------------------------------------------------------------------------------- /normal-topic-spyder.py: -------------------------------------------------------------------------------- 1 | import time 2 | import xlrd 3 | from selenium import webdriver 4 | from selenium.webdriver.common.keys import Keys 5 | import os 6 | import excelSave as save 7 | 8 | # 用来控制页面滚动 9 | def Transfer_Clicks(browser): 10 | time.sleep(5) 11 | try: 12 | browser.execute_script("window.scrollBy(0,document.body.scrollHeight)", "") 13 | except: 14 | pass 15 | return "Transfer successfully \n" 16 | 17 | #判断页面是否加载出来 18 | def isPresent(): 19 | temp =1 20 | try: 21 | driver.find_elements_by_css_selector('div.line-around.layout-box.mod-pagination > a:nth-child(2) > div > select > option') 22 | except: 23 | temp =0 24 | return temp 25 | 26 | #插入数据 27 | def insert_data(elems,path,name,yuedu,taolun): 28 | for elem in elems: 29 | workbook = xlrd.open_workbook(path) # 打开工作簿 30 | sheets = workbook.sheet_names() # 获取工作簿中的所有表格 31 | worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格 32 | rows_old = worksheet.nrows # 获取表格中已存在的数据的行数 33 | rid = rows_old 34 | #用户名 35 | weibo_username = elem.find_elements_by_css_selector('h3.m-text-cut')[0].text 36 | weibo_userlevel = "普通用户" 37 | #微博等级 38 | try: 39 | weibo_userlevel_color_class = elem.find_elements_by_css_selector("i.m-icon")[0].get_attribute("class").replace("m-icon ","") 40 | if weibo_userlevel_color_class == "m-icon-yellowv": 41 | weibo_userlevel = "黄v" 42 | if weibo_userlevel_color_class == "m-icon-bluev": 43 | weibo_userlevel = "蓝v" 44 | if weibo_userlevel_color_class == "m-icon-goldv-static": 45 | weibo_userlevel = "金v" 46 | if weibo_userlevel_color_class == "m-icon-club": 47 | weibo_userlevel = "微博达人" 48 | except: 49 | weibo_userlevel = "普通用户" 50 | #微博内容 51 | weibo_content = elem.find_elements_by_css_selector('div.weibo-text')[0].text 52 | shares = elem.find_elements_by_css_selector('i.m-font.m-font-forward + h4')[0].text 53 | comments = elem.find_elements_by_css_selector('i.m-font.m-font-comment + h4')[0].text 54 | likes = elem.find_elements_by_css_selector('i.m-icon.m-icon-like + h4')[0].text 55 | 56 | #发布时间 57 | weibo_time = elem.find_elements_by_css_selector('span.time')[0].text 58 | ''' 59 | print("用户名:"+ weibo_username + "|" 60 | "微博等级:"+ weibo_userlevel + "|" 61 | "微博内容:"+ weibo_content + "|" 62 | "转发:"+ shares + "|" 63 | "评论数:"+ comments + "|" 64 | "点赞数:"+ likes + "|" 65 | "发布时间:"+ weibo_time + "|" 66 | "话题名称" + name + "|" 67 | "话题讨论数" + yuedu + "|" 68 | "话题阅读数" + taolun) 69 | ''' 70 | value1 = [[rid, weibo_username, weibo_userlevel,weibo_content, shares,comments,likes,weibo_time,keyword,name,yuedu,taolun],] 71 | print("当前插入第%d条数据" % rid) 72 | save.write_excel_xls_append_norepeat(book_name_xls, value1) 73 | #获取当前页面的数据 74 | def get_current_weibo_data(elems,book_name_xls,name,yuedu,taolun,maxWeibo): 75 | #开始爬取数据 76 | before = 0 77 | after = 0 78 | n = 0 79 | timeToSleep = 100 80 | while True: 81 | before = after 82 | Transfer_Clicks(driver) 83 | time.sleep(3) 84 | elems = driver.find_elements_by_css_selector('div.card.m-panel.card9') 85 | print("当前包含微博最大数量:%d,n当前的值为:%d, n值到5说明已无法解析出新的微博" % (len(elems),n)) 86 | after = len(elems) 87 | if after > before: 88 | n = 0 89 | if after == before: 90 | n = n + 1 91 | if n == 5: 92 | print("当前关键词最大微博数为:%d" % after) 93 | insert_data(elems,book_name_xls,name,yuedu,taolun) 94 | break 95 | if len(elems)>maxWeibo: 96 | print("当前微博数以达到%d条"%maxWeibo) 97 | insert_data(elems,book_name_xls,name,yuedu,taolun) 98 | break 99 | ''' 100 | if after > timeToSleep: 101 | print("抓取到%d多条,插入当前新抓取数据并休眠5秒" % timeToSleep) 102 | timeToSleep = timeToSleep + 100 103 | insert_data(elems,book_name_xls,name,yuedu,taolun) 104 | time.sleep(5) 105 | ''' 106 | 107 | 108 | #爬虫运行 109 | def spider(username,password,driver,book_name_xls,sheet_name_xls,keyword,maxWeibo): 110 | 111 | #创建文件 112 | if os.path.exists(book_name_xls): 113 | print("文件已存在") 114 | else: 115 | print("文件不存在,重新创建") 116 | value_title = [["rid", "用户名称", "微博等级", "微博内容", "微博转发量","微博评论量","微博点赞","发布时间","搜索关键词","话题名称","话题讨论数","话题阅读数"],] 117 | save.write_excel_xls(book_name_xls, sheet_name_xls, value_title) 118 | 119 | #加载驱动,使用浏览器打开指定网址 120 | driver.set_window_size(452, 790) 121 | driver.get("https://passport.weibo.cn/signin/login") 122 | print("开始自动登陆,若出现验证码手动验证") 123 | time.sleep(3) 124 | 125 | elem = driver.find_element_by_xpath("//*[@id='loginName']"); 126 | elem.send_keys(username) 127 | elem = driver.find_element_by_xpath("//*[@id='loginPassword']"); 128 | elem.send_keys(password) 129 | elem = driver.find_element_by_xpath("//*[@id='loginAction']"); 130 | elem.send_keys(Keys.ENTER) 131 | print("暂停20秒,用于验证码验证") 132 | time.sleep(20) 133 | 134 | ''' 135 | # 添加cookie 136 | cookie = [] 137 | for ix in cookie: 138 | driver.add_cookie(ix) 139 | driver.get("https://m.weibo.cn") 140 | ''' 141 | 142 | 143 | while 1: # 循环条件为1必定成立 144 | result = isPresent() 145 | # 解决输入验证码无法跳转的问题 146 | driver.get('https://m.weibo.cn/') 147 | print ('判断页面1成功 0失败 结果是=%d' % result ) 148 | if result == 1: 149 | elems = driver.find_elements_by_css_selector('div.line-around.layout-box.mod-pagination > a:nth-child(2) > div > select > option') 150 | #return elems #如果封装函数,返回页面 151 | break 152 | else: 153 | print ('页面还没加载出来呢') 154 | time.sleep(20) 155 | 156 | time.sleep(2) 157 | 158 | #搜索关键词 159 | elem = driver.find_element_by_xpath("//*[@class='m-text-cut']").click(); 160 | time.sleep(2) 161 | elem = driver.find_element_by_xpath("//*[@type='search']"); 162 | elem.send_keys(keyword) 163 | elem.send_keys(Keys.ENTER) 164 | time.sleep(5) 165 | 166 | 167 | 168 | print("话题链接获取完毕,休眠2秒") 169 | time.sleep(2) 170 | 171 | # 阅读和讨论待更新,目前先用固定数字 172 | yuedu = '6.6亿' 173 | taolun = '20.3万' 174 | 175 | time.sleep(2) 176 | name = keyword 177 | shishi_element = driver.find_element_by_xpath("//*[@id='app']/div[1]/div[1]/div[3]/div[2]/div[1]/div/div/div/ul/li[2]/span") 178 | driver.execute_script('arguments[0].click()',shishi_element) 179 | get_current_weibo_data(elems,book_name_xls,name,yuedu,taolun,maxWeibo) #爬取实时 180 | time.sleep(2) 181 | 182 | 183 | 184 | if __name__ == '__main__': 185 | username = "" #你的微博登录名 186 | password = "" #你的密码 187 | driver = webdriver.Chrome()#你的chromedriver的地址 188 | book_name_xls = "test.xls" #填写你想存放excel的路径,没有文件会自动创建 189 | sheet_name_xls = '微博数据' #sheet表名 190 | maxWeibo = 5000 #设置最多多少条微博 191 | keywords = ["#华为向武汉捐赠3000万#",] # 此处可以设置多个话题,#必须要加上 192 | for keyword in keywords: 193 | spider(username,password,driver,book_name_xls,sheet_name_xls,keyword,maxWeibo) -------------------------------------------------------------------------------- /seg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import jieba 4 | from collections import Counter 5 | import pandas as pd 6 | 7 | 8 | 9 | def save_seg(filename,cnt): 10 | f_out = open(filename, 'w+') 11 | result = cnt.most_common(100) 12 | for ix in result: 13 | f_out.write(ix[0]+"\t出现次数:"+str(ix[1])+"\n") 14 | 15 | 16 | 17 | STOPWORDS = [u'的',u' ',u'\n',u'他', u'地', u'得', u'而', u'了', u'在', u'是', u'我', u'有', u'和', u'就', u'不', u'人', u'都', u'一', u'一个', u'上', u'也', u'很', u'到', u'说', u'要', u'去', u'你', u'会', u'着', u'没有', u'看', u'好', u'自己', u'这'] 18 | PUNCTUATIONS = [u'。',u'#', u',', u'“', u'”', u'…', u'?', u'!', u'、', u';', u'(', u')'] 19 | 20 | 21 | # 需要进行分词的文件 22 | wj = ['mlxg','IG+rng','igbanlan','edg','uzi','teamwe','theshy','英雄联盟','jackeylove'] 23 | 24 | cnt = Counter() 25 | 26 | for file in wj: 27 | data = pd.read_excel(file+'.xls',encoding='utf-8') 28 | for l in data['微博内容'].astype(str): 29 | seg_list = jieba.cut(l) 30 | for seg in seg_list: 31 | if seg not in STOPWORDS and seg not in PUNCTUATIONS and seg not in wj: 32 | cnt[seg] = cnt[seg] + 1 33 | 34 | 35 | save_seg("seg_result/"+file+".txt",cnt) # 保存文件 36 | 37 | -------------------------------------------------------------------------------- /seg_result/IG+rng.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/seg_result/IG+rng.txt -------------------------------------------------------------------------------- /seg_result/edg.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/seg_result/edg.txt -------------------------------------------------------------------------------- /seg_result/igbanlan.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/seg_result/igbanlan.txt -------------------------------------------------------------------------------- /seg_result/jackeylove.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/seg_result/jackeylove.txt -------------------------------------------------------------------------------- /seg_result/mlxg.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/seg_result/mlxg.txt -------------------------------------------------------------------------------- /seg_result/result.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/seg_result/result.txt -------------------------------------------------------------------------------- /seg_result/teamwe.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/seg_result/teamwe.txt -------------------------------------------------------------------------------- /seg_result/theshy.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/seg_result/theshy.txt -------------------------------------------------------------------------------- /seg_result/uzi.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/seg_result/uzi.txt -------------------------------------------------------------------------------- /seg_result/英雄联盟.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/seg_result/英雄联盟.txt -------------------------------------------------------------------------------- /super-topic-spyder.py: -------------------------------------------------------------------------------- 1 | import time 2 | import xlrd 3 | from selenium import webdriver 4 | from selenium.webdriver.common.keys import Keys 5 | import os 6 | import requests 7 | import json 8 | 9 | import excelSave as save 10 | 11 | 12 | # 用来控制页面滚动 13 | def Transfer_Clicks(browser): 14 | time.sleep(5) 15 | try: 16 | browser.execute_script("window.scrollBy(0,document.body.scrollHeight)", "") 17 | except: 18 | pass 19 | return "Transfer successfully \n" 20 | 21 | #判断页面是否加载出来 22 | def isPresent(): 23 | temp =1 24 | try: 25 | driver.find_elements_by_css_selector('div.line-around.layout-box.mod-pagination > a:nth-child(2) > div > select > option') 26 | except: 27 | temp =0 28 | return temp 29 | 30 | #插入数据 31 | def insert_data(elems,path,name,yuedu,taolun,num,save_pic): 32 | for elem in elems: 33 | workbook = xlrd.open_workbook(path) # 打开工作簿 34 | sheets = workbook.sheet_names() # 获取工作簿中的所有表格 35 | worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格 36 | rows_old = worksheet.nrows # 获取表格中已存在的数据的行数 37 | rid = rows_old 38 | #用户名 39 | weibo_username = elem.find_elements_by_css_selector('h3.m-text-cut')[0].text 40 | weibo_userlevel = "普通用户" 41 | #微博等级 42 | try: 43 | weibo_userlevel_color_class = elem.find_elements_by_css_selector("i.m-icon")[0].get_attribute("class").replace("m-icon ","") 44 | if weibo_userlevel_color_class == "m-icon-yellowv": 45 | weibo_userlevel = "黄v" 46 | if weibo_userlevel_color_class == "m-icon-bluev": 47 | weibo_userlevel = "蓝v" 48 | if weibo_userlevel_color_class == "m-icon-goldv-static": 49 | weibo_userlevel = "金v" 50 | if weibo_userlevel_color_class == "m-icon-club": 51 | weibo_userlevel = "微博达人" 52 | except: 53 | weibo_userlevel = "普通用户" 54 | #微博内容 55 | #点击“全文”,获取完整的微博文字内容 56 | weibo_content = get_all_text(elem) 57 | #获取微博图片 58 | if save_pic: 59 | num = get_pic(elem,num) 60 | #获取分享数,评论数和点赞数 61 | shares = elem.find_elements_by_css_selector('i.m-font.m-font-forward + h4')[0].text 62 | if shares == '转发': 63 | shares = '0' 64 | comments = elem.find_elements_by_css_selector('i.m-font.m-font-comment + h4')[0].text 65 | if comments == '评论': 66 | comments = '0' 67 | likes = elem.find_elements_by_css_selector('i.m-icon.m-icon-like + h4')[0].text 68 | if likes == '赞': 69 | likes = '0' 70 | 71 | #发布时间 72 | weibo_time = elem.find_elements_by_css_selector('span.time')[0].text 73 | ''' 74 | print("用户名:"+ weibo_username + "|" 75 | "微博等级:"+ weibo_userlevel + "|" 76 | "微博内容:"+ weibo_content + "|" 77 | "转发:"+ shares + "|" 78 | "评论数:"+ comments + "|" 79 | "点赞数:"+ likes + "|" 80 | "发布时间:"+ weibo_time + "|" 81 | "话题名称" + name + "|" 82 | "话题讨论数" + yuedu + "|" 83 | "话题阅读数" + taolun) 84 | ''' 85 | value1 = [[rid, weibo_username, weibo_userlevel,weibo_content, shares,comments,likes,weibo_time,keyword,name,yuedu,taolun],] 86 | print("当前插入第%d条数据" % rid) 87 | save.write_excel_xls_append_norepeat(book_name_xls, value1) 88 | 89 | #获取“全文”内容 90 | def get_all_text(elem): 91 | try: 92 | #判断是否有“全文内容”,若有则将内容存储在weibo_content中 93 | href = elem.find_element_by_link_text('全文').get_attribute('href') 94 | driver.execute_script('window.open("{}")'.format(href)) 95 | driver.switch_to.window(driver.window_handles[1]) 96 | weibo_content = driver.find_element_by_class_name('weibo-text').text 97 | driver.close() 98 | driver.switch_to.window(driver.window_handles[0]) 99 | except: 100 | weibo_content = elem.find_elements_by_css_selector('div.weibo-text')\ 101 | [0].text 102 | return weibo_content 103 | 104 | def get_pic(elem,num): 105 | try: 106 | #获取该条微博中的图片元素,之后遍历每个图片元素,获取图片链接并下载图片 107 | #如果是多张图片 108 | if elem.find_elements_by_css_selector\ 109 | ('div > div > article > div > div:nth-child(2) > div > ul > li') != [] : 110 | pic_links = elem.find_elements_by_css_selector\ 111 | ('div > div > article > div > div:nth-child(2) > div > ul > li') 112 | for pic_link in pic_links: 113 | pic_link = pic_link.find_element_by_css_selector\ 114 | ('div > img').get_attribute('src') 115 | response = requests.get(pic_link) 116 | pic = response.content 117 | with open(pic_addr + str(num) + '.jpg', 'wb') as file: 118 | file.write(pic) 119 | num += 1 120 | #如果图片只有一张 121 | else: 122 | pic_link = elem.find_element_by_css_selector\ 123 | ('div > div > article > div > div:nth-child(2) > div > div > img').\ 124 | get_attribute('src') 125 | response = requests.get(pic_link) 126 | pic = response.content 127 | with open(pic_addr + str(num) + '.jpg', 'wb') as file: 128 | file.write(pic) 129 | num += 1 130 | except Exception as e: 131 | print(e) 132 | return num 133 | 134 | #获取当前页面的数据 135 | def get_current_weibo_data(elems,book_name_xls,name,yuedu,taolun,maxWeibo,num): 136 | #开始爬取数据 137 | before = 0 138 | after = 0 139 | n = 0 140 | timeToSleep = 100 141 | while True: 142 | before = after 143 | Transfer_Clicks(driver) 144 | time.sleep(3) 145 | elems = driver.find_elements_by_css_selector('div.card.m-panel.card9') 146 | print("当前包含微博最大数量:%d,n当前的值为:%d, n值到5说明已无法解析出新的微博" % (len(elems),n)) 147 | after = len(elems) 148 | if after > before: 149 | n = 0 150 | if after == before: 151 | n = n + 1 152 | if n == 5: 153 | print("当前关键词最大微博数为:%d" % after) 154 | insert_data(elems,book_name_xls,name,yuedu,taolun,num,save_pic) 155 | break 156 | if len(elems)>maxWeibo: 157 | print("当前微博数以达到%d条"%maxWeibo) 158 | insert_data(elems,book_name_xls,name,yuedu,taolun,num,save_pic) 159 | break 160 | ''' 161 | if after > timeToSleep: 162 | print("抓取到%d多条,插入当前新抓取数据并休眠5秒" % timeToSleep) 163 | timeToSleep = timeToSleep + 100 164 | insert_data(elems,book_name_xls,name,yuedu,taolun,num,save_pic) 165 | time.sleep(5) 166 | ''' 167 | 168 | 169 | #爬虫运行 170 | def spider(username,password,book_name_xls,sheet_name_xls,keyword,maxWeibo,num,save_pic): 171 | 172 | #创建文件 173 | if os.path.exists(book_name_xls): 174 | print("文件已存在") 175 | else: 176 | print("文件不存在,重新创建") 177 | value_title = [["rid", "用户名称", "微博等级", "微博内容", "微博转发量","微博评论量","微博点赞","发布时间","搜索关键词","话题名称","话题讨论数","话题阅读数"],] 178 | save.write_excel_xls(book_name_xls, sheet_name_xls, value_title) 179 | 180 | #加载驱动,使用浏览器打开指定网址 181 | driver.set_window_size(452, 790) 182 | driver.get('https://m.weibo.cn') 183 | 184 | driver.get("https://passport.weibo.cn/signin/login") 185 | print("开始自动登陆,若出现验证码手动验证") 186 | time.sleep(3) 187 | 188 | elem = driver.find_element_by_xpath("//*[@id='loginName']"); 189 | elem.send_keys(username) 190 | elem = driver.find_element_by_xpath("//*[@id='loginPassword']"); 191 | elem.send_keys(password) 192 | elem = driver.find_element_by_xpath("//*[@id='loginAction']"); 193 | elem.send_keys(Keys.ENTER) 194 | print("暂停20秒,用于验证码验证") 195 | time.sleep(20) 196 | 197 | 198 | # 添加cookie 199 | #将提前从chrome控制台中复制来的cookie保存在txt中,转化成name, value形式传给selenium的driver 200 | #实现自动登录 201 | #如果txt中的cookie是用selenium保存的,则可以直接使用, 无需转化 202 | ''' 203 | driver.delete_all_cookies() 204 | with open(r'./weibocookie.txt') as file: 205 | cookies = json.loads(file.read()) 206 | for name, value in cookies.items(): 207 | print(name, value) 208 | driver.add_cookie({'name': name, 'value': value}) 209 | driver.refresh() 210 | ''' 211 | 212 | 213 | 214 | while 1: # 循环条件为1必定成立 215 | result = isPresent() 216 | # 解决输入验证码无法跳转的问题 217 | driver.get('https://m.weibo.cn/') 218 | print ('判断页面1成功 0失败 结果是=%d' % result ) 219 | if result == 1: 220 | elems = driver.find_elements_by_css_selector('div.line-around.layout-box.mod-pagination > a:nth-child(2) > div > select > option') 221 | #return elems #如果封装函数,返回页面 222 | break 223 | else: 224 | print ('页面还没加载出来呢') 225 | time.sleep(20) 226 | 227 | time.sleep(2) 228 | 229 | #搜索关键词 230 | elem = driver.find_element_by_xpath("//*[@class='m-text-cut']").click(); 231 | time.sleep(2) 232 | elem = driver.find_element_by_xpath("//*[@type='search']"); 233 | elem.send_keys(keyword) 234 | elem.send_keys(Keys.ENTER) 235 | time.sleep(5) 236 | 237 | # elem = driver.find_element_by_xpath("//*[@class='box-left m-box-col m-box-center-a']") 238 | # 修改为点击超话图标进入超话,减少错误 239 | elem = driver.find_element_by_xpath("//img[@src ='http://simg.s.weibo.com/20181009184948_super_topic_bg_small.png']") 240 | elem.click() 241 | print("超话链接获取完毕,休眠2秒") 242 | time.sleep(2) 243 | yuedu_taolun = driver.find_element_by_xpath("//*[@id='app']/div[1]/div[1]/div[1]/div[4]/div/div/div/a/div[2]/h4[1]").text 244 | yuedu = yuedu_taolun.split(" ")[0] 245 | taolun = yuedu_taolun.split(" ")[1] 246 | time.sleep(2) 247 | name = keyword 248 | shishi_element = driver.find_element_by_xpath("//*[@class='scroll-box nav_item']/ul/li/span[text()='帖子']") 249 | 250 | get_current_weibo_data(elems,book_name_xls,name,yuedu,taolun,maxWeibo,num) #爬取实时 251 | time.sleep(2) 252 | 253 | 254 | if __name__ == '__main__': 255 | username = "" #你的微博登录名 256 | password = "" #你的密码 257 | driver = webdriver.Chrome()#你的chromedriver的地址 258 | driver.implicitly_wait(2)#隐式等待2秒 259 | book_name_xls = "test.xls" #填写你想存放excel的路径,没有文件会自动创建 260 | sheet_name_xls = '微博数据' #sheet表名 261 | maxWeibo = 10 #设置最多多少条微博 262 | keywords = ["肺炎",] # 此处可以设置多个超话关键词 263 | num = 1 264 | save_pic = False #设置是否同时爬取微博图片,默认不爬取 265 | pic_addr = 'img/' #设置自己想要放置图片的路径 266 | 267 | for keyword in keywords: 268 | spider(username,password,book_name_xls,sheet_name_xls,keyword,maxWeibo,num,save_pic) 269 | -------------------------------------------------------------------------------- /weibodata/IG+rng.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/weibodata/IG+rng.xls -------------------------------------------------------------------------------- /weibodata/edg.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/weibodata/edg.xls -------------------------------------------------------------------------------- /weibodata/igbanlan.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/weibodata/igbanlan.xls -------------------------------------------------------------------------------- /weibodata/jackeylove.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/weibodata/jackeylove.xls -------------------------------------------------------------------------------- /weibodata/mlxg.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/weibodata/mlxg.xls -------------------------------------------------------------------------------- /weibodata/teamwe.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/weibodata/teamwe.xls -------------------------------------------------------------------------------- /weibodata/theshy.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/weibodata/theshy.xls -------------------------------------------------------------------------------- /weibodata/uzi.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/weibodata/uzi.xls -------------------------------------------------------------------------------- /weibodata/肺炎.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/weibodata/肺炎.xls -------------------------------------------------------------------------------- /weibodata/英雄联盟.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/czy1999/weibo-topic-spider/a7e93f1a8fac4146be36b8a594b7977fbac019f0/weibodata/英雄联盟.xls --------------------------------------------------------------------------------