├── 123.jpg ├── font.ttf ├── 评论词云.png ├── README.md ├── test.py ├── comment_test.py ├── comment_nlp.py ├── comment_get.py └── emotion_analyse.py /123.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melo4/weibo/HEAD/123.jpg -------------------------------------------------------------------------------- /font.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melo4/weibo/HEAD/font.ttf -------------------------------------------------------------------------------- /评论词云.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/melo4/weibo/HEAD/评论词云.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # weibo 2 | 微博评论爬取及nlp情感分析 3 | ## test.py 4 | 微博API简单使用 5 | ## comment_test.py 6 | 利用微博API获取微博评论 7 | ## comment_get.py 8 | 爬虫爬取微博评论 9 | ## comment_nlp.py 10 | 对微博评论进行去重 统计 分析等操作 11 | ## emotion_analyse 12 | python实现简单的情感分析 13 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from weibo import APIClient 3 | import webbrowser #python内置的包 4 | 5 | APP_KEY = 'xxxxxxx'#注意替换这里为自己申请的App信息 6 | APP_SECRET = 'xxxxxxxxxxxxxxxx'#注意替换这里为自己申请的应用密钥 7 | CALLBACK_URL = 'https://api.weibo.com/oauth2/default.html'#回调授权页面 8 | 9 | #利用官方微博SDK 10 | client = APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=CALLBACK_URL) 11 | #得到授权页面的url,利用webbrowser打开这个url 12 | url = client.get_authorize_url() 13 | print url 14 | webbrowser.open_new(url) 15 | 16 | #获取code=后面的内容 17 | print '输入url中code后面的内容后按回车键:' 18 | code = raw_input() 19 | #code = your.web.framework.request.get('code') 20 | #client = APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=CALLBACK_URL) 21 | r = client.request_access_token(code) 22 | access_token = r.access_token # 新浪返回的token,类似abc123xyz456 23 | expires_in = r.expires_in 24 | 25 | # 设置得到的access_token 26 | client.set_access_token(access_token, expires_in) 27 | 28 | #可以打印下看看里面都有什么东西 29 | statuses = client.statuses__friends_timeline()['statuses'] #获取当前登录用户以及所关注用户(已授权)的微博 30 | 31 | length = len(statuses) 32 | print length 33 | #输出了部分信息 34 | for i in range(0,length): 35 | print u'昵称:'+statuses[i]['user']['screen_name'] 36 | print u'简介:'+statuses[i]['user']['description'] 37 | print u'位置:'+statuses[i]['user']['location'] 38 | print u'微博:'+statuses[i]['text'] -------------------------------------------------------------------------------- /comment_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from weibo import APIClient 3 | import webbrowser 4 | import pymysql,re,time 5 | 6 | import sys 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') #修改系统默认编码 9 | 10 | APP_KEY = 'xxxxxxxx'#注意替换这里为自己申请的App信息 11 | APP_SECRET = 'xxxxxxxxxxxxxx'#注意这里替换为自己申请的应用信息 12 | CALLBACK_URL = 'https://api.weibo.com/oauth2/default.html'#回调授权页面 13 | 14 | #利用官方微博SDK 15 | client = APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=CALLBACK_URL) 16 | #得到授权页面的url,利用webbrowser打开这个url 17 | url = client.get_authorize_url() 18 | print url 19 | webbrowser.open_new(url) 20 | 21 | #获取code=后面的内容 22 | print '输入url中code后面的内容后按回车键:' 23 | code = raw_input() 24 | #code = your.web.framework.request.get('code') 25 | #client = APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=CALLBACK_URL) 26 | r = client.request_access_token(code) 27 | access_token = r.access_token # 新浪返回的token,类似abc123xyz456 28 | expires_in = r.expires_in 29 | 30 | # 设置得到的access_token 31 | client.set_access_token(access_token, expires_in) 32 | 33 | comment_num = 1 34 | i = 1 35 | 36 | while True: 37 | r = client.comments.show.get(id = 4192500543207482,count = 200,page = i) #pgone 回应微博 38 | if len(r.comments): 39 | print '第 %s 页' % i 40 | for st in r.comments: 41 | print '第 %s 条评论' % comment_num 42 | created_at = st.created_at 43 | comment_id = st.id 44 | text = re.sub('回复.*?:','',str(st.text)) 45 | source = re.sub('<.*?>|','',str(st.source)) 46 | user_name = st.user.screen_name 47 | followers = st.user.followers_count 48 | follow = st.user.friends_count 49 | province = st.user.province 50 | print created_at 51 | print comment_id 52 | print text 53 | print source 54 | print '评论者:%s,粉丝数:%s,关注数:%s,所在省份编号:%s' % (user_name,followers,follow,province) 55 | print '\n' 56 | 57 | 58 | conn = pymysql.connect(host='127.0.0.1',user='root',password='xxxx',charset='utf8',use_unicode=False) 59 | cur = conn.cursor() 60 | sql = "insert into weibo.test(created_at,comment_id,text,source,user_name,followers,follow,province) values(%s,%s,%s,%s,%s,%s,%s,%s)" 61 | param = (created_at,comment_id,text,source,user_name,followers,follow,province) 62 | try: 63 | A = cur.execute(sql,param) 64 | conn.commit() 65 | except Exception,e: 66 | print(e) 67 | conn.rollback() 68 | comment_num+=1 69 | 70 | i+=1 71 | time.sleep(4) 72 | else: 73 | break 74 | -------------------------------------------------------------------------------- /comment_nlp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pymysql,re 4 | import jieba 5 | import jieba.posseg as pseg 6 | import pandas as pd 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | from scipy.misc import imread 10 | from snownlp import SnowNLP 11 | from wordcloud import WordCloud,ImageColorGenerator 12 | from collections import Counter 13 | 14 | def readmysql(): 15 | commentlist = [] 16 | textlist = [] 17 | userlist = [] 18 | conn = pymysql.connect(host='127.0.0.1', user='root', password='xxxx',charset='utf8') 19 | with conn: 20 | cur = conn.cursor() 21 | cur.execute("SELECT * FROM weibo.response WHERE id < '%d'" % 10000) 22 | rows = cur.fetchall() 23 | for row in rows: 24 | row = list(row) 25 | del row[0] 26 | if row not in commentlist: 27 | commentlist.append([row[0],row[1],row[2],row[3],row[4],row[5]]) 28 | comment_id = row[0] 29 | user_name = row[1] 30 | userlist.append(user_name) 31 | created_at = row[2] 32 | text = row[3] 33 | if text: 34 | textlist.append(text) 35 | like_num = row[4] 36 | source = row[5] 37 | 38 | return commentlist,userlist,textlist 39 | 40 | def word2cloud(textlist): 41 | fulltext = '' 42 | isCN = 1 43 | back_coloring = imread("123.jpg") 44 | cloud = WordCloud(font_path='font.ttf', 45 | background_color="white", 46 | max_words=2000, 47 | mask=back_coloring, 48 | max_font_size=100, 49 | random_state=42, 50 | width=1000,height=860,margin=2) 51 | 52 | for li in textlist: 53 | fulltext += ' '.join(jieba.cut(li,cut_all =False)) 54 | wc = cloud.generate(fulltext) 55 | image_colors = ImageColorGenerator(back_coloring) 56 | plt.figure("wordc") 57 | plt.imshow(wc.recolor(color_func=image_colors)) 58 | wc.to_file('评论词云.png') 59 | 60 | def snowanlaysis(textlist): 61 | sentimentslist = [] 62 | for li in textlist: 63 | s = SnowNLP(li) 64 | 65 | sentimentslist.append(s.sentiments) 66 | 67 | fig1 = plt.figure("sentiment") 68 | plt.hist(sentimentslist,bins=np.arange(0,1,0.02)) 69 | plt.show() 70 | 71 | def emojilist(textlist): 72 | emojilist = [] 73 | for li in textlist: 74 | emojis = re.findall(re.compile(u'(\[.*?\])',re.S),li) 75 | if emojis: 76 | for emoji in emojis: 77 | emojilist.append(emoji) 78 | emojidict = Counter(emojilist) 79 | print(emojidict) 80 | 81 | def follows(textlist): 82 | userdict = Counter(userlist) 83 | print(userdict.most_common(20)) 84 | 85 | if __name__ == '__main__': 86 | commentlist,userlist,textlist = readmysql() 87 | word2cloud(textlist) 88 | snowanlaysis(textlist) 89 | emojilist(textlist) 90 | follows(textlist) 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /comment_get.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pymysql,re,time,requests 3 | 4 | #Pgone回应微博ID:4192500543207482 5 | 6 | weibo_id = 4192500543207482 # input('输入单条微博ID:') 7 | # url='https://m.weibo.cn/single/rcList?format=cards&id=4192500543207482&type=comment&hot=1&page={}' #爬热门评论 8 | url='https://m.weibo.cn/api/comments/show?id=4192500543207482&page={}' #爬时间排序评论 9 | headers = { 10 | 'User-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0', 11 | 'Host' : 'm.weibo.cn', 12 | 'Accept' : 'application/json, text/plain, */*', 13 | 'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 14 | 'Accept-Encoding' : 'gzip, deflate, br', 15 | 'Referer' : 'https://m.weibo.cn/status/4192500543207482', 16 | 'Cookie' : 'SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W50ic.eDLvgYsrQXyffhbay5JpX5K-hUgL.FoeN1hMNe0e7eo-2dJLoI7_SdgHV9NyDqgvae5tt;SUHB=02MvZbDGrK2xQh;SUB=_2A253vYJuDeRhGeVJ41UW8y3MyTmIHXVVQS4mrDV6PUJbktBeLRnNkW1NT8Uxylr_X7ihbRPAsGoHvyMe-m1VDguR;SCF=AjbvCvnn6cFfvStw42Unvmy0fnmfrzDjUEgoGOutGhzsTXeUK1Ozd-6flNTG4uPUSUGT4tD-yuLOa9uyDQGhko0.;M_WEIBOCN_PARAMS=uicode%3D20000174%26featurecode%3D20000320%26fid%3Dhotword;H5_INDEX_TITLE=Mmengxiaoo0;H5_INDEX=0_all;ALF=1524726828;_T_WM=136847b9a3ccf2b122897afe9e7d254e', 17 | 'DNT' : '1', 18 | 'Connection' : 'keep-alive', 19 | } 20 | i = 120 21 | comment_num = 1 22 | while True: 23 | # if i==1: # 爬热门评论 24 | # r = requests.get(url = url.format(i),headers = headers) 25 | # comment_page = r.json()[1]['card_group'] 26 | # else: 27 | # r = requests.get(url = url.format(i),headers = headers) 28 | # comment_page = r.json()[0]['card_group'] 29 | r = requests.get(url=url.format(i), headers=headers) # 爬时间排序评论 30 | try: 31 | comment_page = r.json()['data']['data'] 32 | except Exception as e: 33 | print (e.args) 34 | print (r) 35 | print (r.text) 36 | 37 | 38 | 39 | if r.status_code == 200: 40 | try: 41 | print('正在读取第 %s 页评论:' % i) 42 | for j in range(0,len(comment_page)): 43 | print('第 %s 条评论' % comment_num) 44 | user = comment_page[j] 45 | print(user) 46 | comment_id = user['user']['id'] 47 | print(comment_id) 48 | user_name = user['user']['screen_name'] 49 | print(user_name) 50 | created_at = user['created_at'] 51 | print(created_at) 52 | text = re.sub('<.*?>|回复<.*?>:|[\U00010000-\U0010ffff]|[\uD800-\uDBFF][\uDC00-\uDFFF]', '', user['text']) 53 | print(text) 54 | like_num = user['like_counts'] 55 | print(like_num) 56 | source = re.sub('[\U00010000-\U0010ffff]|[\uD800-\uDBFF][\uDC00-\uDFFF]', '', user['source']) 57 | print(source + '\r\n') 58 | conn = pymysql.connect(host='127.0.0.1', user='root', password='xxxxx', charset='utf8', use_unicode=False) 59 | cur = conn.cursor() 60 | sql = "insert into weibo.response(comment_id,user_name,created_at,text,like_num,source) values(%s,%s,%s,%s,%s,%s)" 61 | param = (comment_id,user_name,created_at,text,like_num,source) 62 | try: 63 | A = cur.execute(sql, param) 64 | conn.commit() 65 | except Exception as e: 66 | print(e) 67 | conn.rollback() 68 | comment_num+=1 69 | 70 | i+=1 71 | time.sleep(0.2) 72 | except: 73 | i+1 74 | pass 75 | else: 76 | break -------------------------------------------------------------------------------- /emotion_analyse.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import jieba 3 | import numpy as np 4 | 5 | def open_dict(Dict = 'open',path = r'/Users/mengxiao/PycharmProjects/emotion/'): 6 | path = path + '%s.txt' % Dict 7 | dictionary = open(path,'r',encoding='utf-8') 8 | dict = [] 9 | for word in dictionary: 10 | word = word.strip('\n') 11 | dict.append(word) 12 | return dict #返回列表 13 | 14 | def judge(num): 15 | if (num % 2) == 0: 16 | return 'even' 17 | else: 18 | return 'odd' 19 | 20 | deny_word = open_dict(Dict = '否定词',path = r'/Users/mengxiao/Textming/') 21 | posdict = open_dict(Dict = 'positive',path = r'/Users/mengxiao/Textming/') 22 | negdict = open_dict(Dict = 'negative',path = r'/Users/mengxiao/Textming/') 23 | degree_word = open_dict(Dict= '程度级别词语',path = r'/Users/mengxiao/Textming/') 24 | 25 | # 权重由高到低, 26 | mostdict = degree_word[degree_word.index('extreme')+1 : degree_word.index('very')] 27 | verydict = degree_word[degree_word.index('very')+1 : degree_word.index('more')] 28 | moredict = degree_word[degree_word.index('more')+1 : degree_word.index('ish')] 29 | ishdict = degree_word[degree_word.index('ish')+1 : degree_word.index('last')] 30 | 31 | def sentiment_score_list(dataset): 32 | seg_sentence = dataset.split('。') 33 | 34 | count1 = [] 35 | count2 = [] 36 | for sen in seg_sentence: 37 | segtmp = jieba.lcut(sen, cut_all=False) # 对句子进行分词,并以列表形式返回。 38 | i = 0 # 记录扫描到词的位置 39 | j = 0 # 记录情感词的位置 40 | poscount = 0 # 积极词的第一次分值 41 | poscount2 = 0 # 积极词反转后的分值 42 | poscount3 = 0 # 积极词的最终分值(包括叹号的分值) 43 | negcount = 0 44 | negcount2 = 0 45 | negcount3 = 0 46 | 47 | for word in segtmp: 48 | if word in posdict: # 判断词语是否是情感词 49 | poscount += 1 50 | c = 0 # 否定词个数 51 | for w in segtmp[j:i]: 52 | if w in mostdict: 53 | poscount *= 4.0 54 | elif w in verydict: 55 | poscount *= 3.0 56 | elif w in moredict: 57 | poscount *= 2.0 58 | elif w in ishdict: 59 | poscount *= 0.5 60 | elif w in deny_word: 61 | c += 1 62 | if judge(c) == 'odd': 63 | poscount *= -1.0 64 | poscount2 += poscount 65 | poscount = 0 66 | poscount3 = poscount + poscount2 + poscount3 67 | poscount2 = 0 68 | else: 69 | poscount3 = poscount + poscount2 + poscount3 70 | poscount = 0 71 | j = i + 1 # 情感词位置变化 72 | 73 | elif word in negdict: 74 | negcount += 1 75 | d = 0 76 | for w in segtmp[j:i]: 77 | if w in mostdict: 78 | negcount *= 4.0 79 | elif w in verydict: 80 | negcount *= 3.0 81 | elif w in moredict: 82 | negcount *= 2.0 83 | elif w in ishdict: 84 | negcount *= 0.5 85 | elif w in deny_word: 86 | d += 1 87 | if judge(d) == 'odd': 88 | negcount *= -1.0 89 | negcount2 += negcount 90 | negcount = 0 91 | negcount3 = negcount + negcount2 + negcount3 92 | negcount2 = 0 93 | else: 94 | negcount3 = negcount + negcount2 + negcount3 95 | negcount = 0 96 | j = i + 1 # 情感词位置变化 97 | elif word == '!' or word == '!': # 判断句子是否有感叹号 98 | for w2 in segtmp[::-1]: 99 | if w2 in posdict or negdict: 100 | poscount3 += 2 101 | negcount3 += 2 102 | break 103 | i += 1 # 扫描词位置前移 104 | 105 | 106 | # 以下是防止出现负数的情况 107 | 108 | pos_count = 0 109 | neg_count = 0 110 | if poscount3 < 0 and negcount3 > 0: 111 | neg_count += negcount3 - poscount3 112 | pos_count = 0 113 | elif negcount3 < 0 and poscount3 > 0: 114 | pos_count = poscount3 - negcount3 115 | neg_count = 0 116 | elif poscount3 < 0 and negcount3 < 0: 117 | neg_count = -poscount3 118 | pos_count = -negcount3 119 | else: 120 | pos_count = poscount3 121 | neg_count = negcount3 122 | 123 | count1.append([pos_count, neg_count]) 124 | count2.append(count1) 125 | count1 = [] 126 | 127 | return count2 128 | 129 | def sentiment_score(senti_score_list): 130 | score = [] 131 | for review in senti_score_list: 132 | score_array = np.array(review) 133 | Pos = np.sum(score_array[:, 0]) 134 | Neg = np.sum(score_array[:, 1]) 135 | score.append([Pos,Neg]) 136 | return score 137 | 138 | 139 | data1 = '你就是个王八蛋,混账玩意!你们的手机真不好用!我非常生气!!!!' 140 | data2 = '我好开心啊,非常非常非常高兴!今天我得了一百分,我很兴奋开心,愉快,开心!' 141 | 142 | print (sentiment_score(sentiment_score_list(data1))) 143 | print (sentiment_score(sentiment_score_list(data2))) 144 | 145 | 146 | 147 | 148 | 149 | --------------------------------------------------------------------------------