├── .DS_Store
├── .gitattributes
├── README.md
├── bg.jpg
├── font.ttf
├── weiboAPI.py
├── xuenlp.py
└── xueweibo.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/otakurice/weibonlp/4bb4294acd0aac7fc4386efcb5daaf1dcd949351/.DS_Store


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # weibonlp
 2 | #### 功能介绍
 3 | weiboAPI.py功能包含：
 4 | - 通过调用微博API的方法将微博评论写入数据库
 5 | 
 6 | xueweibo.py功能包含：
 7 | - 爬取微博评论并写入数据库
 8 | 
 9 | xuenlp.py功能包含：
10 | - 读取数据库并进行数据去重
11 | - 对微博评论进行情感分析并生成统计结果
12 | - 统计微博评论中的表情排行
13 | - 统计微博评论中的粉丝排行前20
14 | 
15 | #### 博客
16 | - 代码可结合文章阅读：https://mp.weixin.qq.com/s/a0904t-7Yvhi0VO_n-6CEw
17 | 
18 | #### 关于我
19 | - 一个特别擅长不务正业的人
20 | - 一个脱离了高级趣味的人
21 | - 一个努力改变生活的人
22 | - 一个逗比&懵逼的妹几
23 | 
24 | #### 2017.11
25 | 由于初次使用nlp以及能力有限，代码比较简陋，希望各位喜欢。
26 | 
27 | #### 其他
28 | 
29 | 我的公众平台，欢迎关注：
30 | *   微信公众号：大吉大利小米酱
31 | ![公众号](https://upload-images.jianshu.io/upload_images/5588611-5c96affae52d5082.jpg?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
32 | *   简书：[大吉大利小米酱](https://www.jianshu.com/u/8e45f2f3b6c1)
33 | *   知乎：[大吉大利小米酱](https://www.zhihu.com/people/otakurice/activities)
34 | *   CSDN：[大吉大利小米酱](https://blog.csdn.net/vermilion1990)
35 | *   GitHub：[https://github.com/otakurice](https://github.com/otakurice)
36 | 
37 | 有想法交流的可以由下面几种方式联系我：
38 | *   知识星球：
39 | ![知识星球](https://upload-images.jianshu.io/upload_images/5588611-c0e4068f081d8fce.jpg?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
40 | *   Email：vermilion1990@126.com
41 | *   QQ：616814925(请注明联系目的)
42 | 
43 | 如果你觉得我写得东西对你有所帮助，可以扫描下面的打(yao)赏(fan)二维码给我微信转账支持我，谢谢~
44 | ![](https://upload-images.jianshu.io/upload_images/5588611-d4bd4e040ed0e02c.jpeg?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
45 | 


--------------------------------------------------------------------------------
/bg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/otakurice/weibonlp/4bb4294acd0aac7fc4386efcb5daaf1dcd949351/bg.jpg


--------------------------------------------------------------------------------
/font.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/otakurice/weibonlp/4bb4294acd0aac7fc4386efcb5daaf1dcd949351/font.ttf


--------------------------------------------------------------------------------
/weiboAPI.py:
--------------------------------------------------------------------------------
 1 | # encoding:UTF-8
 2 | from weibo import APIClient 
 3 | import webbrowser
 4 | import pymysql,re,time
 5 | 
 6 | import sys
 7 | reload(sys)
 8 | sys.setdefaultencoding('utf-8')
 9 | 
10 | APP_KEY = '你的App Key ' #获取的App Key 
11 | APP_SECRET = '你的AppSecret' #获取的AppSecret 
12 | CALLBACK_URL = 'https://api.weibo.com/oauth2/default.html' #回调链接 
13 | 
14 | client = APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=CALLBACK_URL) 
15 | url = client.get_authorize_url() 
16 | webbrowser.open_new(url) #打开默认浏览器获取code参数 
17 | 
18 | print '输入url中code后面的内容后按回车键：'
19 | 
20 | code = raw_input()
21 | r = client.request_access_token(code)
22 | access_token = r.access_token
23 | expires_in = r.expires_in
24 | client.set_access_token(access_token, expires_in)
25 | 
26 | comment_num = 1
27 | i = 1
28 | 
29 | while True:
30 | 	r = client.comments.show.get(id = 4154417035431509,count = 200,page = i)
31 | 	if len(r.comments):
32 | 		print '第 %s 页' % i
33 | 		for st in r.comments:
34 | 			print '第 %s 条评论' % comment_num
35 | 			created_at = st.created_at
36 | 			comment_id = st.id
37 | 			text = re.sub('回复.*?:','',str(st.text))
38 | 			source = re.sub('<.*?>|</a>','',str(st.source))
39 | 			user_name = st.user.screen_name
40 | 			followers = st.user.followers_count
41 | 			follow = st.user.friends_count
42 | 			province = st.user.province
43 | 			print created_at
44 | 			print comment_id
45 | 			print text
46 | 			print source
47 | 			print '评论者：%s,粉丝数：%s,关注数：%s,所在省份编号：%s' % (user_name,followers,follow,province)
48 | 			print '\n'
49 | 			conn =pymysql.connect(host='127.0.0.1',user='root',password='1314',charset="utf8",use_unicode = False)
50 | 			cur = conn.cursor()
51 | 			sql = "insert into xue.xueresponse(created_at,comment_id,text,source,user_name,followers,follow,province) values(%s,%s,%s,%s,%s,%s,%s,%s)"
52 | 			param = (created_at,comment_id,text,source,user_name,followers,follow,province)
53 | 			try:
54 | 			    A = cur.execute(sql,param)
55 | 			    conn.commit()
56 | 			except Exception,e:
57 | 			    print(e)
58 | 			    conn.rollback()
59 | 			comment_num+=1
60 | 		i+=1
61 | 		time.sleep(4)
62 | 	else:
63 | 		break
64 | 


--------------------------------------------------------------------------------
/xuenlp.py:
--------------------------------------------------------------------------------
 1 | # encoding:UTF-8
 2 | import pymysql,re
 3 | import jieba
 4 | import jieba.posseg as pseg
 5 | import pandas as pd
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | from scipy.misc import imread
 9 | from snownlp import SnowNLP
10 | from wordcloud import WordCloud,ImageColorGenerator
11 | from collections import Counter
12 | 
13 | def readmysql(): #读取数据库
14 |     commentlist = []
15 |     textlist = []
16 |     userlist = []
17 |     conn =pymysql.connect(host='服务器IP',user='用户名',password='密码',charset="utf8")    #连接服务器
18 |     with conn:
19 |         cur = conn.cursor()
20 |         cur.execute("SELECT * FROM nlp.love_guan WHERE id < '%d'" % 10000)
21 |         rows = cur.fetchall()
22 |         for row in rows:
23 |             row = list(row)
24 |             del row[0]
25 |             if row not in commentlist:
26 |                 commentlist.append([row[0],row[1],row[2],row[3],row[4],row[5]])
27 |                 comment_id = row[0]
28 |                 user_name = row[1]
29 |                 userlist.append(user_name)
30 |                 created_at = row[2]
31 |                 text = row[3]
32 |                 if text:
33 |                     textlist.append(text)
34 |                 likenum = row[4]
35 |                 source = row[5]
36 |             # print("%d %s %s %s %s %s" % (comment_id,user_name,created_at,text,likenum,source))
37 |     return commentlist,userlist,textlist
38 | 
39 | def wordtocloud(textlist):
40 |     fulltext = ''
41 |     isCN = 1
42 |     back_coloring = imread("bg.jpg")
43 |     cloud = WordCloud(font_path='font.ttf', # 若是有中文的话，这句代码必须添加，不然会出现方框，不出现汉字
44 |             background_color="white",  # 背景颜色
45 |             max_words=2000,  # 词云显示的最大词数
46 |             mask=back_coloring,  # 设置背景图片
47 |             max_font_size=100,  # 字体最大值
48 |             random_state=42,
49 |             width=1000, height=860, margin=2,# 设置图片默认的大小,但是如果使用背景图片的话,那么保存的图片大小将会按照其大小保存,margin为词语边缘距离
50 |             )
51 |     for li in textlist:
52 |         fulltext += ' '.join(jieba.cut(li,cut_all = False))
53 |     wc = cloud.generate(fulltext)
54 |     image_colors = ImageColorGenerator(back_coloring)
55 |     plt.figure("wordc")
56 |     plt.imshow(wc.recolor(color_func=image_colors))
57 |     wc.to_file('微博评论词云.png')
58 | 
59 | def snowanalysis(textlist):
60 |     sentimentslist = []
61 |     for li in textlist:
62 |         s = SnowNLP(li)
63 |         # print(li)
64 |         # print(s.sentiments)
65 |         sentimentslist.append(s.sentiments)
66 |     fig1 = plt.figure("sentiment")
67 |     plt.hist(sentimentslist,bins=np.arange(0,1,0.02))
68 |     plt.show()
69 | 
70 | def emojilist(textlist):
71 |     emojilist = []
72 |     for li in textlist:
73 |         emojis = re.findall(re.compile(u'(\[.*?\])',re.S),li)
74 |         if emojis:
75 |             for emoji in emojis:
76 |                 emojilist.append(emoji)
77 |     emojidict = Counter(emojilist)
78 |     print(emojidict)
79 | 
80 | def follows(textlist):
81 |     userdict = Counter(userlist)
82 |     print(userdict.most_common(20))
83 | 
84 | if __name__=='__main__':
85 |     #运行
86 |     commentlist,userlist,textlist = readmysql()
87 |     wordtocloud(textlist)
88 |     snowanalysis(textlist)
89 |     emojilist(textlist)
90 |     follows(textlist)
91 | 


--------------------------------------------------------------------------------
/xueweibo.py:
--------------------------------------------------------------------------------
 1 | # encoding:UTF-8
 2 | import pymysql,re,time,requests,urllib.request
 3 | from collections import OrderedDict
 4 | 
 5 | #薛回应P图 4154417035431509
 6 | #李转账捐款 4155545118733236
 7 | #鹿晗微博 4160547165300149
 8 | #关晓彤微博 4160547694498927
 9 | weibo_id = input('输入单条微博ID：')
10 | # url='https://m.weibo.cn/single/rcList?format=cards&id=' + weibo_id + '&type=comment&hot=1&page={}' #爬热门评论
11 | url='https://m.weibo.cn/api/comments/show?id=' + weibo_id + '&page={}' #爬时间排序评论
12 | headers = {
13 |     'User-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
14 |     'Host' : 'm.weibo.cn',
15 |     'Accept' : 'application/json, text/plain, */*',
16 |     'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
17 |     'Accept-Encoding' : 'gzip, deflate, br',
18 |     'Referer' : 'https://m.weibo.cn/status/' + weibo_id,
19 |     'Cookie' : '_T_WM=e25a28bec35b27c72d37ae2104433873; WEIBOCN_WM=3349; H5_wentry=H5; backURL=http%3A%2F%2Fm.weibo.cn%2F; SUB=_2A250zXayDeThGeVJ7VYV8SnJyTuIHXVUThr6rDV6PUJbkdBeLRDzkW1FrGCo75fsx_qRR822fcI2HoErRQ..; SUHB=0sqRDiYRHXFJdM; SCF=Ag4UgBbd7u4DMdyvdAjGRMgi7lfo6vB4Or8nQI4-9HQ4cLYm_RgdaeTdAH_68X4EbewMK-X4JMj5IQeuQUymxxc.; SSOLoginState=1506346722; M_WEIBOCN_PARAMS=featurecode%3D20000320%26oid%3D3638527344076162%26luicode%3D10000011%26lfid%3D1076031239246050; H5_INDEX=3; H5_INDEX_TITLE=%E8%8A%82cao%E9%85%B1',
20 |     'DNT' : '1',
21 |     'Connection' : 'keep-alive',
22 |     }
23 | i = 0
24 | comment_num = 1
25 | while True:
26 |     # if i==1:     #26-31行 爬热门评论
27 |     #     r = requests.get(url = url.format(i),headers = headers)
28 |     #     comment_page = r.json()[1]['card_group']
29 |     # else:
30 |     #     r = requests.get(url = url.format(i),headers = headers)
31 |     #     comment_page = r.json()[0]['card_group']
32 |     r = requests.get(url = url.format(i),headers = headers)  #32-33行 爬时间排序评论
33 |     comment_page = r.json()['data']
34 |     if r.status_code ==200:
35 |         try:
36 |             print('正在读取第 %s 页评论：' % i)
37 |             for j in range(0,len(comment_page)):
38 |                 print('第 %s 条评论' % comment_num)
39 |                 user = comment_page[j]
40 |                 comment_id = user['user']['id']
41 |                 print(comment_id)
42 |                 user_name = user['user']['screen_name']
43 |                 print(user_name)
44 |                 created_at = user['created_at']
45 |                 print(created_at)
46 |                 text = re.sub('<.*?>|回复<.*?>:|[\U00010000-\U0010ffff]|[\uD800-\uDBFF][\uDC00-\uDFFF]','',user['text'])
47 |                 print(text)
48 |                 likenum = user['like_counts']
49 |                 print(likenum)
50 |                 source = re.sub('[\U00010000-\U0010ffff]|[\uD800-\uDBFF][\uDC00-\uDFFF]','',user['source'])
51 |                 print(source + '\r\n')
52 |                 conn =pymysql.connect(host='服务器IP(默认是127.0.0.1)',user='服务器名(默认是root)',password='服务器密码',charset="utf8",use_unicode = False)    #连接服务器
53 |                 cur = conn.cursor()
54 |                 sql = "insert into nlp.love_guan(comment_id,user_name,created_at,text,likenum,source) values(%s,%s,%s,%s,%s,%s)"
55 |                 param = (comment_id,user_name,created_at,text,likenum,source)
56 |                 try:
57 |                     A = cur.execute(sql,param)
58 |                     conn.commit()
59 |                 except Exception as e:
60 |                     print(e)
61 |                     conn.rollback()
62 |                 comment_num+=1
63 |             i+=1
64 |             time.sleep(3)
65 |         except:
66 |             i+1
67 |             pass
68 |     else:
69 |         break


--------------------------------------------------------------------------------