├── .DS_Store ├── .gitattributes ├── README.md ├── bg.jpg ├── font.ttf ├── weiboAPI.py ├── xuenlp.py └── xueweibo.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/otakurice/weibonlp/4bb4294acd0aac7fc4386efcb5daaf1dcd949351/.DS_Store -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # weibonlp 2 | #### 功能介绍 3 | weiboAPI.py功能包含: 4 | - 通过调用微博API的方法将微博评论写入数据库 5 | 6 | xueweibo.py功能包含: 7 | - 爬取微博评论并写入数据库 8 | 9 | xuenlp.py功能包含: 10 | - 读取数据库并进行数据去重 11 | - 对微博评论进行情感分析并生成统计结果 12 | - 统计微博评论中的表情排行 13 | - 统计微博评论中的粉丝排行前20 14 | 15 | #### 博客 16 | - 代码可结合文章阅读:https://mp.weixin.qq.com/s/a0904t-7Yvhi0VO_n-6CEw 17 | 18 | #### 关于我 19 | - 一个特别擅长不务正业的人 20 | - 一个脱离了高级趣味的人 21 | - 一个努力改变生活的人 22 | - 一个逗比&懵逼的妹几 23 | 24 | #### 2017.11 25 | 由于初次使用nlp以及能力有限,代码比较简陋,希望各位喜欢。 26 | 27 | #### 其他 28 | 29 | 我的公众平台,欢迎关注: 30 | * 微信公众号:大吉大利小米酱 31 | ![公众号](https://upload-images.jianshu.io/upload_images/5588611-5c96affae52d5082.jpg?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 32 | * 简书:[大吉大利小米酱](https://www.jianshu.com/u/8e45f2f3b6c1) 33 | * 知乎:[大吉大利小米酱](https://www.zhihu.com/people/otakurice/activities) 34 | * CSDN:[大吉大利小米酱](https://blog.csdn.net/vermilion1990) 35 | * GitHub:[https://github.com/otakurice](https://github.com/otakurice) 36 | 37 | 有想法交流的可以由下面几种方式联系我: 38 | * 知识星球: 39 | ![知识星球](https://upload-images.jianshu.io/upload_images/5588611-c0e4068f081d8fce.jpg?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 40 | * Email:vermilion1990@126.com 41 | * QQ:616814925(请注明联系目的) 42 | 43 | 如果你觉得我写得东西对你有所帮助,可以扫描下面的打(yao)赏(fan)二维码给我微信转账支持我,谢谢~ 44 | ![](https://upload-images.jianshu.io/upload_images/5588611-d4bd4e040ed0e02c.jpeg?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 45 | -------------------------------------------------------------------------------- /bg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/otakurice/weibonlp/4bb4294acd0aac7fc4386efcb5daaf1dcd949351/bg.jpg -------------------------------------------------------------------------------- /font.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/otakurice/weibonlp/4bb4294acd0aac7fc4386efcb5daaf1dcd949351/font.ttf -------------------------------------------------------------------------------- /weiboAPI.py: -------------------------------------------------------------------------------- 1 | # encoding:UTF-8 2 | from weibo import APIClient 3 | import webbrowser 4 | import pymysql,re,time 5 | 6 | import sys 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | APP_KEY = '你的App Key ' #获取的App Key 11 | APP_SECRET = '你的AppSecret' #获取的AppSecret 12 | CALLBACK_URL = 'https://api.weibo.com/oauth2/default.html' #回调链接 13 | 14 | client = APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=CALLBACK_URL) 15 | url = client.get_authorize_url() 16 | webbrowser.open_new(url) #打开默认浏览器获取code参数 17 | 18 | print '输入url中code后面的内容后按回车键:' 19 | 20 | code = raw_input() 21 | r = client.request_access_token(code) 22 | access_token = r.access_token 23 | expires_in = r.expires_in 24 | client.set_access_token(access_token, expires_in) 25 | 26 | comment_num = 1 27 | i = 1 28 | 29 | while True: 30 | r = client.comments.show.get(id = 4154417035431509,count = 200,page = i) 31 | if len(r.comments): 32 | print '第 %s 页' % i 33 | for st in r.comments: 34 | print '第 %s 条评论' % comment_num 35 | created_at = st.created_at 36 | comment_id = st.id 37 | text = re.sub('回复.*?:','',str(st.text)) 38 | source = re.sub('<.*?>|','',str(st.source)) 39 | user_name = st.user.screen_name 40 | followers = st.user.followers_count 41 | follow = st.user.friends_count 42 | province = st.user.province 43 | print created_at 44 | print comment_id 45 | print text 46 | print source 47 | print '评论者:%s,粉丝数:%s,关注数:%s,所在省份编号:%s' % (user_name,followers,follow,province) 48 | print '\n' 49 | conn =pymysql.connect(host='127.0.0.1',user='root',password='1314',charset="utf8",use_unicode = False) 50 | cur = conn.cursor() 51 | sql = "insert into xue.xueresponse(created_at,comment_id,text,source,user_name,followers,follow,province) values(%s,%s,%s,%s,%s,%s,%s,%s)" 52 | param = (created_at,comment_id,text,source,user_name,followers,follow,province) 53 | try: 54 | A = cur.execute(sql,param) 55 | conn.commit() 56 | except Exception,e: 57 | print(e) 58 | conn.rollback() 59 | comment_num+=1 60 | i+=1 61 | time.sleep(4) 62 | else: 63 | break 64 | -------------------------------------------------------------------------------- /xuenlp.py: -------------------------------------------------------------------------------- 1 | # encoding:UTF-8 2 | import pymysql,re 3 | import jieba 4 | import jieba.posseg as pseg 5 | import pandas as pd 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from scipy.misc import imread 9 | from snownlp import SnowNLP 10 | from wordcloud import WordCloud,ImageColorGenerator 11 | from collections import Counter 12 | 13 | def readmysql(): #读取数据库 14 | commentlist = [] 15 | textlist = [] 16 | userlist = [] 17 | conn =pymysql.connect(host='服务器IP',user='用户名',password='密码',charset="utf8") #连接服务器 18 | with conn: 19 | cur = conn.cursor() 20 | cur.execute("SELECT * FROM nlp.love_guan WHERE id < '%d'" % 10000) 21 | rows = cur.fetchall() 22 | for row in rows: 23 | row = list(row) 24 | del row[0] 25 | if row not in commentlist: 26 | commentlist.append([row[0],row[1],row[2],row[3],row[4],row[5]]) 27 | comment_id = row[0] 28 | user_name = row[1] 29 | userlist.append(user_name) 30 | created_at = row[2] 31 | text = row[3] 32 | if text: 33 | textlist.append(text) 34 | likenum = row[4] 35 | source = row[5] 36 | # print("%d %s %s %s %s %s" % (comment_id,user_name,created_at,text,likenum,source)) 37 | return commentlist,userlist,textlist 38 | 39 | def wordtocloud(textlist): 40 | fulltext = '' 41 | isCN = 1 42 | back_coloring = imread("bg.jpg") 43 | cloud = WordCloud(font_path='font.ttf', # 若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字 44 | background_color="white", # 背景颜色 45 | max_words=2000, # 词云显示的最大词数 46 | mask=back_coloring, # 设置背景图片 47 | max_font_size=100, # 字体最大值 48 | random_state=42, 49 | width=1000, height=860, margin=2,# 设置图片默认的大小,但是如果使用背景图片的话,那么保存的图片大小将会按照其大小保存,margin为词语边缘距离 50 | ) 51 | for li in textlist: 52 | fulltext += ' '.join(jieba.cut(li,cut_all = False)) 53 | wc = cloud.generate(fulltext) 54 | image_colors = ImageColorGenerator(back_coloring) 55 | plt.figure("wordc") 56 | plt.imshow(wc.recolor(color_func=image_colors)) 57 | wc.to_file('微博评论词云.png') 58 | 59 | def snowanalysis(textlist): 60 | sentimentslist = [] 61 | for li in textlist: 62 | s = SnowNLP(li) 63 | # print(li) 64 | # print(s.sentiments) 65 | sentimentslist.append(s.sentiments) 66 | fig1 = plt.figure("sentiment") 67 | plt.hist(sentimentslist,bins=np.arange(0,1,0.02)) 68 | plt.show() 69 | 70 | def emojilist(textlist): 71 | emojilist = [] 72 | for li in textlist: 73 | emojis = re.findall(re.compile(u'(\[.*?\])',re.S),li) 74 | if emojis: 75 | for emoji in emojis: 76 | emojilist.append(emoji) 77 | emojidict = Counter(emojilist) 78 | print(emojidict) 79 | 80 | def follows(textlist): 81 | userdict = Counter(userlist) 82 | print(userdict.most_common(20)) 83 | 84 | if __name__=='__main__': 85 | #运行 86 | commentlist,userlist,textlist = readmysql() 87 | wordtocloud(textlist) 88 | snowanalysis(textlist) 89 | emojilist(textlist) 90 | follows(textlist) 91 | -------------------------------------------------------------------------------- /xueweibo.py: -------------------------------------------------------------------------------- 1 | # encoding:UTF-8 2 | import pymysql,re,time,requests,urllib.request 3 | from collections import OrderedDict 4 | 5 | #薛回应P图 4154417035431509 6 | #李转账捐款 4155545118733236 7 | #鹿晗微博 4160547165300149 8 | #关晓彤微博 4160547694498927 9 | weibo_id = input('输入单条微博ID:') 10 | # url='https://m.weibo.cn/single/rcList?format=cards&id=' + weibo_id + '&type=comment&hot=1&page={}' #爬热门评论 11 | url='https://m.weibo.cn/api/comments/show?id=' + weibo_id + '&page={}' #爬时间排序评论 12 | headers = { 13 | 'User-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0', 14 | 'Host' : 'm.weibo.cn', 15 | 'Accept' : 'application/json, text/plain, */*', 16 | 'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 17 | 'Accept-Encoding' : 'gzip, deflate, br', 18 | 'Referer' : 'https://m.weibo.cn/status/' + weibo_id, 19 | 'Cookie' : '_T_WM=e25a28bec35b27c72d37ae2104433873; WEIBOCN_WM=3349; H5_wentry=H5; backURL=http%3A%2F%2Fm.weibo.cn%2F; SUB=_2A250zXayDeThGeVJ7VYV8SnJyTuIHXVUThr6rDV6PUJbkdBeLRDzkW1FrGCo75fsx_qRR822fcI2HoErRQ..; SUHB=0sqRDiYRHXFJdM; SCF=Ag4UgBbd7u4DMdyvdAjGRMgi7lfo6vB4Or8nQI4-9HQ4cLYm_RgdaeTdAH_68X4EbewMK-X4JMj5IQeuQUymxxc.; SSOLoginState=1506346722; M_WEIBOCN_PARAMS=featurecode%3D20000320%26oid%3D3638527344076162%26luicode%3D10000011%26lfid%3D1076031239246050; H5_INDEX=3; H5_INDEX_TITLE=%E8%8A%82cao%E9%85%B1', 20 | 'DNT' : '1', 21 | 'Connection' : 'keep-alive', 22 | } 23 | i = 0 24 | comment_num = 1 25 | while True: 26 | # if i==1: #26-31行 爬热门评论 27 | # r = requests.get(url = url.format(i),headers = headers) 28 | # comment_page = r.json()[1]['card_group'] 29 | # else: 30 | # r = requests.get(url = url.format(i),headers = headers) 31 | # comment_page = r.json()[0]['card_group'] 32 | r = requests.get(url = url.format(i),headers = headers) #32-33行 爬时间排序评论 33 | comment_page = r.json()['data'] 34 | if r.status_code ==200: 35 | try: 36 | print('正在读取第 %s 页评论:' % i) 37 | for j in range(0,len(comment_page)): 38 | print('第 %s 条评论' % comment_num) 39 | user = comment_page[j] 40 | comment_id = user['user']['id'] 41 | print(comment_id) 42 | user_name = user['user']['screen_name'] 43 | print(user_name) 44 | created_at = user['created_at'] 45 | print(created_at) 46 | text = re.sub('<.*?>|回复<.*?>:|[\U00010000-\U0010ffff]|[\uD800-\uDBFF][\uDC00-\uDFFF]','',user['text']) 47 | print(text) 48 | likenum = user['like_counts'] 49 | print(likenum) 50 | source = re.sub('[\U00010000-\U0010ffff]|[\uD800-\uDBFF][\uDC00-\uDFFF]','',user['source']) 51 | print(source + '\r\n') 52 | conn =pymysql.connect(host='服务器IP(默认是127.0.0.1)',user='服务器名(默认是root)',password='服务器密码',charset="utf8",use_unicode = False) #连接服务器 53 | cur = conn.cursor() 54 | sql = "insert into nlp.love_guan(comment_id,user_name,created_at,text,likenum,source) values(%s,%s,%s,%s,%s,%s)" 55 | param = (comment_id,user_name,created_at,text,likenum,source) 56 | try: 57 | A = cur.execute(sql,param) 58 | conn.commit() 59 | except Exception as e: 60 | print(e) 61 | conn.rollback() 62 | comment_num+=1 63 | i+=1 64 | time.sleep(3) 65 | except: 66 | i+1 67 | pass 68 | else: 69 | break --------------------------------------------------------------------------------