├── .idea
├── Micro-blogReview.iml
├── inspectionProfiles
│ └── Project_Default.xml
├── misc.xml
├── modules.xml
├── other.xml
└── vcs.xml
├── DataAnalysis.py
├── README.md
├── WeiboApi
├── readme.md
├── sinaweibopy3.py
└── weiboAPI.py
├── modelTraining.py
├── outPut
├── final.html
├── sentiment.marshal.3
└── things.txt
├── requirements.txt
├── start.py
└── visualization.py
/.idea/Micro-blogReview.iml:
--------------------------------------------------------------------------------
1 |
2 | 应用信息>高级信息>OAuth2.0 授权设置
中的授权回调页
填写http://api.weibo.com/oauth2/default.html
8 | * 获取accessoken
:
9 | 将得到的App Key及App Secret值填入本程序./weiboApi/weiboApi.py
中对应的值,运行后将自动打开浏览器的微博OAuth2.0
页面,在url中,包含一个code=*****
,复制code值,回到本程序,将code的值填入程序运行后的输入栏,运行后输出accesstoken
10 | * 根据开发者文档的评论接口页面的接口说明,使用获得的accesstoken及相关微博文章的ID构造API链接,访问,并复制cookie,将./start.py
中链接及cookie替换。
11 | * 执行pip3 install -r requirements.txt
安装引用到的库,或直接安装[Anaconda](https://www.anaconda.com/)
12 | * 运行./start.py
,数据会自动格式化并保存至./outPut/评论.csv
13 | ## 更新日志
14 | ### 2019年12月5日
15 | * 创建项目
16 | * 分析微博手机端H5页面,得到地址爬取评论
17 | * 太难了,地址得手动获取,每页50条,且反扒机制TQL!
18 | ### 2019年12月6日
19 | * 申请了微博API
20 | * 添加了通过AppKey及AppSecret获取access_token的程序(./weiboAPI):[olwolf/sinaweibopy3](https://github.com/olwolf/sinaweibopy3)
21 | * 根据[开发者文档](https://open.weibo.com/wiki/2/comments/show)构建获取评论的链接
22 | https://api.weibo.com/2/comments/show.json?access_token=[your_access_token]&id=[微博的ID]&count=[1~200]200&page=1
23 | ### 2019年12月10日
24 | * 更新README.md,增加运行过程说明
25 | * 爬取香港相关微博的评论,数据保存为./outPut/getInfo.json
及评论内容./outPut/评论.csv
26 |
--------------------------------------------------------------------------------
/WeiboApi/readme.md:
--------------------------------------------------------------------------------
1 | # 此处是获取weiboAPI的access_token的代码
2 | 带代码引用自[github@olwolf的sinaweibopy3项目](https://github.com/olwolf/sinaweibopy3)
--------------------------------------------------------------------------------
/WeiboApi/sinaweibopy3.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import urllib.request
4 | import urllib.parse
5 | import json
6 | import logging
7 | import time
8 |
9 |
10 | # general json object that can bind any fields but also act as a dict.
11 | # a json class inhert dict class which can use d['key'] or d.key to get
12 | class JsonDict(dict):
13 | def __getattr__(self, attr):
14 | return self[attr]
15 |
16 | def __setattr__(self, attr, value):
17 | self[attr] = value
18 |
19 |
20 | # convert json object to python object.
21 | def _obj_hook(paris):
22 | odict = JsonDict()
23 | for key, value in paris.items():
24 | odict[str(key)] = value
25 | return odict
26 |
27 |
28 | # Encode parameters
29 | def _encode_params(**kw):
30 | args = []
31 | for key, value in kw.items():
32 | para = value.encode('utf-8') if isinstance(value, str) else str(value)
33 | args.append('%s=%s' % (key, urllib.parse.quote(para)))
34 | return '&'.join(args)
35 |
36 |
37 | # Build a multipart/form-data body with generated random boundary.
38 | def _encode_multipart(**kw):
39 | # 'encode mulyipart data'
40 | boundary = '----------%s' % hex(int(time.time() * 1000))
41 | data = []
42 | for key, value in kw.items():
43 | data.append('--%s' % boundary)
44 | if hasattr(value, 'read'):
45 | filename = getattr(value, 'name', '')
46 | name = filename.rfind('.')
47 | ext = filename[name:].lower() if name != (-1) else ""
48 | content = value.read()
49 | content = content.decode('ISO-8859-1')
50 | data.append('Content-Disposition: form-data; name="%s"; filename="hidden"' % key)
51 | data.append('Content-Length: %d' % len(content))
52 | data.append('Content-Type: %s\r\n' % _guess_content_type(ext))
53 | data.append(content)
54 | else:
55 | data.append('Content-Disposition: form-data; name="%s"\r\n' % key)
56 | data.append(value if isinstance(value, str) else value.decode('utf-8'))
57 | data.append('--%s--\r\n' % boundary)
58 | return '\r\n'.join(data), boundary
59 |
60 |
61 | _CONTENT_TYPES = {'.png': 'image/png', '.gif': 'image/gif', '.jpg': 'image/jpeg',
62 | '.jpeg': 'image/jpeg', '.jpe': 'image/jpeg'}
63 |
64 |
65 | def _guess_content_type(ext):
66 | return _CONTENT_TYPES.get(ext, 'application/octet-stream')
67 |
68 |
69 | _HTTP_GET = 0
70 | _HTTP_POST = 1
71 | _HTTP_UPLOAD = 2
72 |
73 |
74 | def _http_get(url, authorization=None, **kw):
75 | logging.info('GET %s' % url)
76 | return _http_request(url, _HTTP_GET, authorization, **kw)
77 |
78 |
79 | def _http_post(url, authorization=None, **kw):
80 | logging.info('POST %s' % url)
81 | return _http_request(url, _HTTP_POST, authorization, **kw)
82 |
83 |
84 | def _http_upload(url, authorization=None, **kw):
85 | logging.info('UPLOAD %s' % url)
86 | return _http_request(url, _HTTP_UPLOAD, authorization, **kw)
87 |
88 |
89 | # send an http request and expect to return a json object if no error.
90 | def _http_request(url, method, authorization, **kw):
91 | params = None
92 | boundary = None
93 | if method == _HTTP_UPLOAD:
94 | params, boundary = _encode_multipart(**kw)
95 | else:
96 | params = _encode_params(**kw)
97 | http_url = '%s?%s' % (url, params) if method == _HTTP_GET else url
98 | http_para = None if method == _HTTP_GET else params.encode(encoding='utf-8')
99 | # print(http_para)
100 | req = urllib.request.Request(http_url, data=http_para)
101 | if authorization:
102 | req.add_header('Authorization', 'OAuth2 %s' % authorization)
103 | if boundary:
104 | req.add_header('Content-Type', 'multipart/form-data; boundary=%s' % boundary)
105 | resq = urllib.request.urlopen(req)
106 | body = resq.read().decode("utf-8")
107 | result = json.loads(body, object_hook=_obj_hook)
108 | if 'error_code' in result:
109 | print('error')
110 | return result
111 |
112 |
113 | class HttpObject(object):
114 | 'post get or updload object'
115 |
116 | def __init__(self, client, method):
117 | self.client = client
118 | self.method = method
119 |
120 | def __getattr__(self, attr):
121 | def wrap(**kw):
122 | 'request param'
123 | if self.client.is_expires():
124 | raise AttributeError
125 | return _http_request('%s%s.json' % (self.client.api_url, attr.replace('__', '/')), self.method,
126 | self.client.access_token, **kw)
127 |
128 | return wrap
129 |
130 |
131 | # APIClient class
132 | class APIClient(object):
133 | def __init__(self, app_key, app_secret, redirect_uri=None, response_type='code', domain='api.weibo.com',
134 | version='2'):
135 | self.client_id = app_key
136 | self.client_secret = app_secret
137 | self.redirect_uri = redirect_uri
138 | self.response_type = response_type
139 | self.auth_url = 'https://%s/oauth2/' % domain
140 | self.api_url = 'https://%s/%s/' % (domain, version)
141 | self.access_token = None
142 | self.expires = 0.0 # 到期
143 | self.get = HttpObject(self, _HTTP_GET)
144 | self.post = HttpObject(self, _HTTP_POST)
145 | self.upload = HttpObject(self, _HTTP_UPLOAD)
146 |
147 | # get authorize url得到授权url
148 | def get_authorize_url(self):
149 | return "https://api.weibo.com/oauth2/authorize?response_type=code&client_id=%s&redirect_uri=%s" % (
150 | self.client_id, self.redirect_uri)
151 |
152 | # post a request and then get a access_token
153 | def request_access_token(self, code):
154 | result = _http_post('%s%s' % (self.auth_url, 'access_token'),
155 | client_id=self.client_id,
156 | client_secret=self.client_secret,
157 | redirect_uri=self.redirect_uri,
158 | code=code, grant_type='authorization_code')
159 | result.expires_in += int(time.time())
160 | return result
161 |
162 | # set access_token and expires_in
163 | def set_access_token(self, access_token, expires_in):
164 | self.access_token = str(access_token)
165 | self.expires = float(expires_in)
166 |
167 | # Determine if the access token expires
168 | def is_expires(self):
169 | return not self.access_token or time.time() > self.expires
170 |
171 | # Custom function:Used to get the latest public Weibo
172 | def public_timeline(self):
173 | '''
174 | get new public weibo,the parameters followed can be used in _http_get in this method
175 | access_token : (string) the token you got after OAuth
176 | count : (int) the record items in one single page,default 50 items
177 | page : (int) the page number,default one page
178 | base_app : (int) whether get data in current app or not,0 is not(all data),1 is yes(current app),default 0
179 | '''
180 | result = _http_get('%s' % (self.api_url) + 'statuses/public_timeline.json',
181 | access_token=self.access_token,
182 | count=50,
183 | page=1,
184 | base_app=0,
185 | )
186 | return result
--------------------------------------------------------------------------------
/WeiboApi/weiboAPI.py:
--------------------------------------------------------------------------------
1 | import webbrowser
2 | from WeiboApi import sinaweibopy3
3 |
4 |
5 | def main():
6 | try:
7 | APP_KEY = '2047157240'
8 | APP_SECRET = '57a21639366669c3f92c9c1672081f10'
9 | REDIRECT_URL = 'http://api.weibo.com/oauth2/default.html'
10 | client = sinaweibopy3.APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=REDIRECT_URL)
11 | url = client.get_authorize_url()
12 | webbrowser.open_new(url)
13 | result = client.request_access_token(
14 | input("please input code : "))
15 | print(result)
16 | client.set_access_token(result.access_token, result.expires_in)
17 | print(client.public_timeline())
18 | print(client.get.statuses__public_timeline())
19 | print(client.get.account__get_uid())
20 |
21 | except ValueError:
22 | print('pyOauth2Error')
23 |
24 |
25 | if __name__ == '__main__':
26 | main()
27 |
--------------------------------------------------------------------------------
/modelTraining.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3.7
2 | # -*- coding: utf-8 -*-
3 | # @Time : 2019/12/5 下午 05:09
4 | # @Email : pasalai@qq.com
5 | # @Github : github.com/laishouchao
6 | # @File : modelTraining.py
7 | # @Software: PyCharm
8 |
9 | import re
10 | # from snownlp import sentiment
11 | # import numpy as np
12 | import pymysql
13 | from snownlp import SnowNLP
14 | # import matplotlib.pyplot as plt
15 | from snownlp import sentiment
16 | # from snownlp.sentiment import Sentiment
17 |
18 | conn = pymysql.connect(host='localhost', user='root', password='root', charset="utf8", use_unicode=False) # 连接服务器
19 | with conn:
20 | cur = conn.cursor()
21 | cur.execute("SELECT * FROM test.weibo WHERE weiboId < '%d'" % 6000000)
22 | rows = cur.fetchall()
23 | comment = []
24 | for row in rows:
25 | row = list(row)
26 | comment.append(row[18])
27 |
28 |
29 | def train_model(texts):
30 | for li in texts:
31 | comm = li.decode('utf-8')
32 | text = re.sub(r'(?:回复)?(?://)?@[\w\u2E80-\u9FFF]+:?|\[\w+\]', ',', comm)
33 | socre = SnowNLP(text)
34 | if socre.sentiments > 0.8:
35 | with open('./outPut/pos.txt', mode='a', encoding='utf-8') as g:
36 | g.writelines(comm + "\n")
37 | elif socre.sentiments < 0.3:
38 | with open('./outPut/neg.txt', mode='a', encoding='utf-8') as f:
39 | f.writelines(comm + "\n")
40 | else:
41 | pass
42 |
43 |
44 | train_model(comment)
45 | sentiment.train('./outPut/neg.txt', './outPut/pos.txt')
46 | sentiment.save('./outPut/sentiment.marshal')
--------------------------------------------------------------------------------
/outPut/final.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |