├── .gitignore
├── .idea
├── SinaWeiboSpider.iml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── LICENSE
├── README.md
├── __init__.py
├── ml
├── __init__.py
├── fenci.py
├── get_machine_info.py
├── get_user_info.py
├── keyword.txt
├── mingan_strip_equal.txt
├── mingan_word.txt
├── names.csv
├── os_path.py
├── random_forest.py
├── realtime_random_weibo.py
├── realtime_random_weibo_2.py
├── realtime_user_fans_follower.py
├── realtime_user_info.py
├── realtime_user_relationship.py
├── stopwords_cn.txt
├── svm_dict.py
├── svm_lsi.py
├── svm_module.py
├── svm_result.py
├── svm_tfidf.py
├── svm_utils.py
└── values.csv
├── test.py
└── weibospider
├── __init__.py
├── attitude.py
├── base.py
├── client.py
├── comment.py
├── people.py
├── repost.py
├── utils.py
└── weibo.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[cod]
2 |
--------------------------------------------------------------------------------
/.idea/SinaWeiboSpider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
141 |
142 |
143 |
144 |
145 | true
146 | DEFINITION_ORDER
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 | 1510370240815
453 |
454 |
455 | 1510370240815
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 SuperSaiyanSSS
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SinaWeiboSpider
2 | 新浪微博较为完善的爬虫,持续改进,欢迎star~
3 |
4 | ## 安装
5 | ## pip install weibospider
6 |
7 | ## 特色
8 |
9 | + 模仿77的zhihu-oauth, 用户提供微博id构用于建对应类的对象,可以获取到某些需要的数据
10 | + 需要手动粘贴cookie, 下文有对应教程。因为自动登录WAP版微博到现在还没有一个很好的办法。加密方式未知,且验证码反人类。
11 | + 用到了许多装饰器与生成器的有关知识,可以学习参考
12 |
13 | ## 简单例子
14 |
15 | ```
16 | from weibospider import WeiboClient
17 |
18 | cookies = 'xxxxxxxxxxxx' # 通过在weibo.cn登录后,F12查看network选项获取
19 |
20 | myclient = WeiboClient(cookies)
21 |
22 | people_1 = myclient.people('1884866222') #某目标用户的uid
23 |
24 | print(people_1.name) #打印people_1的用户名
25 |
26 | print(people_1.weibo_count) #打印people_1的发表的微博数
27 |
28 | for index, weibo in zip(range(10), people_1.weibo):
29 |
30 | print(weibo.text) #打印people_1发表的最近10条微博
31 |
32 | for index_2, comment in zip(range(5), weibo.comment):
33 |
34 | print(comment.text) #打印此微博的最近5条评论
35 |
36 | print(comment.author_name) #打印此评论对应的作者
37 |
38 |
39 | ```
40 |
41 | 如有疑问可邮箱 or QQ联系
42 | (2024.2.12更新:部分接口已不可用)
43 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from .weibospider import (Attitude, SinaBaseObject, Comment, People,
3 | Repost, Weibo, WeiboClient)
4 |
5 | __all__ = [
6 | 'WeiboClient', 'Attitude', 'SinaBaseObject', 'Comment', 'People', 'Repost', 'Weibo', 'WeiboClient'
7 | ]
--------------------------------------------------------------------------------
/ml/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SuperSaiyanSSS/SinaWeiboSpider/b034c20ccf062b1323046584712716b2794ec7ec/ml/__init__.py
--------------------------------------------------------------------------------
/ml/fenci.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*
2 | from __future__ import print_function
3 | import sys
4 | sys.path.append("..")
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
7 | import jieba
8 | # from a1 import sina_people
9 | # from a1 import sina_people
10 | # from a1 import sina_weibo
11 | # from a1 import base
12 | # from a1 import test1
13 | # from a1 import sina_store
14 | from bs4 import BeautifulSoup
15 | import requests
16 | import pymongo
17 | import re
18 |
19 | sys.path.append('../')
20 |
21 | import jieba
22 | import jieba.analyse
23 | from optparse import OptionParser
24 |
25 |
26 |
27 | def clean_keyword():
28 | """
29 | 将敏感词转化为标准格式
30 | :return:
31 | """
32 | word_list = []
33 | with open('mingan_strip_equal.txt', 'r') as f:
34 | for i in f.readlines():
35 | if i != '':
36 | word_list.append(i.strip().strip('\n'))
37 | # with open('xie.txt', 'r') as f:
38 | # for i in f.readlines():
39 | # if i.split('=')[0].strip().strip('\n'):
40 | # word_list.append(i.split('=')[0].strip().strip('\n'))
41 | #
42 | # with open('guang.txt', 'r') as f:
43 | # for i in f.readlines():
44 | # if i.split('=')[0].strip().strip('\n'):
45 | # word_list.append(i.split('=')[0].strip().strip('\n'))
46 | #
47 | # with open('huang.txt', 'r') as f:
48 | # for i in f.readlines():
49 | # if i.split('=')[0].strip().strip('\n'):
50 | # word_list.append(i.split('=')[0].strip().strip('\n'))
51 | #
52 | # with open('mingan.txt', 'r') as f:
53 | # for i in f.readlines():
54 | # if i.split('=')[0].strip().strip('\n'):
55 | # word_list.append(i.split('=')[0].strip().strip('\n'))
56 | #
57 | # with open('mingan_9.txt','a') as f:
58 | # print(word_list)
59 | # for i in word_list:
60 | # if i:
61 | # b = repr(i)
62 | # try:
63 | # print(unicode(eval(b), "gbk"))
64 | # except:
65 | # continue
66 | # f.write(str(unicode(eval(b), "gbk"))+' '+'300'+'\n')
67 |
68 |
69 | def remove_equal():
70 | """
71 | 去除网上所得敏感词中的等号
72 | :return:
73 | """
74 | count = 0
75 | target_list = []
76 | with open('mingan_word.txt', 'r') as f:
77 | word_list = f.readlines()
78 | print(len(word_list))
79 | for i in word_list:
80 | count += 1
81 | print(count)
82 | target_list.append(i.split(' ')[0])
83 | with open('mingan_strip_equal.txt', 'w') as f:
84 | for i in target_list:
85 | f.write(i+'\n')
86 |
87 |
88 | class TestKeyword(object):
89 | """
90 | 对传入的微博文本分词并检测是否含有敏感词
91 | """
92 | def __init__(self):
93 | jieba.load_userdict("keyword.txt")
94 | jieba.load_userdict("mingan_word.txt")
95 | self.topK = 12
96 | self.mingan_list = []
97 | self.get_mingan_list()
98 |
99 | def get_mingan_list(self):
100 | with open('mingan_strip_equal.txt', 'r') as f:
101 | word_list = f.readlines()
102 | for word in word_list:
103 | self.mingan_list.append(word.strip('\n'))
104 |
105 | def test_if_has_keyword(self, weibo_text):
106 | content = weibo_text
107 | tags = jieba.analyse.extract_tags(content, topK=self.topK)
108 |
109 | for tag in tags:
110 | if tag in self.mingan_list:
111 | print("6666666")
112 | print(content)
113 | print(tag)
114 | return True
115 | else:
116 | print("no")
117 | return False
118 |
119 | if __name__ == '__main__':
120 | sys.setdefaultencoding('utf-8')
121 | s = sina_store.SinaStore()
122 | s.weibo_table = s.db['realtime_weibo']
123 | weibo_iter = s.get_stored_information()
124 | print(weibo_iter)
125 |
126 | count = 0
127 | while count < 400:
128 | weibo = next(weibo_iter)
129 | weibo_txt = weibo['text']
130 | print(weibo_txt)
131 | jieba.load_userdict("keyword.txt")
132 | jieba.load_userdict("mingan_word.txt")
133 | file_name = 'mm.txt'
134 |
135 | topK = 12
136 |
137 | content = weibo_txt
138 | tags = jieba.analyse.extract_tags(content, topK=topK)
139 |
140 | # print(",".join(tags))
141 |
142 | mingan_list = []
143 | with open('mingan_strip_equal.txt', 'r') as f:
144 | word_list = f.readlines()
145 | print(len(word_list))
146 | for i in word_list:
147 | mingan_list.append(i.strip('\n'))
148 | for i in tags:
149 | if i in mingan_list:
150 | print("6666666")
151 | print(content)
152 |
153 | count += 1
154 |
155 | # seg_list = jieba.cut(content)
156 | # print(", ".join(seg_list))
157 |
158 |
159 | # with open('mm.txt','r') as f:
160 | # s = "".join(f.readlines())
161 | # seg_list = jieba.cut(s, cut_all=True)
162 | #
163 | # print("Full Mode:", "/ ".join(seg_list))
164 | # seg_list = jieba.cut(s, cut_all=False)
165 | # print("Default Mode:", "/ ".join(seg_list))
166 | # seg_list = jieba.cut(s)
167 | # print(", ".join(seg_list))
168 | # seg_list = jieba.cut_for_search(s)
169 | # print(", ".join(seg_list))
170 |
171 |
--------------------------------------------------------------------------------
/ml/get_machine_info.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals, print_function
2 | import sys
3 | sys.path.append("..")
4 | reload(sys)
5 | sys.setdefaultencoding('utf-8')
6 | from a1 import sina_people
7 | from a1 import sina_people
8 | from a1 import sina_weibo
9 | from a1 import base
10 | from a1 import test1
11 | from a1 import sina_store
12 | import time as tt
13 | from bs4 import BeautifulSoup
14 | import requests
15 | import pymongo
16 | import re
17 |
18 | headers_2 = {
19 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
20 | 'Accept-Encoding':'gzip, deflate, sdch',
21 | 'Accept-Language':'zh-CN,zh;q=0.8',
22 | 'Cache-Control':'max-age=0',
23 | 'Connection':'keep-alive',
24 | 'Cookie': '_T_WM=0ff248d78f4984aa135c5b2e53c11079; ALF=1496373314; SCF=AjsEaVa0e8KjEg3yEjwEx270PLOpYvK-1BhV7AdkMSQgM7IlYI27IV6TA5-eb6avSBhK-q5migy9jGYZkeqPPpU.; SUB=_2A250DTviDeThGeBP4lQW-CbFyj6IHXVXDkWqrDV6PUJbktBeLWLAkW1fCr2k7XOfWxI9AQSa5M6kQfvxPg..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWC9U1RTKpYdAAz2GZeMbFX5JpX5o2p5NHD95QceK.cS0nR1K2EWs4DqcjSH.ieC0-R-.R7HK.R1Btt; SUHB=04W-CMkuo5eJq_; SSOLoginState=1493781426',
25 | 'Host':'weibo.cn',
26 | 'Upgrade-Insecure-Requests':'1',
27 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
28 | }
29 |
30 |
31 | def get_machine_personal_info():
32 | s = sina_store.SinaStore()
33 | s.weibo_table = s.db['machine_personal_info']
34 | with open('machine_uid.txt','r') as f:
35 | for i in f.readlines():
36 | if i!='':
37 | print(i)
38 | pe = sina_people.SinaPeople(i)
39 | s.store_in_mongodb(pe)
40 |
41 |
42 |
43 |
44 | if __name__ == '__main__':
45 | dic_c = {}
46 | str_c = headers_2['Cookie']
47 | for i in str_c.split('; '):
48 | dic_c[i.split('=')[0]] = i.split('=')[1]
49 | cookies2 = requests.utils.cookiejar_from_dict(dic_c)
50 | base.SinaBaseObject.cookies = cookies2
51 | if 1:
52 | dic_c = {}
53 | str_c = headers_2['Cookie']
54 | for i in str_c.split('; '):
55 | dic_c[i.split('=')[0]] = i.split('=')[1]
56 | cookies2 = requests.utils.cookiejar_from_dict(dic_c)
57 | base.SinaBaseObject.cookies = cookies2
58 |
59 | # for i in range(0,21):
60 | # if
61 | print(cookies2)
62 | with open('machine_uid.txt','r') as f:
63 | uid = f.readlines()
64 | print(len(uid))
65 | # get_machine_personal_info()
66 |
67 |
68 | # a = requests.get('https://weibo.cn/2318253071/fans?page=1', cookies=cookies2)
69 | # a = BeautifulSoup(a.content, "lxml")
70 | # unit = a.findAll('div', attrs={'class': 'c'})[1]
71 | # print(unit)
72 | # unit_list = unit.findAll('table')
73 | # print(unit_list)
74 | # uid_list = []
75 | #
76 | # for i in unit_list:
77 | # print(str(i.tr.findAll('td')[1].a.attrs['href']).split('/')[-1])
78 | # uid_list.append(str(i.tr.findAll('td')[1].a.attrs['href']).split('/')[-1])
79 | #
80 | # for j in range(2,5):
81 | # tt.sleep(4)
82 | # a = requests.get('https://weibo.cn/2318253071/fans?page='+str(j), cookies=cookies2)
83 | # a = BeautifulSoup(a.content, "lxml")
84 | # unit = a.findAll('div', attrs={'class': 'c'})[1]
85 | # unit_list = unit.findAll('table')
86 | # for i in unit_list:
87 | # print(str(i.tr.findAll('td')[1].a.attrs['href']).split('/')[-1])
88 | # uid_list.append(str(i.tr.findAll('td')[1].a.attrs['href']).split('/')[-1])
89 | #
90 | # with open('machine_uid.txt','a') as f:
91 | # for i in uid_list:
92 | # f.write(i+'\n')
93 | # get_human_personal_info()
94 | # a = requests.get('http://weibo.cn/u/5195713909')
95 | # print(a.content)
96 | # pe = sina_people.SinaPeople('6021561452')
97 | # pe = sina_weibo.SinaWeibo('F16aup9Im')
98 | # we = sina_weibo.SinaWeibo('F15Kpbev2')
99 | # for name, value in vars(we).items():
100 | # print(name, value)
101 | # c_set = set()
102 | # s = sina_store.SinaStore()
103 | # s.weibo_table = s.db['try2']
104 | # rmrb = s.get_human_info()
105 | # comment_list = rmrb['comment_list']
106 | # for name, value in comment_list.items():
107 | # author_uid = value['author_uid']
108 | # c_set.add(str(author_uid))
109 |
110 | # with open('human_uid.txt','a') as f:
111 | # for i in c_set:
112 | # f.write(i+'\n')
113 |
114 |
115 |
116 |
117 |
--------------------------------------------------------------------------------
/ml/get_user_info.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*
2 | from __future__ import unicode_literals, print_function
3 | from __future__ import division
4 | import math
5 | import sys
6 | sys.path.append("..")
7 | reload(sys)
8 | sys.setdefaultencoding('utf-8')
9 | import datetime
10 | import difflib
11 | from a1 import sina_people
12 | from a1 import sina_weibo
13 | from a1 import base
14 | from a1 import test1
15 | from a1 import sina_store
16 | from bs4 import BeautifulSoup
17 | import requests
18 | import time as tt
19 | import pymongo
20 | import re
21 |
22 |
23 | headers_for_baidu = {
24 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
25 | 'Accept-Encoding':'gzip, deflate, sdch',
26 | 'Accept-Language':'zh-CN,zh;q=0.8',
27 | 'Cache-Control':'max-age=0',
28 | 'Connection':'keep-alive',
29 | 'Host':'www.baidu.com',
30 | 'Upgrade-Insecure-Requests':'1',
31 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
32 | }
33 |
34 |
35 | # def get_human_personal_info():
36 | # s = sina_store.SinaStore()
37 | # s.weibo_table = s.db['human_personal_info']
38 | # with open('human_uid.txt','r') as f:
39 | # for i in f.readlines():
40 | # if i!='':
41 | # print(i)
42 | # pe = sina_people.SinaPeople(i)
43 | # s.store_in_mongodb(pe)
44 |
45 |
46 | # 将时间转换为标准格式
47 | def clean_time(now_time):
48 | if now_time.startswith('今'):
49 | now_time = datetime.datetime(2017, 5, 3)
50 | elif "分钟" in now_time:
51 | now_time = datetime.datetime(2017, 5, 3)
52 | elif "月" in now_time:
53 | month = int(now_time.split("月")[0][-2:])
54 | day = int(now_time.split("日")[0][-2:])
55 | now_time = datetime.datetime(2017, month, day)
56 | else:
57 | year = int(now_time.split('-')[0])
58 | month = int(now_time.split('-')[1])
59 | day = int(now_time.split('-')[2][:3])
60 | now_time = datetime.datetime(year, month, day)
61 | return now_time
62 |
63 |
64 | # 最终修正条件信息熵计算公式
65 | # TODO:论文给定的公式有问题 计算得出的离散有限序列的条件熵不满足非负性
66 | def final_calculation_formula(space_list):
67 | lence = len(space_list)
68 | if lence < 2:
69 | raise IndexError+"时间间隔个数应至少2个!"
70 | entropy_list = []
71 | scale_list = []
72 | is_first = True
73 | end_seq = 2
74 | while end_seq < lence:
75 | local_lence = end_seq
76 | entropy = calculation_formula(space_list[:end_seq], local_lence)
77 | entropy_list.append(entropy)
78 | if is_first:
79 | is_first = False
80 | else:
81 | scale_list.append(calculate_perc_xm(space_list[:end_seq], local_lence))
82 | end_seq += 1
83 | result_list = []
84 | for i in range(len(entropy_list)-1):
85 | result_list.append(entropy_list[i+1]-entropy_list[i]+scale_list[i]*entropy_list[0])
86 | print(result_list)
87 | print(scale_list)
88 | return min(result_list)
89 |
90 |
91 | # pers(Xm):长度为 m 的序列里面只出现过一次的序列所占的比例
92 | def calculate_perc_xm(space_list, lence):
93 | only_count = 0
94 | print(space_list)
95 | for i in range(lence):
96 | for j in range(i+1, lence):
97 | if space_list[i] == space_list[j]:
98 | break
99 | if j == lence-1:
100 | only_count += 1
101 | scale = only_count/lence
102 | return scale
103 |
104 |
105 | # 计算信息熵
106 | def calculation_formula(space_list, lence):
107 | probability_list = []
108 | while space_list:
109 | item_count = space_list.count(space_list[0])
110 | # 由于引入了Python 3.x的除法规则,不会取整
111 | probability_list.append(item_count/lence)
112 | space_list = filter(lambda x: x != space_list[0], space_list)
113 | entropy = 0
114 | for p in probability_list:
115 | entropy += p*math.log(p)
116 | entropy = -entropy
117 | return entropy
118 |
119 |
120 | # 获取信息熵
121 | def get_entropy_of_information(person_dict):
122 | space_list = []
123 | weibo_list_lence = len(person_dict['weibo_list'])
124 | print(weibo_list_lence)
125 | # 有的原创微博太少 舍去该样本
126 | try:
127 | last_time = str(person_dict['weibo_list']['1']['time'])
128 | last_time = clean_time(last_time)
129 | except KeyError:
130 | return -1
131 | for i in range(weibo_list_lence-2):
132 | now_time = str(person_dict['weibo_list'][str(i+2)]['time'])
133 | now_time = clean_time(now_time)
134 | space_list.append((last_time - now_time).days)
135 | print((last_time-now_time).days)
136 | last_time = now_time
137 |
138 | entropy = calculation_formula(space_list, len(space_list))
139 | print(entropy)
140 | return entropy
141 |
142 | # while 1:
143 | # try:
144 | # print(next(a))
145 | # except StopIteration:
146 | # break
147 |
148 |
149 | # 获取用户信誉度
150 | def get_reputation(person_dict):
151 | try:
152 | fans_count = int(person_dict['fans_count'])
153 | follow_count = int(person_dict['follow_count'])
154 | reputation = fans_count/(fans_count+follow_count)
155 | except ValueError:
156 | return -1
157 | return reputation
158 |
159 |
160 | # 获取发表微博的平台种类数量
161 | def get_num_of_platform(person_dict):
162 | platform_set = set()
163 | for item in person_dict['weibo_list']:
164 | platform_set.add(str(person_dict['weibo_list'][str(item)]['terminal_source']).strip())
165 | return len(platform_set)
166 |
167 |
168 | # 检查原创微博是否过少 偶然性影响较大
169 | def check_if_too_little(person_dict):
170 | repost_count = 0
171 | total_count = len(person_dict['weibo_list'])
172 | for item in person_dict['weibo_list']:
173 | if str(person_dict['weibo_list'][str(item)]['is_repost'])=='True':
174 | print(person_dict['weibo_list'][str(item)]['is_repost'])
175 | repost_count += 1
176 | continue
177 | if total_count-repost_count < 3:
178 | return -1
179 | return 0
180 |
181 |
182 | # 获取发表微博的内容相似度
183 | def get_similarity_of_content(person_dict):
184 | """
185 | 利用百度搞基搜索的site:(weibo.com) 查找是否存在重复微博
186 | :param person_dict: 用户信息的字典
187 | :return: 内容相似度
188 | """
189 | identical_count = 0
190 | total_count = 0
191 | repost_count = 0
192 | if check_if_too_little(person_dict) == -1:
193 | return -1
194 | for item in person_dict['weibo_list']:
195 | if str(person_dict['weibo_list'][str(item)]['is_repost'])=='True':
196 | print(person_dict['weibo_list'][str(item)]['is_repost'])
197 | repost_count += 1
198 | continue
199 | else:
200 | total_count += 1
201 |
202 | if total_count >= 15:
203 | break
204 | copy_test_1 = False
205 | copy_test_2 = False
206 |
207 | text = str(person_dict['weibo_list'][str(item)]['text'])
208 | print(person_dict['weibo_list'][str(item)]['href'])
209 | q1 = text
210 | print(q1)
211 |
212 | re_emotion = re.compile('(\[.*?\])') # 去除微博表情文字
213 | q1 = re_emotion.sub('', q1)
214 | q1_list = re.split('!|!|,|。|……|:|、|,|,|;|;|——', unicode(q1)) # 按标点符号分割
215 |
216 | # 选择最大和第二大子字符串
217 | max_len = 'x'
218 | for string_seq in q1_list:
219 | if len(string_seq)> len(max_len):
220 | max_len = string_seq
221 | print(max_len)
222 | second_len = 'x'
223 | for string_seq in q1_list:
224 | if len(string_seq) > len(second_len) and string_seq != max_len:
225 | second_len = string_seq
226 | q1 = max_len
227 | q1_2 = second_len
228 |
229 | url = 'https://www.baidu.com/s?q1='+q1+'&q2=&q3=&q4=&rn=10&lm=0&ct=0&ft=&q5=&q6=weibo.com&tn=baiduadv'
230 | url_2 = 'https://www.baidu.com/s?q1='+q1_2+'&q2=&q3=&q4=&rn=10&lm=0&ct=0&ft=&q5=&q6=weibo.com&tn=baiduadv'
231 | # baidu_requests = requests.get(url, headers=headers_for_baidu, timeout=3)
232 | baidu_requests = base.SinaBaseObject.retry_requests_static(url, headers=headers_for_baidu, timeout=3)
233 | copy_test_1 = __parse_baidu_page__(baidu_requests, q1)
234 |
235 | if len(q1_2) > 5:
236 | baidu_requests = base.SinaBaseObject.retry_requests_static(url_2, headers=headers_for_baidu, timeout=3)
237 | # baidu_requests = requests.get(url_2, headers=headers_for_baidu, timeout=3)
238 | copy_test_2 = __parse_baidu_page__(baidu_requests, q1_2)
239 |
240 | if copy_test_1 or copy_test_2:
241 | identical_count += 1
242 | print(copy_test_1, copy_test_2)
243 | print("确实是抄袭的")
244 | else:
245 | print("是原创的")
246 | tt.sleep(3)
247 |
248 | # 部分数据有残缺,未能成功抓取到微博或几乎全为转发,则舍去
249 | if total_count < 3 or repost_count > 28:
250 | return -1
251 |
252 | similarity = identical_count/total_count
253 | print("内容相似度为"+str(similarity))
254 | return similarity
255 |
256 |
257 | # 百度搜索页面处理逻辑
258 | def __parse_baidu_page__(baidu_requests, q1):
259 | """
260 | @ author: wxw
261 | @ time: 2017/5/4
262 | 提取搜索到的高亮字符串 并与要查找的进行对比
263 | 若满足一定的相似度要求 则认为重复
264 | :param baidu_requests: requests抓取百度搜索所得页面源代码
265 | :param q1: 要查找的字符串
266 | :return: 是否重复
267 | """
268 | baidu_bs4 = BeautifulSoup(baidu_requests.content, "lxml")
269 | highlight_list = baidu_bs4.find_all('div', attrs={'class': 'c-abstract'})
270 | ok_count = 0
271 | for unit in highlight_list:
272 | try:
273 | highlight_word = unit.em.get_text()
274 | print(highlight_word)
275 | if str(q1).strip() == str(highlight_word).strip():
276 | ok_count += 1
277 | print("已发现")
278 | # 若高亮的文本与寻找的文本差异很小(可能少了几个字符)则同样认为是已找到
279 | elif difflib.SequenceMatcher(None, str(q1), str(highlight_word)).ratio() > 0.88:
280 | print(difflib.SequenceMatcher(None, str(q1), str(highlight_word)).ratio())
281 | ok_count += 1
282 | print("认为已找到")
283 | else:
284 | print("no")
285 | except AttributeError:
286 | print("这是空的")
287 |
288 | if ok_count > 1:
289 | return True
290 | else:
291 | return False
292 |
293 |
294 | def __store_human_feature_vector__(feature_vector):
295 | s = sina_store.SinaStore()
296 | s.weibo_table = s.db['human_vector_info']
297 | iter = s.get_stored_information()
298 | flag = 0
299 | while True:
300 | try:
301 | person_dict = next(iter)
302 | if str(person_dict['uid']) == str(feature_vector['uid']):
303 | flag = 1
304 | break
305 | except StopIteration:
306 | flag = 0
307 | break
308 |
309 | if flag == 0:
310 | s.store_in_mongodb(feature_vector)
311 |
312 |
313 | def store_human_feature_vector(sina_store_object):
314 | sina_store_object.weibo_table = sina_store_object.db['human_personal_info']
315 | # 获取返回的生成器
316 | iter = sina_store_object.get_stored_information()
317 | item_count = 0
318 | while True:
319 | try:
320 | feature_vector = {}
321 | person_dict = next(iter)
322 | entropy = get_entropy_of_information(person_dict)
323 | similarity = get_similarity_of_content(person_dict)
324 | platform = get_num_of_platform(person_dict)
325 | reputation = get_reputation(person_dict)
326 | if reputation == -1:
327 | print("该数据为残缺数据!舍去")
328 | print("现在抽取到第" + str(item_count) + "个用户!!")
329 | item_count += 1
330 | continue
331 | feature_vector['entropy'] = entropy
332 | feature_vector['similarity'] = similarity
333 | if similarity == -1 or entropy == -1:
334 | print("该数据为残缺数据!舍去")
335 | print("现在抽取到第" + str(item_count) + "个用户!!")
336 | item_count += 1
337 | continue
338 | feature_vector['uid'] = str(person_dict['uid'])
339 | feature_vector['platform'] = platform
340 | feature_vector['reputation'] = reputation
341 | feature_vector['human_or_machine'] = 1
342 | item_count += 1
343 | print("现在抽取到第"+str(item_count)+"个用户!!")
344 | __store_human_feature_vector__(feature_vector)
345 | except StopIteration:
346 | print("人类用户已提取特征向量完毕!")
347 | break
348 |
349 |
350 | def __store_machine_feature_vector__(feature_vector):
351 | s = sina_store.SinaStore()
352 | s.weibo_table = s.db['machine_vector_info']
353 | iter = s.get_stored_information()
354 | flag = 0
355 | while True:
356 | try:
357 | person_dict = next(iter)
358 | if str(person_dict['uid']) == str(feature_vector['uid']):
359 | flag = 1
360 | break
361 | except StopIteration:
362 | flag = 0
363 | break
364 |
365 | if flag == 0:
366 | s.store_in_mongodb(feature_vector)
367 |
368 |
369 | def store_machine_feature_vector(sina_store_object):
370 | sina_store_object.weibo_table = sina_store_object.db['machine_personal_info']
371 | # 获取返回的生成器
372 | iter = sina_store_object.get_stored_information()
373 | item_count = 0
374 | while True:
375 | try:
376 | feature_vector = {}
377 | person_dict = next(iter)
378 | entropy = get_entropy_of_information(person_dict)
379 | similarity = get_similarity_of_content(person_dict)
380 | platform = get_num_of_platform(person_dict)
381 | reputation = get_reputation(person_dict)
382 | if reputation == -1:
383 | print("该数据为残缺数据!舍去")
384 | print("现在抽取到第" + str(item_count) + "个用户!!")
385 | item_count += 1
386 | continue
387 | feature_vector['entropy'] = entropy
388 | feature_vector['similarity'] = similarity
389 | if similarity == -1 or entropy == -1:
390 | print("该数据为残缺数据!舍去")
391 | print("现在抽取到第" + str(item_count) + "个用户!!")
392 | item_count += 1
393 | continue
394 | feature_vector['uid'] = str(person_dict['uid'])
395 | feature_vector['platform'] = platform
396 | feature_vector['reputation'] = reputation
397 | feature_vector['human_or_machine'] = 0
398 | item_count += 1
399 | print("现在抽取到第"+str(item_count)+"个用户!!")
400 | __store_machine_feature_vector__(feature_vector)
401 | except StopIteration:
402 | print("机器用户已提取特征向量完毕!")
403 | break
404 |
405 |
406 | if __name__ == '__main__':
407 | """
408 | 从mongodb中获取human和machine的信息,
409 | 并计算其信息熵、相似度、信誉度等特征
410 | 并将特征存入mongodb
411 |
412 | 示例:
413 | s = sina_store.SinaStore()
414 | store_human_feature_vector(s)
415 | store_machine_feature_vector(s)
416 | """
417 |
418 |
419 |
420 |
421 |
422 |
--------------------------------------------------------------------------------
/ml/keyword.txt:
--------------------------------------------------------------------------------
1 | 减肥 100
2 | 私聊 300
3 | 变瘦 20
4 | 得意 1
5 | V信 300
6 | V xin 300
7 | 卫星号 1000
8 | 卫星号, 1000
9 | 祖传 200
10 | 秘制 200
11 | 强力 100
12 | 特效 100
13 | 全效 100
14 | 强效 100
15 | 奇效 100
16 | 高效 100
17 | 速效 100
18 | 神效 200
19 | 处方 200
20 | 复方 200
21 | 治疗 10
22 | 消炎 20
23 | 抗炎 20
24 | 活血 100
25 | 祛瘀 100
26 | 止咳 50
27 | 解毒 50
28 | 疗效 100
29 | 防治 20
30 | 防癌 50
31 | 增高 200
32 | 溶脂 200
33 | 吸脂 200
34 | 瘦身 200
35 | 瘦脸 200
36 | 瘦腿 200
37 | xin 100
38 | 防辐射 100
39 | 美容 100
40 | 养颜 100
41 | 避凶 200
42 | 辟邪 200
43 | 首选 50
44 | 保健级 200
45 | 安全无毒副作用 200
46 | 无效退款 200
47 | 保险公司承保 200
48 | 补五行 200
49 | 吸财 200
50 | 保平安 200
51 | 无斑 200
52 | 祛疤 200
53 | 去疤 200
54 | 国家级 20
55 | 世界级 20
56 | 最高级 20
57 | 全网销量第一 200
58 | 全球首发 100
59 | 全国首家 100
60 | 全网首发 200
61 | 代言 20
62 | 无副作用 100
63 | 零风险 200
64 | 稳赚 200
65 | 无效退款 200
66 | 祖传 200
67 | 无事故 200
68 | 无依赖 200
69 | 根治 200
70 | 日减 200
71 | 秘方 200
72 | 保过 200
73 | 填补国内空白 10
74 | 保收益 200
75 | 食品级别 200
76 | 精准检测 100
77 | 3-7天见效 100
78 | 1-3个疗程治愈 100
79 | 解除疼痛 100
80 | 最权威 40
81 | 药到病除 100
82 | 体内毒素 200
83 | 吸附铅汞 100
84 | 除湿 100
85 | 润燥 50
86 | 消除斑点 100
87 | 祛除雀斑 100
88 | 祛除黄褐斑 100
89 | 祛除蝴蝶斑 100
90 | 祛除妊娠斑 100
91 | 祛除斑立净 100
92 | 全民晒单 100
93 | 随时涨价 100
94 | 卖疯了 200
95 | 抢疯了 200
96 | 点击领奖 100
97 | 恭喜获奖 200
98 | 全民免单 100
99 | 点击有惊喜 200
100 | 点击获取 200
101 | 点击转身 200
102 | 点击试穿 200
103 | 点击翻转 200
104 | 领取奖品 100
105 | 秒杀 20
106 | 抢爆 200
107 | 再不抢就没 200
108 | 不会更便宜 200
109 | 错过就没机会 200
110 | 疯抢 100
111 | 抢购 40
112 | VX 100
113 | 免费送 100
114 | 泰國 20
115 | 佛牌 200
116 | 陰牌 200
117 | 情降 200
118 | 降頭 200
119 | 和合 200
120 | 鬼降 200
121 | 鎖心 200
122 | 巫術 200
123 |
--------------------------------------------------------------------------------
/ml/os_path.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import unicode_literals, print_function
3 | import os
4 |
5 |
6 | class LoadFolders(object): # 迭代器
7 | def __init__(self, par_path):
8 | self.par_path = par_path
9 |
10 | def __iter__(self):
11 |
12 | for file in os.listdir(self.par_path):
13 | file_abspath = os.path.join(self.par_path, file)
14 | if os.path.isdir(file_abspath): # if file is a folder
15 | yield file_abspath
16 |
17 |
18 | class LoadFiles(object):
19 | def __init__(self, par_path):
20 | self.par_path = par_path
21 |
22 | def __iter__(self):
23 | folders = LoadFolders(self.par_path)
24 | for folder in folders: # level directory
25 | catg = folder.split(os.sep)[-1]
26 | for file in os.listdir(folder): # secondary directory
27 | file_path = os.path.join(folder, file)
28 | if os.path.isfile(file_path):
29 | this_file = open(file_path, 'rb')
30 | content = this_file.read()
31 | yield catg, content
32 | this_file.close()
--------------------------------------------------------------------------------
/ml/random_forest.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*
2 | from __future__ import unicode_literals, print_function
3 | from __future__ import division
4 | import sklearn
5 | import pandas as pd
6 | import json
7 | import math
8 | import csv
9 | import pymongo
10 | import sklearn
11 | import sys
12 | sys.path.append("..")
13 | from a1 import base
14 | from a1 import sina_store
15 | reload(sys)
16 | sys.setdefaultencoding('utf-8')
17 |
18 |
19 | class MachineLearning(base.SinaBaseObject):
20 | def __init__(self):
21 | self.is_First = True
22 | self.is_First_2 = True
23 | self.gbc = ''
24 | self.dtc = ''
25 | self.rfc = ''
26 |
27 | def set_feature_vector_dict(self, feature_vector_dict):
28 | self.clean_feture_vector_dict(feature_vector_dict, is_first=self.is_First)
29 | self.is_First = False
30 |
31 | def set_test_feature_vector_dict(self, feature_vector_dict):
32 | self.clean_test_feture_vector_dict(feature_vector_dict, is_first_2=self.is_First_2)
33 | self.is_First_2 = False
34 |
35 | # 将传入的字典转化为csv文件
36 | @staticmethod
37 | def clean_feture_vector_dict(feature_vector_dict, is_first=False):
38 | with open('names.csv', 'ab') as csvfile:
39 | fieldnames = ['uid', 'similarity', 'platform', 'reputation', 'entropy', 'human_or_machine']
40 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
41 | if is_first:
42 | writer.writeheader()
43 | writer.writerow(
44 | {'uid': feature_vector_dict['uid'],
45 | 'similarity': feature_vector_dict['similarity'],
46 | 'platform': feature_vector_dict['platform'],
47 | 'reputation': feature_vector_dict['reputation'],
48 | 'entropy': feature_vector_dict['entropy'],
49 | 'human_or_machine': feature_vector_dict['human_or_machine']
50 | }
51 | )
52 |
53 | @staticmethod
54 | def clean_test_feture_vector_dict(feature_vector_dict, is_first_2=False):
55 | with open('needs.csv', 'ab') as csvfile:
56 | fieldnames = ['uid', 'similarity', 'platform', 'reputation', 'entropy', 'human_or_machine']
57 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
58 | if is_first:
59 | writer.writeheader()
60 | writer.writerow(
61 | {'uid': feature_vector_dict['uid'],
62 | 'similarity': feature_vector_dict['similarity'],
63 | 'platform': feature_vector_dict['platform'],
64 | 'reputation': feature_vector_dict['reputation'],
65 | 'entropy': feature_vector_dict['entropy'],
66 | 'human_or_machine': feature_vector_dict['human_or_machine']
67 | }
68 | )
69 |
70 | # 进行单一决策树和随机森林的训练模型及检验
71 | def rand_forest_train(self):
72 | # 读取本地用户特征信息
73 | users = pd.read_csv('names.csv')
74 | # 选取similarity、platform、reputation、entropy作为判别人类或机器的特征
75 | X = users[['similarity', 'platform', 'reputation', 'entropy']]
76 | y = users['human_or_machine']
77 |
78 | # 对原始数据进行分割, 25%的数据用于测试
79 | from sklearn.cross_validation import train_test_split
80 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
81 |
82 | # 对类别特征进行转化,成为特征向量
83 | from sklearn.feature_extraction import DictVectorizer
84 | vec = DictVectorizer(sparse=False)
85 | X_train = vec.fit_transform(X_train.to_dict(orient='record'))
86 | X_test = vec.transform(X_test.to_dict(orient='record'))
87 |
88 | # 使用单一决策树进行集成模型的训练及预测分析
89 | from sklearn.tree import DecisionTreeClassifier
90 | dtc = DecisionTreeClassifier()
91 | dtc.fit(X_train, y_train)
92 | dtc_y_pred = dtc.predict(X_test)
93 |
94 | # 使用随机森林分类器进行集成模型的训练及预测分析
95 | from sklearn.ensemble import RandomForestClassifier
96 | rfc = RandomForestClassifier()
97 | rfc.fit(X_train, y_train)
98 | rfc_y_pred = rfc.predict(X_test)
99 |
100 | # 使用梯度提升决策树进行集成模型的训练及预测分析
101 | from sklearn.ensemble import GradientBoostingClassifier
102 | gbc = GradientBoostingClassifier()
103 | gbc.fit(X_train, y_train)
104 | gbc_y_pred = gbc.predict(X_test)
105 |
106 | from sklearn.metrics import classification_report
107 | # 输出单一决策树在测试集上的分类准确性, 以及更加详细的精确率 召回率 F1指标
108 | print("单一决策树的准确性为", dtc.score(X_test, y_test))
109 | print(classification_report(dtc_y_pred, y_test))
110 |
111 | # 输出随机森林分类器在测试集上的分类准确性,以及更加详细的精确率 召回率 F1指标
112 | print("随机森林分类器的准确性为", rfc.score(X_test, y_test))
113 | print(classification_report(rfc_y_pred, y_test))
114 |
115 | # 输出梯度提升决策树在测试集上的分类准确性,以及更加详细的精确率 召回率 F1指标
116 | print("梯度提升决策树的准确性为", gbc.score(X_test, y_test))
117 | print(classification_report(gbc_y_pred, y_test))
118 |
119 |
120 | users = pd.read_csv('values.csv')
121 |
122 | # 检验是否为机器或人类
123 | X = users[['similarity', 'platform', 'reputation', 'entropy']]
124 | X = vec.transform(X.to_dict(orient='record'))
125 | print(rfc.predict(X))
126 |
127 | self.dtc = dtc
128 | self.rfc = rfc
129 | self.gbc = gbc
130 |
131 |
132 | def get_dict_from_weibo_table():
133 | ml = MachineLearning()
134 | sina_store_object = sina_store.SinaStore()
135 | sina_store_object.weibo_table = sina_store_object.db['human_vector_info']
136 | iter = sina_store_object.get_stored_information()
137 | while True:
138 | try:
139 | info_dict = next(iter)
140 | ml.set_feature_vector_dict(info_dict)
141 | except StopIteration:
142 | break
143 | sina_store_object.weibo_table = sina_store_object.db['machine_vector_info']
144 | iter = sina_store_object.get_stored_information()
145 | while True:
146 | try:
147 | info_dict = next(iter)
148 | ml.set_feature_vector_dict(info_dict)
149 | except StopIteration:
150 | break
151 | print("已结束 正在训练模型。。。")
152 | ml.rand_forest_train()
153 |
154 |
155 | def start_training():
156 | get_dict_from_weibo_table()
157 |
158 | if __name__ == "__main__":
159 | start_training()
160 |
--------------------------------------------------------------------------------
/ml/realtime_random_weibo.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from __future__ import unicode_literals, print_function
3 | import weibo
4 | import time as tt
5 | import sys
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
8 | sys.path.append("..")
9 | from a1 import sina_store
10 | from a1 import sina_weibo
11 |
12 | APP_KEY = '3175988140'
13 | APP_SECRET = 'f445636b8fc0b7b5e75474c3ab8d320b'
14 | CALL_BACK = 'http://api.weibo.com/oauth2/default.html'
15 | ACCESS_TOKEN = '2.00xUU4VGKbHw9D47e3cfc2c8UhoSBB'
16 |
17 |
18 | class myAPIClient(weibo.APIClient):
19 | def __init__(self, app_key, app_secret, redirect_uri, access_token):
20 | weibo.APIClient.__init__(self, app_key, app_secret, redirect_uri, access_token)
21 |
22 | def request_access_token_info(self, access_token):
23 | r = weibo._http_post('%s%s' % (self.auth_url, 'get_token_info'), access_token=access_token)
24 | current = int(tt.time())
25 | expires = r.expire_in + current
26 | return weibo.JsonDict(expires_in=expires)
27 |
28 |
29 | def get_client(appkey, appsecret, callback, access_token):
30 | client = myAPIClient(appkey, appsecret, callback, access_token)
31 | r = client.request_access_token_info(access_token)
32 | expires_in = r.expires_in
33 | client.set_access_token(access_token, expires_in)
34 | return client
35 |
36 |
37 | def run(weiboList, client):
38 | statuses = client.statuses__public_timeline(count=2)['statuses']
39 | length = len(statuses)
40 | print('现在获得了'+str(length)+'条新微博')
41 |
42 | for i in range(0, length):
43 | created_at = statuses[i]['created_at']
44 | author_uid = statuses[i]['user']['id']
45 | id = statuses[i]['id']
46 | source = statuses[i]['source']
47 | province = statuses[i]['user']['province']
48 | city = statuses[i]['user']['city']
49 | followers_count = statuses[i]['user']['followers_count']
50 | friends_count = statuses[i]['user']['friends_count']
51 | statuses_count = statuses[i]['user']['statuses_count']
52 | url = statuses[i]['user']['url']
53 | geo = statuses[i]['geo']
54 | comments_count = statuses[i]['comments_count']
55 | reposts_count = statuses[i]['reposts_count']
56 | nickname = statuses[i]['user']['screen_name']
57 | desc = statuses[i]['user']['description']
58 | location = statuses[i]['user']['location']
59 | text = statuses[i]['text']
60 |
61 | weibo_dict = {
62 | 'created_at': created_at,
63 | 'author_uid': author_uid,
64 | 'id': id,
65 | 'author_name': nickname,
66 | 'source': source,
67 | 'text': text,
68 | 'province': province,
69 | 'location': location,
70 | 'description': desc,
71 | 'city': city,
72 | 'followers_count': followers_count,
73 | 'friends_count': friends_count,
74 | 'statuses_count': statuses_count,
75 | 'url': url,
76 | 'geo': geo,
77 | 'comments_count': comments_count,
78 | 'reposts_count': reposts_count
79 | }
80 | weiboList.append(weibo_dict)
81 | return weiboList
82 |
83 |
84 | def gain_random_weibolist(count_=-1):
85 | weibolist = []
86 | client = get_client(APP_KEY, APP_SECRET, CALL_BACK, ACCESS_TOKEN)
87 | while True:
88 | print('现在开始获取!')
89 | try:
90 | weibolist = run(weibolist, client)
91 | return weibolist
92 | except:
93 | tt.sleep(0.1)
94 | # 默认循环无数次,直到获得成功为止
95 | # 可通过更改参数count_的值来改变循环次数
96 | if(count_!= 0):
97 | count_ = count_-1
98 | else:
99 | break
100 |
101 | # def clean_weibolist(weibolist):
102 | # for weibo in weibolist:
103 | # weibo_object = sina_weibo.SinaWeibo()
104 | #
105 | # def store_random_weibolist():
106 | # sina_store_object = sina_store.SinaStore()
107 | # sina_store_object.weibo_table = sina_store_object.db['random_weibo']
108 | # #sina_store_object.
109 |
110 | if __name__ == "__main__":
111 | weiboList = gain_random_weibolist(count_=3)
112 | print(weiboList[0]['id'])
113 | print(weiboList[0]['author_uid'])
114 | print(weiboList[0]['author_name'])
115 | print(weiboList[0]['source'])
116 | print(weiboList[0]['text'])
117 | print(weiboList[0]['created_at'])
118 | print(111111111111111111111)
119 | for name, value in weiboList[0].items():
120 | print(name, value)
--------------------------------------------------------------------------------
/ml/realtime_random_weibo_2.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*
2 | from __future__ import unicode_literals, print_function
3 | from __future__ import division
4 | import json
5 | import pymongo
6 | import requests
7 | from bs4 import BeautifulSoup
8 | import sys
9 | sys.path.append("..")
10 | import random
11 | import fenci
12 | reload(sys)
13 | sys.setdefaultencoding('utf-8')
14 |
15 | REALTIMEWEIBO = 'realtime719'
16 | REALTIMEWEIBOT = 'realtime719t'
17 |
18 |
19 | location_dict = {
20 | '上海': [121.4648,31.2891],
21 | '东莞': [113.8953,22.901],
22 | '东营': [118.7073,37.5513],
23 | '中山': [113.4229,22.478],
24 | '临汾': [111.4783,36.1615],
25 | '临沂': [118.3118,35.2936],
26 | '丹东': [124.541,40.4242],
27 | '丽水': [119.5642,28.1854],
28 | '乌鲁木齐': [87.9236,43.5883],
29 | '佛山': [112.8955,23.1097],
30 | '保定': [115.0488,39.0948],
31 | '兰州': [103.5901,36.3043],
32 | '包头': [110.3467,41.4899],
33 | '北京': [116.4551,40.2539],
34 | '北海': [109.314,21.6211],
35 | '南京': [118.8062,31.9208],
36 | '南宁': [108.479,23.1152],
37 | '南昌': [116.0046,28.6633],
38 | '南通': [121.1023,32.1625],
39 | '厦门': [118.1689,24.6478],
40 | '台州': [121.1353,28.6688],
41 | '合肥': [117.29,32.0581],
42 | '呼和浩特': [111.4124,40.4901],
43 | '咸阳': [108.4131,34.8706],
44 | '哈尔滨': [127.9688,45.368],
45 | '唐山': [118.4766,39.6826],
46 | '嘉兴': [120.9155,30.6354],
47 | '大同': [113.7854,39.8035],
48 | '大连': [122.2229,39.4409],
49 | '天津': [117.4219,39.4189],
50 | '太原': [112.3352,37.9413],
51 | '威海': [121.9482,37.1393],
52 | '宁波': [121.5967,29.6466],
53 | '宝鸡': [107.1826,34.3433],
54 | '宿迁': [118.5535,33.7775],
55 | '常州': [119.4543,31.5582],
56 | '广州': [113.5107,23.2196],
57 | '廊坊': [116.521,39.0509],
58 | '延安': [109.1052,36.4252],
59 | '张家口': [115.1477,40.8527],
60 | '徐州': [117.5208,34.3268],
61 | '德州': [116.6858,37.2107],
62 | '惠州': [114.6204,23.1647],
63 | '成都': [103.9526,30.7617],
64 | '扬州': [119.4653,32.8162],
65 | '承德': [117.5757,41.4075],
66 | '拉萨': [91.1865,30.1465],
67 | '无锡': [120.3442,31.5527],
68 | '日照': [119.2786,35.5023],
69 | '昆明': [102.9199,25.4663],
70 | '杭州': [119.5313,29.8773],
71 | '枣庄': [117.323,34.8926],
72 | '柳州': [109.3799,24.9774],
73 | '株洲': [113.5327,27.0319],
74 | '武汉': [114.3896,30.6628],
75 | '汕头': [117.1692,23.3405],
76 | '江门': [112.6318,22.1484],
77 | '沈阳': [123.1238,42.1216],
78 | '沧州': [116.8286,38.2104],
79 | '河源': [114.917,23.9722],
80 | '泉州': [118.3228,25.1147],
81 | '泰安': [117.0264,36.0516],
82 | '泰州': [120.0586,32.5525],
83 | '济南': [117.1582,36.8701],
84 | '济宁': [116.8286,35.3375],
85 | '海口': [110.3893,19.8516],
86 | '淄博': [118.0371,36.6064],
87 | '淮安': [118.927,33.4039],
88 | '深圳': [114.5435,22.5439],
89 | '清远': [112.9175,24.3292],
90 | '温州': [120.498,27.8119],
91 | '渭南': [109.7864,35.0299],
92 | '湖州': [119.8608,30.7782],
93 | '湘潭': [112.5439,27.7075],
94 | '滨州': [117.8174,37.4963],
95 | '潍坊': [119.0918,36.524],
96 | '烟台': [120.7397,37.5128],
97 | '玉溪': [101.9312,23.8898],
98 | '珠海': [113.7305,22.1155],
99 | '盐城': [120.2234,33.5577],
100 | '盘锦': [121.9482,41.0449],
101 | '石家庄': [114.4995,38.1006],
102 | '福州': [119.4543,25.9222],
103 | '秦皇岛': [119.2126,40.0232],
104 | '绍兴': [120.564,29.7565],
105 | '聊城': [115.9167,36.4032],
106 | '肇庆': [112.1265,23.5822],
107 | '舟山': [122.2559,30.2234],
108 | '苏州': [120.6519,31.3989],
109 | '莱芜': [117.6526,36.2714],
110 | '菏泽': [115.6201,35.2057],
111 | '营口': [122.4316,40.4297],
112 | '葫芦岛': [120.1575,40.578],
113 | '衡水': [115.8838,37.7161],
114 | '衢州': [118.6853,28.8666],
115 | '西宁': [101.4038,36.8207],
116 | '西安': [109.1162,34.2004],
117 | '贵阳': [106.6992,26.7682],
118 | '连云港': [119.1248,34.552],
119 | '邢台': [114.8071,37.2821],
120 | '邯郸': [114.4775,36.535],
121 | '郑州': [113.4668,34.6234],
122 | '鄂尔多斯': [108.9734,39.2487],
123 | '重庆': [107.7539,30.1904],
124 | '金华': [120.0037,29.1028],
125 | '铜川': [109.0393,35.1947],
126 | '银川': [106.3586,38.1775],
127 | '镇江': [119.4763,31.9702],
128 | '长春': [125.8154,44.2584],
129 | '长沙': [113.0823,28.2568],
130 | '长治': [112.8625,36.4746],
131 | '阳泉': [113.4778,38.0951],
132 | '青岛': [120.4651,36.3373],
133 | '韶关': [113.7964,24.7028]
134 | }
135 |
136 |
137 | class RealtimeRandomWeibo(object):
138 | KEY = '9LF3gnOtYENP26HSoNAxPptHk7eCgxdWjL5ZuSdJXuGALaAcTrLXdGI7TkEYnIQm'
139 |
140 | def __init__(self, lazy=True):
141 | self.href = 'http://api03.bitspaceman.com:8000/post/weibo?kw=的&apikey=' + self.KEY
142 | self.weibo_list = []
143 | self.weibo_list_all = []
144 | self.weibo_list_threat = []
145 | self.iter_all = None
146 | self.iter_count = 0
147 | if not lazy:
148 | self.get_random_weibo()
149 |
150 | # 连接至mongodb
151 | self.mongo_client = pymongo.MongoClient('localhost', 27017)
152 | self.db = self.mongo_client['Weibo']
153 |
154 | def get_random_weibo(self):
155 | self.iter_count += 15
156 | requests_get = requests.get(self.href, timeout=15)
157 | requests_content = requests_get.content
158 | requests_dict = json.loads(requests_content)
159 |
160 | weibo_list = self.parse_requests_dict(requests_dict)
161 |
162 | copy_of_weibo_list = self.parse_weibo_list(weibo_list)
163 | self.weibo_list_all = copy_of_weibo_list
164 | self.weibo_list_threat = [weibo for weibo in copy_of_weibo_list if int(weibo['threatened']) > 68]
165 | self.store_to_mongodb()
166 |
167 | def parse_requests_dict(self, requests_dict):
168 | weibo_list = []
169 | count = 0
170 | for item in requests_dict['data']:
171 |
172 | weibo = {}
173 | try:
174 | weibo['is_repost'] = False
175 | weibo['repost_location'] = ''
176 | weibo['text'] = str(item['mblog']['text'])
177 | print(weibo['text'])
178 | weibo['uid'] = str(item['from']['url']).split('/')[-1]
179 | weibo['time'] = str(item['pDate'])
180 | weibo['comment_count'] = str(item['commentCount'])
181 | weibo['author_name'] = str(item['from']['name'])
182 | weibo['author_uid'] = str(item['from']['id'])
183 | weibo['author_fans'] = str(item['from']['fansCount'])
184 | weibo['author_follower'] = str(item['from']['friendCount'])
185 | weibo['location'] = str(item['from']['extend']['location'])
186 | weibo['province'] = ''
187 | print(weibo['author_uid'])
188 | except:
189 | continue
190 |
191 | try:
192 | weibo['terminal_source'] = str(item['mblog']['source']).split('>')[1].split('<')[0]
193 | except IndexError:
194 | weibo['terminal_source'] = '未知'
195 | if item['mblog'].has_key('retweeted_status'):
196 | count += 1
197 | weibo['is_repost'] = True
198 | try:
199 | weibo['repost_location'] = str(item['mblog']['retweeted_status']['user']['location'])
200 | weibo['repost_reposted_count'] = str(item['mblog']['retweeted_status']['reposts_count'])
201 | weibo['repost_text'] = str(item['mblog']['retweeted_status']['text'])
202 | weibo['repost_attitude_count'] = str(item['mblog']['retweeted_status']['attitudes_count'])
203 | print(weibo['repost_location'])
204 | print(weibo['repost_reposted_count'])
205 | print(weibo['repost_text'])
206 | print(weibo['repost_attitude_count'])
207 | except:
208 | pass
209 |
210 | weibo_list.append(weibo)
211 |
212 | print("为转发的微博数: ", str(count))
213 | # self.store_to_mongodb(weibo_list)
214 | self.weibo_list = weibo_list
215 | print(weibo_list)
216 |
217 | return weibo_list
218 |
219 | def parse_weibo_list(self, weibo_list):
220 | """
221 | 分析微博威胁程度与规范地址格式
222 | :param weibo_list: 初始微博列表
223 | :return: 分析后的微博列表
224 | """
225 | for i in weibo_list:
226 | i['location'] = mapped_province(i['location'], weibo=i)
227 | i['repost_location'] = mapped_province(i['repost_location'])
228 | print(i['location'], i['repost_location'])
229 | print('111111111111111111111111111111111111111111111111111')
230 |
231 | copy_of_weibo_list = []
232 |
233 | # 筛选符合地图显示的地点
234 | for i in weibo_list:
235 | if i['location'] is None or i['location'] == '':
236 | continue
237 | if i['repost_location'] is None or i['repost_location'] == '':
238 | i['is_repost'] = False
239 |
240 | i['location'] = str(i['location'])
241 | i['repost_location'] = str(i['repost_location'])
242 | copy_of_weibo_list.append(i)
243 |
244 | copy_of_weibo_list = assess_threat_levels(copy_of_weibo_list)
245 | return copy_of_weibo_list
246 |
247 | def store_to_mongodb(self):
248 |
249 | weibo_table = self.db[REALTIMEWEIBO]
250 | for i in self.weibo_list_all:
251 | weibo_table.insert(i)
252 |
253 | weibo_table = self.db[REALTIMEWEIBOT]
254 | for i in self.weibo_list_threat:
255 | weibo_table.insert(i)
256 |
257 | # def get_iter_all(self):
258 | # weibo_table = self.db['realtime719']
259 | # for i in weibo_table.find():
260 | # yield i
261 |
262 | def get_realtime_weibo_from_mongodb(self):
263 | weibo_table = self.db[REALTIMEWEIBO]
264 | count = 0
265 | now_weibo_all = []
266 | for i in weibo_table.find():
267 | if count 1:
335 | if weibo and weibo_location.split(' ')[0] in mapped_dict.keys():
336 | weibo['province'] = weibo_location.split(' ')[0]
337 |
338 | if weibo_location.split(' ')[1] in location_dict.keys():
339 | weibo_location = weibo_location.split(' ')[1]
340 | elif weibo_location.split(' ')[0] in location_dict.keys():
341 | weibo_location = weibo_location.split(' ')[0]
342 | else:
343 | weibo_location = ''
344 |
345 |
346 | else:
347 |
348 | if weibo and weibo_location in mapped_dict.keys():
349 | weibo['province'] = weibo_location.strip()
350 |
351 | if weibo_location.strip() in location_dict.keys():
352 | weibo_location = weibo_location.strip()
353 | else:
354 | if weibo_location.strip() in mapped_dict.keys():
355 | print(weibo_location.strip())
356 | weibo_location = mapped_dict.get(weibo_location.strip())
357 | else:
358 | weibo_location = ''
359 |
360 |
361 | return weibo_location
362 |
363 |
364 | def assess_threat_levels(copy_of_weibo_list):
365 | """
366 | 评估威胁程度
367 | """
368 | check_object = fenci.TestKeyword()
369 |
370 | for weibo in copy_of_weibo_list:
371 | flag = check_object.test_if_has_keyword(weibo['text'])
372 | threat = 0
373 | if weibo['is_repost']:
374 | flag = flag or check_object.test_if_has_keyword(weibo['repost_text'])
375 | if flag:
376 | if weibo.has_key('repost_reposted_count') and weibo['repost_reposted_count']:
377 | if int(weibo['repost_reposted_count']) > 10:
378 | threat += 1
379 | if weibo.has_key('comment_count') and weibo['comment_count']:
380 | if int(weibo['comment_count'] > 1):
381 | threat += 1
382 | if weibo.has_key('repost_attitude_count') and weibo['repost_attitude_count']:
383 | if int(weibo['repost_attitude_count']) > 10:
384 | threat += 1
385 | if weibo.has_key('author_fans') and weibo['author_fans']:
386 | if int(weibo['author_fans']) > 100:
387 | threat += 1
388 |
389 | weibo['threatened'] = random.randint(68, 80)
390 |
391 | if threat == 1 or threat == 2:
392 | weibo['threatened'] = random.randint(80, 90)
393 | print('what?????????????????????')
394 | print(weibo['threatened'] )
395 |
396 | if threat > 2:
397 | weibo['threatened'] = random.randint(90, 100)
398 | print('what?????????????????????')
399 | print(weibo['threatened'] )
400 |
401 | print(weibo['time'])
402 | print(weibo['author_uid'])
403 | else:
404 | weibo['threatened'] = random.randint(0, 68)
405 |
406 | return copy_of_weibo_list
407 |
408 |
409 | def start_run():
410 |
411 | realtime_weibo_object = RealtimeRandomWeibo()
412 |
413 | for i in realtime_weibo_object.weibo_list:
414 | i['location'] = mapped_province(i['location'], weibo=i)
415 | i['repost_location'] = mapped_province(i['repost_location'])
416 | print(i['location'], i['repost_location'])
417 | print('111111111111111111111111111111111111111111111111111')
418 |
419 | copy_of_weibo_list = []
420 |
421 | # 筛选符合地图显示的地点
422 | for i in realtime_weibo_object.weibo_list:
423 | if i['location'] is None or i['location'] == '':
424 | continue
425 | if i['repost_location'] is None or i['repost_location'] == '':
426 | i['is_repost'] = False
427 |
428 | i['location'] = str(i['location'])
429 | i['repost_location'] = str(i['repost_location'])
430 | copy_of_weibo_list.append(i)
431 |
432 | copy_of_weibo_list = assess_threat_levels(copy_of_weibo_list)
433 | return copy_of_weibo_list
434 |
435 |
436 |
437 |
438 |
439 | if __name__ == '__main__':
440 |
441 | a = RealtimeRandomWeibo()
442 | a.get_random_weibo()
443 | l = a.get_realtime_weibo_from_mongodb()
444 | for i in l:
445 | print(i)
446 | # a = start_run()
447 | # for i in a:
448 | # print(i['location'])
449 | # print(type(i['location']))
450 | # if i['is_repost']:
451 | # print("转发自"+str(i['repost_location']))
--------------------------------------------------------------------------------
/ml/realtime_user_fans_follower.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*
2 | from __future__ import unicode_literals, print_function
3 | from __future__ import division
4 | import json
5 | import time as tt
6 | import csv
7 | import pymongo
8 | import requests
9 | from bs4 import BeautifulSoup
10 | import sys
11 | sys.path.append("..")
12 | from wbcls.sina_store import SinaStore
13 | import fenci
14 | reload(sys)
15 | sys.setdefaultencoding('utf-8')
16 |
17 | RALATIONTABLE = 'Relation719'
18 |
19 |
20 | class RealtimeUserRealationship(object):
21 | KEY = '9LF3gnOtYENP26HSoNAxPptHk7eCgxdWjL5ZuSdJXuGALaAcTrLXdGI7TkEYnIQm'
22 |
23 | def __init__(self, user_id, user=True, fans=True, follow=True):
24 | # 连接至mongodb
25 | self.mongo_client = pymongo.MongoClient('localhost', 27017)
26 | self.db = self.mongo_client['Weibo']
27 |
28 | self._session = requests.Session()
29 | self._session.mount('http://', self._create_adapter())
30 |
31 | self.fans_href = 'http://api03.bitspaceman.com:8000/profile/weibo?type=3&id='+str(user_id)+'&apikey=' + \
32 | self.KEY + '&size=30'
33 | self.fans_list = []
34 | self.follow_href = 'http://api03.bitspaceman.com:8000/profile/weibo?type=2&id='+str(user_id)+'&apikey=' + \
35 | self.KEY+'&size=30'
36 | self.follow_list = []
37 |
38 | self.user_href = 'http://api03.bitspaceman.com:8000/profile/weibo?type=1&id='+str(user_id)+'&apikey=' + self.KEY
39 |
40 | self.info_dict = {}
41 | self.get_relationship(user=user, fans=fans, follow=follow)
42 |
43 | def get_relationship(self, user=True, fans=True, follow=False):
44 |
45 | if fans:
46 | requests_get = self._session.get(self.fans_href, timeout=15)
47 | requests_content = requests_get.content
48 | requests_dict = json.loads(requests_content)
49 | self.fans_list = self.parse_requests_dict(requests_dict)
50 | tt.sleep(0.5)
51 | if follow:
52 | requests_get = self._session.get(self.follow_href, timeout=15)
53 | requests_content = requests_get.content
54 | requests_dict = json.loads(requests_content)
55 | self.follow_list = self.parse_requests_dict(requests_dict)
56 | tt.sleep(0.5)
57 |
58 | if user:
59 | requests_get = self._session.get(self.user_href, timeout=15)
60 | requests_content = requests_get.content
61 | requests_dict = json.loads(requests_content)
62 | self.info_dict = self.parse_requests_info_dict(requests_dict)
63 |
64 |
65 | self.store_to_mongodb()
66 |
67 | @staticmethod
68 | def parse_requests_dict(requests_dict):
69 | relationship_list = []
70 | for item in requests_dict['data']:
71 | user_id = str(item['id'])
72 | user = {}
73 | try:
74 | user['id'] = str(user_id)
75 | user['name'] = str(item['userName'])
76 | user['fans_count'] = str(item['fansCount'])
77 | user['follow_count'] = str(item['followCount'])
78 | user['weibo_count'] = str(item['postCount'])
79 | user['location'] = str(item['location'])
80 | user['sex'] = str(item['gender'])
81 | print("粉丝数"+user['fans_count'])
82 | except:
83 | continue
84 |
85 | relationship_list.append(user)
86 |
87 | return relationship_list
88 |
89 | def parse_requests_info_dict(self, requests_dict):
90 | info_dict = {}
91 | try:
92 | for item in requests_dict['data']:
93 | try:
94 | info_dict['fans_count'] = str(item['fansCount'])
95 | info_dict['follow_count'] = str(item['followCount'])
96 | info_dict['weibo_count'] = str(item['postCount'])
97 | info_dict['location'] = str(item['location'])
98 | info_dict['name'] = str(item['userName'])
99 | info_dict['url'] = str(item['url'])
100 | except:
101 | continue
102 | info_dict['fans_list'] = self.fans_list
103 | info_dict['follow_list'] = self.follow_list
104 | print(info_dict['name'])
105 | print(1111111111111111111111)
106 | return info_dict
107 | except:
108 | print(requests_dict)
109 |
110 | def store_to_mongodb(self):
111 | table = self.db[RALATIONTABLE]
112 | table.insert(self.info_dict)
113 |
114 |
115 | @staticmethod
116 | def _create_adapter():
117 | return requests.adapters.HTTPAdapter(
118 | max_retries=requests.adapters.Retry(
119 | total=5,
120 | status_forcelist=[403, 404, 408, 500, 502],
121 | )
122 | )
123 |
124 |
125 |
126 | def get_relationship_from_mongodb(user_id):
127 | mongo_client = pymongo.MongoClient('localhost', 27017)
128 | db = mongo_client['Weibo']
129 | table = db[RALATIONTABLE]
130 | for i in table.find():
131 | if i['url'] == 'http://weibo.com/u/' + str(user_id):
132 | print(i['name'])
133 |
134 |
135 | if __name__ == '__main__':
136 | a = RealtimeUserRealationship(user_id='2671467531')
137 | get_relationship_from_mongodb('2671467531')
138 | b = {
139 | 'topic':'水滴直播',
140 | 'question_list':
141 | [
142 | {
143 | 'question_name':'如何看待。。问题1',
144 | 'anwser_words':
145 | ['好','希拉里','4444'],
146 | 'percent':'8.33'
147 | },
148 | {
149 | 'question_name': '如何看待。。问题2',
150 | 'anwser_words':
151 | ['不会', '淳朴', '4444'],
152 | 'percent': '4.44'
153 | },
154 | ]
155 | }
156 |
157 |
158 |
--------------------------------------------------------------------------------
/ml/realtime_user_info.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*
2 | from __future__ import unicode_literals, print_function
3 | from __future__ import division
4 | import json
5 | import csv
6 | import pymongo
7 | import requests
8 | from bs4 import BeautifulSoup
9 | import sys
10 | sys.path.append("..")
11 | from a1 import base
12 | from a1 import sina_store
13 | from a1 import sina_weibo
14 | from a1 import sina_people
15 | reload(sys)
16 | sys.setdefaultencoding('utf-8')
17 |
18 |
19 | class RealtimeRandomWeibo(object):
20 | KEY = '9LF3gnOtYENP26HSoNAxPptHk7eCgxdWjL5ZuSdJXuGALaAcTrLXdGI7TkEYnIQm'
21 |
22 | def __init__(self):
23 | self.href = 'http://api01.bitspaceman.com:8000/post/weibo?kw=的&apikey='+self.KEY
24 | self.get_random_weibo()
25 |
26 | def get_random_weibo(self):
27 | requests_get = requests.get(self.href, timeout=15)
28 | requests_content = requests_get.content
29 | requests_dict = json.loads(requests_content)
30 | for name, value in requests_dict.items():
31 | print(name, value)
32 | print(len(requests_dict['data']))
33 | self.parse_requests_dict(requests_dict)
34 |
35 | def parse_requests_dict(self, requests_dict):
36 | weibo_list = []
37 | weibo = sina_weibo.SinaWeibo()
38 | count = 0
39 | for i in requests_dict['data']:
40 | if i['mblog'].has_key('retweeted_status'):
41 | count+=1
42 | print(i['mblog']['retweeted_status']['user']['location'])
43 | print(i['mblog']['retweeted_status']['reposts_count'])
44 | print(i['mblog']['retweeted_status']['user']['id'])
45 | print(i['mblog']['retweeted_status']['text'])
46 |
47 | print(count)
48 |
49 | if __name__ == '__main__':
50 | a = RealtimeRandomWeibo()
--------------------------------------------------------------------------------
/ml/realtime_user_relationship.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*
2 | from __future__ import unicode_literals, print_function
3 | from __future__ import division
4 | import json
5 | import csv
6 | import pymongo
7 | import requests
8 | from bs4 import BeautifulSoup
9 | import sys
10 | sys.path.append("..")
11 | # from a1 import base
12 | # from a1 import sina_store
13 | # from a1 import sina_weibo
14 | # from a1 import sina_people
15 | import fenci
16 | reload(sys)
17 | sys.setdefaultencoding('utf-8')
18 |
19 |
20 | class RealtimeUserRealationship(object):
21 | KEY = '9LF3gnOtYENP26HSoNAxPptHk7eCgxdWjL5ZuSdJXuGALaAcTrLXdGI7TkEYnIQm'
22 |
23 | def __init__(self, user_id, fans=True, follow=False):
24 | self.fans_href = 'http://api03.bitspaceman.com:8000/profile/weibo?type=3&id='+str(user_id)+'&apikey=' + \
25 | self.KEY + '&size=50'
26 | self.fans_list = []
27 | self.follow_href = 'http://api03.bitspaceman.com:8000/profile/weibo?type=2&id='+str(user_id)+'&apikey=' + \
28 | self.KEY + '&size=50'
29 | self.follow_list = []
30 | self.get_relationship(fans=fans, follow=follow)
31 |
32 | def get_relationship(self, fans=True, follow=False):
33 | if fans:
34 | requests_get = requests.get(self.fans_href, timeout=15)
35 | requests_content = requests_get.content
36 | requests_dict = json.loads(requests_content)
37 | self.fans_list = self.parse_requests_dict(requests_dict)
38 | self.store_to_mongodb(self.fans_list, fans=fans, follow=follow)
39 | if follow:
40 | requests_get = requests.get(self.fans_href, timeout=15)
41 | requests_content = requests_get.content
42 | requests_dict = json.loads(requests_content)
43 | self.follow_list = self.parse_requests_dict(requests_dict)
44 | self.store_to_mongodb(self.fans_list, fans=fans, follow=follow)
45 |
46 | @staticmethod
47 | def parse_requests_dict(requests_dict):
48 | relationship_list = []
49 | for item in requests_dict['data']:
50 | user_id = str(item['id'])
51 | user = sina_people.SinaPeople(uid=user_id, lazy=True)
52 | try:
53 | user.name = str(item['userName'])
54 | print(user.name)
55 | user.fans_count = str(item['fansCount'])
56 | user.follow_count = str(item['followCount'])
57 | user.weibo_count = str(item['postCount'])
58 | user.location = str(item['location'])
59 | user.sex = str(item['gender'])
60 | print("粉丝数"+user.fans_count)
61 | except:
62 | continue
63 |
64 | relationship_list.append(user)
65 |
66 | return relationship_list
67 |
68 | @staticmethod
69 | def store_to_mongodb(user_list, fans=True, follow=False):
70 | sina_store_object = sina_store.SinaStore()
71 | if fans:
72 | sina_store_object.weibo_table = sina_store_object.db['realtime_user_fans']
73 | for user in user_list:
74 | sina_store_object.store_in_mongodb(user)
75 | if follow:
76 | sina_store_object.weibo_table = sina_store_object.db['realtime_user_follow']
77 | for user in user_list:
78 | sina_store_object.store_in_mongodb(user)
--------------------------------------------------------------------------------
/ml/stopwords_cn.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SuperSaiyanSSS/SinaWeiboSpider/b034c20ccf062b1323046584712716b2794ec7ec/ml/stopwords_cn.txt
--------------------------------------------------------------------------------
/ml/svm_dict.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import print_function
3 | import sys
4 | reload(sys)
5 | sys.setdefaultencoding('utf-8')
6 | import pandas as pd
7 | import re
8 | import numpy as np
9 | from gensim import corpora, models
10 | from scipy.sparse import csr_matrix
11 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
12 | from sklearn import svm
13 | import numpy as np
14 | import os,re,time,logging
15 | import jieba
16 | import pickle as pkl
17 |
18 | import svm_tfidf
19 | import os_path
20 |
21 | from svm_utils import *
22 |
23 | def reduce_dict(weibo_test):
24 | dictionary = None
25 | if not os.path.exists(path_tmp):
26 | os.makedirs(path_tmp)
27 | # 若不存在之前创建的词典,则生成词典
28 | if not os.path.exists(path_dictionary):
29 | dictionary = corpora.Dictionary()
30 | files = os_path.LoadFiles(path_doc_root)
31 | for i, msg in enumerate(files):
32 | catg = msg[0]
33 | file = msg[1]
34 | file = convert_doc_to_wordlist(file, cut_all=False)
35 | dictionary.add_documents([file])
36 | # 去掉词典中出现次数过少的词语
37 | small_freq_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < 5]
38 | dictionary.filter_tokens(small_freq_ids)
39 | dictionary.compactify()
40 | dictionary.save(path_dictionary)
41 | svm_tfidf.reduce_tfidf(dictionary, weibo_test)
42 |
43 | if __name__ == "__main__":
44 | reduce_dict(weibo_test = "小粉红滚!你个傻逼,体育老师教你的?吾问无为谓")
--------------------------------------------------------------------------------
/ml/svm_lsi.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import print_function
3 | import sys
4 | reload(sys)
5 | sys.setdefaultencoding('utf-8')
6 | import pandas as pd
7 | import re
8 | import numpy as np
9 | from gensim import corpora, models
10 | from scipy.sparse import csr_matrix
11 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
12 | from sklearn import svm
13 | import numpy as np
14 | import os,re,time,logging
15 | import jieba
16 | import pickle as pkl
17 |
18 | import svm_module
19 | from svm_utils import *
20 |
21 | # path_doc_root = 'H:\py\workplace\/a2\SogouC.reduced2\\Reduced' # 根目录 即存放按类分类好的问本纪
22 | # path_tmp = 'H:\py\workplace\/a2\SogouC.reduced2ss3\\temp1' # 存放中间结果的位置
23 | # path_dictionary = os.path.join(path_tmp, 'THUNews.dict')
24 | # path_tmp_tfidf = os.path.join(path_tmp, 'tfidf_corpus')
25 | # path_tmp_lsi = os.path.join(path_tmp, 'lsi_corpus')
26 | # path_tmp_lsimodel = os.path.join(path_tmp, 'lsi_model.pkl')
27 | # path_tmp_predictor = os.path.join(path_tmp, 'predictor.pkl')
28 |
29 |
30 | def reduce_lsi(dictionary, corpus_tfidf, weibo_test):
31 | corpus_lsi = None
32 | lsi_model = None
33 | # # # # 第三阶段, 开始将tfidf转化成lsi
34 | if not os.path.exists(path_tmp_lsi):
35 | print('=== 未检测到有lsi文件夹存在,开始生成lsi向量 ===')
36 | if not dictionary:
37 | dictionary = corpora.Dictionary.load(path_dictionary)
38 | if not corpus_tfidf: # 如果跳过了第二阶段,则从指定位置读取tfidf文档
39 | print('--- 未检测到tfidf文档,开始从磁盘中读取 ---')
40 | # 从对应文件夹中读取所有类别
41 | files = os.listdir(path_tmp_tfidf)
42 | catg_list = []
43 | for file in files:
44 | t = file.split('.')[0]
45 | if t not in catg_list:
46 | catg_list.append(t)
47 |
48 | # 从磁盘中读取corpus
49 | corpus_tfidf = {}
50 | for catg in catg_list:
51 | path = '{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg)
52 | corpus = corpora.MmCorpus(path)
53 | corpus_tfidf[catg] = corpus
54 | print('--- tfidf文档读取完毕,开始转化成lsi向量 ---')
55 |
56 | # 生成lsi model
57 | os.makedirs(path_tmp_lsi)
58 | corpus_tfidf_total = []
59 | catgs = list(corpus_tfidf.keys())
60 | for catg in catgs:
61 | tmp = corpus_tfidf.get(catg)
62 | corpus_tfidf_total += tmp
63 | lsi_model = models.LsiModel(corpus=corpus_tfidf_total, id2word=dictionary, num_topics=50)
64 | # 将lsi模型存储到磁盘上
65 | lsi_file = open(path_tmp_lsimodel, 'wb')
66 | pkl.dump(lsi_model, lsi_file)
67 | lsi_file.close()
68 | del corpus_tfidf_total # lsi model已经生成,释放变量空间
69 | print('--- lsi模型已经生成 ---')
70 |
71 | # 生成corpus of lsi, 并逐步去掉 corpus of tfidf
72 | corpus_lsi = {}
73 | for catg in catgs:
74 | corpu = [lsi_model[doc] for doc in corpus_tfidf.get(catg)]
75 | corpus_lsi[catg] = corpu
76 | corpus_tfidf.pop(catg)
77 | corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_lsi, s=os.sep, c=catg),
78 | corpu,
79 | id2word=dictionary)
80 | print('=== lsi向量已经生成 ===')
81 | else:
82 | print('=== 检测到lsi向量已经生成,跳过该阶段 ===')
83 |
84 | svm_module.reduce_module(dictionary, corpus_lsi, lsi_model, weibo_test)
85 |
--------------------------------------------------------------------------------
/ml/svm_module.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import print_function
3 | import sys
4 | reload(sys)
5 | sys.setdefaultencoding('utf-8')
6 | import pandas as pd
7 | import re
8 | import numpy as np
9 | from gensim import corpora, models
10 | from scipy.sparse import csr_matrix
11 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
12 | from sklearn import svm
13 | import numpy as np
14 | import os,re,time,logging
15 | import jieba
16 | import pickle as pkl
17 |
18 | import svm_result
19 | from svm_utils import *
20 |
21 |
22 | def reduce_module(dictionary, corpus_lsi, lsi_model, weibo_test):
23 | # # # # 第四阶段, 分类
24 | predictor = None
25 | if not os.path.exists(path_tmp_predictor):
26 | print('=== 未检测到判断器存在,开始进行分类过程 ===')
27 | if not corpus_lsi: # 如果跳过了第三阶段
28 | print('--- 未检测到lsi文档,开始从磁盘中读取 ---')
29 | files = os.listdir(path_tmp_lsi)
30 | catg_list = []
31 | for file in files:
32 | t = file.split('.')[0]
33 | if t not in catg_list:
34 | catg_list.append(t)
35 | # 从磁盘中读取corpus
36 | corpus_lsi = {}
37 | for catg in catg_list:
38 | path = '{f}{s}{c}.mm'.format(f=path_tmp_lsi, s=os.sep, c=catg)
39 | corpus = corpora.MmCorpus(path)
40 | corpus_lsi[catg] = corpus
41 | print('--- lsi文档读取完毕,开始进行分类 ---')
42 |
43 | tag_list = []
44 | doc_num_list = []
45 | corpus_lsi_total = []
46 | catg_list = []
47 | files = os.listdir(path_tmp_lsi)
48 | for file in files:
49 | t = file.split('.')[0]
50 | if t not in catg_list:
51 | catg_list.append(t)
52 | for count, catg in enumerate(catg_list):
53 | tmp = corpus_lsi[catg]
54 | tag_list += [count] * tmp.__len__()
55 | doc_num_list.append(tmp.__len__())
56 | corpus_lsi_total += tmp
57 | corpus_lsi.pop(catg)
58 |
59 | # 将gensim中的mm表示转化成numpy矩阵表示
60 | data = []
61 | rows = []
62 | cols = []
63 | line_count = 0
64 | for line in corpus_lsi_total:
65 | for elem in line:
66 | rows.append(line_count)
67 | cols.append(elem[0])
68 | data.append(elem[1])
69 | line_count += 1
70 | lsi_matrix = csr_matrix((data, (rows, cols))).toarray()
71 | # 生成训练集和测试集
72 | rarray = np.random.random(size=line_count)
73 | train_set = []
74 | train_tag = []
75 | test_set = []
76 | test_tag = []
77 | for i in range(line_count):
78 | if rarray[i] < 0.8:
79 | train_set.append(lsi_matrix[i, :])
80 | train_tag.append(tag_list[i])
81 | else:
82 | test_set.append(lsi_matrix[i, :])
83 | test_tag.append(tag_list[i])
84 |
85 | # 生成分类器
86 | predictor = svm_classify(train_set, train_tag, test_set, test_tag)
87 | x = open(path_tmp_predictor, 'wb')
88 | pkl.dump(predictor, x)
89 | x.close()
90 | else:
91 | print('=== 检测到分类器已经生成,跳过该阶段 ===')
92 |
93 | svm_result.reduce_result(dictionary, lsi_model, predictor, weibo_test)
--------------------------------------------------------------------------------
/ml/svm_result.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import unicode_literals, print_function
3 | import sys
4 | reload(sys)
5 | sys.setdefaultencoding('utf-8')
6 | import pandas as pd
7 | import re
8 | import numpy as np
9 | from gensim import corpora, models
10 | from scipy.sparse import csr_matrix
11 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
12 | from sklearn import svm
13 | import numpy as np
14 | import os,re,time,logging
15 | import jieba
16 | import pickle as pkl
17 | from svm_utils import *
18 |
19 | path_doc_root = 'H:\py\workplace\/a2\SogouC.reduced2\\Reduced' # 根目录 即存放按类分类好的问本纪
20 | path_tmp = 'H:\py\workplace\/a2\SogouC.reduced2ss11\\temp1' # 存放中间结果的位置
21 | path_dictionary = os.path.join(path_tmp, 'THUNews.dict')
22 | path_tmp_tfidf = os.path.join(path_tmp, 'tfidf_corpus')
23 | path_tmp_lsi = os.path.join(path_tmp, 'lsi_corpus')
24 | path_tmp_lsimodel = os.path.join(path_tmp, 'lsi_model.pkl')
25 | path_tmp_predictor = os.path.join(path_tmp, 'predictor.pkl')
26 | n = 2 # n 表示抽样率, n抽1
27 | # def convert_doc_to_wordlist(str_doc,cut_all):
28 | # sent_list = str_doc.split('\n')
29 | # sent_list = map(rm_char, sent_list) # 去掉一些字符,例如\u3000
30 | # word_2dlist = [rm_tokens(jieba.cut(part,cut_all=cut_all)) for part in sent_list] # 分词
31 | # word_list = sum(word_2dlist,[])
32 | # return word_list
33 | # def rm_tokens(words): # 去掉一些停用次和数字
34 | # words_list = list(words)
35 | # stop_words = get_stop_words()
36 | # for i in range(words_list.__len__())[::-1]:
37 | # if words_list[i] in stop_words: # 去除停用词
38 | # words_list.pop(i)
39 | # elif words_list[i].isdigit():
40 | # words_list.pop(i)
41 | # return words_list
42 | # def get_stop_words(path='stopwords_cn.txt'):
43 | # file = open(path,'rb').read().split('\n')
44 | # return set(file)
45 | # def rm_char(text):
46 | # text = re.sub('\u3000','',text)
47 | # return text
48 | #
49 | # def svm_classify(train_set,train_tag,test_set,test_tag):
50 | #
51 | # clf = svm.LinearSVC()
52 | # clf_res = clf.fit(train_set,train_tag)
53 | # train_pred = clf_res.predict(train_set)
54 | # test_pred = clf_res.predict(test_set)
55 | #
56 | # train_err_num, train_err_ratio = checkPred(train_tag, train_pred)
57 | # test_err_num, test_err_ratio = checkPred(test_tag, test_pred)
58 | #
59 | # print('=== 分类训练完毕,分类结果如下 ===')
60 | # print('训练集误差: {e}'.format(e=train_err_ratio))
61 | # print('检验集误差: {e}'.format(e=test_err_ratio))
62 | #
63 | # return clf_res
64 | #
65 | #
66 | # def checkPred(data_tag, data_pred):
67 | # if data_tag.__len__() != data_pred.__len__():
68 | # raise RuntimeError('The length of data tag and data pred should be the same')
69 | # err_count = 0
70 | # for i in range(data_tag.__len__()):
71 | # if data_tag[i]!=data_pred[i]:
72 | # err_count += 1
73 | # err_ratio = err_count / data_tag.__len__()
74 | # return [err_count, err_ratio]
75 |
76 |
77 | def reduce_result(dictionary, lsi_model, predictor, weibo_test):
78 | # # # # 第五阶段, 对新文本进行判断
79 | if not dictionary:
80 | dictionary = corpora.Dictionary.load(path_dictionary)
81 | if not lsi_model:
82 | lsi_file = open(path_tmp_lsimodel,'rb')
83 | lsi_model = pkl.load(lsi_file)
84 | lsi_file.close()
85 | if not predictor:
86 | x = open(path_tmp_predictor,'rb')
87 | predictor = pkl.load(x)
88 | x.close()
89 | files = os.listdir(path_tmp_lsi)
90 | catg_list = []
91 | for file in files:
92 | t = file.split('.')[0]
93 | if t not in catg_list:
94 | catg_list.append(t)
95 |
96 | demo_doc = weibo_test
97 | print(demo_doc)
98 | demo_doc = list(jieba.cut(demo_doc,cut_all=False))
99 | demo_bow = dictionary.doc2bow(demo_doc)
100 | tfidf_model = models.TfidfModel(dictionary=dictionary)
101 | demo_tfidf = tfidf_model[demo_bow]
102 | demo_lsi = lsi_model[demo_tfidf]
103 | data = []
104 | cols = []
105 | rows = []
106 | for item in demo_lsi:
107 | data.append(item[1])
108 | cols.append(item[0])
109 | rows.append(0)
110 | demo_matrix = csr_matrix((data,(rows,cols))).toarray()
111 | x = predictor.predict(demo_matrix)
112 | print('分类结果为:{x}'.format(x=catg_list[x[0]]))
113 |
114 |
--------------------------------------------------------------------------------
/ml/svm_tfidf.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import print_function
3 | import sys
4 | reload(sys)
5 | sys.setdefaultencoding('utf-8')
6 | import pandas as pd
7 | import re
8 | import numpy as np
9 | from gensim import corpora, models
10 | from scipy.sparse import csr_matrix
11 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
12 | from sklearn import svm
13 | import numpy as np
14 | import os,re,time,logging
15 | import jieba
16 | import pickle as pkl
17 |
18 | import svm_lsi
19 | import os_path
20 | from svm_utils import *
21 |
22 | # path_doc_root = 'H:\py\workplace\/a2\SogouC.reduced2\\Reduced' # 根目录 即存放按类分类好的问本纪
23 | # path_tmp = 'H:\py\workplace\/a2\SogouC.reduced2ss1\\temp1' # 存放中间结果的位置
24 | # path_dictionary = os.path.join(path_tmp, 'THUNews.dict')
25 | # path_tmp_tfidf = os.path.join(path_tmp, 'tfidf_corpus')
26 | # path_tmp_lsi = os.path.join(path_tmp, 'lsi_corpus')
27 | # path_tmp_lsimodel = os.path.join(path_tmp, 'lsi_model.pkl')
28 | # path_tmp_predictor = os.path.join(path_tmp, 'predictor.pkl')
29 |
30 | corpus_lsi = None
31 | lsi_model = None
32 | predictor = None
33 |
34 |
35 | # def convert_doc_to_wordlist(str_doc,cut_all):
36 | # sent_list = str_doc.split('\n')
37 | # sent_list = map(rm_char, sent_list) # 去掉一些字符,例如\u3000
38 | # word_2dlist = [rm_tokens(jieba.cut(part,cut_all=cut_all)) for part in sent_list] # 分词
39 | # word_list = sum(word_2dlist,[])
40 | # return word_list
41 | #
42 | #
43 | # def rm_tokens(words): # 去掉一些停用次和数字
44 | # words_list = list(words)
45 | # stop_words = get_stop_words()
46 | # for i in range(words_list.__len__())[::-1]:
47 | # if words_list[i] in stop_words: # 去除停用词
48 | # words_list.pop(i)
49 | # elif words_list[i].isdigit():
50 | # words_list.pop(i)
51 | # return words_list
52 | #
53 | #
54 | # def get_stop_words(path='stopwords_cn.txt'):
55 | # file = open(path,'rb').read().split('\n')
56 | # return set(file)
57 | #
58 | #
59 | # def rm_char(text):
60 | # text = re.sub('\u3000','',text)
61 | # return text
62 |
63 |
64 | def reduce_tfidf(dictionary, weibo_test):
65 | corpus_tfidf = None
66 | # # # # 第二阶段, 开始将文档转化成tfidf
67 | if not os.path.exists(path_tmp_tfidf):
68 | print('=== 未检测到有tfidf文件夹存在,开始生成tfidf向量 ===')
69 | # 如果指定的位置没有tfidf文档,则生成一个。如果有,则跳过该阶段
70 | if not dictionary: # 如果跳过了第一阶段,则从指定位置读取词典
71 | dictionary = corpora.Dictionary.load(path_dictionary)
72 | os.makedirs(path_tmp_tfidf)
73 | files = os_path.LoadFiles(path_doc_root)
74 | tfidf_model = models.TfidfModel(dictionary=dictionary)
75 | corpus_tfidf = {}
76 | for i, msg in enumerate(files):
77 | catg = msg[0]
78 | file = msg[1]
79 | word_list = convert_doc_to_wordlist(file, cut_all=False)
80 | file_bow = dictionary.doc2bow(word_list)
81 | file_tfidf = tfidf_model[file_bow]
82 | tmp = corpus_tfidf.get(catg, [])
83 | tmp.append(file_tfidf)
84 | if tmp.__len__() == 1:
85 | corpus_tfidf[catg] = tmp
86 | # 将tfidf中间结果储存起来
87 | catgs = list(corpus_tfidf.keys())
88 | for catg in catgs:
89 | corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg),
90 | corpus_tfidf.get(catg),
91 | id2word=dictionary
92 | )
93 | print('catg {c} has been transformed into tfidf vector'.format(c=catg))
94 | print('=== tfidf向量已经生成 ===')
95 | else:
96 | print('=== 检测到tfidf向量已经生成,跳过该阶段 ===')
97 |
98 | svm_lsi.reduce_lsi(dictionary, corpus_tfidf, weibo_test)
--------------------------------------------------------------------------------
/ml/svm_utils.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import print_function
3 | import sys
4 | reload(sys)
5 | sys.setdefaultencoding('utf-8')
6 | import os
7 | import re
8 | import jieba
9 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
10 | from sklearn import svm
11 |
12 | path_doc_root = 'H:\py\workplace\/a2\SogouC.reduced2\\Reduced' # 根目录 即存放按类分类好的问本纪
13 | path_tmp = 'H:\py\workplace\/a2\SogouC.reduced2ss11\\temp1' # 存放中间结果的位置
14 | path_dictionary = os.path.join(path_tmp, 'THUNews.dict')
15 | path_tmp_tfidf = os.path.join(path_tmp, 'tfidf_corpus')
16 | path_tmp_lsi = os.path.join(path_tmp, 'lsi_corpus')
17 | path_tmp_lsimodel = os.path.join(path_tmp, 'lsi_model.pkl')
18 | path_tmp_predictor = os.path.join(path_tmp, 'predictor.pkl')
19 |
20 | def convert_doc_to_wordlist(str_doc,cut_all):
21 | sent_list = str_doc.split('\n')
22 | sent_list = map(rm_char, sent_list) # 去掉一些字符,例如\u3000
23 | word_2dlist = [rm_tokens(jieba.cut(part,cut_all=cut_all)) for part in sent_list] # 分词
24 | word_list = sum(word_2dlist,[])
25 | return word_list
26 |
27 |
28 | def rm_tokens(words): # 去掉一些停用次和数字
29 | words_list = list(words)
30 | stop_words = get_stop_words()
31 | for i in range(words_list.__len__())[::-1]:
32 | if words_list[i] in stop_words: # 去除停用词
33 | words_list.pop(i)
34 | elif words_list[i].isdigit():
35 | words_list.pop(i)
36 | return words_list
37 |
38 |
39 | def get_stop_words(path='stopwords_cn.txt'):
40 | file = open(path,'rb').read().split('\n')
41 | return set(file)
42 |
43 |
44 | def rm_char(text):
45 | text = re.sub('\u3000','',text)
46 | return text
47 |
48 |
49 | def svm_classify(train_set, train_tag, test_set, test_tag):
50 | clf = svm.LinearSVC()
51 | clf_res = clf.fit(train_set, train_tag)
52 | train_pred = clf_res.predict(train_set)
53 | test_pred = clf_res.predict(test_set)
54 |
55 | train_err_num, train_err_ratio = checkPred(train_tag, train_pred)
56 | test_err_num, test_err_ratio = checkPred(test_tag, test_pred)
57 |
58 | print('=== 分类训练完毕,分类结果如下 ===')
59 | print('训练集误差: {e}'.format(e=train_err_ratio))
60 | print('检验集误差: {e}'.format(e=test_err_ratio))
61 |
62 | return clf_res
63 |
64 |
65 | def checkPred(data_tag, data_pred):
66 | if data_tag.__len__() != data_pred.__len__():
67 | raise RuntimeError('The length of data tag and data pred should be the same')
68 | err_count = 0
69 | for i in range(data_tag.__len__()):
70 | if data_tag[i]!=data_pred[i]:
71 | err_count += 1
72 | err_ratio = err_count / data_tag.__len__()
73 | return [err_count, err_ratio]
--------------------------------------------------------------------------------
/ml/values.csv:
--------------------------------------------------------------------------------
1 | uid,similarity,platform,reputation,entropy
2 | 6034008530,0.0,4,0.31343283582089554,1.5167044950347657
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from weibospider import WeiboClient
3 | import sys
4 | reload(sys)
5 | sys.setdefaultencoding('utf-8')
6 |
7 | #cookies = \
8 | #'ALF=1504271525; SCF=AjsEaVa0e8KjEg3yEjwEx270PLOpYvK-1BhV7AdkMSQgREMFA0bd3IqV3weK9ydf5NAbDXjKT69Rfndb2m9Ah4I.; SUB=_2A250hb_2DeRhGeNH7VIV9izNwj2IHXVXicG-rDV6PUNbktANLRPhkW1ZeSLr49kFNMgwrWThnh1bPUhWPw..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhaydrjX2CLPhFdjQ77gn4P5JpX5KMhUgL.Fo-4So5XSozp1K22dJLoI0YLxK.LB.-L1K.LxKML12qLBK5LxKqL1KBLBo.LxK.LB-BL1KBLxKBLB.2LB.2LxK-LBonL1heLxKqLB-eLBKMt; SUHB=03oF_eQuesy4xQ; SSOLoginState=1501679526; _T_WM=544d051d212d2d6f3adece8b6949b373'
9 |
10 | cookies = 'ALF=1512959361; SCF=AlGHrwmWqyhSdpml9a836b5TfwBwT3_aqlPQLm4VGPX5AnF7W-51O8sb-246XgliUA_jtEUQg3I0XisboShzSK4.; SUB=_2A253Ah7JDeRhGeNH7VIV9izNwj2IHXVUDKKBrDV6PUJbktAKLUehkW02ueHV00_NzZ0DwjSbUFYBB6B69g..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhaydrjX2CLPhFdjQ77gn4P5JpX5K-hUgL.Fo-4So5XSozp1K22dJLoI0YLxK.LB.-L1K.LxKML12qLBK5LxKqL1KBLBo.LxK.LB-BL1KBLxKBLB.2LB.2LxK-LBonL1heLxKqLB-eLBKMt; SUHB=0M2Flmef-A-AIV; SSOLoginState=1510370969; _T_WM=28de7b7a225087a87cbe9c2ad92df1ea'
11 |
12 | if __name__ == '__main__':
13 | pe0 = WeiboClient(cookies=cookies)
14 | pe2 = pe0.Weibo('E6iRJofK6')
15 | pe4 = pe0.People('1884866222')
16 | print(pe4)
17 | print(pe4.name)
18 | print(pe4.weibo_count)
19 | print(pe4.location)
20 | pe2_people = pe2.author
21 | print('______test________')
22 | print(pe2_people.name)
23 | print(pe2_people.weibo_count)
24 | for i, j in zip(range(3), pe2_people.weibo):
25 | print(j.text)
26 | for ii, jj in zip(range(4), j.repost):
27 | print(jj.author_name)
28 |
29 |
30 |
--------------------------------------------------------------------------------
/weibospider/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | from .attitude import Attitude
4 | from .base import SinaBaseObject
5 | from .comment import Comment
6 | from .client import WeiboClient
7 | from .people import People
8 | from .repost import Repost
9 | from .weibo import Weibo
10 |
11 | __all__ = [
12 | 'Attitude', 'SinaBaseObject', 'Comment', 'People', 'Repost', 'Weibo', 'WeiboClient'
13 | ]
--------------------------------------------------------------------------------
/weibospider/attitude.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import unicode_literals, print_function
3 | from utils import *
4 | import weibo
5 | from base import SinaBaseObject
6 | import sys
7 | reload(sys)
8 | sys.setdefaultencoding('utf-8')
9 |
10 |
11 | class Attitude(SinaBaseObject):
12 | """
13 | 点赞类,一般不直接使用,而是作为`Answer.attitude`迭代器的返回类型
14 | """
15 |
16 | def __init__(self, id, cache={}):
17 | super(Attitude, self).__init__()
18 | self.uid = str(id)
19 | self._cache = cache
20 | self.author_name = cache['author_name']
21 | self.time = cache['time']
22 |
--------------------------------------------------------------------------------
/weibospider/base.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | from __future__ import unicode_literals, print_function
4 | import requests
5 | import json
6 | import time as tt
7 | from bs4 import BeautifulSoup
8 | import sys
9 | import re
10 |
11 | reload(sys)
12 | sys.path.append('../')
13 | sys.setdefaultencoding('utf-8')
14 |
15 |
16 | # headers_for_get = {
17 | # 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
18 | # 'Accept-Encoding':'gzip, deflate, sdch',
19 | # 'Accept-Language':'zh-CN,zh;q=0.8',
20 | # 'Cache-Control':'max-age=0',
21 | # 'Connection':'keep-alive',
22 | # 'Cookie': '_T_WM=0ff248d78f4984aa135c5b2e53c11079; ALF=1495630107; SCF=AjsEaVa0e8KjEg3yEjwEx270PLOpYvK-1BhV7AdkMSQgVvrJ48ic42g3Xqe49zEjKtpWuFcU6KaL2lKIyLzY43s.; SUB=_2A251-YQQDeRhGeNH7VIV9izNwj2IHXVXBSxYrDV6PUJbktBeLUn6kW0ntTSLDvUTciwLCGGI3rSIiDX8jQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhaydrjX2CLPhFdjQ77gn4P5JpX5o2p5NHD95Qf1Kq7ShqEeK.pWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSK.cehBceo24eBtt; SUHB=0mxUEyUKiYW96L; SSOLoginState=1493038144',
23 | # 'Host':'weibo.cn',
24 | # 'Upgrade-Insecure-Requests':'1',
25 | # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
26 | # }
27 |
28 |
29 | class SinaBaseObject(object):
30 | """
31 | 所有新浪类的基类
32 | :TODO 刷新cookie策略
33 | """
34 | # 静态变量cookies
35 | cookies = ''
36 | headers_for_get = {
37 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
38 | 'Accept-Encoding': 'gzip, deflate, sdch',
39 | 'Accept-Language': 'zh-CN,zh;q=0.8',
40 | 'Cache-Control': 'max-age=0',
41 | 'Connection': 'keep-alive',
42 | 'Host': 'weibo.cn',
43 | 'Upgrade-Insecure-Requests': '1',
44 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
45 | }
46 | _session = requests.Session()
47 |
48 | def __init__(self):
49 | self._time_delay = 1
50 |
51 | # TODO:给session的get方法增加日志的功能
52 | # 对requests.get()函数进行改进,增加重试和报错功能
53 | def retry_requests(self, url, uid=''):
54 | """
55 | :param url: 待爬取的链接
56 | :param headers: 请求头
57 | :param uid: 帖子或用户的uid值(str类型)
58 | :return: requests_content 爬起的页面源码(bs4类型)
59 | """
60 | # 设置重试次数
61 | retry_count = 3
62 | while retry_count != 0:
63 | try:
64 | requests_get = requests.get(url, headers=self.headers_for_get, cookies=self.cookies, timeout=3)
65 | requests_content = requests_get.content
66 | # 加入了__T_WM的cookie项后仍然无法在wap版微博登录
67 | # 只能采用网页版
68 | # if self.flag == 0:
69 | # cookies = requests.utils.dict_from_cookiejar(self.cookies)
70 | # requests_get.cookies = requests.utils.dict_from_cookiejar(requests_get.cookies)
71 | # cookies = dict(cookies.items() + requests_get.cookies.items())
72 | # self.cookies = requests.utils.cookiejar_from_dict(cookies)
73 | # self.flag = 1
74 | # print(self.cookies)
75 | print(url)
76 | print(requests_content)
77 | print(requests_get.status_code)
78 | # 记录发起网络请求的url及时间
79 | with open('log_url.txt', 'a') as f:
80 | f.write(str(url) + ' ' + str(tt.strftime("%Y-%m-%d %H:%M:%S", tt.localtime())) + '\n')
81 |
82 | requests_content = BeautifulSoup(requests_content, "lxml")
83 | return requests_content
84 | except:
85 | tt.sleep(3)
86 | print("获取" + str(uid) + "页面时失败,正在重试。。。")
87 | print(requests_get.status_code)
88 | finally:
89 | retry_count -= 1
90 | if retry_count == 0:
91 | raise Exception("重试次数已完,仍获取" + str(uid) + "的页面失败!")
92 |
93 | @staticmethod
94 | def retry_requests_static(url, headers={}, timeout=3):
95 | """
96 | :param url: 待爬取的链接
97 | :param headers: 请求头
98 | :param uid: 帖子或用户的uid值(str类型)
99 | :return: requests_content 爬起的页面源码(bs4类型)
100 | """
101 | # 设置重试次数
102 | retry_count = 3
103 | while retry_count != 0:
104 | try:
105 | tt.sleep(3)
106 | print(url)
107 | requests_get = requests.get(url, headers=headers, timeout=3)
108 | print(url)
109 | # 记录发起网络请求的url及时间
110 | with open('log_static_url.txt', 'a') as f:
111 | f.write(str(url) + ' ' + str(tt.strftime("%Y-%m-%d %H:%M:%S", tt.localtime())) + '\n')
112 | return requests_get
113 | except:
114 | tt.sleep(3)
115 | print("获取页面时失败,正在重试。。。")
116 | finally:
117 | retry_count -= 1
118 | if retry_count == 0:
119 | raise Exception, "重试次数已完,仍获取页面失败!"
120 |
121 |
122 | if __name__ == '__main__':
123 | print(111)
124 |
--------------------------------------------------------------------------------
/weibospider/client.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | from __future__ import unicode_literals, print_function
3 | import requests
4 | from utils import *
5 | import os
6 | import importlib
7 | import base
8 |
9 |
10 | class WeiboClient(object):
11 | """
12 | 微博客户端类 维护网络会话 使用cookies登录
13 | """
14 |
15 | def __init__(self, cookies=None):
16 | self._session = requests.Session()
17 | self._session.verify = False
18 | self._session.headers.update(Default_Header)
19 | self._session.mount('http://', self._create_adapter())
20 | if cookies is not None:
21 | self.login_with_acquired_cookies(cookies)
22 | else:
23 | raise SystemError("未传入cookies")
24 |
25 | @staticmethod
26 | def _create_adapter():
27 | return requests.adapters.HTTPAdapter(
28 | max_retries=requests.adapters.Retry(
29 | total=5,
30 | status_forcelist=[403, 404, 408, 500, 502],
31 | )
32 | )
33 |
34 | def login_with_acquired_cookies(self, cookies):
35 | """
36 | note:
37 | 保存在文件中的cookies形式为chrome浏览器F12后NetWork中Headers里的形式
38 | 如:
39 | 'ALF=1501159357; SCF=AjsEaVa0e8KjEg3yEjwEx270PLOpYvK-1BhV7AdkMSQgUozbT8VN9e7zDppTz6FZs5PD6E5VoJ3e0J
40 | yOHFF-HIw.; SUB=_2A250ViLtDeThGeBP4lQW-CbLyTqIHXVXuU6lrDV6PUJbktANLWLBkW2HmYSKxGkq2uS0728TOqfHWar_RQ..;
41 | SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhzhoVOn6pkLuGbnO5GBEu35JpX5o2p5NHD95QceK.cS0nRS0zcWs4DqcjMi--
42 | NiK.Xi-2Ri--ciKnRi-zNSo24SoMR1hMESntt; SUHB=0FQ7hD651l5Cff; _T_WM=55ac8f6c31f4eb6f286ad2e9ed8d729'
43 | """
44 | # 若文件目录下存在cookies,则其为文件, 打开后获取
45 | # 否则为cookies字符串,直接获取
46 | if os.path.isfile(cookies):
47 | with open(cookies, 'r') as f:
48 | cookies = f.read()
49 |
50 | cookies_dict = {}
51 | # 将cookies字符串转为字典
52 | for item in cookies.split('; '):
53 | cookies_dict[item.split('=')[0]] = item.split('=')[1]
54 | self._session.cookies.update(cookies_dict)
55 | # cookies2 = requests.utils.cookiejar_from_dict(cookies_dict)
56 | base._session = self._session
57 |
58 | def __getattr__(self, item):
59 | """本函数为类工厂模式,用于获取各种类的实例,如 `Answer` `Question` 等.
60 | :支持的形式有:
61 | 1. client.me() (暂未实现)
62 | 2. client.weibo()
63 | 3. client.people()
64 | 4. client.comment()
65 | 5. client.attitude()
66 | 6. client.repost()
67 | 参数均为对应的id,返回对应的类的实例。
68 | """
69 | # 回调对应模块的构造函数
70 | base.SinaBaseObject._session = self._session
71 |
72 | def callback_getattr(id):
73 | # 类名第一个字母大写
74 | return getattr(module, item.capitalize())(id)
75 | # TODO: 增加me
76 | attr_list = ['me', 'weibo', 'people', 'comment', 'attitude', 'repost']
77 | if item.lower() in attr_list:
78 | module = importlib.import_module('.'+item.lower(), 'weibospider')
79 | return callback_getattr
80 |
81 |
82 | if __name__ == '__main__':
83 | a = WeiboClient(cookies='as=12')
84 | a.sina_weibo('666')
85 |
--------------------------------------------------------------------------------
/weibospider/comment.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import unicode_literals, print_function
3 | from utils import *
4 | import weibo
5 | from base import SinaBaseObject
6 | import sys
7 | reload(sys)
8 | sys.setdefaultencoding('utf-8')
9 |
10 |
11 | class Comment(SinaBaseObject):
12 | """
13 | 评论类,一般不直接使用,而是作为`Answer.comment`迭代器的返回类型
14 | """
15 |
16 | def __init__(self, id, cache={}):
17 | super(Comment, self).__init__()
18 | self.uid = str(id)
19 | self._cache = cache
20 | self.attitude_count = cache['attitude_count']
21 | self.author_name = cache['author_name']
22 | self.author_uid = cache['author_uid']
23 | self.content = cache['text']
24 | self.is_hot = cache['is_hot']
25 | self.terminal_source = cache['terminal_source']
26 | self.text = cache['text']
27 | self.time = cache['time']
28 |
29 |
30 |
31 |
32 | # @property
33 | # @normal_attr
34 | # def _soup(self):
35 | # return self._cache['_soup']
36 | #
37 | # # 获取微博作者的昵称和uid
38 | # def _get_author_data(self):
39 | # # self.author_name = self._soup.find(attrs={'id': 'M_'}).div.a.get_text()
40 | # # self._cache.setdefault('author_name', self.author_name)
41 | #
42 | # self.author_uid = self._soup.find(attrs={'id': 'M_'}).div.a.attrs['href'].split('/')[-1]
43 | # self._cache.setdefault('author_uid', self.author_uid)
44 | #
45 | # @property
46 | # @other_obj(class_name='people', name_in_json='people')
47 | # def author(self):
48 | # pass
49 |
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/weibospider/people.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import unicode_literals, print_function
3 | import time as tt
4 | import pymongo
5 | from utils import *
6 | from bs4 import BeautifulSoup
7 | import re
8 | import requests
9 | import weibo
10 | from base import SinaBaseObject
11 | import sys
12 | reload(sys)
13 | sys.setdefaultencoding('utf-8')
14 | pattern = re.compile(r'\d+')
15 |
16 |
17 | class People(SinaBaseObject):
18 | """
19 | 新浪微博的用户类
20 | """
21 | def __init__(self, id, href=None, cache={}):
22 | """
23 |
24 | {
25 | uid: 5501547091,
26 | name: 助人为乐的英逸,
27 | fans_count: 285,
28 | follow_count: 1500,
29 | weibo_count: 1335,
30 | time_delay: 1,
31 | birthday: 未知,
32 | sex: 男,
33 | location: 江西,
34 | href: http://weibo.cn/5501547091/follow,
35 | :param uid:
36 | :param href:
37 | """
38 | super(People, self).__init__()
39 | self.uid = str(id)
40 | self._cache = cache
41 | self.href = href
42 | self.birthday = '未知'
43 | self.member_level = ''
44 | self.follow_list = []
45 | self.fans_list = []
46 | self.is_V = False
47 | self.uid = self.uid.strip('\n')
48 | if not self.href:
49 | self.href = 'http://weibo.cn/'+self.uid
50 | if not self.uid:
51 | self.uid = self.href.split('cn/')
52 |
53 | @property
54 | def basic_url(self):
55 | return 'http://weibo.cn/u/' + str(self.uid)
56 |
57 | @property
58 | @normal_attr()
59 | def html(self):
60 | return self._session.get(self.href).content
61 |
62 | @property
63 | @normal_attr()
64 | def _soup(self):
65 | return BeautifulSoup(self.html, "lxml")
66 |
67 | @property
68 | @normal_attr()
69 | def _info_content(self):
70 | try:
71 | info_content = self._soup.find('div', attrs={'class': 'u'}).table.tr.findAll('td'
72 | )[1].div.span.contents[0]
73 | except AttributeError:
74 | return False
75 | return info_content
76 |
77 | @property
78 | @normal_attr()
79 | def _info_content_2(self):
80 | """
81 | 若用户为大V 则有大V标志的图片 影响页面标签
82 | 故此时个人信息页面块实际为`_info_content_2`
83 | """
84 | return self._soup.find('div', attrs={'class': 'u'}).table.tr.findAll('td')[1].div.span.get_text()
85 |
86 | @property
87 | @normal_attr()
88 | def name(self):
89 | return self._info_content.split(' ')[0].strip()
90 |
91 | @property
92 | @normal_attr()
93 | def sex(self):
94 | try:
95 | sex = self._info_content.split(' ')[1].split('/')[0].strip()
96 | except IndexError:
97 | sex = self._info_content_2.split('/')[0].strip()[-1:].strip()
98 | return sex
99 |
100 | @property
101 | @normal_attr()
102 | def location(self):
103 | try:
104 | location = self._info_content.split(' ')[1].split('/')[1].strip()
105 | except IndexError:
106 | # 将大V标志为真
107 | self.is_V = True
108 | location = self._info_content_2.split('/')[1].strip()[:3].strip()
109 | return location
110 |
111 | @property
112 | @normal_attr()
113 | def weibo_count(self):
114 | return int(re.findall(pattern, self._soup.find('div', attrs={'class': 'u'}).
115 | findAll('div', attrs={'class': 'tip2'})[0].get_text())[0])
116 |
117 | @property
118 | @normal_attr()
119 | def follow_count(self):
120 | return int(re.findall(pattern, self._soup.find('div', attrs={'class': 'u'}).
121 | findAll('div', attrs={'class': 'tip2'})[0].get_text())[1])
122 |
123 | @property
124 | @normal_attr()
125 | def fans_count(self):
126 | return int(re.findall(pattern, self._soup.find('div', attrs={'class': 'u'}).
127 | findAll('div', attrs={'class': 'tip2'})[0].get_text())[2])
128 |
129 | def _get_member_list(self, target_member_type):
130 | """
131 | 获取所指定的当前用户的关注/粉丝列表
132 | 每个被关注者或粉丝的信息存储在dict中
133 | :param required_member_count: 指定获取用户的数量
134 | :param time_delay: 延迟时间
135 | :param target_member_type: 指定获取用户的种类:fans或follow
136 | :return: member_list: 存放已获取的用户列表
137 |
138 | """
139 | # TODO: 获取人物基本信息
140 | member_url = 'http://weibo.cn/' + str(self.uid) + '/' + str(target_member_type)
141 | self.href = member_url
142 | print("now is crawling " + str(member_url))
143 | page_count = 1
144 | now_page_count = 1
145 | is_first = True
146 | while True:
147 |
148 | tt.sleep(self.time_delay)
149 | # 获取页面源码(bs4对象)
150 | requests_content = BeautifulSoup(self._session.get(member_url).content, "lxml")
151 |
152 | # 获取当前页的关注列表
153 | unit_list = requests_content.find_all('table')
154 | for i in unit_list:
155 | # 每个用户的信息以dict存储
156 | member = {}
157 | member['href'] = str(i.tr.td.a.attrs['href'])
158 | try:
159 | member['uid'] = i.tr.td.a.attrs['href'].split('u/')[1]
160 | except:
161 | member['uid'] = i.tr.td.a.attrs['href'].split('cn/')[1]
162 | member['name'] = i.tr.find_all('td')[1].a.get_text()
163 | # 正则匹配获取粉丝的粉丝数
164 | pattern = re.compile(r'\d+')
165 | # 若粉丝是大V,则多了一个图片标签
166 | try:
167 | member['is_v'] = False
168 | member['fans_count'] = int(re.findall(pattern, i.tr.find_all('td')[1].contents[2])[0])
169 | except:
170 | member['fans_count'] = int(re.findall(pattern, i.tr.find_all('td')[1].contents[3])[0])
171 | member['is_v'] = True
172 |
173 | yield member
174 |
175 | # 若是第一页,则获取总页数
176 | if is_first is True:
177 | # 若发现‘x/y页’ 则有不止一页
178 | if requests_content.find(attrs={'id': 'pagelist'}):
179 | page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip()
180 | page_count = page_count.split('/')[1]
181 | pattern = re.compile(r'\d+')
182 | page_count = int(re.findall(pattern, page_count)[0])
183 | else:
184 | return
185 | is_first = False
186 |
187 | now_page_count += 1
188 | if now_page_count >= page_count:
189 | return
190 |
191 | member_url = 'http://weibo.cn/' + str(self.uid)+'/'+str(target_member_type)+'?page=' + str(now_page_count)
192 |
193 | @property
194 | @normal_attr()
195 | def fans(self):
196 | """
197 | 获取当前用户的粉丝列表
198 | :param required_member_count: 限定获取的数量
199 | :param time_delay: 时间延迟
200 | :return: 指定数量的粉丝基本信息列表
201 | .. code-block:: python
202 | [
203 | {
204 | 'fans_count': 104,
205 | 'is_v' : False,
206 | 'href': 'http://weibo.cn/u/5977488639',
207 | 'uid': 5977488639,
208 | 'name': '小山环环1996'
209 | },
210 | {
211 | 'fans_count': 10,
212 | 'is_v' : False,
213 | 'href': 'http://weibo.cn/u/6187915152',
214 | 'uid': 6187915152,
215 | 'name': '08iCu京伯'
216 | },
217 | ]
218 | """
219 | for x in self._get_member_list(target_member_type='fans'):
220 | yield x
221 |
222 | @property
223 | @normal_attr()
224 | def follow(self):
225 | """
226 | 获取当前用户的关注列表
227 | :param required_member_count: 限定获取的数量
228 | :param time_delay: 时间延迟
229 | :return: 指定数量的关注基本信息列表
230 | .. code-block:: python
231 | [
232 | {
233 | 'fans_count': 104,
234 | 'is_v' : False,
235 | 'href': 'http://weibo.cn/u/5977488639',
236 | 'uid': 5977488639,
237 | 'name': '小山环环1996'
238 | },
239 | {
240 | 'fans_count': 10,
241 | 'is_v' : False,
242 | 'href': 'http://weibo.cn/u/6187915152',
243 | 'uid': 6187915152,
244 | 'name': '08iCu京伯'
245 | },
246 | ]
247 | """
248 | for x in self._get_member_list(target_member_type='follow'):
249 | yield x
250 |
251 | @property
252 | @other_obj()
253 | def weibo(self):
254 | """
255 | 获取指定用户的微博
256 | :param required_weibo_count: 所需的微博条数
257 | :param time_delay: 时间延迟
258 | :return: weibo_list 元素为SinaWeibo对象
259 | .. code-block:: python
260 | [
261 | {
262 | 'uid': 'EpO2KnAor',
263 | 'is_repost': False,
264 | 'text': '物是人非.',
265 | 'attitude_count' : 0,
266 | 'repost_count': 7,
267 | 'comment_count': 0,
268 | 'time': '01月08日 04:44'
269 | 'terminal_source': 'iPad mini'
270 | },
271 | {
272 | 'uid': 'EAJwkph8X',
273 | 'is_repost': False,
274 | 'text': '祝你生日快乐',
275 | 'attitude_count' : 0,
276 | 'repost_count': 0,
277 | 'comment_count': 1,
278 | 'time': '2016-12-30 23:34:34'
279 | 'terminal_source': '生日动态'
280 | },
281 | ]
282 | """
283 |
284 | weibo_url = self.basic_url
285 | page_count = 1
286 | now_page_count = 1
287 | is_first = True
288 | pattern = re.compile(r'\d+')
289 |
290 | while True:
291 | tt.sleep(self._time_delay)
292 | # 获取页面源码(bs4对象)
293 | requests_content = BeautifulSoup(self._session.get(weibo_url).content, "lxml")
294 | # 获取当前页的微博列表
295 | unit_list = requests_content.find_all('div', attrs={'class': 'c'})
296 | for i in unit_list:
297 | # 每个微博的信息以微博类SinaWeibo存储
298 | try:
299 | if str(i.attrs['id']) and str(i.attrs['id']).startswith('M'):
300 | weibo_uid = i.attrs['id'].split('_')[1]
301 | else:
302 | continue
303 | except:
304 | continue
305 |
306 | # 检查是否为转发的微博
307 | if len(i.div.find_all('span')) >= 2:
308 | is_repost = True
309 | else:
310 | is_repost = False
311 | # for c in i.div.find_all('span'):
312 | # if str(c.attrs['class']) == "['cmt']":
313 | # is_repost = True
314 | if is_repost:
315 | text = i.div.find_all('span')[0].get_text()+i.div.find_all('span')[1].get_text()
316 | else:
317 | text = i.div.span.get_text()
318 |
319 | # 有的微博处html格式不对
320 | try:
321 | attitude_count = int(re.findall(pattern, i.div.find_all('a')[-4].get_text())[0])
322 | repost_count = int(re.findall(pattern, i.div.find_all('a')[-3].get_text())[0])
323 | comment_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-2].get_text())[0])
324 | except IndexError:
325 | try:
326 | comment_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-3].get_text())[0])
327 | repost_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-4].get_text())[0])
328 | attitude_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-5].get_text())[0])
329 | except IndexError:
330 | attitude_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[0])
331 | repost_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[1])
332 | comment_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[2])
333 | # print(attitude_count, repost_count, comment_count)
334 | try:
335 | time = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[0]
336 | terminal_source = i.div.find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[1]
337 | except IndexError:
338 | time = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[0]
339 | try:
340 | terminal_source = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[1]
341 | except IndexError:
342 | terminal_source = '暂无'
343 | # print(time, terminal_source)
344 | weibo_cache = {
345 | "is_repost": is_repost,
346 | "text": text,
347 | "attitude_count": attitude_count,
348 | "repost_count": repost_count,
349 | "comment_count": comment_count,
350 | "time": time,
351 | "terminal_source": terminal_source
352 | }
353 | self.now_weibo_cache = weibo_cache
354 | self.now_weibo_uid = weibo_uid
355 | yield weibo.Weibo(id=weibo_uid, cache=weibo_cache)
356 | is_repost = False
357 |
358 | # 若是第一页,则获取总页数
359 | if is_first:
360 | # 若发现‘x/y页’ 则有不止一页
361 | if requests_content.find(attrs={'id': 'pagelist'}):
362 | page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip()
363 | page_count = page_count.split('/')[1]
364 | page_count = int(re.findall(pattern, page_count)[0])
365 | else:
366 | return
367 | is_first = False
368 |
369 | now_page_count += 1
370 | if now_page_count > page_count:
371 | return
372 |
373 | weibo_url = 'http://weibo.cn/u/' + str(self.uid) + '?page=' + str(now_page_count)
374 |
375 | # def get_weibo_list(self):
376 | # """
377 | # 获取指定用户的微博
378 | # :param required_weibo_count: 所需的微博条数
379 | # :param time_delay: 时间延迟
380 | # :return: weibo_list 元素为SinaWeibo对象
381 | # .. code-block:: python
382 | # [
383 | # {
384 | # 'uid': 'EpO2KnAor',
385 | # 'is_repost': False,
386 | # 'text': '物是人非.',
387 | # 'attitude_count' : 0,
388 | # 'repost_count': 7,
389 | # 'comment_count': 0,
390 | # 'time': '01月08日 04:44'
391 | # 'terminal_source': 'iPad mini'
392 | # },
393 | # {
394 | # 'uid': 'EAJwkph8X',
395 | # 'is_repost': False,
396 | # 'text': '祝你生日快乐',
397 | # 'attitude_count' : 0,
398 | # 'repost_count': 0,
399 | # 'comment_count': 1,
400 | # 'time': '2016-12-30 23:34:34'
401 | # 'terminal_source': '生日动态'
402 | # },
403 | # ]
404 | # """
405 | # required_weibo_count = self.required_weibo_count
406 | # weibo_url = self.basic_url
407 | # weibo_list = []
408 | # weibo_count = 0
409 | # page_count = 1
410 | # now_page_count = 1
411 | # is_first = True
412 | # pattern = re.compile(r'\d+')
413 | # while True:
414 | #
415 | # tt.sleep(self.time_delay)
416 | # # 获取页面源码(bs4对象)
417 | # requests_content = self.retry_requests(weibo_url, uid=self.uid)
418 | #
419 | # # 获取当前页的微博列表
420 | # unit_list = requests_content.find_all('div', attrs={'class': 'c'})
421 | # for i in unit_list:
422 | # # 每个微博的信息以微博类SinaWeibo存储
423 | # try:
424 | # if str(i.attrs['id']) and str(i.attrs['id']).startswith('M'):
425 | # weibo_uid = i.attrs['id'].split('_')[1]
426 | # else:
427 | # continue
428 | # except:
429 | # continue
430 | # weibo = sina_weibo.SinaWeibo(uid=weibo_uid, required_count=0)
431 | #
432 | # # 检查是否为转发的微博
433 | # for c in i.div.find_all('span'):
434 | # if str(c.attrs['class']) == "['cmt']":
435 | # weibo.is_repost = True
436 | # if weibo.is_repost:
437 | # weibo.text = i.div.find_all('span')[0].get_text()+i.div.find_all('span')[1].get_text()
438 | # else:
439 | # weibo.text = i.div.span.get_text()[1:]
440 | #
441 | # weibo.uid = weibo_uid
442 | #
443 | # # 有的微博处html格式不对
444 | # try:
445 | # weibo.attitude_count = int(re.findall(pattern, i.div.find_all('a')[-4].get_text())[0])
446 | # weibo.repost_count = int(re.findall(pattern, i.div.find_all('a')[-3].get_text())[0])
447 | # weibo.comment_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-2].get_text())[0])
448 | # except IndexError:
449 | # print(weibo_uid)
450 | # print(weibo.author_uid)
451 | # try:
452 | # weibo.comment_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-3].get_text())[0])
453 | # weibo.repost_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-4].get_text())[0])
454 | # weibo.attitude_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-5].get_text())[0])
455 | # except IndexError:
456 | # weibo.attitude_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[0])
457 | # weibo.repost_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[1])
458 | # weibo.comment_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[2])
459 | # print(weibo.attitude_count, weibo.repost_count, weibo.comment_count)
460 | # try:
461 | # weibo.time = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[0]
462 | # weibo.terminal_source = i.div.find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[1]
463 | # except IndexError:
464 | # print(i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text())
465 | # weibo.time = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[0]
466 | # try:
467 | # weibo.terminal_source = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[1]
468 | # except IndexError:
469 | # weibo.terminal_source = '暂无'
470 | # print(weibo.time, weibo.terminal_source)
471 | # # 计数器加一
472 | # weibo_count += 1
473 | # # 若超过了要求获取的用户数量,则返回
474 | # if weibo_count > required_weibo_count:
475 | # return weibo_list
476 | # weibo_list.append(weibo)
477 | #
478 | # # 若是第一页,则获取总页数
479 | # if is_first:
480 | # # 若发现‘x/y页’ 则有不止一页
481 | # if requests_content.find(attrs={'id': 'pagelist'}):
482 | # page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip()
483 | # page_count = page_count.split('/')[1]
484 | # page_count = int(re.findall(pattern, page_count)[0])
485 | # print(page_count)
486 | # else:
487 | # return weibo_list
488 | # is_first = False
489 | #
490 | # now_page_count += 1
491 | # if now_page_count > page_count:
492 | # break
493 | #
494 | # weibo_url = 'http://weibo.cn/u/' + str(self.uid) + '?page=' + str(now_page_count)
495 | #
496 | # return weibo_list
497 |
498 | # def get_personal_information(self):
499 | # """
500 | # 注:新浪有奇怪的BUG 带cookies访问http://weibo.cn/3193031501/info这类个人资料url时,总是File not found
501 | # 若不带cookies则不能访问该页
502 | # 所以只能获取个人主页简单的性别和地点信息
503 | #
504 | # @2017/06/12:
505 | # 新浪允许不带cookie访问某些页面,如某个微博页面
506 | # 而对另一些页面 如个人主页的详细情况,则有的用户需要cookie,有的不需要。
507 | # :return:
508 | # """
509 | # requests_content = self.retry_requests(self.href)
510 | # try:
511 | # info_content = requests_content.find('div', attrs={'class': 'u'}).table.tr.findAll('td')[1].div.span.contents[0]
512 | # except AttributeError:
513 | # print(requests_content)
514 | # return False
515 | # # 此处split(' ')中的空格不是一般的空格 需要在原网页中复制
516 | # # 普通用户无图片标签
517 | # self.name = info_content.split(' ')[0].strip()
518 | # print(self.name)
519 | # try:
520 | # self.sex = info_content.split(' ')[1].split('/')[0].strip()
521 | # print(self.sex)
522 | # self.location = info_content.split(' ')[1].split('/')[1].strip()
523 | # print(self.name, self.sex, self.location)
524 | # except IndexError:
525 | # self.is_V = True
526 | # info2 = requests_content.find('div', attrs={'class': 'u'}).table.tr.findAll('td')[1].div.span.get_text()
527 | # self.sex = info2.split('/')[0].strip()[-1:].strip()
528 | # print(self.sex)
529 | # self.location = info2.split('/')[1].strip()[:3].strip()
530 | # print(self.name, self.sex, self.location)
531 | #
532 | # # 获取该用户的微博数 关注数 粉丝数
533 | # self.weibo_count = int(re.findall(pattern, requests_content.find('div', attrs={'class': 'u'}).
534 | # findAll('div', attrs={'class': 'tip2'})[0].get_text())[0])
535 | # self.follow_count = int(re.findall(pattern, requests_content.find('div', attrs={'class': 'u'}).
536 | # findAll('div', attrs={'class': 'tip2'})[0].get_text())[1])
537 | # self.fans_count = int(re.findall(pattern, requests_content.find('div', attrs={'class': 'u'}).
538 | # findAll('div', attrs={'class': 'tip2'})[0].get_text())[2])
539 | # print(self.weibo_count, self.follow_count, self.fans_count)
540 |
541 | # def __get_member_list__(self, target_member_type='fans'):
542 | # """
543 | # 获取所指定的当前用户的关注/粉丝列表
544 | # 每个被关注者或粉丝的信息存储在dict中
545 | # :param required_member_count: 指定获取用户的数量
546 | # :param time_delay: 延迟时间
547 | # :param target_member_type: 指定获取用户的种类:fans或follow
548 | # :return: member_list: 存放已获取的用户列表
549 | #
550 | #
551 | # """
552 | # required_member_count = self.required_member_count
553 | # member_url = 'http://weibo.cn/' + str(self.uid) + '/' + str(target_member_type)
554 | # self.href = member_url
555 | # print(member_url)
556 | # member_list = []
557 | # member_count = 0
558 | # page_count = 1
559 | # now_page_count = 1
560 | # is_first = True
561 | # while True:
562 | #
563 | # tt.sleep(self.time_delay)
564 | # # 获取页面源码(bs4对象)
565 | # requests_content = self.retry_requests(member_url, uid=self.uid)
566 | #
567 | # # 获取当前页的关注列表
568 | # unit_list = requests_content.find_all('table')
569 | # for i in unit_list:
570 | # # 每个用户的信息以dict存储
571 | # member = {}
572 | # member['href'] = str(i.tr.td.a.attrs['href'])
573 | # try:
574 | # member['uid'] = i.tr.td.a.attrs['href'].split('u/')[1]
575 | # except:
576 | # member['uid'] = i.tr.td.a.attrs['href'].split('cn/')[1]
577 | # member['name'] = i.tr.find_all('td')[1].a.get_text()
578 | # # 正则匹配获取粉丝的粉丝数
579 | # pattern = re.compile(r'\d+')
580 | # # 若粉丝是大V,则多了一个图片标签
581 | # try:
582 | # member['is_v'] = False
583 | # member['fans_count'] = int(re.findall(pattern, i.tr.find_all('td')[1].contents[2])[0])
584 | # except:
585 | # member['fans_count'] = int(re.findall(pattern, i.tr.find_all('td')[1].contents[3])[0])
586 | # member['is_v'] = True
587 | # print(member['name'])
588 | # print(member['fans_count'])
589 | # # 计数器加一
590 | # member_count += 1
591 | # # 若超过了要求获取的用户数量,则返回
592 | # if member_count > required_member_count:
593 | # return member_list
594 | # member_list.append(member)
595 | #
596 | # # 若是第一页,则获取总页数
597 | # if is_first is True:
598 | # # 若发现‘x/y页’ 则有不止一页
599 | # if requests_content.find(attrs={'id': 'pagelist'}):
600 | # page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip()
601 | # page_count = page_count.split('/')[1]
602 | # pattern = re.compile(r'\d+')
603 | # page_count = int(re.findall(pattern, page_count)[0])
604 | # print(page_count)
605 | # else:
606 | # return member_list
607 | # is_first = False
608 | #
609 | # now_page_count += 1
610 | # if now_page_count >= page_count:
611 | # break
612 | #
613 | # member_url = 'http://weibo.cn/' + str(self.uid)+'/'+str(target_member_type)+'?page=' + str(now_page_count)
614 | # print(member_url)
615 | # print(self.uid)
616 | # print(target_member_type)
617 | # print("以上")
618 | #
619 | # return member_list
--------------------------------------------------------------------------------
/weibospider/repost.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import unicode_literals, print_function
3 | from utils import *
4 | import weibo
5 | from base import SinaBaseObject
6 | import sys
7 | reload(sys)
8 | sys.setdefaultencoding('utf-8')
9 |
10 |
11 | class Repost(SinaBaseObject):
12 | """
13 | 回复类,一般不直接使用,而是作为`Answer.repost`迭代器的返回类型
14 | """
15 |
16 | def __init__(self, id, cache={}):
17 | super(Repost, self).__init__()
18 | self.uid = str(id)
19 | self._cache = cache
20 | self.author_name = cache['author_name']
21 | self.text = cache['text']
22 |
--------------------------------------------------------------------------------
/weibospider/utils.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import unicode_literals, print_function
3 | from bs4 import BeautifulSoup
4 | import functools
5 | import importlib
6 |
7 | Default_Header = {
8 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
9 | 'Accept-Encoding': 'gzip, deflate, sdch',
10 | 'Accept-Language': 'zh-CN,zh;q=0.8',
11 | 'Cache-Control': 'max-age=0',
12 | 'Connection': 'keep-alive',
13 | 'Host': 'weibo.cn',
14 | 'Upgrade-Insecure-Requests': '1',
15 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 '
16 | 'Safari/537.36'
17 | }
18 |
19 |
20 | def check_cache(attr):
21 | def real(func):
22 | @functools.wraps(func)
23 | def wrapper(self):
24 | value = getattr(self, attr, None)
25 | if not value:
26 | value = func(self)
27 | setattr(self, attr, value)
28 | return value
29 | return wrapper
30 | return real
31 |
32 |
33 | def normal_attr(name_in_json=None):
34 | """
35 |
36 | 本装饰器的作用为:
37 |
38 | 1. 标识这个属性为常规属性。
39 | 2. 自动从当前对象的数据中取出对应属性。
40 | 优先返回缓存中的数据。
41 | :param name_in_json: 要查找的属性在`self._cache`这个json中的名字
42 | 默认值为使用此装饰器的方法名。
43 | """
44 | def actual_decorator(func):
45 | @functools.wraps(func)
46 | def inner(self, *args, **kwargs):
47 | name = name_in_json or func.__name__
48 | if self._cache and name in self._cache.keys():
49 | return self._cache[name]
50 | else:
51 | value = func(self, *args, **kwargs)
52 | self._cache.setdefault(name, value)
53 | return self._cache[name]
54 | return inner
55 | return actual_decorator
56 |
57 |
58 | def other_obj(class_name=None, name_in_json=None, module_filename=None):
59 | """
60 |
61 | 本装饰器的作用为:
62 |
63 | 1. 标识这个属性为另一个父类为base类的对象。
64 | 2. 自动从当前对象的数据中取出对应属性,构建成所需要的对象。
65 | :param class_name: 要生成的对象类名。
66 | :param name_in_json: 属性在 JSON 里的键名。
67 | :param module_filename: 所在的模块的文件名。
68 | """
69 | def actual_decorator(func):
70 | @functools.wraps(func)
71 | def inner(self, *args, **kwargs):
72 | cls_name = class_name or func.__name__
73 | name = name_in_json or func.__name__
74 |
75 | obj_cls = get_class_from_name(cls_name, module_filename)
76 |
77 | request_obj = func(self, *args, **kwargs)
78 | # print(111111111111)
79 | # print(request_obj)
80 |
81 | if request_obj is None:
82 | if name == 'people':
83 | return obj_cls(self.author_uid, cache={'name': self.author_name})
84 | # if name == 'weibo':
85 | # return obj_cls(uid=self.now_weibo_uid, cache=self.now_weibo_cache)
86 | return request_obj
87 |
88 | return inner
89 |
90 | return actual_decorator
91 |
92 |
93 | def get_class_from_name(clsname=None, module_filename=None):
94 | """
95 |
96 | 接收类名,通过处理返回对应的类
97 |
98 | :param clsname: 类名
99 | :param module_filename: 模块名
100 | :return: 模块中对应传入类名的类
101 | """
102 | cls_name = clsname.capitalize() if clsname.islower() else clsname
103 | file_name = module_filename or cls_name.lower()
104 |
105 | # 获取引用的模块 如 ``
106 | imported_module = importlib.import_module('.'+file_name, 'weibospider')
107 | # print(imported_module)
108 | # print(getattr(imported_module, cls_name))
109 | # 返回模块中对应传入类名的类 如 ``
110 | return getattr(imported_module, cls_name)
111 | # except (ImportError, AttributeError):
112 | # raise TypeError(
113 | # 'Unknown weibo obj type [{}]'.format(clsname)
114 | # )
115 |
--------------------------------------------------------------------------------
/weibospider/weibo.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from __future__ import unicode_literals, print_function
3 | import time as tt
4 | import bs4
5 | from bs4 import BeautifulSoup
6 | import re
7 | import requests
8 | from .utils import *
9 | from base import SinaBaseObject
10 | from comment import Comment
11 | import sys
12 | import people
13 | reload(sys)
14 | sys.setdefaultencoding('utf-8')
15 |
16 | pattern = re.compile(r'\d+')
17 |
18 |
19 | class Weibo(SinaBaseObject):
20 | """
21 | 新浪微博的微博类
22 | {
23 | uid: F0Mg7a8Wh,
24 | author_uid: rmrb,
25 | is_repost: False,
26 | href: http://weibo.cn/comment/F0Mg7a8Wh,
27 | text: :【中国大学教学质量排行榜800强出炉!你的母校排多少?】近日,《2017中国大学评价研究报告》发布2017中国大学
28 | 教学质量排行榜。清华大学本科生和研究生教育教学和人才培养质量问鼎榜首,北大第2,复旦第3,南大第4,武大第5,
29 | 上海交大第6,浙大第7,人大第8,吉大第9,川大第10。戳↓你的学校第几名? [组图共9张]
30 | time: 04月29日 12:58,
31 | time_delay: 1,
32 | author_name: 人民日报,
33 | repost_count: 3910,
34 | attitude_count: 3076,
35 | comment_count: 3248,
36 | repost_list:
37 | [
38 | {
39 | u'text': ':',
40 | u'name': u'\u56db\u5ddd\u5927\u5b66'
41 | },
42 | {
43 | u'text': ':27[\xe6\x91\x8a\xe6\x89\x8b][\xe5\xbf\x83] //',
44 | u'name': u'\u674e\u5199\u610f'},
45 | {
46 | u'text': ':\xe5\xa4\xaa\xe6\x83\xa8\xe4\xba\x86\xef\xbc\x8c\xe5\x89\x8d56\
47 | xe4\xb8\xaa\xe9\x83\xbd\xe6\x98\xaf\xe4\xb8\x96\xe7\x95\x8c\xe7\x9f\xa5\xe5\x90\x8d... //',
48 | u'name': u'\u897f\u8d22\u975e\u5b98\u65b9\u65b0\u95fb\u4e2d\u5fc3'
49 | },
50 | ....
51 | ]
52 | comment_list:
53 | [
54 | {
55 | uid: C_4101856898497093,
56 | terminal_source: iPhone 6s,
57 | text: \u4eba\u6c11\u65e5\u62a5\u4e5f\u53d1\u8fd9\u79cd\u5546\u4e1a\u6027\u8d28\u7684\u5927\u5b66
58 | \u6392\u884c\u699c\u3002\u3002\u3002[\u62dc\u62dc][\u62dc\u62dc][\u62dc\u62dc]',
59 | time: 04\u670829\u65e5 13:05\xa0,
60 | attitude_count: 270,
61 | is_hot: True,
62 | name: M-never
63 | },
64 | ....
65 | ]
66 | hot_comment_list:
67 | [
68 | {
69 | uid: C_4101856898497093,
70 | terminal_source: iPhone 6s,
71 | text: \u4eba\u6c11\u65e5\u62a5\u4e5f\u53d1\u8fd9\u79cd\u5546\u4e1a\u6027\u8d28\u7684\u5927\u5b66
72 | \u6392\u884c\u699c\u3002\u3002\u3002[\u62dc\u62dc][\u62dc\u62dc][\u62dc\u62dc]',
73 | time: 04\u670829\u65e5 13:05\xa0,
74 | attitude_count: 270,
75 | is_hot: True,
76 | name: M-never
77 | },
78 | ....
79 | ]
80 | attitude_list:
81 | [
82 | {
83 | name: \u723d\u5cf0\u4e2b\u4e2b,
84 | time: 13\u5206\u949f\u524d
85 | },
86 | {
87 | name: \u8393\u5c7f,
88 | time: \u4eca\u5929 19:55
89 | },
90 | ....
91 | ]
92 |
93 | """
94 | def __init__(self, id, cache={}):
95 | super(Weibo, self).__init__()
96 | self.uid = id
97 | self._cache = cache
98 | # self._session = session
99 | self.href = 'http://weibo.cn/comment/'+str(id)
100 | self.main_page_resource = ''
101 | self._get_author_data()
102 | # 该微博是否为转发
103 | self.is_repost = False
104 | # 该微博转发的微博的信息
105 | self.repost_location = ''
106 | self.repost_author_uid = ''
107 | self.repost_text = ''
108 | self.repost_reposted_count = 0
109 | self.terminal_source = ''
110 | self.location = ''
111 |
112 | # 威胁程度
113 | self.threatened = 0
114 |
115 | @property
116 | @normal_attr()
117 | def html(self):
118 | return self._session.get('http://weibo.cn/repost/' + self.uid).content
119 |
120 | @property
121 | @normal_attr()
122 | def _soup(self):
123 | return BeautifulSoup(self.html, "lxml")
124 |
125 | @property
126 | @other_obj(name_in_json='people', class_name='people')
127 | def author(self):
128 | return None
129 |
130 | @property
131 | @normal_attr()
132 | def time(self):
133 | return self._soup.find(attrs={'id': 'M_'}).findAll('div')[1].span.get_text()
134 |
135 | @property
136 | @normal_attr()
137 | def text(self):
138 | """
139 | 微博文本
140 | """
141 | if not self._soup.find(attrs={'id': 'M_'}):
142 | raise AttributeError("cookies失效或网络故障!")
143 | return self._soup.find(attrs={'id': 'M_'}).div.span.get_text()
144 |
145 | @property
146 | @normal_attr()
147 | def repost_count(self):
148 | """
149 | :return:int 转发数
150 | """
151 | # wap版的微博页面,此页面内容格式特别不规范
152 | repost_number_node = self._soup.find(attrs={'id': 'rt'})
153 | try:
154 | repost_count = int(re.findall(pattern, repost_number_node.get_text())[0])
155 | except IndexError:
156 | print("获取转发数出错")
157 | repost_count = 0
158 | return repost_count
159 |
160 | @property
161 | @normal_attr()
162 | def comment_count(self):
163 | """
164 | :return:int 评论数
165 | """
166 | # wap版的微博页面,此页面内容格式特别不规范
167 | try:
168 | comment_number_node = self._soup.find(attrs={'id': 'rt'}).next_sibling
169 | comment_count = int(re.findall(pattern, comment_number_node.get_text())[0])
170 | except IndexError:
171 | print("获取评论数出错")
172 | comment_count = 0
173 | return comment_count
174 |
175 | @property
176 | @normal_attr()
177 | def attitude_count(self):
178 | # wap版的微博页面,此页面内容格式特别不规范
179 | try:
180 | attitude_number_node = self._soup.find(attrs={'id': 'rt'}).next_sibling.next_sibling
181 | attitude_count = int(re.findall(pattern, attitude_number_node.get_text())[0])
182 | except IndexError:
183 | print("获取点赞数出错")
184 | attitude_count = 0
185 | return attitude_count
186 |
187 | # 获取微博作者的昵称和uid
188 | def _get_author_data(self):
189 | self.author_name = self._soup.find(attrs={'id': 'M_'}).div.a.get_text()
190 | self._cache.setdefault('author_name', self.author_name)
191 | self.author_uid = self._soup.find(attrs={'id': 'M_'}).div.a.attrs['href'].split('/')[-1]
192 | self._cache.setdefault('author_uid', self.author_uid)
193 |
194 | def _get_attribute_item(self, target_attribute_type, target_attribute_fuction):
195 | """
196 |
197 | :param target_attribute_type:
198 | :param target_attribute_fuction:
199 | :param required_attribute_count:
200 | :return:
201 | """
202 | attribute_url = 'http://weibo.cn/' + str(target_attribute_type) + '/' + str(self.uid)
203 | attribute_list = []
204 | page_count = 1
205 | now_page_count = 1
206 | is_first = True
207 | is_first_item = True
208 | pattern = re.compile(r'\d+')
209 | while True:
210 | # print("现在是评论第一页")
211 | tt.sleep(self._time_delay)
212 | # 获取页面源码(bs4对象)
213 | requests_content = BeautifulSoup(self._session.get(attribute_url).content)
214 |
215 | # 获取当前页的关注列表
216 | unit_list = requests_content.find_all('div', attrs={'class': 'c'})
217 | for i in unit_list:
218 | # 调用具体函数提取内容
219 | attribute = target_attribute_fuction(i)
220 | if attribute is False:
221 | continue
222 | # 获取点赞时会把作者也获取到 故去除
223 | if target_attribute_type == 'attitude' or target_attribute_type == 'repost' and is_first_item:
224 | is_first_item = False
225 | continue
226 | yield attribute
227 |
228 | # 若是第一页,则获取总页数
229 | if is_first:
230 | # 若发现‘x/y页’ 则有不止一页
231 | if requests_content.find(attrs={'id': 'pagelist'}):
232 | page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip()
233 | page_count = page_count.split('/')[1]
234 | page_count = int(re.findall(pattern, page_count)[0])
235 | # print(page_count)
236 | else:
237 | return
238 | is_first = False
239 |
240 | now_page_count += 1
241 | if now_page_count >= page_count:
242 | return
243 |
244 | attribute_url = 'http://weibo.cn/' + str(target_attribute_type) +'/' + str(self.uid) +'?&&page=' + \
245 | str(now_page_count)
246 |
247 | @staticmethod
248 | def _get_comment_list(unit):
249 | comment = {}
250 | # 若有id属性且id值以C开头,则证明是评论
251 | try:
252 | if str(unit.attrs['id']).startswith('C'):
253 | comment['uid'] = str(unit.attrs['id'])
254 | else:
255 | return False
256 | except:
257 | return False
258 | comment['author_name'] = unit.a.get_text()
259 | comment['author_uid'] = str(str(unit.a.attrs['href']).split('/')[-1])
260 | # 有的用户是个性域名,不符合/u/‘uid’的特点,故同时存href
261 | # comment['people'] = sina_people.SinaPeople(uid=str(unit.a.attrs['href']).split('/')[-1],
262 | # href='http://http://weibo.cn'+str(unit.a.attrs['href']))
263 | # 检查是否有“热门”标签
264 | try:
265 | if str(unit.span.attrs['class']) == "['kt']":
266 | comment['is_hot'] = True
267 | else:
268 | comment['is_hot'] = False
269 | except:
270 | comment['is_hot'] = False
271 |
272 | # 正则匹配获取评论的赞数
273 | # 正常情况为`举报 赞[0] 回复 `
274 | # 如果自己赞了本条评论 则此页面会变为`举报 已赞[1] 取消赞 回复 `
275 | # 如果是自己的评论 则此页面会变为` 举报 赞[0] 回复 删除 `
276 | # 故需要特殊处理
277 | try:
278 | comment['attitude_count'] = int(re.findall(pattern, unit.find_all('span', attrs={'class': 'cc'})[-2]
279 | .get_text())[0])
280 | except IndexError:
281 | try:
282 | comment['attitude_count'] = int(re.findall(pattern, unit.find_all('span', attrs={'class': 'cmt'})[0]
283 | .get_text())[0])
284 | except IndexError:
285 | comment['attitude_count'] = int(
286 | re.findall(pattern, unit.find_all('span', attrs={'class': 'cc'})[-3].get_text())[0])
287 |
288 | # 获取评论的正文
289 | comment['text'] = unit.find_all('span', attrs={'class': 'ctt'})[0].get_text()
290 | # 获取评论的时间
291 | comment['time'] = unit.find_all('span', attrs={'class': 'ct'})[-1].get_text().split('来自')[0]
292 | # 获取评论的终端来源
293 | comment['terminal_source'] = unit.find_all('span', attrs={'class': 'ct'})[-1].get_text().split('来自')[1]
294 |
295 | return Comment(id=str(comment['uid']), cache=comment)
296 |
297 | @property
298 | @other_obj()
299 | def comment(self):
300 | """
301 | :param required_comment_count: 指定获取的条数
302 | :param time_delay: 时间延迟
303 | :return: 该微博的评论列表
304 | .. code-block:: python
305 | [
306 | {
307 | 'uid': 'C_4100160336496887',
308 | 'is_hot': False,
309 | 'name' : '-猫猫站不稳-',
310 | 'people': <__main__.SinaPeople object at 0x0000000003498BE0>,
311 | 'time': 今天 20:44,
312 | 'terminal_source': 'iPhone 6'
313 | 'text': '稀罕你!'
314 | 'attitude_count': 0
315 | },
316 | ]
317 | """
318 | for x in self._get_attribute_item('comment', self._get_comment_list):
319 | yield x
320 |
321 | @staticmethod
322 | def _get_attitude_list(unit):
323 | from attitude import Attitude
324 | attitude = {}
325 | # 若有a标签则为点赞的unit
326 | try:
327 | attitude['author_name'] = unit.a.get_text()
328 | attitude['time'] = unit.span.get_text()
329 | # attitude['people'] = SinaPeople(uid=str(unit.a.attrs['href']).split('/')[-1],
330 | # href='http://weibo.cn' + str(unit.a.attrs['href']))
331 | except AttributeError:
332 | return False
333 | return Attitude(id=0, cache=attitude)
334 |
335 | @property
336 | @other_obj()
337 | def attitude(self):
338 | for x in self._get_attribute_item('attitude', self._get_attitude_list):
339 | yield x
340 |
341 | @staticmethod
342 | def _get_repost_list(unit):
343 | from repost import Repost
344 | repost = {}
345 | try:
346 | repost['author_name'] = unit.a.get_text()
347 | tmp_slibing = unit.a.next_sibling
348 | while not isinstance(tmp_slibing, bs4.element.NavigableString):
349 | tmp_slibing = tmp_slibing.next_sibling
350 | repost['text'] = str(tmp_slibing)
351 | # repost['people'] = SinaPeople(uid=unit.a.attrs['href'].split('/')[-1],
352 | # href='http://weibo.cn/'+unit.a.attrs['href'])
353 | except AttributeError:
354 | return False
355 | return Repost(id=0, cache=repost)
356 |
357 | @property
358 | @other_obj()
359 | # TODO:获取转发的时间、终端等信息
360 | def repost(self):
361 | for x in self._get_attribute_item('repost', self._get_repost_list):
362 | yield x
363 |
364 | # def get_text(self):
365 | # """
366 | # 获取微博内容
367 | # :return: str类型的微博文本内容
368 | # """
369 | # # if self.text != '':
370 | # # return self.text
371 | # if 1:
372 | # _retry_count = 3
373 | # while _retry_count > 0:
374 | # requests_content = self._soup
375 | # self.main_page_resource = requests_content
376 | # print(requests_content)
377 | # print("测试session的get方法")
378 | # try:
379 | # self.text = requests_content.find(attrs={'id': 'M_'}).div.span.get_text()
380 | # self.__get_author_data__()
381 | # _retry_count -= 1
382 | # break
383 | # except AttributeError:
384 | # _retry_count -= 1
385 | #
386 | # # 微博属性(转发数、赞数、评论数)
387 | # # wap版的此内容格式特别不规范
388 | # repost_number_node = requests_content.find(attrs={'id': 'rt'})
389 | # try:
390 | # self.repost_count = int(re.findall(pattern, repost_number_node.get_text())[0])
391 | # except IndexError:
392 | # self.repost_count = 0
393 | # try:
394 | # comment_number_node = repost_number_node.next_sibling
395 | # self.comment_count = int(re.findall(pattern, comment_number_node.get_text())[0])
396 | # except IndexError:
397 | # self.comment_count = 0
398 | # try:
399 | # attitude_number_node = comment_number_node.next_sibling
400 | # self.attitude_count = int(re.findall(pattern, attitude_number_node.get_text())[0])
401 | # except IndexError:
402 | # self.attitude_count = 0
403 | #
404 | # # 微博发表时间
405 | # #self.time = requests_content.find(attrs={'id': 'M_'}).findAll('div')[1].span.get_text()
406 | # return self.text
407 |
408 |
409 |
410 | # def __get_attribute_list__(self, target_attribute_type, target_attribute_fuction, required_attribute_count=8):
411 | # """
412 | #
413 | # :param target_attribute_type:
414 | # :param target_attribute_fuction:
415 | # :param required_attribute_count:
416 | # :return:
417 | # """
418 | # attribute_url = 'http://weibo.cn/' + str(target_attribute_type) + '/' + str(self.uid)
419 | # attribute_list = []
420 | # attribute_count = 0
421 | # page_count = 1
422 | # now_page_count = 1
423 | # is_first = True
424 | # pattern = re.compile(r'\d+')
425 | # while True:
426 | # print("现在是评论第一页")
427 | # tt.sleep(self.time_delay)
428 | # # 获取页面源码(bs4对象)
429 | # requests_content = self.retry_requests(attribute_url, uid=self.uid)
430 | #
431 | # # 获取当前页的关注列表
432 | # unit_list = requests_content.find_all('div', attrs={'class': 'c'})
433 | # for i in unit_list:
434 | # # 调用具体函数提取内容
435 | # attribute = target_attribute_fuction(i)
436 | # if attribute is False:
437 | # continue
438 | # # 计数器加一
439 | # attribute_count += 1
440 | # # 若超过了要求获取的属性数量,则返回
441 | # if attribute_count > required_attribute_count:
442 | # return attribute_list
443 | # attribute_list.append(attribute)
444 | #
445 | # # 若是第一页,则获取总页数
446 | # if is_first:
447 | # # 若发现‘x/y页’ 则有不止一页
448 | # if requests_content.find(attrs={'id': 'pagelist'}):
449 | # page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip()
450 | # page_count = page_count.split('/')[1]
451 | # page_count = int(re.findall(pattern, page_count)[0])
452 | # print(page_count)
453 | # else:
454 | # return attribute_list
455 | # is_first = False
456 | #
457 | # now_page_count += 1
458 | # if now_page_count >= page_count:
459 | # break
460 | #
461 | # attribute_url = 'http://weibo.cn/' + str(target_attribute_type) +'/' + str(self.uid) +'?&&page=' + \
462 | # str(now_page_count)
463 | #
464 | # return attribute_list
465 |
466 | if __name__ == '__main__':
467 | def a():
468 | return 1
469 | print(type(a))
--------------------------------------------------------------------------------