├── .gitignore ├── .idea ├── SinaWeiboSpider.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── LICENSE ├── README.md ├── __init__.py ├── ml ├── __init__.py ├── fenci.py ├── get_machine_info.py ├── get_user_info.py ├── keyword.txt ├── mingan_strip_equal.txt ├── mingan_word.txt ├── names.csv ├── os_path.py ├── random_forest.py ├── realtime_random_weibo.py ├── realtime_random_weibo_2.py ├── realtime_user_fans_follower.py ├── realtime_user_info.py ├── realtime_user_relationship.py ├── stopwords_cn.txt ├── svm_dict.py ├── svm_lsi.py ├── svm_module.py ├── svm_result.py ├── svm_tfidf.py ├── svm_utils.py └── values.csv ├── test.py └── weibospider ├── __init__.py ├── attitude.py ├── base.py ├── client.py ├── comment.py ├── people.py ├── repost.py ├── utils.py └── weibo.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | -------------------------------------------------------------------------------- /.idea/SinaWeiboSpider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 14 | 15 | 16 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 132 | 133 | 141 | 142 | 143 | 144 | 145 | true 146 | DEFINITION_ORDER 147 | 148 | 149 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 178 | 179 | 182 | 183 | 184 | 185 | 188 | 189 | 192 | 193 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 226 | 227 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 269 | 270 | 283 | 284 | 301 | 302 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 344 | 345 | 364 | 365 | 386 | 387 | 409 | 410 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 1510370240815 453 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 486 | 487 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 SuperSaiyanSSS 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SinaWeiboSpider 2 | 新浪微博较为完善的爬虫，持续改进，欢迎star~ 3 | 4 | ## 安装 5 | ## pip install weibospider 6 | 7 | ## 特色 8 | 9 | + 模仿77的zhihu-oauth，用户提供微博id构用于建对应类的对象，可以获取到某些需要的数据 10 | + 需要手动粘贴cookie，下文有对应教程。因为自动登录WAP版微博到现在还没有一个很好的办法。加密方式未知，且验证码反人类。 11 | + 用到了许多装饰器与生成器的有关知识，可以学习参考 12 | 13 | ## 简单例子 14 | 15 | ``` 16 | from weibospider import WeiboClient 17 | 18 | cookies = 'xxxxxxxxxxxx' # 通过在weibo.cn登录后，F12查看network选项获取 19 | 20 | myclient = WeiboClient(cookies) 21 | 22 | people_1 = myclient.people('1884866222') #某目标用户的uid 23 | 24 | print(people_1.name) #打印people_1的用户名 25 | 26 | print(people_1.weibo_count) #打印people_1的发表的微博数 27 | 28 | for index, weibo in zip(range(10), people_1.weibo): 29 | 30 | print(weibo.text) #打印people_1发表的最近10条微博 31 | 32 | for index_2, comment in zip(range(5), weibo.comment): 33 | 34 | print(comment.text) #打印此微博的最近5条评论 35 | 36 | print(comment.author_name) #打印此评论对应的作者 37 | 38 | 39 | ``` 40 | 41 | 如有疑问可邮箱 or QQ联系 42 | (2024.2.12更新：部分接口已不可用） 43 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .weibospider import (Attitude, SinaBaseObject, Comment, People, 3 | Repost, Weibo, WeiboClient) 4 | 5 | __all__ = [ 6 | 'WeiboClient', 'Attitude', 'SinaBaseObject', 'Comment', 'People', 'Repost', 'Weibo', 'WeiboClient' 7 | ] -------------------------------------------------------------------------------- /ml/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SuperSaiyanSSS/SinaWeiboSpider/b034c20ccf062b1323046584712716b2794ec7ec/ml/__init__.py -------------------------------------------------------------------------------- /ml/fenci.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | from __future__ import print_function 3 | import sys 4 | sys.path.append("..") 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') 7 | import jieba 8 | # from a1 import sina_people 9 | # from a1 import sina_people 10 | # from a1 import sina_weibo 11 | # from a1 import base 12 | # from a1 import test1 13 | # from a1 import sina_store 14 | from bs4 import BeautifulSoup 15 | import requests 16 | import pymongo 17 | import re 18 | 19 | sys.path.append('../') 20 | 21 | import jieba 22 | import jieba.analyse 23 | from optparse import OptionParser 24 | 25 | 26 | 27 | def clean_keyword(): 28 | """ 29 | 将敏感词转化为标准格式 30 | :return: 31 | """ 32 | word_list = [] 33 | with open('mingan_strip_equal.txt', 'r') as f: 34 | for i in f.readlines(): 35 | if i != '': 36 | word_list.append(i.strip().strip('\n')) 37 | # with open('xie.txt', 'r') as f: 38 | # for i in f.readlines(): 39 | # if i.split('=')[0].strip().strip('\n'): 40 | # word_list.append(i.split('=')[0].strip().strip('\n')) 41 | # 42 | # with open('guang.txt', 'r') as f: 43 | # for i in f.readlines(): 44 | # if i.split('=')[0].strip().strip('\n'): 45 | # word_list.append(i.split('=')[0].strip().strip('\n')) 46 | # 47 | # with open('huang.txt', 'r') as f: 48 | # for i in f.readlines(): 49 | # if i.split('=')[0].strip().strip('\n'): 50 | # word_list.append(i.split('=')[0].strip().strip('\n')) 51 | # 52 | # with open('mingan.txt', 'r') as f: 53 | # for i in f.readlines(): 54 | # if i.split('=')[0].strip().strip('\n'): 55 | # word_list.append(i.split('=')[0].strip().strip('\n')) 56 | # 57 | # with open('mingan_9.txt','a') as f: 58 | # print(word_list) 59 | # for i in word_list: 60 | # if i: 61 | # b = repr(i) 62 | # try: 63 | # print(unicode(eval(b), "gbk")) 64 | # except: 65 | # continue 66 | # f.write(str(unicode(eval(b), "gbk"))+' '+'300'+'\n') 67 | 68 | 69 | def remove_equal(): 70 | """ 71 | 去除网上所得敏感词中的等号 72 | :return: 73 | """ 74 | count = 0 75 | target_list = [] 76 | with open('mingan_word.txt', 'r') as f: 77 | word_list = f.readlines() 78 | print(len(word_list)) 79 | for i in word_list: 80 | count += 1 81 | print(count) 82 | target_list.append(i.split(' ')[0]) 83 | with open('mingan_strip_equal.txt', 'w') as f: 84 | for i in target_list: 85 | f.write(i+'\n') 86 | 87 | 88 | class TestKeyword(object): 89 | """ 90 | 对传入的微博文本分词并检测是否含有敏感词 91 | """ 92 | def __init__(self): 93 | jieba.load_userdict("keyword.txt") 94 | jieba.load_userdict("mingan_word.txt") 95 | self.topK = 12 96 | self.mingan_list = [] 97 | self.get_mingan_list() 98 | 99 | def get_mingan_list(self): 100 | with open('mingan_strip_equal.txt', 'r') as f: 101 | word_list = f.readlines() 102 | for word in word_list: 103 | self.mingan_list.append(word.strip('\n')) 104 | 105 | def test_if_has_keyword(self, weibo_text): 106 | content = weibo_text 107 | tags = jieba.analyse.extract_tags(content, topK=self.topK) 108 | 109 | for tag in tags: 110 | if tag in self.mingan_list: 111 | print("6666666") 112 | print(content) 113 | print(tag) 114 | return True 115 | else: 116 | print("no") 117 | return False 118 | 119 | if __name__ == '__main__': 120 | sys.setdefaultencoding('utf-8') 121 | s = sina_store.SinaStore() 122 | s.weibo_table = s.db['realtime_weibo'] 123 | weibo_iter = s.get_stored_information() 124 | print(weibo_iter) 125 | 126 | count = 0 127 | while count < 400: 128 | weibo = next(weibo_iter) 129 | weibo_txt = weibo['text'] 130 | print(weibo_txt) 131 | jieba.load_userdict("keyword.txt") 132 | jieba.load_userdict("mingan_word.txt") 133 | file_name = 'mm.txt' 134 | 135 | topK = 12 136 | 137 | content = weibo_txt 138 | tags = jieba.analyse.extract_tags(content, topK=topK) 139 | 140 | # print(",".join(tags)) 141 | 142 | mingan_list = [] 143 | with open('mingan_strip_equal.txt', 'r') as f: 144 | word_list = f.readlines() 145 | print(len(word_list)) 146 | for i in word_list: 147 | mingan_list.append(i.strip('\n')) 148 | for i in tags: 149 | if i in mingan_list: 150 | print("6666666") 151 | print(content) 152 | 153 | count += 1 154 | 155 | # seg_list = jieba.cut(content) 156 | # print(", ".join(seg_list)) 157 | 158 | 159 | # with open('mm.txt','r') as f: 160 | # s = "".join(f.readlines()) 161 | # seg_list = jieba.cut(s, cut_all=True) 162 | # 163 | # print("Full Mode:", "/ ".join(seg_list)) 164 | # seg_list = jieba.cut(s, cut_all=False) 165 | # print("Default Mode:", "/ ".join(seg_list)) 166 | # seg_list = jieba.cut(s) 167 | # print(", ".join(seg_list)) 168 | # seg_list = jieba.cut_for_search(s) 169 | # print(", ".join(seg_list)) 170 | 171 | -------------------------------------------------------------------------------- /ml/get_machine_info.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals, print_function 2 | import sys 3 | sys.path.append("..") 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | from a1 import sina_people 7 | from a1 import sina_people 8 | from a1 import sina_weibo 9 | from a1 import base 10 | from a1 import test1 11 | from a1 import sina_store 12 | import time as tt 13 | from bs4 import BeautifulSoup 14 | import requests 15 | import pymongo 16 | import re 17 | 18 | headers_2 = { 19 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 20 | 'Accept-Encoding':'gzip, deflate, sdch', 21 | 'Accept-Language':'zh-CN,zh;q=0.8', 22 | 'Cache-Control':'max-age=0', 23 | 'Connection':'keep-alive', 24 | 'Cookie': '_T_WM=0ff248d78f4984aa135c5b2e53c11079; ALF=1496373314; SCF=AjsEaVa0e8KjEg3yEjwEx270PLOpYvK-1BhV7AdkMSQgM7IlYI27IV6TA5-eb6avSBhK-q5migy9jGYZkeqPPpU.; SUB=_2A250DTviDeThGeBP4lQW-CbFyj6IHXVXDkWqrDV6PUJbktBeLWLAkW1fCr2k7XOfWxI9AQSa5M6kQfvxPg..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWC9U1RTKpYdAAz2GZeMbFX5JpX5o2p5NHD95QceK.cS0nR1K2EWs4DqcjSH.ieC0-R-.R7HK.R1Btt; SUHB=04W-CMkuo5eJq_; SSOLoginState=1493781426', 25 | 'Host':'weibo.cn', 26 | 'Upgrade-Insecure-Requests':'1', 27 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' 28 | } 29 | 30 | 31 | def get_machine_personal_info(): 32 | s = sina_store.SinaStore() 33 | s.weibo_table = s.db['machine_personal_info'] 34 | with open('machine_uid.txt','r') as f: 35 | for i in f.readlines(): 36 | if i!='': 37 | print(i) 38 | pe = sina_people.SinaPeople(i) 39 | s.store_in_mongodb(pe) 40 | 41 | 42 | 43 | 44 | if __name__ == '__main__': 45 | dic_c = {} 46 | str_c = headers_2['Cookie'] 47 | for i in str_c.split('; '): 48 | dic_c[i.split('=')[0]] = i.split('=')[1] 49 | cookies2 = requests.utils.cookiejar_from_dict(dic_c) 50 | base.SinaBaseObject.cookies = cookies2 51 | if 1: 52 | dic_c = {} 53 | str_c = headers_2['Cookie'] 54 | for i in str_c.split('; '): 55 | dic_c[i.split('=')[0]] = i.split('=')[1] 56 | cookies2 = requests.utils.cookiejar_from_dict(dic_c) 57 | base.SinaBaseObject.cookies = cookies2 58 | 59 | # for i in range(0,21): 60 | # if 61 | print(cookies2) 62 | with open('machine_uid.txt','r') as f: 63 | uid = f.readlines() 64 | print(len(uid)) 65 | # get_machine_personal_info() 66 | 67 | 68 | # a = requests.get('https://weibo.cn/2318253071/fans?page=1', cookies=cookies2) 69 | # a = BeautifulSoup(a.content, "lxml") 70 | # unit = a.findAll('div', attrs={'class': 'c'})[1] 71 | # print(unit) 72 | # unit_list = unit.findAll('table') 73 | # print(unit_list) 74 | # uid_list = [] 75 | # 76 | # for i in unit_list: 77 | # print(str(i.tr.findAll('td')[1].a.attrs['href']).split('/')[-1]) 78 | # uid_list.append(str(i.tr.findAll('td')[1].a.attrs['href']).split('/')[-1]) 79 | # 80 | # for j in range(2,5): 81 | # tt.sleep(4) 82 | # a = requests.get('https://weibo.cn/2318253071/fans?page='+str(j), cookies=cookies2) 83 | # a = BeautifulSoup(a.content, "lxml") 84 | # unit = a.findAll('div', attrs={'class': 'c'})[1] 85 | # unit_list = unit.findAll('table') 86 | # for i in unit_list: 87 | # print(str(i.tr.findAll('td')[1].a.attrs['href']).split('/')[-1]) 88 | # uid_list.append(str(i.tr.findAll('td')[1].a.attrs['href']).split('/')[-1]) 89 | # 90 | # with open('machine_uid.txt','a') as f: 91 | # for i in uid_list: 92 | # f.write(i+'\n') 93 | # get_human_personal_info() 94 | # a = requests.get('http://weibo.cn/u/5195713909') 95 | # print(a.content) 96 | # pe = sina_people.SinaPeople('6021561452') 97 | # pe = sina_weibo.SinaWeibo('F16aup9Im') 98 | # we = sina_weibo.SinaWeibo('F15Kpbev2') 99 | # for name, value in vars(we).items(): 100 | # print(name, value) 101 | # c_set = set() 102 | # s = sina_store.SinaStore() 103 | # s.weibo_table = s.db['try2'] 104 | # rmrb = s.get_human_info() 105 | # comment_list = rmrb['comment_list'] 106 | # for name, value in comment_list.items(): 107 | # author_uid = value['author_uid'] 108 | # c_set.add(str(author_uid)) 109 | 110 | # with open('human_uid.txt','a') as f: 111 | # for i in c_set: 112 | # f.write(i+'\n') 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /ml/get_user_info.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | from __future__ import unicode_literals, print_function 3 | from __future__ import division 4 | import math 5 | import sys 6 | sys.path.append("..") 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | import datetime 10 | import difflib 11 | from a1 import sina_people 12 | from a1 import sina_weibo 13 | from a1 import base 14 | from a1 import test1 15 | from a1 import sina_store 16 | from bs4 import BeautifulSoup 17 | import requests 18 | import time as tt 19 | import pymongo 20 | import re 21 | 22 | 23 | headers_for_baidu = { 24 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 25 | 'Accept-Encoding':'gzip, deflate, sdch', 26 | 'Accept-Language':'zh-CN,zh;q=0.8', 27 | 'Cache-Control':'max-age=0', 28 | 'Connection':'keep-alive', 29 | 'Host':'www.baidu.com', 30 | 'Upgrade-Insecure-Requests':'1', 31 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' 32 | } 33 | 34 | 35 | # def get_human_personal_info(): 36 | # s = sina_store.SinaStore() 37 | # s.weibo_table = s.db['human_personal_info'] 38 | # with open('human_uid.txt','r') as f: 39 | # for i in f.readlines(): 40 | # if i!='': 41 | # print(i) 42 | # pe = sina_people.SinaPeople(i) 43 | # s.store_in_mongodb(pe) 44 | 45 | 46 | # 将时间转换为标准格式 47 | def clean_time(now_time): 48 | if now_time.startswith('今'): 49 | now_time = datetime.datetime(2017, 5, 3) 50 | elif "分钟" in now_time: 51 | now_time = datetime.datetime(2017, 5, 3) 52 | elif "月" in now_time: 53 | month = int(now_time.split("月")[0][-2:]) 54 | day = int(now_time.split("日")[0][-2:]) 55 | now_time = datetime.datetime(2017, month, day) 56 | else: 57 | year = int(now_time.split('-')[0]) 58 | month = int(now_time.split('-')[1]) 59 | day = int(now_time.split('-')[2][:3]) 60 | now_time = datetime.datetime(year, month, day) 61 | return now_time 62 | 63 | 64 | # 最终修正条件信息熵计算公式 65 | # TODO:论文给定的公式有问题计算得出的离散有限序列的条件熵不满足非负性 66 | def final_calculation_formula(space_list): 67 | lence = len(space_list) 68 | if lence < 2: 69 | raise IndexError+"时间间隔个数应至少2个！" 70 | entropy_list = [] 71 | scale_list = [] 72 | is_first = True 73 | end_seq = 2 74 | while end_seq < lence: 75 | local_lence = end_seq 76 | entropy = calculation_formula(space_list[:end_seq], local_lence) 77 | entropy_list.append(entropy) 78 | if is_first: 79 | is_first = False 80 | else: 81 | scale_list.append(calculate_perc_xm(space_list[:end_seq], local_lence)) 82 | end_seq += 1 83 | result_list = [] 84 | for i in range(len(entropy_list)-1): 85 | result_list.append(entropy_list[i+1]-entropy_list[i]+scale_list[i]*entropy_list[0]) 86 | print(result_list) 87 | print(scale_list) 88 | return min(result_list) 89 | 90 | 91 | # pers(Xm):长度为 m 的序列里面只出现过一次的序列所占的比例 92 | def calculate_perc_xm(space_list, lence): 93 | only_count = 0 94 | print(space_list) 95 | for i in range(lence): 96 | for j in range(i+1, lence): 97 | if space_list[i] == space_list[j]: 98 | break 99 | if j == lence-1: 100 | only_count += 1 101 | scale = only_count/lence 102 | return scale 103 | 104 | 105 | # 计算信息熵 106 | def calculation_formula(space_list, lence): 107 | probability_list = [] 108 | while space_list: 109 | item_count = space_list.count(space_list[0]) 110 | # 由于引入了Python 3.x的除法规则，不会取整 111 | probability_list.append(item_count/lence) 112 | space_list = filter(lambda x: x != space_list[0], space_list) 113 | entropy = 0 114 | for p in probability_list: 115 | entropy += p*math.log(p) 116 | entropy = -entropy 117 | return entropy 118 | 119 | 120 | # 获取信息熵 121 | def get_entropy_of_information(person_dict): 122 | space_list = [] 123 | weibo_list_lence = len(person_dict['weibo_list']) 124 | print(weibo_list_lence) 125 | # 有的原创微博太少舍去该样本 126 | try: 127 | last_time = str(person_dict['weibo_list']['1']['time']) 128 | last_time = clean_time(last_time) 129 | except KeyError: 130 | return -1 131 | for i in range(weibo_list_lence-2): 132 | now_time = str(person_dict['weibo_list'][str(i+2)]['time']) 133 | now_time = clean_time(now_time) 134 | space_list.append((last_time - now_time).days) 135 | print((last_time-now_time).days) 136 | last_time = now_time 137 | 138 | entropy = calculation_formula(space_list, len(space_list)) 139 | print(entropy) 140 | return entropy 141 | 142 | # while 1: 143 | # try: 144 | # print(next(a)) 145 | # except StopIteration: 146 | # break 147 | 148 | 149 | # 获取用户信誉度 150 | def get_reputation(person_dict): 151 | try: 152 | fans_count = int(person_dict['fans_count']) 153 | follow_count = int(person_dict['follow_count']) 154 | reputation = fans_count/(fans_count+follow_count) 155 | except ValueError: 156 | return -1 157 | return reputation 158 | 159 | 160 | # 获取发表微博的平台种类数量 161 | def get_num_of_platform(person_dict): 162 | platform_set = set() 163 | for item in person_dict['weibo_list']: 164 | platform_set.add(str(person_dict['weibo_list'][str(item)]['terminal_source']).strip()) 165 | return len(platform_set) 166 | 167 | 168 | # 检查原创微博是否过少偶然性影响较大 169 | def check_if_too_little(person_dict): 170 | repost_count = 0 171 | total_count = len(person_dict['weibo_list']) 172 | for item in person_dict['weibo_list']: 173 | if str(person_dict['weibo_list'][str(item)]['is_repost'])=='True': 174 | print(person_dict['weibo_list'][str(item)]['is_repost']) 175 | repost_count += 1 176 | continue 177 | if total_count-repost_count < 3: 178 | return -1 179 | return 0 180 | 181 | 182 | # 获取发表微博的内容相似度 183 | def get_similarity_of_content(person_dict): 184 | """ 185 | 利用百度搞基搜索的site:(weibo.com) 查找是否存在重复微博 186 | :param person_dict: 用户信息的字典 187 | :return: 内容相似度 188 | """ 189 | identical_count = 0 190 | total_count = 0 191 | repost_count = 0 192 | if check_if_too_little(person_dict) == -1: 193 | return -1 194 | for item in person_dict['weibo_list']: 195 | if str(person_dict['weibo_list'][str(item)]['is_repost'])=='True': 196 | print(person_dict['weibo_list'][str(item)]['is_repost']) 197 | repost_count += 1 198 | continue 199 | else: 200 | total_count += 1 201 | 202 | if total_count >= 15: 203 | break 204 | copy_test_1 = False 205 | copy_test_2 = False 206 | 207 | text = str(person_dict['weibo_list'][str(item)]['text']) 208 | print(person_dict['weibo_list'][str(item)]['href']) 209 | q1 = text 210 | print(q1) 211 | 212 | re_emotion = re.compile('(\[.*?\])') # 去除微博表情文字 213 | q1 = re_emotion.sub('', q1) 214 | q1_list = re.split('!|！|,|。|……|：|、|,|，|；|;|——', unicode(q1)) # 按标点符号分割 215 | 216 | # 选择最大和第二大子字符串 217 | max_len = 'x' 218 | for string_seq in q1_list: 219 | if len(string_seq)> len(max_len): 220 | max_len = string_seq 221 | print(max_len) 222 | second_len = 'x' 223 | for string_seq in q1_list: 224 | if len(string_seq) > len(second_len) and string_seq != max_len: 225 | second_len = string_seq 226 | q1 = max_len 227 | q1_2 = second_len 228 | 229 | url = 'https://www.baidu.com/s?q1='+q1+'&q2=&q3=&q4=&rn=10&lm=0&ct=0&ft=&q5=&q6=weibo.com&tn=baiduadv' 230 | url_2 = 'https://www.baidu.com/s?q1='+q1_2+'&q2=&q3=&q4=&rn=10&lm=0&ct=0&ft=&q5=&q6=weibo.com&tn=baiduadv' 231 | # baidu_requests = requests.get(url, headers=headers_for_baidu, timeout=3) 232 | baidu_requests = base.SinaBaseObject.retry_requests_static(url, headers=headers_for_baidu, timeout=3) 233 | copy_test_1 = __parse_baidu_page__(baidu_requests, q1) 234 | 235 | if len(q1_2) > 5: 236 | baidu_requests = base.SinaBaseObject.retry_requests_static(url_2, headers=headers_for_baidu, timeout=3) 237 | # baidu_requests = requests.get(url_2, headers=headers_for_baidu, timeout=3) 238 | copy_test_2 = __parse_baidu_page__(baidu_requests, q1_2) 239 | 240 | if copy_test_1 or copy_test_2: 241 | identical_count += 1 242 | print(copy_test_1, copy_test_2) 243 | print("确实是抄袭的") 244 | else: 245 | print("是原创的") 246 | tt.sleep(3) 247 | 248 | # 部分数据有残缺，未能成功抓取到微博或几乎全为转发，则舍去 249 | if total_count < 3 or repost_count > 28: 250 | return -1 251 | 252 | similarity = identical_count/total_count 253 | print("内容相似度为"+str(similarity)) 254 | return similarity 255 | 256 | 257 | # 百度搜索页面处理逻辑 258 | def __parse_baidu_page__(baidu_requests, q1): 259 | """ 260 | @ author: wxw 261 | @ time: 2017/5/4 262 | 提取搜索到的高亮字符串并与要查找的进行对比 263 | 若满足一定的相似度要求则认为重复 264 | :param baidu_requests: requests抓取百度搜索所得页面源代码 265 | :param q1: 要查找的字符串 266 | :return: 是否重复 267 | """ 268 | baidu_bs4 = BeautifulSoup(baidu_requests.content, "lxml") 269 | highlight_list = baidu_bs4.find_all('div', attrs={'class': 'c-abstract'}) 270 | ok_count = 0 271 | for unit in highlight_list: 272 | try: 273 | highlight_word = unit.em.get_text() 274 | print(highlight_word) 275 | if str(q1).strip() == str(highlight_word).strip(): 276 | ok_count += 1 277 | print("已发现") 278 | # 若高亮的文本与寻找的文本差异很小（可能少了几个字符）则同样认为是已找到 279 | elif difflib.SequenceMatcher(None, str(q1), str(highlight_word)).ratio() > 0.88: 280 | print(difflib.SequenceMatcher(None, str(q1), str(highlight_word)).ratio()) 281 | ok_count += 1 282 | print("认为已找到") 283 | else: 284 | print("no") 285 | except AttributeError: 286 | print("这是空的") 287 | 288 | if ok_count > 1: 289 | return True 290 | else: 291 | return False 292 | 293 | 294 | def __store_human_feature_vector__(feature_vector): 295 | s = sina_store.SinaStore() 296 | s.weibo_table = s.db['human_vector_info'] 297 | iter = s.get_stored_information() 298 | flag = 0 299 | while True: 300 | try: 301 | person_dict = next(iter) 302 | if str(person_dict['uid']) == str(feature_vector['uid']): 303 | flag = 1 304 | break 305 | except StopIteration: 306 | flag = 0 307 | break 308 | 309 | if flag == 0: 310 | s.store_in_mongodb(feature_vector) 311 | 312 | 313 | def store_human_feature_vector(sina_store_object): 314 | sina_store_object.weibo_table = sina_store_object.db['human_personal_info'] 315 | # 获取返回的生成器 316 | iter = sina_store_object.get_stored_information() 317 | item_count = 0 318 | while True: 319 | try: 320 | feature_vector = {} 321 | person_dict = next(iter) 322 | entropy = get_entropy_of_information(person_dict) 323 | similarity = get_similarity_of_content(person_dict) 324 | platform = get_num_of_platform(person_dict) 325 | reputation = get_reputation(person_dict) 326 | if reputation == -1: 327 | print("该数据为残缺数据！舍去") 328 | print("现在抽取到第" + str(item_count) + "个用户！！") 329 | item_count += 1 330 | continue 331 | feature_vector['entropy'] = entropy 332 | feature_vector['similarity'] = similarity 333 | if similarity == -1 or entropy == -1: 334 | print("该数据为残缺数据！舍去") 335 | print("现在抽取到第" + str(item_count) + "个用户！！") 336 | item_count += 1 337 | continue 338 | feature_vector['uid'] = str(person_dict['uid']) 339 | feature_vector['platform'] = platform 340 | feature_vector['reputation'] = reputation 341 | feature_vector['human_or_machine'] = 1 342 | item_count += 1 343 | print("现在抽取到第"+str(item_count)+"个用户！！") 344 | __store_human_feature_vector__(feature_vector) 345 | except StopIteration: 346 | print("人类用户已提取特征向量完毕！") 347 | break 348 | 349 | 350 | def __store_machine_feature_vector__(feature_vector): 351 | s = sina_store.SinaStore() 352 | s.weibo_table = s.db['machine_vector_info'] 353 | iter = s.get_stored_information() 354 | flag = 0 355 | while True: 356 | try: 357 | person_dict = next(iter) 358 | if str(person_dict['uid']) == str(feature_vector['uid']): 359 | flag = 1 360 | break 361 | except StopIteration: 362 | flag = 0 363 | break 364 | 365 | if flag == 0: 366 | s.store_in_mongodb(feature_vector) 367 | 368 | 369 | def store_machine_feature_vector(sina_store_object): 370 | sina_store_object.weibo_table = sina_store_object.db['machine_personal_info'] 371 | # 获取返回的生成器 372 | iter = sina_store_object.get_stored_information() 373 | item_count = 0 374 | while True: 375 | try: 376 | feature_vector = {} 377 | person_dict = next(iter) 378 | entropy = get_entropy_of_information(person_dict) 379 | similarity = get_similarity_of_content(person_dict) 380 | platform = get_num_of_platform(person_dict) 381 | reputation = get_reputation(person_dict) 382 | if reputation == -1: 383 | print("该数据为残缺数据！舍去") 384 | print("现在抽取到第" + str(item_count) + "个用户！！") 385 | item_count += 1 386 | continue 387 | feature_vector['entropy'] = entropy 388 | feature_vector['similarity'] = similarity 389 | if similarity == -1 or entropy == -1: 390 | print("该数据为残缺数据！舍去") 391 | print("现在抽取到第" + str(item_count) + "个用户！！") 392 | item_count += 1 393 | continue 394 | feature_vector['uid'] = str(person_dict['uid']) 395 | feature_vector['platform'] = platform 396 | feature_vector['reputation'] = reputation 397 | feature_vector['human_or_machine'] = 0 398 | item_count += 1 399 | print("现在抽取到第"+str(item_count)+"个用户！！") 400 | __store_machine_feature_vector__(feature_vector) 401 | except StopIteration: 402 | print("机器用户已提取特征向量完毕！") 403 | break 404 | 405 | 406 | if __name__ == '__main__': 407 | """ 408 | 从mongodb中获取human和machine的信息， 409 | 并计算其信息熵、相似度、信誉度等特征 410 | 并将特征存入mongodb 411 | 412 | 示例： 413 | s = sina_store.SinaStore() 414 | store_human_feature_vector(s) 415 | store_machine_feature_vector(s) 416 | """ 417 | 418 | 419 | 420 | 421 | 422 | -------------------------------------------------------------------------------- /ml/keyword.txt: -------------------------------------------------------------------------------- 1 | 减肥 100 2 | 私聊 300 3 | 变瘦 20 4 | 得意 1 5 | V信 300 6 | V xin 300 7 | 卫星号 1000 8 | 卫星号, 1000 9 | 祖传 200 10 | 秘制 200 11 | 强力 100 12 | 特效 100 13 | 全效 100 14 | 强效 100 15 | 奇效 100 16 | 高效 100 17 | 速效 100 18 | 神效 200 19 | 处方 200 20 | 复方 200 21 | 治疗 10 22 | 消炎 20 23 | 抗炎 20 24 | 活血 100 25 | 祛瘀 100 26 | 止咳 50 27 | 解毒 50 28 | 疗效 100 29 | 防治 20 30 | 防癌 50 31 | 增高 200 32 | 溶脂 200 33 | 吸脂 200 34 | 瘦身 200 35 | 瘦脸 200 36 | 瘦腿 200 37 | xin 100 38 | 防辐射 100 39 | 美容 100 40 | 养颜 100 41 | 避凶 200 42 | 辟邪 200 43 | 首选 50 44 | 保健级 200 45 | 安全无毒副作用 200 46 | 无效退款 200 47 | 保险公司承保 200 48 | 补五行 200 49 | 吸财 200 50 | 保平安 200 51 | 无斑 200 52 | 祛疤 200 53 | 去疤 200 54 | 国家级 20 55 | 世界级 20 56 | 最高级 20 57 | 全网销量第一 200 58 | 全球首发 100 59 | 全国首家 100 60 | 全网首发 200 61 | 代言 20 62 | 无副作用 100 63 | 零风险 200 64 | 稳赚 200 65 | 无效退款 200 66 | 祖传 200 67 | 无事故 200 68 | 无依赖 200 69 | 根治 200 70 | 日减 200 71 | 秘方 200 72 | 保过 200 73 | 填补国内空白 10 74 | 保收益 200 75 | 食品级别 200 76 | 精准检测 100 77 | 3-7天见效 100 78 | 1-3个疗程治愈 100 79 | 解除疼痛 100 80 | 最权威 40 81 | 药到病除 100 82 | 体内毒素 200 83 | 吸附铅汞 100 84 | 除湿 100 85 | 润燥 50 86 | 消除斑点 100 87 | 祛除雀斑 100 88 | 祛除黄褐斑 100 89 | 祛除蝴蝶斑 100 90 | 祛除妊娠斑 100 91 | 祛除斑立净 100 92 | 全民晒单 100 93 | 随时涨价 100 94 | 卖疯了 200 95 | 抢疯了 200 96 | 点击领奖 100 97 | 恭喜获奖 200 98 | 全民免单 100 99 | 点击有惊喜 200 100 | 点击获取 200 101 | 点击转身 200 102 | 点击试穿 200 103 | 点击翻转 200 104 | 领取奖品 100 105 | 秒杀 20 106 | 抢爆 200 107 | 再不抢就没 200 108 | 不会更便宜 200 109 | 错过就没机会 200 110 | 疯抢 100 111 | 抢购 40 112 | VX 100 113 | 免费送 100 114 | 泰國 20 115 | 佛牌 200 116 | 陰牌 200 117 | 情降 200 118 | 降頭 200 119 | 和合 200 120 | 鬼降 200 121 | 鎖心 200 122 | 巫術 200 123 | -------------------------------------------------------------------------------- /ml/os_path.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | import os 4 | 5 | 6 | class LoadFolders(object): # 迭代器 7 | def __init__(self, par_path): 8 | self.par_path = par_path 9 | 10 | def __iter__(self): 11 | 12 | for file in os.listdir(self.par_path): 13 | file_abspath = os.path.join(self.par_path, file) 14 | if os.path.isdir(file_abspath): # if file is a folder 15 | yield file_abspath 16 | 17 | 18 | class LoadFiles(object): 19 | def __init__(self, par_path): 20 | self.par_path = par_path 21 | 22 | def __iter__(self): 23 | folders = LoadFolders(self.par_path) 24 | for folder in folders: # level directory 25 | catg = folder.split(os.sep)[-1] 26 | for file in os.listdir(folder): # secondary directory 27 | file_path = os.path.join(folder, file) 28 | if os.path.isfile(file_path): 29 | this_file = open(file_path, 'rb') 30 | content = this_file.read() 31 | yield catg, content 32 | this_file.close() -------------------------------------------------------------------------------- /ml/random_forest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | from __future__ import unicode_literals, print_function 3 | from __future__ import division 4 | import sklearn 5 | import pandas as pd 6 | import json 7 | import math 8 | import csv 9 | import pymongo 10 | import sklearn 11 | import sys 12 | sys.path.append("..") 13 | from a1 import base 14 | from a1 import sina_store 15 | reload(sys) 16 | sys.setdefaultencoding('utf-8') 17 | 18 | 19 | class MachineLearning(base.SinaBaseObject): 20 | def __init__(self): 21 | self.is_First = True 22 | self.is_First_2 = True 23 | self.gbc = '' 24 | self.dtc = '' 25 | self.rfc = '' 26 | 27 | def set_feature_vector_dict(self, feature_vector_dict): 28 | self.clean_feture_vector_dict(feature_vector_dict, is_first=self.is_First) 29 | self.is_First = False 30 | 31 | def set_test_feature_vector_dict(self, feature_vector_dict): 32 | self.clean_test_feture_vector_dict(feature_vector_dict, is_first_2=self.is_First_2) 33 | self.is_First_2 = False 34 | 35 | # 将传入的字典转化为csv文件 36 | @staticmethod 37 | def clean_feture_vector_dict(feature_vector_dict, is_first=False): 38 | with open('names.csv', 'ab') as csvfile: 39 | fieldnames = ['uid', 'similarity', 'platform', 'reputation', 'entropy', 'human_or_machine'] 40 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 41 | if is_first: 42 | writer.writeheader() 43 | writer.writerow( 44 | {'uid': feature_vector_dict['uid'], 45 | 'similarity': feature_vector_dict['similarity'], 46 | 'platform': feature_vector_dict['platform'], 47 | 'reputation': feature_vector_dict['reputation'], 48 | 'entropy': feature_vector_dict['entropy'], 49 | 'human_or_machine': feature_vector_dict['human_or_machine'] 50 | } 51 | ) 52 | 53 | @staticmethod 54 | def clean_test_feture_vector_dict(feature_vector_dict, is_first_2=False): 55 | with open('needs.csv', 'ab') as csvfile: 56 | fieldnames = ['uid', 'similarity', 'platform', 'reputation', 'entropy', 'human_or_machine'] 57 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 58 | if is_first: 59 | writer.writeheader() 60 | writer.writerow( 61 | {'uid': feature_vector_dict['uid'], 62 | 'similarity': feature_vector_dict['similarity'], 63 | 'platform': feature_vector_dict['platform'], 64 | 'reputation': feature_vector_dict['reputation'], 65 | 'entropy': feature_vector_dict['entropy'], 66 | 'human_or_machine': feature_vector_dict['human_or_machine'] 67 | } 68 | ) 69 | 70 | # 进行单一决策树和随机森林的训练模型及检验 71 | def rand_forest_train(self): 72 | # 读取本地用户特征信息 73 | users = pd.read_csv('names.csv') 74 | # 选取similarity、platform、reputation、entropy作为判别人类或机器的特征 75 | X = users[['similarity', 'platform', 'reputation', 'entropy']] 76 | y = users['human_or_machine'] 77 | 78 | # 对原始数据进行分割， 25%的数据用于测试 79 | from sklearn.cross_validation import train_test_split 80 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) 81 | 82 | # 对类别特征进行转化，成为特征向量 83 | from sklearn.feature_extraction import DictVectorizer 84 | vec = DictVectorizer(sparse=False) 85 | X_train = vec.fit_transform(X_train.to_dict(orient='record')) 86 | X_test = vec.transform(X_test.to_dict(orient='record')) 87 | 88 | # 使用单一决策树进行集成模型的训练及预测分析 89 | from sklearn.tree import DecisionTreeClassifier 90 | dtc = DecisionTreeClassifier() 91 | dtc.fit(X_train, y_train) 92 | dtc_y_pred = dtc.predict(X_test) 93 | 94 | # 使用随机森林分类器进行集成模型的训练及预测分析 95 | from sklearn.ensemble import RandomForestClassifier 96 | rfc = RandomForestClassifier() 97 | rfc.fit(X_train, y_train) 98 | rfc_y_pred = rfc.predict(X_test) 99 | 100 | # 使用梯度提升决策树进行集成模型的训练及预测分析 101 | from sklearn.ensemble import GradientBoostingClassifier 102 | gbc = GradientBoostingClassifier() 103 | gbc.fit(X_train, y_train) 104 | gbc_y_pred = gbc.predict(X_test) 105 | 106 | from sklearn.metrics import classification_report 107 | # 输出单一决策树在测试集上的分类准确性，以及更加详细的精确率召回率 F1指标 108 | print("单一决策树的准确性为", dtc.score(X_test, y_test)) 109 | print(classification_report(dtc_y_pred, y_test)) 110 | 111 | # 输出随机森林分类器在测试集上的分类准确性，以及更加详细的精确率召回率 F1指标 112 | print("随机森林分类器的准确性为", rfc.score(X_test, y_test)) 113 | print(classification_report(rfc_y_pred, y_test)) 114 | 115 | # 输出梯度提升决策树在测试集上的分类准确性，以及更加详细的精确率召回率 F1指标 116 | print("梯度提升决策树的准确性为", gbc.score(X_test, y_test)) 117 | print(classification_report(gbc_y_pred, y_test)) 118 | 119 | 120 | users = pd.read_csv('values.csv') 121 | 122 | # 检验是否为机器或人类 123 | X = users[['similarity', 'platform', 'reputation', 'entropy']] 124 | X = vec.transform(X.to_dict(orient='record')) 125 | print(rfc.predict(X)) 126 | 127 | self.dtc = dtc 128 | self.rfc = rfc 129 | self.gbc = gbc 130 | 131 | 132 | def get_dict_from_weibo_table(): 133 | ml = MachineLearning() 134 | sina_store_object = sina_store.SinaStore() 135 | sina_store_object.weibo_table = sina_store_object.db['human_vector_info'] 136 | iter = sina_store_object.get_stored_information() 137 | while True: 138 | try: 139 | info_dict = next(iter) 140 | ml.set_feature_vector_dict(info_dict) 141 | except StopIteration: 142 | break 143 | sina_store_object.weibo_table = sina_store_object.db['machine_vector_info'] 144 | iter = sina_store_object.get_stored_information() 145 | while True: 146 | try: 147 | info_dict = next(iter) 148 | ml.set_feature_vector_dict(info_dict) 149 | except StopIteration: 150 | break 151 | print("已结束正在训练模型。。。") 152 | ml.rand_forest_train() 153 | 154 | 155 | def start_training(): 156 | get_dict_from_weibo_table() 157 | 158 | if __name__ == "__main__": 159 | start_training() 160 | -------------------------------------------------------------------------------- /ml/realtime_random_weibo.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from __future__ import unicode_literals, print_function 3 | import weibo 4 | import time as tt 5 | import sys 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') 8 | sys.path.append("..") 9 | from a1 import sina_store 10 | from a1 import sina_weibo 11 | 12 | APP_KEY = '3175988140' 13 | APP_SECRET = 'f445636b8fc0b7b5e75474c3ab8d320b' 14 | CALL_BACK = 'http://api.weibo.com/oauth2/default.html' 15 | ACCESS_TOKEN = '2.00xUU4VGKbHw9D47e3cfc2c8UhoSBB' 16 | 17 | 18 | class myAPIClient(weibo.APIClient): 19 | def __init__(self, app_key, app_secret, redirect_uri, access_token): 20 | weibo.APIClient.__init__(self, app_key, app_secret, redirect_uri, access_token) 21 | 22 | def request_access_token_info(self, access_token): 23 | r = weibo._http_post('%s%s' % (self.auth_url, 'get_token_info'), access_token=access_token) 24 | current = int(tt.time()) 25 | expires = r.expire_in + current 26 | return weibo.JsonDict(expires_in=expires) 27 | 28 | 29 | def get_client(appkey, appsecret, callback, access_token): 30 | client = myAPIClient(appkey, appsecret, callback, access_token) 31 | r = client.request_access_token_info(access_token) 32 | expires_in = r.expires_in 33 | client.set_access_token(access_token, expires_in) 34 | return client 35 | 36 | 37 | def run(weiboList, client): 38 | statuses = client.statuses__public_timeline(count=2)['statuses'] 39 | length = len(statuses) 40 | print('现在获得了'+str(length)+'条新微博') 41 | 42 | for i in range(0, length): 43 | created_at = statuses[i]['created_at'] 44 | author_uid = statuses[i]['user']['id'] 45 | id = statuses[i]['id'] 46 | source = statuses[i]['source'] 47 | province = statuses[i]['user']['province'] 48 | city = statuses[i]['user']['city'] 49 | followers_count = statuses[i]['user']['followers_count'] 50 | friends_count = statuses[i]['user']['friends_count'] 51 | statuses_count = statuses[i]['user']['statuses_count'] 52 | url = statuses[i]['user']['url'] 53 | geo = statuses[i]['geo'] 54 | comments_count = statuses[i]['comments_count'] 55 | reposts_count = statuses[i]['reposts_count'] 56 | nickname = statuses[i]['user']['screen_name'] 57 | desc = statuses[i]['user']['description'] 58 | location = statuses[i]['user']['location'] 59 | text = statuses[i]['text'] 60 | 61 | weibo_dict = { 62 | 'created_at': created_at, 63 | 'author_uid': author_uid, 64 | 'id': id, 65 | 'author_name': nickname, 66 | 'source': source, 67 | 'text': text, 68 | 'province': province, 69 | 'location': location, 70 | 'description': desc, 71 | 'city': city, 72 | 'followers_count': followers_count, 73 | 'friends_count': friends_count, 74 | 'statuses_count': statuses_count, 75 | 'url': url, 76 | 'geo': geo, 77 | 'comments_count': comments_count, 78 | 'reposts_count': reposts_count 79 | } 80 | weiboList.append(weibo_dict) 81 | return weiboList 82 | 83 | 84 | def gain_random_weibolist(count_=-1): 85 | weibolist = [] 86 | client = get_client(APP_KEY, APP_SECRET, CALL_BACK, ACCESS_TOKEN) 87 | while True: 88 | print('现在开始获取！') 89 | try: 90 | weibolist = run(weibolist, client) 91 | return weibolist 92 | except: 93 | tt.sleep(0.1) 94 | # 默认循环无数次，直到获得成功为止 95 | # 可通过更改参数count_的值来改变循环次数 96 | if(count_!= 0): 97 | count_ = count_-1 98 | else: 99 | break 100 | 101 | # def clean_weibolist(weibolist): 102 | # for weibo in weibolist: 103 | # weibo_object = sina_weibo.SinaWeibo() 104 | # 105 | # def store_random_weibolist(): 106 | # sina_store_object = sina_store.SinaStore() 107 | # sina_store_object.weibo_table = sina_store_object.db['random_weibo'] 108 | # #sina_store_object. 109 | 110 | if __name__ == "__main__": 111 | weiboList = gain_random_weibolist(count_=3) 112 | print(weiboList[0]['id']) 113 | print(weiboList[0]['author_uid']) 114 | print(weiboList[0]['author_name']) 115 | print(weiboList[0]['source']) 116 | print(weiboList[0]['text']) 117 | print(weiboList[0]['created_at']) 118 | print(111111111111111111111) 119 | for name, value in weiboList[0].items(): 120 | print(name, value) -------------------------------------------------------------------------------- /ml/realtime_random_weibo_2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | from __future__ import unicode_literals, print_function 3 | from __future__ import division 4 | import json 5 | import pymongo 6 | import requests 7 | from bs4 import BeautifulSoup 8 | import sys 9 | sys.path.append("..") 10 | import random 11 | import fenci 12 | reload(sys) 13 | sys.setdefaultencoding('utf-8') 14 | 15 | REALTIMEWEIBO = 'realtime719' 16 | REALTIMEWEIBOT = 'realtime719t' 17 | 18 | 19 | location_dict = { 20 | '上海': [121.4648,31.2891], 21 | '东莞': [113.8953,22.901], 22 | '东营': [118.7073,37.5513], 23 | '中山': [113.4229,22.478], 24 | '临汾': [111.4783,36.1615], 25 | '临沂': [118.3118,35.2936], 26 | '丹东': [124.541,40.4242], 27 | '丽水': [119.5642,28.1854], 28 | '乌鲁木齐': [87.9236,43.5883], 29 | '佛山': [112.8955,23.1097], 30 | '保定': [115.0488,39.0948], 31 | '兰州': [103.5901,36.3043], 32 | '包头': [110.3467,41.4899], 33 | '北京': [116.4551,40.2539], 34 | '北海': [109.314,21.6211], 35 | '南京': [118.8062,31.9208], 36 | '南宁': [108.479,23.1152], 37 | '南昌': [116.0046,28.6633], 38 | '南通': [121.1023,32.1625], 39 | '厦门': [118.1689,24.6478], 40 | '台州': [121.1353,28.6688], 41 | '合肥': [117.29,32.0581], 42 | '呼和浩特': [111.4124,40.4901], 43 | '咸阳': [108.4131,34.8706], 44 | '哈尔滨': [127.9688,45.368], 45 | '唐山': [118.4766,39.6826], 46 | '嘉兴': [120.9155,30.6354], 47 | '大同': [113.7854,39.8035], 48 | '大连': [122.2229,39.4409], 49 | '天津': [117.4219,39.4189], 50 | '太原': [112.3352,37.9413], 51 | '威海': [121.9482,37.1393], 52 | '宁波': [121.5967,29.6466], 53 | '宝鸡': [107.1826,34.3433], 54 | '宿迁': [118.5535,33.7775], 55 | '常州': [119.4543,31.5582], 56 | '广州': [113.5107,23.2196], 57 | '廊坊': [116.521,39.0509], 58 | '延安': [109.1052,36.4252], 59 | '张家口': [115.1477,40.8527], 60 | '徐州': [117.5208,34.3268], 61 | '德州': [116.6858,37.2107], 62 | '惠州': [114.6204,23.1647], 63 | '成都': [103.9526,30.7617], 64 | '扬州': [119.4653,32.8162], 65 | '承德': [117.5757,41.4075], 66 | '拉萨': [91.1865,30.1465], 67 | '无锡': [120.3442,31.5527], 68 | '日照': [119.2786,35.5023], 69 | '昆明': [102.9199,25.4663], 70 | '杭州': [119.5313,29.8773], 71 | '枣庄': [117.323,34.8926], 72 | '柳州': [109.3799,24.9774], 73 | '株洲': [113.5327,27.0319], 74 | '武汉': [114.3896,30.6628], 75 | '汕头': [117.1692,23.3405], 76 | '江门': [112.6318,22.1484], 77 | '沈阳': [123.1238,42.1216], 78 | '沧州': [116.8286,38.2104], 79 | '河源': [114.917,23.9722], 80 | '泉州': [118.3228,25.1147], 81 | '泰安': [117.0264,36.0516], 82 | '泰州': [120.0586,32.5525], 83 | '济南': [117.1582,36.8701], 84 | '济宁': [116.8286,35.3375], 85 | '海口': [110.3893,19.8516], 86 | '淄博': [118.0371,36.6064], 87 | '淮安': [118.927,33.4039], 88 | '深圳': [114.5435,22.5439], 89 | '清远': [112.9175,24.3292], 90 | '温州': [120.498,27.8119], 91 | '渭南': [109.7864,35.0299], 92 | '湖州': [119.8608,30.7782], 93 | '湘潭': [112.5439,27.7075], 94 | '滨州': [117.8174,37.4963], 95 | '潍坊': [119.0918,36.524], 96 | '烟台': [120.7397,37.5128], 97 | '玉溪': [101.9312,23.8898], 98 | '珠海': [113.7305,22.1155], 99 | '盐城': [120.2234,33.5577], 100 | '盘锦': [121.9482,41.0449], 101 | '石家庄': [114.4995,38.1006], 102 | '福州': [119.4543,25.9222], 103 | '秦皇岛': [119.2126,40.0232], 104 | '绍兴': [120.564,29.7565], 105 | '聊城': [115.9167,36.4032], 106 | '肇庆': [112.1265,23.5822], 107 | '舟山': [122.2559,30.2234], 108 | '苏州': [120.6519,31.3989], 109 | '莱芜': [117.6526,36.2714], 110 | '菏泽': [115.6201,35.2057], 111 | '营口': [122.4316,40.4297], 112 | '葫芦岛': [120.1575,40.578], 113 | '衡水': [115.8838,37.7161], 114 | '衢州': [118.6853,28.8666], 115 | '西宁': [101.4038,36.8207], 116 | '西安': [109.1162,34.2004], 117 | '贵阳': [106.6992,26.7682], 118 | '连云港': [119.1248,34.552], 119 | '邢台': [114.8071,37.2821], 120 | '邯郸': [114.4775,36.535], 121 | '郑州': [113.4668,34.6234], 122 | '鄂尔多斯': [108.9734,39.2487], 123 | '重庆': [107.7539,30.1904], 124 | '金华': [120.0037,29.1028], 125 | '铜川': [109.0393,35.1947], 126 | '银川': [106.3586,38.1775], 127 | '镇江': [119.4763,31.9702], 128 | '长春': [125.8154,44.2584], 129 | '长沙': [113.0823,28.2568], 130 | '长治': [112.8625,36.4746], 131 | '阳泉': [113.4778,38.0951], 132 | '青岛': [120.4651,36.3373], 133 | '韶关': [113.7964,24.7028] 134 | } 135 | 136 | 137 | class RealtimeRandomWeibo(object): 138 | KEY = '9LF3gnOtYENP26HSoNAxPptHk7eCgxdWjL5ZuSdJXuGALaAcTrLXdGI7TkEYnIQm' 139 | 140 | def __init__(self, lazy=True): 141 | self.href = 'http://api03.bitspaceman.com:8000/post/weibo?kw=的&apikey=' + self.KEY 142 | self.weibo_list = [] 143 | self.weibo_list_all = [] 144 | self.weibo_list_threat = [] 145 | self.iter_all = None 146 | self.iter_count = 0 147 | if not lazy: 148 | self.get_random_weibo() 149 | 150 | # 连接至mongodb 151 | self.mongo_client = pymongo.MongoClient('localhost', 27017) 152 | self.db = self.mongo_client['Weibo'] 153 | 154 | def get_random_weibo(self): 155 | self.iter_count += 15 156 | requests_get = requests.get(self.href, timeout=15) 157 | requests_content = requests_get.content 158 | requests_dict = json.loads(requests_content) 159 | 160 | weibo_list = self.parse_requests_dict(requests_dict) 161 | 162 | copy_of_weibo_list = self.parse_weibo_list(weibo_list) 163 | self.weibo_list_all = copy_of_weibo_list 164 | self.weibo_list_threat = [weibo for weibo in copy_of_weibo_list if int(weibo['threatened']) > 68] 165 | self.store_to_mongodb() 166 | 167 | def parse_requests_dict(self, requests_dict): 168 | weibo_list = [] 169 | count = 0 170 | for item in requests_dict['data']: 171 | 172 | weibo = {} 173 | try: 174 | weibo['is_repost'] = False 175 | weibo['repost_location'] = '' 176 | weibo['text'] = str(item['mblog']['text']) 177 | print(weibo['text']) 178 | weibo['uid'] = str(item['from']['url']).split('/')[-1] 179 | weibo['time'] = str(item['pDate']) 180 | weibo['comment_count'] = str(item['commentCount']) 181 | weibo['author_name'] = str(item['from']['name']) 182 | weibo['author_uid'] = str(item['from']['id']) 183 | weibo['author_fans'] = str(item['from']['fansCount']) 184 | weibo['author_follower'] = str(item['from']['friendCount']) 185 | weibo['location'] = str(item['from']['extend']['location']) 186 | weibo['province'] = '' 187 | print(weibo['author_uid']) 188 | except: 189 | continue 190 | 191 | try: 192 | weibo['terminal_source'] = str(item['mblog']['source']).split('>')[1].split('<')[0] 193 | except IndexError: 194 | weibo['terminal_source'] = '未知' 195 | if item['mblog'].has_key('retweeted_status'): 196 | count += 1 197 | weibo['is_repost'] = True 198 | try: 199 | weibo['repost_location'] = str(item['mblog']['retweeted_status']['user']['location']) 200 | weibo['repost_reposted_count'] = str(item['mblog']['retweeted_status']['reposts_count']) 201 | weibo['repost_text'] = str(item['mblog']['retweeted_status']['text']) 202 | weibo['repost_attitude_count'] = str(item['mblog']['retweeted_status']['attitudes_count']) 203 | print(weibo['repost_location']) 204 | print(weibo['repost_reposted_count']) 205 | print(weibo['repost_text']) 206 | print(weibo['repost_attitude_count']) 207 | except: 208 | pass 209 | 210 | weibo_list.append(weibo) 211 | 212 | print("为转发的微博数： ", str(count)) 213 | # self.store_to_mongodb(weibo_list) 214 | self.weibo_list = weibo_list 215 | print(weibo_list) 216 | 217 | return weibo_list 218 | 219 | def parse_weibo_list(self, weibo_list): 220 | """ 221 | 分析微博威胁程度与规范地址格式 222 | :param weibo_list: 初始微博列表 223 | :return: 分析后的微博列表 224 | """ 225 | for i in weibo_list: 226 | i['location'] = mapped_province(i['location'], weibo=i) 227 | i['repost_location'] = mapped_province(i['repost_location']) 228 | print(i['location'], i['repost_location']) 229 | print('111111111111111111111111111111111111111111111111111') 230 | 231 | copy_of_weibo_list = [] 232 | 233 | # 筛选符合地图显示的地点 234 | for i in weibo_list: 235 | if i['location'] is None or i['location'] == '': 236 | continue 237 | if i['repost_location'] is None or i['repost_location'] == '': 238 | i['is_repost'] = False 239 | 240 | i['location'] = str(i['location']) 241 | i['repost_location'] = str(i['repost_location']) 242 | copy_of_weibo_list.append(i) 243 | 244 | copy_of_weibo_list = assess_threat_levels(copy_of_weibo_list) 245 | return copy_of_weibo_list 246 | 247 | def store_to_mongodb(self): 248 | 249 | weibo_table = self.db[REALTIMEWEIBO] 250 | for i in self.weibo_list_all: 251 | weibo_table.insert(i) 252 | 253 | weibo_table = self.db[REALTIMEWEIBOT] 254 | for i in self.weibo_list_threat: 255 | weibo_table.insert(i) 256 | 257 | # def get_iter_all(self): 258 | # weibo_table = self.db['realtime719'] 259 | # for i in weibo_table.find(): 260 | # yield i 261 | 262 | def get_realtime_weibo_from_mongodb(self): 263 | weibo_table = self.db[REALTIMEWEIBO] 264 | count = 0 265 | now_weibo_all = [] 266 | for i in weibo_table.find(): 267 | if count 1: 335 | if weibo and weibo_location.split(' ')[0] in mapped_dict.keys(): 336 | weibo['province'] = weibo_location.split(' ')[0] 337 | 338 | if weibo_location.split(' ')[1] in location_dict.keys(): 339 | weibo_location = weibo_location.split(' ')[1] 340 | elif weibo_location.split(' ')[0] in location_dict.keys(): 341 | weibo_location = weibo_location.split(' ')[0] 342 | else: 343 | weibo_location = '' 344 | 345 | 346 | else: 347 | 348 | if weibo and weibo_location in mapped_dict.keys(): 349 | weibo['province'] = weibo_location.strip() 350 | 351 | if weibo_location.strip() in location_dict.keys(): 352 | weibo_location = weibo_location.strip() 353 | else: 354 | if weibo_location.strip() in mapped_dict.keys(): 355 | print(weibo_location.strip()) 356 | weibo_location = mapped_dict.get(weibo_location.strip()) 357 | else: 358 | weibo_location = '' 359 | 360 | 361 | return weibo_location 362 | 363 | 364 | def assess_threat_levels(copy_of_weibo_list): 365 | """ 366 | 评估威胁程度 367 | """ 368 | check_object = fenci.TestKeyword() 369 | 370 | for weibo in copy_of_weibo_list: 371 | flag = check_object.test_if_has_keyword(weibo['text']) 372 | threat = 0 373 | if weibo['is_repost']: 374 | flag = flag or check_object.test_if_has_keyword(weibo['repost_text']) 375 | if flag: 376 | if weibo.has_key('repost_reposted_count') and weibo['repost_reposted_count']: 377 | if int(weibo['repost_reposted_count']) > 10: 378 | threat += 1 379 | if weibo.has_key('comment_count') and weibo['comment_count']: 380 | if int(weibo['comment_count'] > 1): 381 | threat += 1 382 | if weibo.has_key('repost_attitude_count') and weibo['repost_attitude_count']: 383 | if int(weibo['repost_attitude_count']) > 10: 384 | threat += 1 385 | if weibo.has_key('author_fans') and weibo['author_fans']: 386 | if int(weibo['author_fans']) > 100: 387 | threat += 1 388 | 389 | weibo['threatened'] = random.randint(68, 80) 390 | 391 | if threat == 1 or threat == 2: 392 | weibo['threatened'] = random.randint(80, 90) 393 | print('what?????????????????????') 394 | print(weibo['threatened'] ) 395 | 396 | if threat > 2: 397 | weibo['threatened'] = random.randint(90, 100) 398 | print('what?????????????????????') 399 | print(weibo['threatened'] ) 400 | 401 | print(weibo['time']) 402 | print(weibo['author_uid']) 403 | else: 404 | weibo['threatened'] = random.randint(0, 68) 405 | 406 | return copy_of_weibo_list 407 | 408 | 409 | def start_run(): 410 | 411 | realtime_weibo_object = RealtimeRandomWeibo() 412 | 413 | for i in realtime_weibo_object.weibo_list: 414 | i['location'] = mapped_province(i['location'], weibo=i) 415 | i['repost_location'] = mapped_province(i['repost_location']) 416 | print(i['location'], i['repost_location']) 417 | print('111111111111111111111111111111111111111111111111111') 418 | 419 | copy_of_weibo_list = [] 420 | 421 | # 筛选符合地图显示的地点 422 | for i in realtime_weibo_object.weibo_list: 423 | if i['location'] is None or i['location'] == '': 424 | continue 425 | if i['repost_location'] is None or i['repost_location'] == '': 426 | i['is_repost'] = False 427 | 428 | i['location'] = str(i['location']) 429 | i['repost_location'] = str(i['repost_location']) 430 | copy_of_weibo_list.append(i) 431 | 432 | copy_of_weibo_list = assess_threat_levels(copy_of_weibo_list) 433 | return copy_of_weibo_list 434 | 435 | 436 | 437 | 438 | 439 | if __name__ == '__main__': 440 | 441 | a = RealtimeRandomWeibo() 442 | a.get_random_weibo() 443 | l = a.get_realtime_weibo_from_mongodb() 444 | for i in l: 445 | print(i) 446 | # a = start_run() 447 | # for i in a: 448 | # print(i['location']) 449 | # print(type(i['location'])) 450 | # if i['is_repost']: 451 | # print("转发自"+str(i['repost_location'])) -------------------------------------------------------------------------------- /ml/realtime_user_fans_follower.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | from __future__ import unicode_literals, print_function 3 | from __future__ import division 4 | import json 5 | import time as tt 6 | import csv 7 | import pymongo 8 | import requests 9 | from bs4 import BeautifulSoup 10 | import sys 11 | sys.path.append("..") 12 | from wbcls.sina_store import SinaStore 13 | import fenci 14 | reload(sys) 15 | sys.setdefaultencoding('utf-8') 16 | 17 | RALATIONTABLE = 'Relation719' 18 | 19 | 20 | class RealtimeUserRealationship(object): 21 | KEY = '9LF3gnOtYENP26HSoNAxPptHk7eCgxdWjL5ZuSdJXuGALaAcTrLXdGI7TkEYnIQm' 22 | 23 | def __init__(self, user_id, user=True, fans=True, follow=True): 24 | # 连接至mongodb 25 | self.mongo_client = pymongo.MongoClient('localhost', 27017) 26 | self.db = self.mongo_client['Weibo'] 27 | 28 | self._session = requests.Session() 29 | self._session.mount('http://', self._create_adapter()) 30 | 31 | self.fans_href = 'http://api03.bitspaceman.com:8000/profile/weibo?type=3&id='+str(user_id)+'&apikey=' + \ 32 | self.KEY + '&size=30' 33 | self.fans_list = [] 34 | self.follow_href = 'http://api03.bitspaceman.com:8000/profile/weibo?type=2&id='+str(user_id)+'&apikey=' + \ 35 | self.KEY+'&size=30' 36 | self.follow_list = [] 37 | 38 | self.user_href = 'http://api03.bitspaceman.com:8000/profile/weibo?type=1&id='+str(user_id)+'&apikey=' + self.KEY 39 | 40 | self.info_dict = {} 41 | self.get_relationship(user=user, fans=fans, follow=follow) 42 | 43 | def get_relationship(self, user=True, fans=True, follow=False): 44 | 45 | if fans: 46 | requests_get = self._session.get(self.fans_href, timeout=15) 47 | requests_content = requests_get.content 48 | requests_dict = json.loads(requests_content) 49 | self.fans_list = self.parse_requests_dict(requests_dict) 50 | tt.sleep(0.5) 51 | if follow: 52 | requests_get = self._session.get(self.follow_href, timeout=15) 53 | requests_content = requests_get.content 54 | requests_dict = json.loads(requests_content) 55 | self.follow_list = self.parse_requests_dict(requests_dict) 56 | tt.sleep(0.5) 57 | 58 | if user: 59 | requests_get = self._session.get(self.user_href, timeout=15) 60 | requests_content = requests_get.content 61 | requests_dict = json.loads(requests_content) 62 | self.info_dict = self.parse_requests_info_dict(requests_dict) 63 | 64 | 65 | self.store_to_mongodb() 66 | 67 | @staticmethod 68 | def parse_requests_dict(requests_dict): 69 | relationship_list = [] 70 | for item in requests_dict['data']: 71 | user_id = str(item['id']) 72 | user = {} 73 | try: 74 | user['id'] = str(user_id) 75 | user['name'] = str(item['userName']) 76 | user['fans_count'] = str(item['fansCount']) 77 | user['follow_count'] = str(item['followCount']) 78 | user['weibo_count'] = str(item['postCount']) 79 | user['location'] = str(item['location']) 80 | user['sex'] = str(item['gender']) 81 | print("粉丝数"+user['fans_count']) 82 | except: 83 | continue 84 | 85 | relationship_list.append(user) 86 | 87 | return relationship_list 88 | 89 | def parse_requests_info_dict(self, requests_dict): 90 | info_dict = {} 91 | try: 92 | for item in requests_dict['data']: 93 | try: 94 | info_dict['fans_count'] = str(item['fansCount']) 95 | info_dict['follow_count'] = str(item['followCount']) 96 | info_dict['weibo_count'] = str(item['postCount']) 97 | info_dict['location'] = str(item['location']) 98 | info_dict['name'] = str(item['userName']) 99 | info_dict['url'] = str(item['url']) 100 | except: 101 | continue 102 | info_dict['fans_list'] = self.fans_list 103 | info_dict['follow_list'] = self.follow_list 104 | print(info_dict['name']) 105 | print(1111111111111111111111) 106 | return info_dict 107 | except: 108 | print(requests_dict) 109 | 110 | def store_to_mongodb(self): 111 | table = self.db[RALATIONTABLE] 112 | table.insert(self.info_dict) 113 | 114 | 115 | @staticmethod 116 | def _create_adapter(): 117 | return requests.adapters.HTTPAdapter( 118 | max_retries=requests.adapters.Retry( 119 | total=5, 120 | status_forcelist=[403, 404, 408, 500, 502], 121 | ) 122 | ) 123 | 124 | 125 | 126 | def get_relationship_from_mongodb(user_id): 127 | mongo_client = pymongo.MongoClient('localhost', 27017) 128 | db = mongo_client['Weibo'] 129 | table = db[RALATIONTABLE] 130 | for i in table.find(): 131 | if i['url'] == 'http://weibo.com/u/' + str(user_id): 132 | print(i['name']) 133 | 134 | 135 | if __name__ == '__main__': 136 | a = RealtimeUserRealationship(user_id='2671467531') 137 | get_relationship_from_mongodb('2671467531') 138 | b = { 139 | 'topic':'水滴直播', 140 | 'question_list': 141 | [ 142 | { 143 | 'question_name':'如何看待。。问题1', 144 | 'anwser_words': 145 | ['好','希拉里','4444'], 146 | 'percent':'8.33' 147 | }, 148 | { 149 | 'question_name': '如何看待。。问题2', 150 | 'anwser_words': 151 | ['不会', '淳朴', '4444'], 152 | 'percent': '4.44' 153 | }, 154 | ] 155 | } 156 | 157 | 158 | -------------------------------------------------------------------------------- /ml/realtime_user_info.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | from __future__ import unicode_literals, print_function 3 | from __future__ import division 4 | import json 5 | import csv 6 | import pymongo 7 | import requests 8 | from bs4 import BeautifulSoup 9 | import sys 10 | sys.path.append("..") 11 | from a1 import base 12 | from a1 import sina_store 13 | from a1 import sina_weibo 14 | from a1 import sina_people 15 | reload(sys) 16 | sys.setdefaultencoding('utf-8') 17 | 18 | 19 | class RealtimeRandomWeibo(object): 20 | KEY = '9LF3gnOtYENP26HSoNAxPptHk7eCgxdWjL5ZuSdJXuGALaAcTrLXdGI7TkEYnIQm' 21 | 22 | def __init__(self): 23 | self.href = 'http://api01.bitspaceman.com:8000/post/weibo?kw=的&apikey='+self.KEY 24 | self.get_random_weibo() 25 | 26 | def get_random_weibo(self): 27 | requests_get = requests.get(self.href, timeout=15) 28 | requests_content = requests_get.content 29 | requests_dict = json.loads(requests_content) 30 | for name, value in requests_dict.items(): 31 | print(name, value) 32 | print(len(requests_dict['data'])) 33 | self.parse_requests_dict(requests_dict) 34 | 35 | def parse_requests_dict(self, requests_dict): 36 | weibo_list = [] 37 | weibo = sina_weibo.SinaWeibo() 38 | count = 0 39 | for i in requests_dict['data']: 40 | if i['mblog'].has_key('retweeted_status'): 41 | count+=1 42 | print(i['mblog']['retweeted_status']['user']['location']) 43 | print(i['mblog']['retweeted_status']['reposts_count']) 44 | print(i['mblog']['retweeted_status']['user']['id']) 45 | print(i['mblog']['retweeted_status']['text']) 46 | 47 | print(count) 48 | 49 | if __name__ == '__main__': 50 | a = RealtimeRandomWeibo() -------------------------------------------------------------------------------- /ml/realtime_user_relationship.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | from __future__ import unicode_literals, print_function 3 | from __future__ import division 4 | import json 5 | import csv 6 | import pymongo 7 | import requests 8 | from bs4 import BeautifulSoup 9 | import sys 10 | sys.path.append("..") 11 | # from a1 import base 12 | # from a1 import sina_store 13 | # from a1 import sina_weibo 14 | # from a1 import sina_people 15 | import fenci 16 | reload(sys) 17 | sys.setdefaultencoding('utf-8') 18 | 19 | 20 | class RealtimeUserRealationship(object): 21 | KEY = '9LF3gnOtYENP26HSoNAxPptHk7eCgxdWjL5ZuSdJXuGALaAcTrLXdGI7TkEYnIQm' 22 | 23 | def __init__(self, user_id, fans=True, follow=False): 24 | self.fans_href = 'http://api03.bitspaceman.com:8000/profile/weibo?type=3&id='+str(user_id)+'&apikey=' + \ 25 | self.KEY + '&size=50' 26 | self.fans_list = [] 27 | self.follow_href = 'http://api03.bitspaceman.com:8000/profile/weibo?type=2&id='+str(user_id)+'&apikey=' + \ 28 | self.KEY + '&size=50' 29 | self.follow_list = [] 30 | self.get_relationship(fans=fans, follow=follow) 31 | 32 | def get_relationship(self, fans=True, follow=False): 33 | if fans: 34 | requests_get = requests.get(self.fans_href, timeout=15) 35 | requests_content = requests_get.content 36 | requests_dict = json.loads(requests_content) 37 | self.fans_list = self.parse_requests_dict(requests_dict) 38 | self.store_to_mongodb(self.fans_list, fans=fans, follow=follow) 39 | if follow: 40 | requests_get = requests.get(self.fans_href, timeout=15) 41 | requests_content = requests_get.content 42 | requests_dict = json.loads(requests_content) 43 | self.follow_list = self.parse_requests_dict(requests_dict) 44 | self.store_to_mongodb(self.fans_list, fans=fans, follow=follow) 45 | 46 | @staticmethod 47 | def parse_requests_dict(requests_dict): 48 | relationship_list = [] 49 | for item in requests_dict['data']: 50 | user_id = str(item['id']) 51 | user = sina_people.SinaPeople(uid=user_id, lazy=True) 52 | try: 53 | user.name = str(item['userName']) 54 | print(user.name) 55 | user.fans_count = str(item['fansCount']) 56 | user.follow_count = str(item['followCount']) 57 | user.weibo_count = str(item['postCount']) 58 | user.location = str(item['location']) 59 | user.sex = str(item['gender']) 60 | print("粉丝数"+user.fans_count) 61 | except: 62 | continue 63 | 64 | relationship_list.append(user) 65 | 66 | return relationship_list 67 | 68 | @staticmethod 69 | def store_to_mongodb(user_list, fans=True, follow=False): 70 | sina_store_object = sina_store.SinaStore() 71 | if fans: 72 | sina_store_object.weibo_table = sina_store_object.db['realtime_user_fans'] 73 | for user in user_list: 74 | sina_store_object.store_in_mongodb(user) 75 | if follow: 76 | sina_store_object.weibo_table = sina_store_object.db['realtime_user_follow'] 77 | for user in user_list: 78 | sina_store_object.store_in_mongodb(user) -------------------------------------------------------------------------------- /ml/stopwords_cn.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SuperSaiyanSSS/SinaWeiboSpider/b034c20ccf062b1323046584712716b2794ec7ec/ml/stopwords_cn.txt -------------------------------------------------------------------------------- /ml/svm_dict.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import print_function 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | import pandas as pd 7 | import re 8 | import numpy as np 9 | from gensim import corpora, models 10 | from scipy.sparse import csr_matrix 11 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 12 | from sklearn import svm 13 | import numpy as np 14 | import os,re,time,logging 15 | import jieba 16 | import pickle as pkl 17 | 18 | import svm_tfidf 19 | import os_path 20 | 21 | from svm_utils import * 22 | 23 | def reduce_dict(weibo_test): 24 | dictionary = None 25 | if not os.path.exists(path_tmp): 26 | os.makedirs(path_tmp) 27 | # 若不存在之前创建的词典，则生成词典 28 | if not os.path.exists(path_dictionary): 29 | dictionary = corpora.Dictionary() 30 | files = os_path.LoadFiles(path_doc_root) 31 | for i, msg in enumerate(files): 32 | catg = msg[0] 33 | file = msg[1] 34 | file = convert_doc_to_wordlist(file, cut_all=False) 35 | dictionary.add_documents([file]) 36 | # 去掉词典中出现次数过少的词语 37 | small_freq_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < 5] 38 | dictionary.filter_tokens(small_freq_ids) 39 | dictionary.compactify() 40 | dictionary.save(path_dictionary) 41 | svm_tfidf.reduce_tfidf(dictionary, weibo_test) 42 | 43 | if __name__ == "__main__": 44 | reduce_dict(weibo_test = "小粉红滚！你个傻逼，体育老师教你的？吾问无为谓") -------------------------------------------------------------------------------- /ml/svm_lsi.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import print_function 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | import pandas as pd 7 | import re 8 | import numpy as np 9 | from gensim import corpora, models 10 | from scipy.sparse import csr_matrix 11 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 12 | from sklearn import svm 13 | import numpy as np 14 | import os,re,time,logging 15 | import jieba 16 | import pickle as pkl 17 | 18 | import svm_module 19 | from svm_utils import * 20 | 21 | # path_doc_root = 'H:\py\workplace\/a2\SogouC.reduced2\\Reduced' # 根目录即存放按类分类好的问本纪 22 | # path_tmp = 'H:\py\workplace\/a2\SogouC.reduced2ss3\\temp1' # 存放中间结果的位置 23 | # path_dictionary = os.path.join(path_tmp, 'THUNews.dict') 24 | # path_tmp_tfidf = os.path.join(path_tmp, 'tfidf_corpus') 25 | # path_tmp_lsi = os.path.join(path_tmp, 'lsi_corpus') 26 | # path_tmp_lsimodel = os.path.join(path_tmp, 'lsi_model.pkl') 27 | # path_tmp_predictor = os.path.join(path_tmp, 'predictor.pkl') 28 | 29 | 30 | def reduce_lsi(dictionary, corpus_tfidf, weibo_test): 31 | corpus_lsi = None 32 | lsi_model = None 33 | # # # # 第三阶段，开始将tfidf转化成lsi 34 | if not os.path.exists(path_tmp_lsi): 35 | print('=== 未检测到有lsi文件夹存在，开始生成lsi向量 ===') 36 | if not dictionary: 37 | dictionary = corpora.Dictionary.load(path_dictionary) 38 | if not corpus_tfidf: # 如果跳过了第二阶段，则从指定位置读取tfidf文档 39 | print('--- 未检测到tfidf文档，开始从磁盘中读取 ---') 40 | # 从对应文件夹中读取所有类别 41 | files = os.listdir(path_tmp_tfidf) 42 | catg_list = [] 43 | for file in files: 44 | t = file.split('.')[0] 45 | if t not in catg_list: 46 | catg_list.append(t) 47 | 48 | # 从磁盘中读取corpus 49 | corpus_tfidf = {} 50 | for catg in catg_list: 51 | path = '{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg) 52 | corpus = corpora.MmCorpus(path) 53 | corpus_tfidf[catg] = corpus 54 | print('--- tfidf文档读取完毕，开始转化成lsi向量 ---') 55 | 56 | # 生成lsi model 57 | os.makedirs(path_tmp_lsi) 58 | corpus_tfidf_total = [] 59 | catgs = list(corpus_tfidf.keys()) 60 | for catg in catgs: 61 | tmp = corpus_tfidf.get(catg) 62 | corpus_tfidf_total += tmp 63 | lsi_model = models.LsiModel(corpus=corpus_tfidf_total, id2word=dictionary, num_topics=50) 64 | # 将lsi模型存储到磁盘上 65 | lsi_file = open(path_tmp_lsimodel, 'wb') 66 | pkl.dump(lsi_model, lsi_file) 67 | lsi_file.close() 68 | del corpus_tfidf_total # lsi model已经生成，释放变量空间 69 | print('--- lsi模型已经生成 ---') 70 | 71 | # 生成corpus of lsi, 并逐步去掉 corpus of tfidf 72 | corpus_lsi = {} 73 | for catg in catgs: 74 | corpu = [lsi_model[doc] for doc in corpus_tfidf.get(catg)] 75 | corpus_lsi[catg] = corpu 76 | corpus_tfidf.pop(catg) 77 | corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_lsi, s=os.sep, c=catg), 78 | corpu, 79 | id2word=dictionary) 80 | print('=== lsi向量已经生成 ===') 81 | else: 82 | print('=== 检测到lsi向量已经生成，跳过该阶段 ===') 83 | 84 | svm_module.reduce_module(dictionary, corpus_lsi, lsi_model, weibo_test) 85 | -------------------------------------------------------------------------------- /ml/svm_module.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import print_function 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | import pandas as pd 7 | import re 8 | import numpy as np 9 | from gensim import corpora, models 10 | from scipy.sparse import csr_matrix 11 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 12 | from sklearn import svm 13 | import numpy as np 14 | import os,re,time,logging 15 | import jieba 16 | import pickle as pkl 17 | 18 | import svm_result 19 | from svm_utils import * 20 | 21 | 22 | def reduce_module(dictionary, corpus_lsi, lsi_model, weibo_test): 23 | # # # # 第四阶段，分类 24 | predictor = None 25 | if not os.path.exists(path_tmp_predictor): 26 | print('=== 未检测到判断器存在，开始进行分类过程 ===') 27 | if not corpus_lsi: # 如果跳过了第三阶段 28 | print('--- 未检测到lsi文档，开始从磁盘中读取 ---') 29 | files = os.listdir(path_tmp_lsi) 30 | catg_list = [] 31 | for file in files: 32 | t = file.split('.')[0] 33 | if t not in catg_list: 34 | catg_list.append(t) 35 | # 从磁盘中读取corpus 36 | corpus_lsi = {} 37 | for catg in catg_list: 38 | path = '{f}{s}{c}.mm'.format(f=path_tmp_lsi, s=os.sep, c=catg) 39 | corpus = corpora.MmCorpus(path) 40 | corpus_lsi[catg] = corpus 41 | print('--- lsi文档读取完毕，开始进行分类 ---') 42 | 43 | tag_list = [] 44 | doc_num_list = [] 45 | corpus_lsi_total = [] 46 | catg_list = [] 47 | files = os.listdir(path_tmp_lsi) 48 | for file in files: 49 | t = file.split('.')[0] 50 | if t not in catg_list: 51 | catg_list.append(t) 52 | for count, catg in enumerate(catg_list): 53 | tmp = corpus_lsi[catg] 54 | tag_list += [count] * tmp.__len__() 55 | doc_num_list.append(tmp.__len__()) 56 | corpus_lsi_total += tmp 57 | corpus_lsi.pop(catg) 58 | 59 | # 将gensim中的mm表示转化成numpy矩阵表示 60 | data = [] 61 | rows = [] 62 | cols = [] 63 | line_count = 0 64 | for line in corpus_lsi_total: 65 | for elem in line: 66 | rows.append(line_count) 67 | cols.append(elem[0]) 68 | data.append(elem[1]) 69 | line_count += 1 70 | lsi_matrix = csr_matrix((data, (rows, cols))).toarray() 71 | # 生成训练集和测试集 72 | rarray = np.random.random(size=line_count) 73 | train_set = [] 74 | train_tag = [] 75 | test_set = [] 76 | test_tag = [] 77 | for i in range(line_count): 78 | if rarray[i] < 0.8: 79 | train_set.append(lsi_matrix[i, :]) 80 | train_tag.append(tag_list[i]) 81 | else: 82 | test_set.append(lsi_matrix[i, :]) 83 | test_tag.append(tag_list[i]) 84 | 85 | # 生成分类器 86 | predictor = svm_classify(train_set, train_tag, test_set, test_tag) 87 | x = open(path_tmp_predictor, 'wb') 88 | pkl.dump(predictor, x) 89 | x.close() 90 | else: 91 | print('=== 检测到分类器已经生成，跳过该阶段 ===') 92 | 93 | svm_result.reduce_result(dictionary, lsi_model, predictor, weibo_test) -------------------------------------------------------------------------------- /ml/svm_result.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | import pandas as pd 7 | import re 8 | import numpy as np 9 | from gensim import corpora, models 10 | from scipy.sparse import csr_matrix 11 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 12 | from sklearn import svm 13 | import numpy as np 14 | import os,re,time,logging 15 | import jieba 16 | import pickle as pkl 17 | from svm_utils import * 18 | 19 | path_doc_root = 'H:\py\workplace\/a2\SogouC.reduced2\\Reduced' # 根目录即存放按类分类好的问本纪 20 | path_tmp = 'H:\py\workplace\/a2\SogouC.reduced2ss11\\temp1' # 存放中间结果的位置 21 | path_dictionary = os.path.join(path_tmp, 'THUNews.dict') 22 | path_tmp_tfidf = os.path.join(path_tmp, 'tfidf_corpus') 23 | path_tmp_lsi = os.path.join(path_tmp, 'lsi_corpus') 24 | path_tmp_lsimodel = os.path.join(path_tmp, 'lsi_model.pkl') 25 | path_tmp_predictor = os.path.join(path_tmp, 'predictor.pkl') 26 | n = 2 # n 表示抽样率， n抽1 27 | # def convert_doc_to_wordlist(str_doc,cut_all): 28 | # sent_list = str_doc.split('\n') 29 | # sent_list = map(rm_char, sent_list) # 去掉一些字符，例如\u3000 30 | # word_2dlist = [rm_tokens(jieba.cut(part,cut_all=cut_all)) for part in sent_list] # 分词 31 | # word_list = sum(word_2dlist,[]) 32 | # return word_list 33 | # def rm_tokens(words): # 去掉一些停用次和数字 34 | # words_list = list(words) 35 | # stop_words = get_stop_words() 36 | # for i in range(words_list.__len__())[::-1]: 37 | # if words_list[i] in stop_words: # 去除停用词 38 | # words_list.pop(i) 39 | # elif words_list[i].isdigit(): 40 | # words_list.pop(i) 41 | # return words_list 42 | # def get_stop_words(path='stopwords_cn.txt'): 43 | # file = open(path,'rb').read().split('\n') 44 | # return set(file) 45 | # def rm_char(text): 46 | # text = re.sub('\u3000','',text) 47 | # return text 48 | # 49 | # def svm_classify(train_set,train_tag,test_set,test_tag): 50 | # 51 | # clf = svm.LinearSVC() 52 | # clf_res = clf.fit(train_set,train_tag) 53 | # train_pred = clf_res.predict(train_set) 54 | # test_pred = clf_res.predict(test_set) 55 | # 56 | # train_err_num, train_err_ratio = checkPred(train_tag, train_pred) 57 | # test_err_num, test_err_ratio = checkPred(test_tag, test_pred) 58 | # 59 | # print('=== 分类训练完毕，分类结果如下 ===') 60 | # print('训练集误差: {e}'.format(e=train_err_ratio)) 61 | # print('检验集误差: {e}'.format(e=test_err_ratio)) 62 | # 63 | # return clf_res 64 | # 65 | # 66 | # def checkPred(data_tag, data_pred): 67 | # if data_tag.__len__() != data_pred.__len__(): 68 | # raise RuntimeError('The length of data tag and data pred should be the same') 69 | # err_count = 0 70 | # for i in range(data_tag.__len__()): 71 | # if data_tag[i]!=data_pred[i]: 72 | # err_count += 1 73 | # err_ratio = err_count / data_tag.__len__() 74 | # return [err_count, err_ratio] 75 | 76 | 77 | def reduce_result(dictionary, lsi_model, predictor, weibo_test): 78 | # # # # 第五阶段，对新文本进行判断 79 | if not dictionary: 80 | dictionary = corpora.Dictionary.load(path_dictionary) 81 | if not lsi_model: 82 | lsi_file = open(path_tmp_lsimodel,'rb') 83 | lsi_model = pkl.load(lsi_file) 84 | lsi_file.close() 85 | if not predictor: 86 | x = open(path_tmp_predictor,'rb') 87 | predictor = pkl.load(x) 88 | x.close() 89 | files = os.listdir(path_tmp_lsi) 90 | catg_list = [] 91 | for file in files: 92 | t = file.split('.')[0] 93 | if t not in catg_list: 94 | catg_list.append(t) 95 | 96 | demo_doc = weibo_test 97 | print(demo_doc) 98 | demo_doc = list(jieba.cut(demo_doc,cut_all=False)) 99 | demo_bow = dictionary.doc2bow(demo_doc) 100 | tfidf_model = models.TfidfModel(dictionary=dictionary) 101 | demo_tfidf = tfidf_model[demo_bow] 102 | demo_lsi = lsi_model[demo_tfidf] 103 | data = [] 104 | cols = [] 105 | rows = [] 106 | for item in demo_lsi: 107 | data.append(item[1]) 108 | cols.append(item[0]) 109 | rows.append(0) 110 | demo_matrix = csr_matrix((data,(rows,cols))).toarray() 111 | x = predictor.predict(demo_matrix) 112 | print('分类结果为：{x}'.format(x=catg_list[x[0]])) 113 | 114 | -------------------------------------------------------------------------------- /ml/svm_tfidf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import print_function 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | import pandas as pd 7 | import re 8 | import numpy as np 9 | from gensim import corpora, models 10 | from scipy.sparse import csr_matrix 11 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 12 | from sklearn import svm 13 | import numpy as np 14 | import os,re,time,logging 15 | import jieba 16 | import pickle as pkl 17 | 18 | import svm_lsi 19 | import os_path 20 | from svm_utils import * 21 | 22 | # path_doc_root = 'H:\py\workplace\/a2\SogouC.reduced2\\Reduced' # 根目录即存放按类分类好的问本纪 23 | # path_tmp = 'H:\py\workplace\/a2\SogouC.reduced2ss1\\temp1' # 存放中间结果的位置 24 | # path_dictionary = os.path.join(path_tmp, 'THUNews.dict') 25 | # path_tmp_tfidf = os.path.join(path_tmp, 'tfidf_corpus') 26 | # path_tmp_lsi = os.path.join(path_tmp, 'lsi_corpus') 27 | # path_tmp_lsimodel = os.path.join(path_tmp, 'lsi_model.pkl') 28 | # path_tmp_predictor = os.path.join(path_tmp, 'predictor.pkl') 29 | 30 | corpus_lsi = None 31 | lsi_model = None 32 | predictor = None 33 | 34 | 35 | # def convert_doc_to_wordlist(str_doc,cut_all): 36 | # sent_list = str_doc.split('\n') 37 | # sent_list = map(rm_char, sent_list) # 去掉一些字符，例如\u3000 38 | # word_2dlist = [rm_tokens(jieba.cut(part,cut_all=cut_all)) for part in sent_list] # 分词 39 | # word_list = sum(word_2dlist,[]) 40 | # return word_list 41 | # 42 | # 43 | # def rm_tokens(words): # 去掉一些停用次和数字 44 | # words_list = list(words) 45 | # stop_words = get_stop_words() 46 | # for i in range(words_list.__len__())[::-1]: 47 | # if words_list[i] in stop_words: # 去除停用词 48 | # words_list.pop(i) 49 | # elif words_list[i].isdigit(): 50 | # words_list.pop(i) 51 | # return words_list 52 | # 53 | # 54 | # def get_stop_words(path='stopwords_cn.txt'): 55 | # file = open(path,'rb').read().split('\n') 56 | # return set(file) 57 | # 58 | # 59 | # def rm_char(text): 60 | # text = re.sub('\u3000','',text) 61 | # return text 62 | 63 | 64 | def reduce_tfidf(dictionary, weibo_test): 65 | corpus_tfidf = None 66 | # # # # 第二阶段，开始将文档转化成tfidf 67 | if not os.path.exists(path_tmp_tfidf): 68 | print('=== 未检测到有tfidf文件夹存在，开始生成tfidf向量 ===') 69 | # 如果指定的位置没有tfidf文档，则生成一个。如果有，则跳过该阶段 70 | if not dictionary: # 如果跳过了第一阶段，则从指定位置读取词典 71 | dictionary = corpora.Dictionary.load(path_dictionary) 72 | os.makedirs(path_tmp_tfidf) 73 | files = os_path.LoadFiles(path_doc_root) 74 | tfidf_model = models.TfidfModel(dictionary=dictionary) 75 | corpus_tfidf = {} 76 | for i, msg in enumerate(files): 77 | catg = msg[0] 78 | file = msg[1] 79 | word_list = convert_doc_to_wordlist(file, cut_all=False) 80 | file_bow = dictionary.doc2bow(word_list) 81 | file_tfidf = tfidf_model[file_bow] 82 | tmp = corpus_tfidf.get(catg, []) 83 | tmp.append(file_tfidf) 84 | if tmp.__len__() == 1: 85 | corpus_tfidf[catg] = tmp 86 | # 将tfidf中间结果储存起来 87 | catgs = list(corpus_tfidf.keys()) 88 | for catg in catgs: 89 | corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg), 90 | corpus_tfidf.get(catg), 91 | id2word=dictionary 92 | ) 93 | print('catg {c} has been transformed into tfidf vector'.format(c=catg)) 94 | print('=== tfidf向量已经生成 ===') 95 | else: 96 | print('=== 检测到tfidf向量已经生成，跳过该阶段 ===') 97 | 98 | svm_lsi.reduce_lsi(dictionary, corpus_tfidf, weibo_test) -------------------------------------------------------------------------------- /ml/svm_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import print_function 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | import os 7 | import re 8 | import jieba 9 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 10 | from sklearn import svm 11 | 12 | path_doc_root = 'H:\py\workplace\/a2\SogouC.reduced2\\Reduced' # 根目录即存放按类分类好的问本纪 13 | path_tmp = 'H:\py\workplace\/a2\SogouC.reduced2ss11\\temp1' # 存放中间结果的位置 14 | path_dictionary = os.path.join(path_tmp, 'THUNews.dict') 15 | path_tmp_tfidf = os.path.join(path_tmp, 'tfidf_corpus') 16 | path_tmp_lsi = os.path.join(path_tmp, 'lsi_corpus') 17 | path_tmp_lsimodel = os.path.join(path_tmp, 'lsi_model.pkl') 18 | path_tmp_predictor = os.path.join(path_tmp, 'predictor.pkl') 19 | 20 | def convert_doc_to_wordlist(str_doc,cut_all): 21 | sent_list = str_doc.split('\n') 22 | sent_list = map(rm_char, sent_list) # 去掉一些字符，例如\u3000 23 | word_2dlist = [rm_tokens(jieba.cut(part,cut_all=cut_all)) for part in sent_list] # 分词 24 | word_list = sum(word_2dlist,[]) 25 | return word_list 26 | 27 | 28 | def rm_tokens(words): # 去掉一些停用次和数字 29 | words_list = list(words) 30 | stop_words = get_stop_words() 31 | for i in range(words_list.__len__())[::-1]: 32 | if words_list[i] in stop_words: # 去除停用词 33 | words_list.pop(i) 34 | elif words_list[i].isdigit(): 35 | words_list.pop(i) 36 | return words_list 37 | 38 | 39 | def get_stop_words(path='stopwords_cn.txt'): 40 | file = open(path,'rb').read().split('\n') 41 | return set(file) 42 | 43 | 44 | def rm_char(text): 45 | text = re.sub('\u3000','',text) 46 | return text 47 | 48 | 49 | def svm_classify(train_set, train_tag, test_set, test_tag): 50 | clf = svm.LinearSVC() 51 | clf_res = clf.fit(train_set, train_tag) 52 | train_pred = clf_res.predict(train_set) 53 | test_pred = clf_res.predict(test_set) 54 | 55 | train_err_num, train_err_ratio = checkPred(train_tag, train_pred) 56 | test_err_num, test_err_ratio = checkPred(test_tag, test_pred) 57 | 58 | print('=== 分类训练完毕，分类结果如下 ===') 59 | print('训练集误差: {e}'.format(e=train_err_ratio)) 60 | print('检验集误差: {e}'.format(e=test_err_ratio)) 61 | 62 | return clf_res 63 | 64 | 65 | def checkPred(data_tag, data_pred): 66 | if data_tag.__len__() != data_pred.__len__(): 67 | raise RuntimeError('The length of data tag and data pred should be the same') 68 | err_count = 0 69 | for i in range(data_tag.__len__()): 70 | if data_tag[i]!=data_pred[i]: 71 | err_count += 1 72 | err_ratio = err_count / data_tag.__len__() 73 | return [err_count, err_ratio] -------------------------------------------------------------------------------- /ml/values.csv: -------------------------------------------------------------------------------- 1 | uid,similarity,platform,reputation,entropy 2 | 6034008530,0.0,4,0.31343283582089554,1.5167044950347657 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from weibospider import WeiboClient 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | 7 | #cookies = \ 8 | #'ALF=1504271525; SCF=AjsEaVa0e8KjEg3yEjwEx270PLOpYvK-1BhV7AdkMSQgREMFA0bd3IqV3weK9ydf5NAbDXjKT69Rfndb2m9Ah4I.; SUB=_2A250hb_2DeRhGeNH7VIV9izNwj2IHXVXicG-rDV6PUNbktANLRPhkW1ZeSLr49kFNMgwrWThnh1bPUhWPw..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhaydrjX2CLPhFdjQ77gn4P5JpX5KMhUgL.Fo-4So5XSozp1K22dJLoI0YLxK.LB.-L1K.LxKML12qLBK5LxKqL1KBLBo.LxK.LB-BL1KBLxKBLB.2LB.2LxK-LBonL1heLxKqLB-eLBKMt; SUHB=03oF_eQuesy4xQ; SSOLoginState=1501679526; _T_WM=544d051d212d2d6f3adece8b6949b373' 9 | 10 | cookies = 'ALF=1512959361; SCF=AlGHrwmWqyhSdpml9a836b5TfwBwT3_aqlPQLm4VGPX5AnF7W-51O8sb-246XgliUA_jtEUQg3I0XisboShzSK4.; SUB=_2A253Ah7JDeRhGeNH7VIV9izNwj2IHXVUDKKBrDV6PUJbktAKLUehkW02ueHV00_NzZ0DwjSbUFYBB6B69g..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhaydrjX2CLPhFdjQ77gn4P5JpX5K-hUgL.Fo-4So5XSozp1K22dJLoI0YLxK.LB.-L1K.LxKML12qLBK5LxKqL1KBLBo.LxK.LB-BL1KBLxKBLB.2LB.2LxK-LBonL1heLxKqLB-eLBKMt; SUHB=0M2Flmef-A-AIV; SSOLoginState=1510370969; _T_WM=28de7b7a225087a87cbe9c2ad92df1ea' 11 | 12 | if __name__ == '__main__': 13 | pe0 = WeiboClient(cookies=cookies) 14 | pe2 = pe0.Weibo('E6iRJofK6') 15 | pe4 = pe0.People('1884866222') 16 | print(pe4) 17 | print(pe4.name) 18 | print(pe4.weibo_count) 19 | print(pe4.location) 20 | pe2_people = pe2.author 21 | print('______test________') 22 | print(pe2_people.name) 23 | print(pe2_people.weibo_count) 24 | for i, j in zip(range(3), pe2_people.weibo): 25 | print(j.text) 26 | for ii, jj in zip(range(4), j.repost): 27 | print(jj.author_name) 28 | 29 | 30 | -------------------------------------------------------------------------------- /weibospider/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from .attitude import Attitude 4 | from .base import SinaBaseObject 5 | from .comment import Comment 6 | from .client import WeiboClient 7 | from .people import People 8 | from .repost import Repost 9 | from .weibo import Weibo 10 | 11 | __all__ = [ 12 | 'Attitude', 'SinaBaseObject', 'Comment', 'People', 'Repost', 'Weibo', 'WeiboClient' 13 | ] -------------------------------------------------------------------------------- /weibospider/attitude.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | from utils import * 4 | import weibo 5 | from base import SinaBaseObject 6 | import sys 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | 11 | class Attitude(SinaBaseObject): 12 | """ 13 | 点赞类，一般不直接使用，而是作为`Answer.attitude`迭代器的返回类型 14 | """ 15 | 16 | def __init__(self, id, cache={}): 17 | super(Attitude, self).__init__() 18 | self.uid = str(id) 19 | self._cache = cache 20 | self.author_name = cache['author_name'] 21 | self.time = cache['time'] 22 | -------------------------------------------------------------------------------- /weibospider/base.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from __future__ import unicode_literals, print_function 4 | import requests 5 | import json 6 | import time as tt 7 | from bs4 import BeautifulSoup 8 | import sys 9 | import re 10 | 11 | reload(sys) 12 | sys.path.append('../') 13 | sys.setdefaultencoding('utf-8') 14 | 15 | 16 | # headers_for_get = { 17 | # 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 18 | # 'Accept-Encoding':'gzip, deflate, sdch', 19 | # 'Accept-Language':'zh-CN,zh;q=0.8', 20 | # 'Cache-Control':'max-age=0', 21 | # 'Connection':'keep-alive', 22 | # 'Cookie': '_T_WM=0ff248d78f4984aa135c5b2e53c11079; ALF=1495630107; SCF=AjsEaVa0e8KjEg3yEjwEx270PLOpYvK-1BhV7AdkMSQgVvrJ48ic42g3Xqe49zEjKtpWuFcU6KaL2lKIyLzY43s.; SUB=_2A251-YQQDeRhGeNH7VIV9izNwj2IHXVXBSxYrDV6PUJbktBeLUn6kW0ntTSLDvUTciwLCGGI3rSIiDX8jQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhaydrjX2CLPhFdjQ77gn4P5JpX5o2p5NHD95Qf1Kq7ShqEeK.pWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSK.cehBceo24eBtt; SUHB=0mxUEyUKiYW96L; SSOLoginState=1493038144', 23 | # 'Host':'weibo.cn', 24 | # 'Upgrade-Insecure-Requests':'1', 25 | # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' 26 | # } 27 | 28 | 29 | class SinaBaseObject(object): 30 | """ 31 | 所有新浪类的基类 32 | :TODO 刷新cookie策略 33 | """ 34 | # 静态变量cookies 35 | cookies = '' 36 | headers_for_get = { 37 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 38 | 'Accept-Encoding': 'gzip, deflate, sdch', 39 | 'Accept-Language': 'zh-CN,zh;q=0.8', 40 | 'Cache-Control': 'max-age=0', 41 | 'Connection': 'keep-alive', 42 | 'Host': 'weibo.cn', 43 | 'Upgrade-Insecure-Requests': '1', 44 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' 45 | } 46 | _session = requests.Session() 47 | 48 | def __init__(self): 49 | self._time_delay = 1 50 | 51 | # TODO：给session的get方法增加日志的功能 52 | # 对requests.get()函数进行改进，增加重试和报错功能 53 | def retry_requests(self, url, uid=''): 54 | """ 55 | :param url: 待爬取的链接 56 | :param headers: 请求头 57 | :param uid: 帖子或用户的uid值（str类型） 58 | :return: requests_content 爬起的页面源码(bs4类型) 59 | """ 60 | # 设置重试次数 61 | retry_count = 3 62 | while retry_count != 0: 63 | try: 64 | requests_get = requests.get(url, headers=self.headers_for_get, cookies=self.cookies, timeout=3) 65 | requests_content = requests_get.content 66 | # 加入了__T_WM的cookie项后仍然无法在wap版微博登录 67 | # 只能采用网页版 68 | # if self.flag == 0: 69 | # cookies = requests.utils.dict_from_cookiejar(self.cookies) 70 | # requests_get.cookies = requests.utils.dict_from_cookiejar(requests_get.cookies) 71 | # cookies = dict(cookies.items() + requests_get.cookies.items()) 72 | # self.cookies = requests.utils.cookiejar_from_dict(cookies) 73 | # self.flag = 1 74 | # print(self.cookies) 75 | print(url) 76 | print(requests_content) 77 | print(requests_get.status_code) 78 | # 记录发起网络请求的url及时间 79 | with open('log_url.txt', 'a') as f: 80 | f.write(str(url) + ' ' + str(tt.strftime("%Y-%m-%d %H:%M:%S", tt.localtime())) + '\n') 81 | 82 | requests_content = BeautifulSoup(requests_content, "lxml") 83 | return requests_content 84 | except: 85 | tt.sleep(3) 86 | print("获取" + str(uid) + "页面时失败，正在重试。。。") 87 | print(requests_get.status_code) 88 | finally: 89 | retry_count -= 1 90 | if retry_count == 0: 91 | raise Exception("重试次数已完，仍获取" + str(uid) + "的页面失败！") 92 | 93 | @staticmethod 94 | def retry_requests_static(url, headers={}, timeout=3): 95 | """ 96 | :param url: 待爬取的链接 97 | :param headers: 请求头 98 | :param uid: 帖子或用户的uid值（str类型） 99 | :return: requests_content 爬起的页面源码(bs4类型) 100 | """ 101 | # 设置重试次数 102 | retry_count = 3 103 | while retry_count != 0: 104 | try: 105 | tt.sleep(3) 106 | print(url) 107 | requests_get = requests.get(url, headers=headers, timeout=3) 108 | print(url) 109 | # 记录发起网络请求的url及时间 110 | with open('log_static_url.txt', 'a') as f: 111 | f.write(str(url) + ' ' + str(tt.strftime("%Y-%m-%d %H:%M:%S", tt.localtime())) + '\n') 112 | return requests_get 113 | except: 114 | tt.sleep(3) 115 | print("获取页面时失败，正在重试。。。") 116 | finally: 117 | retry_count -= 1 118 | if retry_count == 0: 119 | raise Exception, "重试次数已完，仍获取页面失败！" 120 | 121 | 122 | if __name__ == '__main__': 123 | print(111) 124 | -------------------------------------------------------------------------------- /weibospider/client.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | from __future__ import unicode_literals, print_function 3 | import requests 4 | from utils import * 5 | import os 6 | import importlib 7 | import base 8 | 9 | 10 | class WeiboClient(object): 11 | """ 12 | 微博客户端类维护网络会话使用cookies登录 13 | """ 14 | 15 | def __init__(self, cookies=None): 16 | self._session = requests.Session() 17 | self._session.verify = False 18 | self._session.headers.update(Default_Header) 19 | self._session.mount('http://', self._create_adapter()) 20 | if cookies is not None: 21 | self.login_with_acquired_cookies(cookies) 22 | else: 23 | raise SystemError("未传入cookies") 24 | 25 | @staticmethod 26 | def _create_adapter(): 27 | return requests.adapters.HTTPAdapter( 28 | max_retries=requests.adapters.Retry( 29 | total=5, 30 | status_forcelist=[403, 404, 408, 500, 502], 31 | ) 32 | ) 33 | 34 | def login_with_acquired_cookies(self, cookies): 35 | """ 36 | note: 37 | 保存在文件中的cookies形式为chrome浏览器F12后NetWork中Headers里的形式 38 | 如： 39 | 'ALF=1501159357; SCF=AjsEaVa0e8KjEg3yEjwEx270PLOpYvK-1BhV7AdkMSQgUozbT8VN9e7zDppTz6FZs5PD6E5VoJ3e0J 40 | yOHFF-HIw.; SUB=_2A250ViLtDeThGeBP4lQW-CbLyTqIHXVXuU6lrDV6PUJbktANLWLBkW2HmYSKxGkq2uS0728TOqfHWar_RQ..; 41 | SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhzhoVOn6pkLuGbnO5GBEu35JpX5o2p5NHD95QceK.cS0nRS0zcWs4DqcjMi-- 42 | NiK.Xi-2Ri--ciKnRi-zNSo24SoMR1hMESntt; SUHB=0FQ7hD651l5Cff; _T_WM=55ac8f6c31f4eb6f286ad2e9ed8d729' 43 | """ 44 | # 若文件目录下存在cookies，则其为文件, 打开后获取 45 | # 否则为cookies字符串，直接获取 46 | if os.path.isfile(cookies): 47 | with open(cookies, 'r') as f: 48 | cookies = f.read() 49 | 50 | cookies_dict = {} 51 | # 将cookies字符串转为字典 52 | for item in cookies.split('; '): 53 | cookies_dict[item.split('=')[0]] = item.split('=')[1] 54 | self._session.cookies.update(cookies_dict) 55 | # cookies2 = requests.utils.cookiejar_from_dict(cookies_dict) 56 | base._session = self._session 57 | 58 | def __getattr__(self, item): 59 | """本函数为类工厂模式，用于获取各种类的实例，如 `Answer` `Question` 等. 60 | :支持的形式有: 61 | 1. client.me() （暂未实现） 62 | 2. client.weibo() 63 | 3. client.people() 64 | 4. client.comment() 65 | 5. client.attitude() 66 | 6. client.repost() 67 | 参数均为对应的id，返回对应的类的实例。 68 | """ 69 | # 回调对应模块的构造函数 70 | base.SinaBaseObject._session = self._session 71 | 72 | def callback_getattr(id): 73 | # 类名第一个字母大写 74 | return getattr(module, item.capitalize())(id) 75 | # TODO: 增加me 76 | attr_list = ['me', 'weibo', 'people', 'comment', 'attitude', 'repost'] 77 | if item.lower() in attr_list: 78 | module = importlib.import_module('.'+item.lower(), 'weibospider') 79 | return callback_getattr 80 | 81 | 82 | if __name__ == '__main__': 83 | a = WeiboClient(cookies='as=12') 84 | a.sina_weibo('666') 85 | -------------------------------------------------------------------------------- /weibospider/comment.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | from utils import * 4 | import weibo 5 | from base import SinaBaseObject 6 | import sys 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | 11 | class Comment(SinaBaseObject): 12 | """ 13 | 评论类，一般不直接使用，而是作为`Answer.comment`迭代器的返回类型 14 | """ 15 | 16 | def __init__(self, id, cache={}): 17 | super(Comment, self).__init__() 18 | self.uid = str(id) 19 | self._cache = cache 20 | self.attitude_count = cache['attitude_count'] 21 | self.author_name = cache['author_name'] 22 | self.author_uid = cache['author_uid'] 23 | self.content = cache['text'] 24 | self.is_hot = cache['is_hot'] 25 | self.terminal_source = cache['terminal_source'] 26 | self.text = cache['text'] 27 | self.time = cache['time'] 28 | 29 | 30 | 31 | 32 | # @property 33 | # @normal_attr 34 | # def _soup(self): 35 | # return self._cache['_soup'] 36 | # 37 | # # 获取微博作者的昵称和uid 38 | # def _get_author_data(self): 39 | # # self.author_name = self._soup.find(attrs={'id': 'M_'}).div.a.get_text() 40 | # # self._cache.setdefault('author_name', self.author_name) 41 | # 42 | # self.author_uid = self._soup.find(attrs={'id': 'M_'}).div.a.attrs['href'].split('/')[-1] 43 | # self._cache.setdefault('author_uid', self.author_uid) 44 | # 45 | # @property 46 | # @other_obj(class_name='people', name_in_json='people') 47 | # def author(self): 48 | # pass 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /weibospider/people.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | import time as tt 4 | import pymongo 5 | from utils import * 6 | from bs4 import BeautifulSoup 7 | import re 8 | import requests 9 | import weibo 10 | from base import SinaBaseObject 11 | import sys 12 | reload(sys) 13 | sys.setdefaultencoding('utf-8') 14 | pattern = re.compile(r'\d+') 15 | 16 | 17 | class People(SinaBaseObject): 18 | """ 19 | 新浪微博的用户类 20 | """ 21 | def __init__(self, id, href=None, cache={}): 22 | """ 23 | 24 | { 25 | uid: 5501547091, 26 | name: 助人为乐的英逸, 27 | fans_count: 285, 28 | follow_count: 1500, 29 | weibo_count: 1335, 30 | time_delay: 1, 31 | birthday: 未知, 32 | sex: 男, 33 | location: 江西, 34 | href: http://weibo.cn/5501547091/follow, 35 | :param uid: 36 | :param href: 37 | """ 38 | super(People, self).__init__() 39 | self.uid = str(id) 40 | self._cache = cache 41 | self.href = href 42 | self.birthday = '未知' 43 | self.member_level = '' 44 | self.follow_list = [] 45 | self.fans_list = [] 46 | self.is_V = False 47 | self.uid = self.uid.strip('\n') 48 | if not self.href: 49 | self.href = 'http://weibo.cn/'+self.uid 50 | if not self.uid: 51 | self.uid = self.href.split('cn/') 52 | 53 | @property 54 | def basic_url(self): 55 | return 'http://weibo.cn/u/' + str(self.uid) 56 | 57 | @property 58 | @normal_attr() 59 | def html(self): 60 | return self._session.get(self.href).content 61 | 62 | @property 63 | @normal_attr() 64 | def _soup(self): 65 | return BeautifulSoup(self.html, "lxml") 66 | 67 | @property 68 | @normal_attr() 69 | def _info_content(self): 70 | try: 71 | info_content = self._soup.find('div', attrs={'class': 'u'}).table.tr.findAll('td' 72 | )[1].div.span.contents[0] 73 | except AttributeError: 74 | return False 75 | return info_content 76 | 77 | @property 78 | @normal_attr() 79 | def _info_content_2(self): 80 | """ 81 | 若用户为大V 则有大V标志的图片影响页面标签 82 | 故此时个人信息页面块实际为`_info_content_2` 83 | """ 84 | return self._soup.find('div', attrs={'class': 'u'}).table.tr.findAll('td')[1].div.span.get_text() 85 | 86 | @property 87 | @normal_attr() 88 | def name(self): 89 | return self._info_content.split(' ')[0].strip() 90 | 91 | @property 92 | @normal_attr() 93 | def sex(self): 94 | try: 95 | sex = self._info_content.split(' ')[1].split('/')[0].strip() 96 | except IndexError: 97 | sex = self._info_content_2.split('/')[0].strip()[-1:].strip() 98 | return sex 99 | 100 | @property 101 | @normal_attr() 102 | def location(self): 103 | try: 104 | location = self._info_content.split(' ')[1].split('/')[1].strip() 105 | except IndexError: 106 | # 将大V标志为真 107 | self.is_V = True 108 | location = self._info_content_2.split('/')[1].strip()[:3].strip() 109 | return location 110 | 111 | @property 112 | @normal_attr() 113 | def weibo_count(self): 114 | return int(re.findall(pattern, self._soup.find('div', attrs={'class': 'u'}). 115 | findAll('div', attrs={'class': 'tip2'})[0].get_text())[0]) 116 | 117 | @property 118 | @normal_attr() 119 | def follow_count(self): 120 | return int(re.findall(pattern, self._soup.find('div', attrs={'class': 'u'}). 121 | findAll('div', attrs={'class': 'tip2'})[0].get_text())[1]) 122 | 123 | @property 124 | @normal_attr() 125 | def fans_count(self): 126 | return int(re.findall(pattern, self._soup.find('div', attrs={'class': 'u'}). 127 | findAll('div', attrs={'class': 'tip2'})[0].get_text())[2]) 128 | 129 | def _get_member_list(self, target_member_type): 130 | """ 131 | 获取所指定的当前用户的关注/粉丝列表 132 | 每个被关注者或粉丝的信息存储在dict中 133 | :param required_member_count: 指定获取用户的数量 134 | :param time_delay: 延迟时间 135 | :param target_member_type: 指定获取用户的种类：fans或follow 136 | :return: member_list: 存放已获取的用户列表 137 | 138 | """ 139 | # TODO: 获取人物基本信息 140 | member_url = 'http://weibo.cn/' + str(self.uid) + '/' + str(target_member_type) 141 | self.href = member_url 142 | print("now is crawling " + str(member_url)) 143 | page_count = 1 144 | now_page_count = 1 145 | is_first = True 146 | while True: 147 | 148 | tt.sleep(self.time_delay) 149 | # 获取页面源码(bs4对象) 150 | requests_content = BeautifulSoup(self._session.get(member_url).content, "lxml") 151 | 152 | # 获取当前页的关注列表 153 | unit_list = requests_content.find_all('table') 154 | for i in unit_list: 155 | # 每个用户的信息以dict存储 156 | member = {} 157 | member['href'] = str(i.tr.td.a.attrs['href']) 158 | try: 159 | member['uid'] = i.tr.td.a.attrs['href'].split('u/')[1] 160 | except: 161 | member['uid'] = i.tr.td.a.attrs['href'].split('cn/')[1] 162 | member['name'] = i.tr.find_all('td')[1].a.get_text() 163 | # 正则匹配获取粉丝的粉丝数 164 | pattern = re.compile(r'\d+') 165 | # 若粉丝是大V，则多了一个图片标签 166 | try: 167 | member['is_v'] = False 168 | member['fans_count'] = int(re.findall(pattern, i.tr.find_all('td')[1].contents[2])[0]) 169 | except: 170 | member['fans_count'] = int(re.findall(pattern, i.tr.find_all('td')[1].contents[3])[0]) 171 | member['is_v'] = True 172 | 173 | yield member 174 | 175 | # 若是第一页，则获取总页数 176 | if is_first is True: 177 | # 若发现‘x/y页’ 则有不止一页 178 | if requests_content.find(attrs={'id': 'pagelist'}): 179 | page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip() 180 | page_count = page_count.split('/')[1] 181 | pattern = re.compile(r'\d+') 182 | page_count = int(re.findall(pattern, page_count)[0]) 183 | else: 184 | return 185 | is_first = False 186 | 187 | now_page_count += 1 188 | if now_page_count >= page_count: 189 | return 190 | 191 | member_url = 'http://weibo.cn/' + str(self.uid)+'/'+str(target_member_type)+'?page=' + str(now_page_count) 192 | 193 | @property 194 | @normal_attr() 195 | def fans(self): 196 | """ 197 | 获取当前用户的粉丝列表 198 | :param required_member_count: 限定获取的数量 199 | :param time_delay: 时间延迟 200 | :return: 指定数量的粉丝基本信息列表 201 | .. code-block:: python 202 | [ 203 | { 204 | 'fans_count': 104, 205 | 'is_v' : False, 206 | 'href': 'http://weibo.cn/u/5977488639', 207 | 'uid': 5977488639, 208 | 'name': '小山环环1996' 209 | }, 210 | { 211 | 'fans_count': 10, 212 | 'is_v' : False, 213 | 'href': 'http://weibo.cn/u/6187915152', 214 | 'uid': 6187915152, 215 | 'name': '08iCu京伯' 216 | }, 217 | ] 218 | """ 219 | for x in self._get_member_list(target_member_type='fans'): 220 | yield x 221 | 222 | @property 223 | @normal_attr() 224 | def follow(self): 225 | """ 226 | 获取当前用户的关注列表 227 | :param required_member_count: 限定获取的数量 228 | :param time_delay: 时间延迟 229 | :return: 指定数量的关注基本信息列表 230 | .. code-block:: python 231 | [ 232 | { 233 | 'fans_count': 104, 234 | 'is_v' : False, 235 | 'href': 'http://weibo.cn/u/5977488639', 236 | 'uid': 5977488639, 237 | 'name': '小山环环1996' 238 | }, 239 | { 240 | 'fans_count': 10, 241 | 'is_v' : False, 242 | 'href': 'http://weibo.cn/u/6187915152', 243 | 'uid': 6187915152, 244 | 'name': '08iCu京伯' 245 | }, 246 | ] 247 | """ 248 | for x in self._get_member_list(target_member_type='follow'): 249 | yield x 250 | 251 | @property 252 | @other_obj() 253 | def weibo(self): 254 | """ 255 | 获取指定用户的微博 256 | :param required_weibo_count: 所需的微博条数 257 | :param time_delay: 时间延迟 258 | :return: weibo_list 元素为SinaWeibo对象 259 | .. code-block:: python 260 | [ 261 | { 262 | 'uid': 'EpO2KnAor', 263 | 'is_repost': False, 264 | 'text': '物是人非.', 265 | 'attitude_count' : 0, 266 | 'repost_count': 7, 267 | 'comment_count': 0, 268 | 'time': '01月08日 04:44' 269 | 'terminal_source': 'iPad mini' 270 | }, 271 | { 272 | 'uid': 'EAJwkph8X', 273 | 'is_repost': False, 274 | 'text': '祝你生日快乐', 275 | 'attitude_count' : 0, 276 | 'repost_count': 0, 277 | 'comment_count': 1, 278 | 'time': '2016-12-30 23:34:34' 279 | 'terminal_source': '生日动态' 280 | }, 281 | ] 282 | """ 283 | 284 | weibo_url = self.basic_url 285 | page_count = 1 286 | now_page_count = 1 287 | is_first = True 288 | pattern = re.compile(r'\d+') 289 | 290 | while True: 291 | tt.sleep(self._time_delay) 292 | # 获取页面源码(bs4对象) 293 | requests_content = BeautifulSoup(self._session.get(weibo_url).content, "lxml") 294 | # 获取当前页的微博列表 295 | unit_list = requests_content.find_all('div', attrs={'class': 'c'}) 296 | for i in unit_list: 297 | # 每个微博的信息以微博类SinaWeibo存储 298 | try: 299 | if str(i.attrs['id']) and str(i.attrs['id']).startswith('M'): 300 | weibo_uid = i.attrs['id'].split('_')[1] 301 | else: 302 | continue 303 | except: 304 | continue 305 | 306 | # 检查是否为转发的微博 307 | if len(i.div.find_all('span')) >= 2: 308 | is_repost = True 309 | else: 310 | is_repost = False 311 | # for c in i.div.find_all('span'): 312 | # if str(c.attrs['class']) == "['cmt']": 313 | # is_repost = True 314 | if is_repost: 315 | text = i.div.find_all('span')[0].get_text()+i.div.find_all('span')[1].get_text() 316 | else: 317 | text = i.div.span.get_text() 318 | 319 | # 有的微博处html格式不对 320 | try: 321 | attitude_count = int(re.findall(pattern, i.div.find_all('a')[-4].get_text())[0]) 322 | repost_count = int(re.findall(pattern, i.div.find_all('a')[-3].get_text())[0]) 323 | comment_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-2].get_text())[0]) 324 | except IndexError: 325 | try: 326 | comment_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-3].get_text())[0]) 327 | repost_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-4].get_text())[0]) 328 | attitude_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-5].get_text())[0]) 329 | except IndexError: 330 | attitude_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[0]) 331 | repost_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[1]) 332 | comment_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[2]) 333 | # print(attitude_count, repost_count, comment_count) 334 | try: 335 | time = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[0] 336 | terminal_source = i.div.find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[1] 337 | except IndexError: 338 | time = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[0] 339 | try: 340 | terminal_source = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[1] 341 | except IndexError: 342 | terminal_source = '暂无' 343 | # print(time, terminal_source) 344 | weibo_cache = { 345 | "is_repost": is_repost, 346 | "text": text, 347 | "attitude_count": attitude_count, 348 | "repost_count": repost_count, 349 | "comment_count": comment_count, 350 | "time": time, 351 | "terminal_source": terminal_source 352 | } 353 | self.now_weibo_cache = weibo_cache 354 | self.now_weibo_uid = weibo_uid 355 | yield weibo.Weibo(id=weibo_uid, cache=weibo_cache) 356 | is_repost = False 357 | 358 | # 若是第一页，则获取总页数 359 | if is_first: 360 | # 若发现‘x/y页’ 则有不止一页 361 | if requests_content.find(attrs={'id': 'pagelist'}): 362 | page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip() 363 | page_count = page_count.split('/')[1] 364 | page_count = int(re.findall(pattern, page_count)[0]) 365 | else: 366 | return 367 | is_first = False 368 | 369 | now_page_count += 1 370 | if now_page_count > page_count: 371 | return 372 | 373 | weibo_url = 'http://weibo.cn/u/' + str(self.uid) + '?page=' + str(now_page_count) 374 | 375 | # def get_weibo_list(self): 376 | # """ 377 | # 获取指定用户的微博 378 | # :param required_weibo_count: 所需的微博条数 379 | # :param time_delay: 时间延迟 380 | # :return: weibo_list 元素为SinaWeibo对象 381 | # .. code-block:: python 382 | # [ 383 | # { 384 | # 'uid': 'EpO2KnAor', 385 | # 'is_repost': False, 386 | # 'text': '物是人非.', 387 | # 'attitude_count' : 0, 388 | # 'repost_count': 7, 389 | # 'comment_count': 0, 390 | # 'time': '01月08日 04:44' 391 | # 'terminal_source': 'iPad mini' 392 | # }, 393 | # { 394 | # 'uid': 'EAJwkph8X', 395 | # 'is_repost': False, 396 | # 'text': '祝你生日快乐', 397 | # 'attitude_count' : 0, 398 | # 'repost_count': 0, 399 | # 'comment_count': 1, 400 | # 'time': '2016-12-30 23:34:34' 401 | # 'terminal_source': '生日动态' 402 | # }, 403 | # ] 404 | # """ 405 | # required_weibo_count = self.required_weibo_count 406 | # weibo_url = self.basic_url 407 | # weibo_list = [] 408 | # weibo_count = 0 409 | # page_count = 1 410 | # now_page_count = 1 411 | # is_first = True 412 | # pattern = re.compile(r'\d+') 413 | # while True: 414 | # 415 | # tt.sleep(self.time_delay) 416 | # # 获取页面源码(bs4对象) 417 | # requests_content = self.retry_requests(weibo_url, uid=self.uid) 418 | # 419 | # # 获取当前页的微博列表 420 | # unit_list = requests_content.find_all('div', attrs={'class': 'c'}) 421 | # for i in unit_list: 422 | # # 每个微博的信息以微博类SinaWeibo存储 423 | # try: 424 | # if str(i.attrs['id']) and str(i.attrs['id']).startswith('M'): 425 | # weibo_uid = i.attrs['id'].split('_')[1] 426 | # else: 427 | # continue 428 | # except: 429 | # continue 430 | # weibo = sina_weibo.SinaWeibo(uid=weibo_uid, required_count=0) 431 | # 432 | # # 检查是否为转发的微博 433 | # for c in i.div.find_all('span'): 434 | # if str(c.attrs['class']) == "['cmt']": 435 | # weibo.is_repost = True 436 | # if weibo.is_repost: 437 | # weibo.text = i.div.find_all('span')[0].get_text()+i.div.find_all('span')[1].get_text() 438 | # else: 439 | # weibo.text = i.div.span.get_text()[1:] 440 | # 441 | # weibo.uid = weibo_uid 442 | # 443 | # # 有的微博处html格式不对 444 | # try: 445 | # weibo.attitude_count = int(re.findall(pattern, i.div.find_all('a')[-4].get_text())[0]) 446 | # weibo.repost_count = int(re.findall(pattern, i.div.find_all('a')[-3].get_text())[0]) 447 | # weibo.comment_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-2].get_text())[0]) 448 | # except IndexError: 449 | # print(weibo_uid) 450 | # print(weibo.author_uid) 451 | # try: 452 | # weibo.comment_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-3].get_text())[0]) 453 | # weibo.repost_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-4].get_text())[0]) 454 | # weibo.attitude_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-5].get_text())[0]) 455 | # except IndexError: 456 | # weibo.attitude_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[0]) 457 | # weibo.repost_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[1]) 458 | # weibo.comment_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[2]) 459 | # print(weibo.attitude_count, weibo.repost_count, weibo.comment_count) 460 | # try: 461 | # weibo.time = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[0] 462 | # weibo.terminal_source = i.div.find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[1] 463 | # except IndexError: 464 | # print(i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text()) 465 | # weibo.time = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[0] 466 | # try: 467 | # weibo.terminal_source = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[1] 468 | # except IndexError: 469 | # weibo.terminal_source = '暂无' 470 | # print(weibo.time, weibo.terminal_source) 471 | # # 计数器加一 472 | # weibo_count += 1 473 | # # 若超过了要求获取的用户数量，则返回 474 | # if weibo_count > required_weibo_count: 475 | # return weibo_list 476 | # weibo_list.append(weibo) 477 | # 478 | # # 若是第一页，则获取总页数 479 | # if is_first: 480 | # # 若发现‘x/y页’ 则有不止一页 481 | # if requests_content.find(attrs={'id': 'pagelist'}): 482 | # page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip() 483 | # page_count = page_count.split('/')[1] 484 | # page_count = int(re.findall(pattern, page_count)[0]) 485 | # print(page_count) 486 | # else: 487 | # return weibo_list 488 | # is_first = False 489 | # 490 | # now_page_count += 1 491 | # if now_page_count > page_count: 492 | # break 493 | # 494 | # weibo_url = 'http://weibo.cn/u/' + str(self.uid) + '?page=' + str(now_page_count) 495 | # 496 | # return weibo_list 497 | 498 | # def get_personal_information(self): 499 | # """ 500 | # 注：新浪有奇怪的BUG 带cookies访问http://weibo.cn/3193031501/info这类个人资料url时，总是File not found 501 | # 若不带cookies则不能访问该页 502 | # 所以只能获取个人主页简单的性别和地点信息 503 | # 504 | # @2017/06/12： 505 | # 新浪允许不带cookie访问某些页面，如某个微博页面 506 | # 而对另一些页面如个人主页的详细情况，则有的用户需要cookie，有的不需要。 507 | # :return: 508 | # """ 509 | # requests_content = self.retry_requests(self.href) 510 | # try: 511 | # info_content = requests_content.find('div', attrs={'class': 'u'}).table.tr.findAll('td')[1].div.span.contents[0] 512 | # except AttributeError: 513 | # print(requests_content) 514 | # return False 515 | # # 此处split(' ')中的空格不是一般的空格需要在原网页中复制 516 | # # 普通用户无图片标签 517 | # self.name = info_content.split(' ')[0].strip() 518 | # print(self.name) 519 | # try: 520 | # self.sex = info_content.split(' ')[1].split('/')[0].strip() 521 | # print(self.sex) 522 | # self.location = info_content.split(' ')[1].split('/')[1].strip() 523 | # print(self.name, self.sex, self.location) 524 | # except IndexError: 525 | # self.is_V = True 526 | # info2 = requests_content.find('div', attrs={'class': 'u'}).table.tr.findAll('td')[1].div.span.get_text() 527 | # self.sex = info2.split('/')[0].strip()[-1:].strip() 528 | # print(self.sex) 529 | # self.location = info2.split('/')[1].strip()[:3].strip() 530 | # print(self.name, self.sex, self.location) 531 | # 532 | # # 获取该用户的微博数关注数粉丝数 533 | # self.weibo_count = int(re.findall(pattern, requests_content.find('div', attrs={'class': 'u'}). 534 | # findAll('div', attrs={'class': 'tip2'})[0].get_text())[0]) 535 | # self.follow_count = int(re.findall(pattern, requests_content.find('div', attrs={'class': 'u'}). 536 | # findAll('div', attrs={'class': 'tip2'})[0].get_text())[1]) 537 | # self.fans_count = int(re.findall(pattern, requests_content.find('div', attrs={'class': 'u'}). 538 | # findAll('div', attrs={'class': 'tip2'})[0].get_text())[2]) 539 | # print(self.weibo_count, self.follow_count, self.fans_count) 540 | 541 | # def __get_member_list__(self, target_member_type='fans'): 542 | # """ 543 | # 获取所指定的当前用户的关注/粉丝列表 544 | # 每个被关注者或粉丝的信息存储在dict中 545 | # :param required_member_count: 指定获取用户的数量 546 | # :param time_delay: 延迟时间 547 | # :param target_member_type: 指定获取用户的种类：fans或follow 548 | # :return: member_list: 存放已获取的用户列表 549 | # 550 | # 551 | # """ 552 | # required_member_count = self.required_member_count 553 | # member_url = 'http://weibo.cn/' + str(self.uid) + '/' + str(target_member_type) 554 | # self.href = member_url 555 | # print(member_url) 556 | # member_list = [] 557 | # member_count = 0 558 | # page_count = 1 559 | # now_page_count = 1 560 | # is_first = True 561 | # while True: 562 | # 563 | # tt.sleep(self.time_delay) 564 | # # 获取页面源码(bs4对象) 565 | # requests_content = self.retry_requests(member_url, uid=self.uid) 566 | # 567 | # # 获取当前页的关注列表 568 | # unit_list = requests_content.find_all('table') 569 | # for i in unit_list: 570 | # # 每个用户的信息以dict存储 571 | # member = {} 572 | # member['href'] = str(i.tr.td.a.attrs['href']) 573 | # try: 574 | # member['uid'] = i.tr.td.a.attrs['href'].split('u/')[1] 575 | # except: 576 | # member['uid'] = i.tr.td.a.attrs['href'].split('cn/')[1] 577 | # member['name'] = i.tr.find_all('td')[1].a.get_text() 578 | # # 正则匹配获取粉丝的粉丝数 579 | # pattern = re.compile(r'\d+') 580 | # # 若粉丝是大V，则多了一个图片标签 581 | # try: 582 | # member['is_v'] = False 583 | # member['fans_count'] = int(re.findall(pattern, i.tr.find_all('td')[1].contents[2])[0]) 584 | # except: 585 | # member['fans_count'] = int(re.findall(pattern, i.tr.find_all('td')[1].contents[3])[0]) 586 | # member['is_v'] = True 587 | # print(member['name']) 588 | # print(member['fans_count']) 589 | # # 计数器加一 590 | # member_count += 1 591 | # # 若超过了要求获取的用户数量，则返回 592 | # if member_count > required_member_count: 593 | # return member_list 594 | # member_list.append(member) 595 | # 596 | # # 若是第一页，则获取总页数 597 | # if is_first is True: 598 | # # 若发现‘x/y页’ 则有不止一页 599 | # if requests_content.find(attrs={'id': 'pagelist'}): 600 | # page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip() 601 | # page_count = page_count.split('/')[1] 602 | # pattern = re.compile(r'\d+') 603 | # page_count = int(re.findall(pattern, page_count)[0]) 604 | # print(page_count) 605 | # else: 606 | # return member_list 607 | # is_first = False 608 | # 609 | # now_page_count += 1 610 | # if now_page_count >= page_count: 611 | # break 612 | # 613 | # member_url = 'http://weibo.cn/' + str(self.uid)+'/'+str(target_member_type)+'?page=' + str(now_page_count) 614 | # print(member_url) 615 | # print(self.uid) 616 | # print(target_member_type) 617 | # print("以上") 618 | # 619 | # return member_list -------------------------------------------------------------------------------- /weibospider/repost.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | from utils import * 4 | import weibo 5 | from base import SinaBaseObject 6 | import sys 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | 11 | class Repost(SinaBaseObject): 12 | """ 13 | 回复类，一般不直接使用，而是作为`Answer.repost`迭代器的返回类型 14 | """ 15 | 16 | def __init__(self, id, cache={}): 17 | super(Repost, self).__init__() 18 | self.uid = str(id) 19 | self._cache = cache 20 | self.author_name = cache['author_name'] 21 | self.text = cache['text'] 22 | -------------------------------------------------------------------------------- /weibospider/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | from bs4 import BeautifulSoup 4 | import functools 5 | import importlib 6 | 7 | Default_Header = { 8 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 9 | 'Accept-Encoding': 'gzip, deflate, sdch', 10 | 'Accept-Language': 'zh-CN,zh;q=0.8', 11 | 'Cache-Control': 'max-age=0', 12 | 'Connection': 'keep-alive', 13 | 'Host': 'weibo.cn', 14 | 'Upgrade-Insecure-Requests': '1', 15 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 ' 16 | 'Safari/537.36' 17 | } 18 | 19 | 20 | def check_cache(attr): 21 | def real(func): 22 | @functools.wraps(func) 23 | def wrapper(self): 24 | value = getattr(self, attr, None) 25 | if not value: 26 | value = func(self) 27 | setattr(self, attr, value) 28 | return value 29 | return wrapper 30 | return real 31 | 32 | 33 | def normal_attr(name_in_json=None): 34 | """ 35 | 36 | 本装饰器的作用为： 37 | 38 | 1. 标识这个属性为常规属性。 39 | 2. 自动从当前对象的数据中取出对应属性。 40 | 优先返回缓存中的数据。 41 | :param name_in_json: 要查找的属性在`self._cache`这个json中的名字 42 | 默认值为使用此装饰器的方法名。 43 | """ 44 | def actual_decorator(func): 45 | @functools.wraps(func) 46 | def inner(self, *args, **kwargs): 47 | name = name_in_json or func.__name__ 48 | if self._cache and name in self._cache.keys(): 49 | return self._cache[name] 50 | else: 51 | value = func(self, *args, **kwargs) 52 | self._cache.setdefault(name, value) 53 | return self._cache[name] 54 | return inner 55 | return actual_decorator 56 | 57 | 58 | def other_obj(class_name=None, name_in_json=None, module_filename=None): 59 | """ 60 | 61 | 本装饰器的作用为： 62 | 63 | 1. 标识这个属性为另一个父类为base类的对象。 64 | 2. 自动从当前对象的数据中取出对应属性，构建成所需要的对象。 65 | :param class_name: 要生成的对象类名。 66 | :param name_in_json: 属性在 JSON 里的键名。 67 | :param module_filename: 所在的模块的文件名。 68 | """ 69 | def actual_decorator(func): 70 | @functools.wraps(func) 71 | def inner(self, *args, **kwargs): 72 | cls_name = class_name or func.__name__ 73 | name = name_in_json or func.__name__ 74 | 75 | obj_cls = get_class_from_name(cls_name, module_filename) 76 | 77 | request_obj = func(self, *args, **kwargs) 78 | # print(111111111111) 79 | # print(request_obj) 80 | 81 | if request_obj is None: 82 | if name == 'people': 83 | return obj_cls(self.author_uid, cache={'name': self.author_name}) 84 | # if name == 'weibo': 85 | # return obj_cls(uid=self.now_weibo_uid, cache=self.now_weibo_cache) 86 | return request_obj 87 | 88 | return inner 89 | 90 | return actual_decorator 91 | 92 | 93 | def get_class_from_name(clsname=None, module_filename=None): 94 | """ 95 | 96 | 接收类名，通过处理返回对应的类 97 | 98 | :param clsname: 类名 99 | :param module_filename: 模块名 100 | :return: 模块中对应传入类名的类 101 | """ 102 | cls_name = clsname.capitalize() if clsname.islower() else clsname 103 | file_name = module_filename or cls_name.lower() 104 | 105 | # 获取引用的模块如 `` 106 | imported_module = importlib.import_module('.'+file_name, 'weibospider') 107 | # print(imported_module) 108 | # print(getattr(imported_module, cls_name)) 109 | # 返回模块中对应传入类名的类如 `` 110 | return getattr(imported_module, cls_name) 111 | # except (ImportError, AttributeError): 112 | # raise TypeError( 113 | # 'Unknown weibo obj type [{}]'.format(clsname) 114 | # ) 115 | -------------------------------------------------------------------------------- /weibospider/weibo.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | import time as tt 4 | import bs4 5 | from bs4 import BeautifulSoup 6 | import re 7 | import requests 8 | from .utils import * 9 | from base import SinaBaseObject 10 | from comment import Comment 11 | import sys 12 | import people 13 | reload(sys) 14 | sys.setdefaultencoding('utf-8') 15 | 16 | pattern = re.compile(r'\d+') 17 | 18 | 19 | class Weibo(SinaBaseObject): 20 | """ 21 | 新浪微博的微博类 22 | { 23 | uid: F0Mg7a8Wh, 24 | author_uid: rmrb, 25 | is_repost: False, 26 | href: http://weibo.cn/comment/F0Mg7a8Wh, 27 | text: :【中国大学教学质量排行榜800强出炉！你的母校排多少？】近日，《2017中国大学评价研究报告》发布2017中国大学 28 | 教学质量排行榜。清华大学本科生和研究生教育教学和人才培养质量问鼎榜首，北大第2，复旦第3，南大第4，武大第5， 29 | 上海交大第6，浙大第7，人大第8，吉大第9，川大第10。戳↓你的学校第几名？ [组图共9张] 30 | time: 04月29日 12:58, 31 | time_delay: 1, 32 | author_name: 人民日报, 33 | repost_count: 3910, 34 | attitude_count: 3076, 35 | comment_count: 3248, 36 | repost_list: 37 | [ 38 | { 39 | u'text': ':', 40 | u'name': u'\u56db\u5ddd\u5927\u5b66' 41 | }, 42 | { 43 | u'text': ':27[\xe6\x91\x8a\xe6\x89\x8b][\xe5\xbf\x83] //', 44 | u'name': u'\u674e\u5199\u610f'}, 45 | { 46 | u'text': ':\xe5\xa4\xaa\xe6\x83\xa8\xe4\xba\x86\xef\xbc\x8c\xe5\x89\x8d56\ 47 | xe4\xb8\xaa\xe9\x83\xbd\xe6\x98\xaf\xe4\xb8\x96\xe7\x95\x8c\xe7\x9f\xa5\xe5\x90\x8d... //', 48 | u'name': u'\u897f\u8d22\u975e\u5b98\u65b9\u65b0\u95fb\u4e2d\u5fc3' 49 | }, 50 | .... 51 | ] 52 | comment_list: 53 | [ 54 | { 55 | uid: C_4101856898497093, 56 | terminal_source: iPhone 6s, 57 | text: \u4eba\u6c11\u65e5\u62a5\u4e5f\u53d1\u8fd9\u79cd\u5546\u4e1a\u6027\u8d28\u7684\u5927\u5b66 58 | \u6392\u884c\u699c\u3002\u3002\u3002[\u62dc\u62dc][\u62dc\u62dc][\u62dc\u62dc]', 59 | time: 04\u670829\u65e5 13:05\xa0, 60 | attitude_count: 270, 61 | is_hot: True, 62 | name: M-never 63 | }, 64 | .... 65 | ] 66 | hot_comment_list: 67 | [ 68 | { 69 | uid: C_4101856898497093, 70 | terminal_source: iPhone 6s, 71 | text: \u4eba\u6c11\u65e5\u62a5\u4e5f\u53d1\u8fd9\u79cd\u5546\u4e1a\u6027\u8d28\u7684\u5927\u5b66 72 | \u6392\u884c\u699c\u3002\u3002\u3002[\u62dc\u62dc][\u62dc\u62dc][\u62dc\u62dc]', 73 | time: 04\u670829\u65e5 13:05\xa0, 74 | attitude_count: 270, 75 | is_hot: True, 76 | name: M-never 77 | }, 78 | .... 79 | ] 80 | attitude_list: 81 | [ 82 | { 83 | name: \u723d\u5cf0\u4e2b\u4e2b, 84 | time: 13\u5206\u949f\u524d 85 | }, 86 | { 87 | name: \u8393\u5c7f, 88 | time: \u4eca\u5929 19:55 89 | }, 90 | .... 91 | ] 92 | 93 | """ 94 | def __init__(self, id, cache={}): 95 | super(Weibo, self).__init__() 96 | self.uid = id 97 | self._cache = cache 98 | # self._session = session 99 | self.href = 'http://weibo.cn/comment/'+str(id) 100 | self.main_page_resource = '' 101 | self._get_author_data() 102 | # 该微博是否为转发 103 | self.is_repost = False 104 | # 该微博转发的微博的信息 105 | self.repost_location = '' 106 | self.repost_author_uid = '' 107 | self.repost_text = '' 108 | self.repost_reposted_count = 0 109 | self.terminal_source = '' 110 | self.location = '' 111 | 112 | # 威胁程度 113 | self.threatened = 0 114 | 115 | @property 116 | @normal_attr() 117 | def html(self): 118 | return self._session.get('http://weibo.cn/repost/' + self.uid).content 119 | 120 | @property 121 | @normal_attr() 122 | def _soup(self): 123 | return BeautifulSoup(self.html, "lxml") 124 | 125 | @property 126 | @other_obj(name_in_json='people', class_name='people') 127 | def author(self): 128 | return None 129 | 130 | @property 131 | @normal_attr() 132 | def time(self): 133 | return self._soup.find(attrs={'id': 'M_'}).findAll('div')[1].span.get_text() 134 | 135 | @property 136 | @normal_attr() 137 | def text(self): 138 | """ 139 | 微博文本 140 | """ 141 | if not self._soup.find(attrs={'id': 'M_'}): 142 | raise AttributeError("cookies失效或网络故障！") 143 | return self._soup.find(attrs={'id': 'M_'}).div.span.get_text() 144 | 145 | @property 146 | @normal_attr() 147 | def repost_count(self): 148 | """ 149 | :return:int 转发数 150 | """ 151 | # wap版的微博页面，此页面内容格式特别不规范 152 | repost_number_node = self._soup.find(attrs={'id': 'rt'}) 153 | try: 154 | repost_count = int(re.findall(pattern, repost_number_node.get_text())[0]) 155 | except IndexError: 156 | print("获取转发数出错") 157 | repost_count = 0 158 | return repost_count 159 | 160 | @property 161 | @normal_attr() 162 | def comment_count(self): 163 | """ 164 | :return:int 评论数 165 | """ 166 | # wap版的微博页面，此页面内容格式特别不规范 167 | try: 168 | comment_number_node = self._soup.find(attrs={'id': 'rt'}).next_sibling 169 | comment_count = int(re.findall(pattern, comment_number_node.get_text())[0]) 170 | except IndexError: 171 | print("获取评论数出错") 172 | comment_count = 0 173 | return comment_count 174 | 175 | @property 176 | @normal_attr() 177 | def attitude_count(self): 178 | # wap版的微博页面，此页面内容格式特别不规范 179 | try: 180 | attitude_number_node = self._soup.find(attrs={'id': 'rt'}).next_sibling.next_sibling 181 | attitude_count = int(re.findall(pattern, attitude_number_node.get_text())[0]) 182 | except IndexError: 183 | print("获取点赞数出错") 184 | attitude_count = 0 185 | return attitude_count 186 | 187 | # 获取微博作者的昵称和uid 188 | def _get_author_data(self): 189 | self.author_name = self._soup.find(attrs={'id': 'M_'}).div.a.get_text() 190 | self._cache.setdefault('author_name', self.author_name) 191 | self.author_uid = self._soup.find(attrs={'id': 'M_'}).div.a.attrs['href'].split('/')[-1] 192 | self._cache.setdefault('author_uid', self.author_uid) 193 | 194 | def _get_attribute_item(self, target_attribute_type, target_attribute_fuction): 195 | """ 196 | 197 | :param target_attribute_type: 198 | :param target_attribute_fuction: 199 | :param required_attribute_count: 200 | :return: 201 | """ 202 | attribute_url = 'http://weibo.cn/' + str(target_attribute_type) + '/' + str(self.uid) 203 | attribute_list = [] 204 | page_count = 1 205 | now_page_count = 1 206 | is_first = True 207 | is_first_item = True 208 | pattern = re.compile(r'\d+') 209 | while True: 210 | # print("现在是评论第一页") 211 | tt.sleep(self._time_delay) 212 | # 获取页面源码(bs4对象) 213 | requests_content = BeautifulSoup(self._session.get(attribute_url).content) 214 | 215 | # 获取当前页的关注列表 216 | unit_list = requests_content.find_all('div', attrs={'class': 'c'}) 217 | for i in unit_list: 218 | # 调用具体函数提取内容 219 | attribute = target_attribute_fuction(i) 220 | if attribute is False: 221 | continue 222 | # 获取点赞时会把作者也获取到故去除 223 | if target_attribute_type == 'attitude' or target_attribute_type == 'repost' and is_first_item: 224 | is_first_item = False 225 | continue 226 | yield attribute 227 | 228 | # 若是第一页，则获取总页数 229 | if is_first: 230 | # 若发现‘x/y页’ 则有不止一页 231 | if requests_content.find(attrs={'id': 'pagelist'}): 232 | page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip() 233 | page_count = page_count.split('/')[1] 234 | page_count = int(re.findall(pattern, page_count)[0]) 235 | # print(page_count) 236 | else: 237 | return 238 | is_first = False 239 | 240 | now_page_count += 1 241 | if now_page_count >= page_count: 242 | return 243 | 244 | attribute_url = 'http://weibo.cn/' + str(target_attribute_type) +'/' + str(self.uid) +'?&&page=' + \ 245 | str(now_page_count) 246 | 247 | @staticmethod 248 | def _get_comment_list(unit): 249 | comment = {} 250 | # 若有id属性且id值以C开头，则证明是评论 251 | try: 252 | if str(unit.attrs['id']).startswith('C'): 253 | comment['uid'] = str(unit.attrs['id']) 254 | else: 255 | return False 256 | except: 257 | return False 258 | comment['author_name'] = unit.a.get_text() 259 | comment['author_uid'] = str(str(unit.a.attrs['href']).split('/')[-1]) 260 | # 有的用户是个性域名，不符合/u/‘uid’的特点，故同时存href 261 | # comment['people'] = sina_people.SinaPeople(uid=str(unit.a.attrs['href']).split('/')[-1], 262 | # href='http://http://weibo.cn'+str(unit.a.attrs['href'])) 263 | # 检查是否有“热门”标签 264 | try: 265 | if str(unit.span.attrs['class']) == "['kt']": 266 | comment['is_hot'] = True 267 | else: 268 | comment['is_hot'] = False 269 | except: 270 | comment['is_hot'] = False 271 | 272 | # 正则匹配获取评论的赞数 273 | # 正常情况为`举报赞[0] 回复 ` 274 | # 如果自己赞了本条评论则此页面会变为`举报已赞[1] 取消赞回复 ` 275 | # 如果是自己的评论则此页面会变为` 举报赞[0] 回复删除 ` 276 | # 故需要特殊处理 277 | try: 278 | comment['attitude_count'] = int(re.findall(pattern, unit.find_all('span', attrs={'class': 'cc'})[-2] 279 | .get_text())[0]) 280 | except IndexError: 281 | try: 282 | comment['attitude_count'] = int(re.findall(pattern, unit.find_all('span', attrs={'class': 'cmt'})[0] 283 | .get_text())[0]) 284 | except IndexError: 285 | comment['attitude_count'] = int( 286 | re.findall(pattern, unit.find_all('span', attrs={'class': 'cc'})[-3].get_text())[0]) 287 | 288 | # 获取评论的正文 289 | comment['text'] = unit.find_all('span', attrs={'class': 'ctt'})[0].get_text() 290 | # 获取评论的时间 291 | comment['time'] = unit.find_all('span', attrs={'class': 'ct'})[-1].get_text().split('来自')[0] 292 | # 获取评论的终端来源 293 | comment['terminal_source'] = unit.find_all('span', attrs={'class': 'ct'})[-1].get_text().split('来自')[1] 294 | 295 | return Comment(id=str(comment['uid']), cache=comment) 296 | 297 | @property 298 | @other_obj() 299 | def comment(self): 300 | """ 301 | :param required_comment_count: 指定获取的条数 302 | :param time_delay: 时间延迟 303 | :return: 该微博的评论列表 304 | .. code-block:: python 305 | [ 306 | { 307 | 'uid': 'C_4100160336496887', 308 | 'is_hot': False, 309 | 'name' : '-猫猫站不稳-', 310 | 'people': <__main__.SinaPeople object at 0x0000000003498BE0>, 311 | 'time': 今天 20:44, 312 | 'terminal_source': 'iPhone 6' 313 | 'text': '稀罕你!' 314 | 'attitude_count': 0 315 | }, 316 | ] 317 | """ 318 | for x in self._get_attribute_item('comment', self._get_comment_list): 319 | yield x 320 | 321 | @staticmethod 322 | def _get_attitude_list(unit): 323 | from attitude import Attitude 324 | attitude = {} 325 | # 若有a标签则为点赞的unit 326 | try: 327 | attitude['author_name'] = unit.a.get_text() 328 | attitude['time'] = unit.span.get_text() 329 | # attitude['people'] = SinaPeople(uid=str(unit.a.attrs['href']).split('/')[-1], 330 | # href='http://weibo.cn' + str(unit.a.attrs['href'])) 331 | except AttributeError: 332 | return False 333 | return Attitude(id=0, cache=attitude) 334 | 335 | @property 336 | @other_obj() 337 | def attitude(self): 338 | for x in self._get_attribute_item('attitude', self._get_attitude_list): 339 | yield x 340 | 341 | @staticmethod 342 | def _get_repost_list(unit): 343 | from repost import Repost 344 | repost = {} 345 | try: 346 | repost['author_name'] = unit.a.get_text() 347 | tmp_slibing = unit.a.next_sibling 348 | while not isinstance(tmp_slibing, bs4.element.NavigableString): 349 | tmp_slibing = tmp_slibing.next_sibling 350 | repost['text'] = str(tmp_slibing) 351 | # repost['people'] = SinaPeople(uid=unit.a.attrs['href'].split('/')[-1], 352 | # href='http://weibo.cn/'+unit.a.attrs['href']) 353 | except AttributeError: 354 | return False 355 | return Repost(id=0, cache=repost) 356 | 357 | @property 358 | @other_obj() 359 | # TODO：获取转发的时间、终端等信息 360 | def repost(self): 361 | for x in self._get_attribute_item('repost', self._get_repost_list): 362 | yield x 363 | 364 | # def get_text(self): 365 | # """ 366 | # 获取微博内容 367 | # :return: str类型的微博文本内容 368 | # """ 369 | # # if self.text != '': 370 | # # return self.text 371 | # if 1: 372 | # _retry_count = 3 373 | # while _retry_count > 0: 374 | # requests_content = self._soup 375 | # self.main_page_resource = requests_content 376 | # print(requests_content) 377 | # print("测试session的get方法") 378 | # try: 379 | # self.text = requests_content.find(attrs={'id': 'M_'}).div.span.get_text() 380 | # self.__get_author_data__() 381 | # _retry_count -= 1 382 | # break 383 | # except AttributeError: 384 | # _retry_count -= 1 385 | # 386 | # # 微博属性（转发数、赞数、评论数） 387 | # # wap版的此内容格式特别不规范 388 | # repost_number_node = requests_content.find(attrs={'id': 'rt'}) 389 | # try: 390 | # self.repost_count = int(re.findall(pattern, repost_number_node.get_text())[0]) 391 | # except IndexError: 392 | # self.repost_count = 0 393 | # try: 394 | # comment_number_node = repost_number_node.next_sibling 395 | # self.comment_count = int(re.findall(pattern, comment_number_node.get_text())[0]) 396 | # except IndexError: 397 | # self.comment_count = 0 398 | # try: 399 | # attitude_number_node = comment_number_node.next_sibling 400 | # self.attitude_count = int(re.findall(pattern, attitude_number_node.get_text())[0]) 401 | # except IndexError: 402 | # self.attitude_count = 0 403 | # 404 | # # 微博发表时间 405 | # #self.time = requests_content.find(attrs={'id': 'M_'}).findAll('div')[1].span.get_text() 406 | # return self.text 407 | 408 | 409 | 410 | # def __get_attribute_list__(self, target_attribute_type, target_attribute_fuction, required_attribute_count=8): 411 | # """ 412 | # 413 | # :param target_attribute_type: 414 | # :param target_attribute_fuction: 415 | # :param required_attribute_count: 416 | # :return: 417 | # """ 418 | # attribute_url = 'http://weibo.cn/' + str(target_attribute_type) + '/' + str(self.uid) 419 | # attribute_list = [] 420 | # attribute_count = 0 421 | # page_count = 1 422 | # now_page_count = 1 423 | # is_first = True 424 | # pattern = re.compile(r'\d+') 425 | # while True: 426 | # print("现在是评论第一页") 427 | # tt.sleep(self.time_delay) 428 | # # 获取页面源码(bs4对象) 429 | # requests_content = self.retry_requests(attribute_url, uid=self.uid) 430 | # 431 | # # 获取当前页的关注列表 432 | # unit_list = requests_content.find_all('div', attrs={'class': 'c'}) 433 | # for i in unit_list: 434 | # # 调用具体函数提取内容 435 | # attribute = target_attribute_fuction(i) 436 | # if attribute is False: 437 | # continue 438 | # # 计数器加一 439 | # attribute_count += 1 440 | # # 若超过了要求获取的属性数量，则返回 441 | # if attribute_count > required_attribute_count: 442 | # return attribute_list 443 | # attribute_list.append(attribute) 444 | # 445 | # # 若是第一页，则获取总页数 446 | # if is_first: 447 | # # 若发现‘x/y页’ 则有不止一页 448 | # if requests_content.find(attrs={'id': 'pagelist'}): 449 | # page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip() 450 | # page_count = page_count.split('/')[1] 451 | # page_count = int(re.findall(pattern, page_count)[0]) 452 | # print(page_count) 453 | # else: 454 | # return attribute_list 455 | # is_first = False 456 | # 457 | # now_page_count += 1 458 | # if now_page_count >= page_count: 459 | # break 460 | # 461 | # attribute_url = 'http://weibo.cn/' + str(target_attribute_type) +'/' + str(self.uid) +'?&&page=' + \ 462 | # str(now_page_count) 463 | # 464 | # return attribute_list 465 | 466 | if __name__ == '__main__': 467 | def a(): 468 | return 1 469 | print(type(a)) --------------------------------------------------------------------------------