├── .gitignore ├── .idea ├── SinaWeiboSpider.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── LICENSE ├── README.md ├── __init__.py ├── ml ├── __init__.py ├── fenci.py ├── get_machine_info.py ├── get_user_info.py ├── keyword.txt ├── mingan_strip_equal.txt ├── mingan_word.txt ├── names.csv ├── os_path.py ├── random_forest.py ├── realtime_random_weibo.py ├── realtime_random_weibo_2.py ├── realtime_user_fans_follower.py ├── realtime_user_info.py ├── realtime_user_relationship.py ├── stopwords_cn.txt ├── svm_dict.py ├── svm_lsi.py ├── svm_module.py ├── svm_result.py ├── svm_tfidf.py ├── svm_utils.py └── values.csv ├── test.py └── weibospider ├── __init__.py ├── attitude.py ├── base.py ├── client.py ├── comment.py ├── people.py ├── repost.py ├── utils.py └── weibo.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | -------------------------------------------------------------------------------- /.idea/SinaWeiboSpider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 14 | 15 | 16 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 132 | 133 | 141 | 142 | 143 | 144 | 145 | true 146 | DEFINITION_ORDER 147 | 148 | 149 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 178 | 179 | 182 | 183 | 184 | 185 | 188 | 189 | 192 | 193 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 226 | 227 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 269 | 270 | 283 | 284 | 301 | 302 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 344 | 345 | 364 | 365 | 386 | 387 | 409 | 410 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 1510370240815 453 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 486 | 487 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 SuperSaiyanSSS 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SinaWeiboSpider 2 | 新浪微博较为完善的爬虫,持续改进,欢迎star~ 3 | 4 | ## 安装 5 | ## pip install weibospider 6 | 7 | ## 特色 8 | 9 | + 模仿77的zhihu-oauth, 用户提供微博id构用于建对应类的对象,可以获取到某些需要的数据 10 | + 需要手动粘贴cookie, 下文有对应教程。因为自动登录WAP版微博到现在还没有一个很好的办法。加密方式未知,且验证码反人类。 11 | + 用到了许多装饰器与生成器的有关知识,可以学习参考 12 | 13 | ## 简单例子 14 | 15 | ``` 16 | from weibospider import WeiboClient 17 | 18 | cookies = 'xxxxxxxxxxxx' # 通过在weibo.cn登录后,F12查看network选项获取 19 | 20 | myclient = WeiboClient(cookies) 21 | 22 | people_1 = myclient.people('1884866222') #某目标用户的uid 23 | 24 | print(people_1.name) #打印people_1的用户名 25 | 26 | print(people_1.weibo_count) #打印people_1的发表的微博数 27 | 28 | for index, weibo in zip(range(10), people_1.weibo): 29 | 30 | print(weibo.text) #打印people_1发表的最近10条微博 31 | 32 | for index_2, comment in zip(range(5), weibo.comment): 33 | 34 | print(comment.text) #打印此微博的最近5条评论 35 | 36 | print(comment.author_name) #打印此评论对应的作者 37 | 38 | 39 | ``` 40 | 41 | 如有疑问可邮箱 or QQ联系 42 | (2024.2.12更新:部分接口已不可用) 43 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .weibospider import (Attitude, SinaBaseObject, Comment, People, 3 | Repost, Weibo, WeiboClient) 4 | 5 | __all__ = [ 6 | 'WeiboClient', 'Attitude', 'SinaBaseObject', 'Comment', 'People', 'Repost', 'Weibo', 'WeiboClient' 7 | ] -------------------------------------------------------------------------------- /ml/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SuperSaiyanSSS/SinaWeiboSpider/b034c20ccf062b1323046584712716b2794ec7ec/ml/__init__.py -------------------------------------------------------------------------------- /ml/fenci.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | from __future__ import print_function 3 | import sys 4 | sys.path.append("..") 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') 7 | import jieba 8 | # from a1 import sina_people 9 | # from a1 import sina_people 10 | # from a1 import sina_weibo 11 | # from a1 import base 12 | # from a1 import test1 13 | # from a1 import sina_store 14 | from bs4 import BeautifulSoup 15 | import requests 16 | import pymongo 17 | import re 18 | 19 | sys.path.append('../') 20 | 21 | import jieba 22 | import jieba.analyse 23 | from optparse import OptionParser 24 | 25 | 26 | 27 | def clean_keyword(): 28 | """ 29 | 将敏感词转化为标准格式 30 | :return: 31 | """ 32 | word_list = [] 33 | with open('mingan_strip_equal.txt', 'r') as f: 34 | for i in f.readlines(): 35 | if i != '': 36 | word_list.append(i.strip().strip('\n')) 37 | # with open('xie.txt', 'r') as f: 38 | # for i in f.readlines(): 39 | # if i.split('=')[0].strip().strip('\n'): 40 | # word_list.append(i.split('=')[0].strip().strip('\n')) 41 | # 42 | # with open('guang.txt', 'r') as f: 43 | # for i in f.readlines(): 44 | # if i.split('=')[0].strip().strip('\n'): 45 | # word_list.append(i.split('=')[0].strip().strip('\n')) 46 | # 47 | # with open('huang.txt', 'r') as f: 48 | # for i in f.readlines(): 49 | # if i.split('=')[0].strip().strip('\n'): 50 | # word_list.append(i.split('=')[0].strip().strip('\n')) 51 | # 52 | # with open('mingan.txt', 'r') as f: 53 | # for i in f.readlines(): 54 | # if i.split('=')[0].strip().strip('\n'): 55 | # word_list.append(i.split('=')[0].strip().strip('\n')) 56 | # 57 | # with open('mingan_9.txt','a') as f: 58 | # print(word_list) 59 | # for i in word_list: 60 | # if i: 61 | # b = repr(i) 62 | # try: 63 | # print(unicode(eval(b), "gbk")) 64 | # except: 65 | # continue 66 | # f.write(str(unicode(eval(b), "gbk"))+' '+'300'+'\n') 67 | 68 | 69 | def remove_equal(): 70 | """ 71 | 去除网上所得敏感词中的等号 72 | :return: 73 | """ 74 | count = 0 75 | target_list = [] 76 | with open('mingan_word.txt', 'r') as f: 77 | word_list = f.readlines() 78 | print(len(word_list)) 79 | for i in word_list: 80 | count += 1 81 | print(count) 82 | target_list.append(i.split(' ')[0]) 83 | with open('mingan_strip_equal.txt', 'w') as f: 84 | for i in target_list: 85 | f.write(i+'\n') 86 | 87 | 88 | class TestKeyword(object): 89 | """ 90 | 对传入的微博文本分词并检测是否含有敏感词 91 | """ 92 | def __init__(self): 93 | jieba.load_userdict("keyword.txt") 94 | jieba.load_userdict("mingan_word.txt") 95 | self.topK = 12 96 | self.mingan_list = [] 97 | self.get_mingan_list() 98 | 99 | def get_mingan_list(self): 100 | with open('mingan_strip_equal.txt', 'r') as f: 101 | word_list = f.readlines() 102 | for word in word_list: 103 | self.mingan_list.append(word.strip('\n')) 104 | 105 | def test_if_has_keyword(self, weibo_text): 106 | content = weibo_text 107 | tags = jieba.analyse.extract_tags(content, topK=self.topK) 108 | 109 | for tag in tags: 110 | if tag in self.mingan_list: 111 | print("6666666") 112 | print(content) 113 | print(tag) 114 | return True 115 | else: 116 | print("no") 117 | return False 118 | 119 | if __name__ == '__main__': 120 | sys.setdefaultencoding('utf-8') 121 | s = sina_store.SinaStore() 122 | s.weibo_table = s.db['realtime_weibo'] 123 | weibo_iter = s.get_stored_information() 124 | print(weibo_iter) 125 | 126 | count = 0 127 | while count < 400: 128 | weibo = next(weibo_iter) 129 | weibo_txt = weibo['text'] 130 | print(weibo_txt) 131 | jieba.load_userdict("keyword.txt") 132 | jieba.load_userdict("mingan_word.txt") 133 | file_name = 'mm.txt' 134 | 135 | topK = 12 136 | 137 | content = weibo_txt 138 | tags = jieba.analyse.extract_tags(content, topK=topK) 139 | 140 | # print(",".join(tags)) 141 | 142 | mingan_list = [] 143 | with open('mingan_strip_equal.txt', 'r') as f: 144 | word_list = f.readlines() 145 | print(len(word_list)) 146 | for i in word_list: 147 | mingan_list.append(i.strip('\n')) 148 | for i in tags: 149 | if i in mingan_list: 150 | print("6666666") 151 | print(content) 152 | 153 | count += 1 154 | 155 | # seg_list = jieba.cut(content) 156 | # print(", ".join(seg_list)) 157 | 158 | 159 | # with open('mm.txt','r') as f: 160 | # s = "".join(f.readlines()) 161 | # seg_list = jieba.cut(s, cut_all=True) 162 | # 163 | # print("Full Mode:", "/ ".join(seg_list)) 164 | # seg_list = jieba.cut(s, cut_all=False) 165 | # print("Default Mode:", "/ ".join(seg_list)) 166 | # seg_list = jieba.cut(s) 167 | # print(", ".join(seg_list)) 168 | # seg_list = jieba.cut_for_search(s) 169 | # print(", ".join(seg_list)) 170 | 171 | -------------------------------------------------------------------------------- /ml/get_machine_info.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals, print_function 2 | import sys 3 | sys.path.append("..") 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | from a1 import sina_people 7 | from a1 import sina_people 8 | from a1 import sina_weibo 9 | from a1 import base 10 | from a1 import test1 11 | from a1 import sina_store 12 | import time as tt 13 | from bs4 import BeautifulSoup 14 | import requests 15 | import pymongo 16 | import re 17 | 18 | headers_2 = { 19 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 20 | 'Accept-Encoding':'gzip, deflate, sdch', 21 | 'Accept-Language':'zh-CN,zh;q=0.8', 22 | 'Cache-Control':'max-age=0', 23 | 'Connection':'keep-alive', 24 | 'Cookie': '_T_WM=0ff248d78f4984aa135c5b2e53c11079; ALF=1496373314; SCF=AjsEaVa0e8KjEg3yEjwEx270PLOpYvK-1BhV7AdkMSQgM7IlYI27IV6TA5-eb6avSBhK-q5migy9jGYZkeqPPpU.; SUB=_2A250DTviDeThGeBP4lQW-CbFyj6IHXVXDkWqrDV6PUJbktBeLWLAkW1fCr2k7XOfWxI9AQSa5M6kQfvxPg..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWC9U1RTKpYdAAz2GZeMbFX5JpX5o2p5NHD95QceK.cS0nR1K2EWs4DqcjSH.ieC0-R-.R7HK.R1Btt; SUHB=04W-CMkuo5eJq_; SSOLoginState=1493781426', 25 | 'Host':'weibo.cn', 26 | 'Upgrade-Insecure-Requests':'1', 27 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' 28 | } 29 | 30 | 31 | def get_machine_personal_info(): 32 | s = sina_store.SinaStore() 33 | s.weibo_table = s.db['machine_personal_info'] 34 | with open('machine_uid.txt','r') as f: 35 | for i in f.readlines(): 36 | if i!='': 37 | print(i) 38 | pe = sina_people.SinaPeople(i) 39 | s.store_in_mongodb(pe) 40 | 41 | 42 | 43 | 44 | if __name__ == '__main__': 45 | dic_c = {} 46 | str_c = headers_2['Cookie'] 47 | for i in str_c.split('; '): 48 | dic_c[i.split('=')[0]] = i.split('=')[1] 49 | cookies2 = requests.utils.cookiejar_from_dict(dic_c) 50 | base.SinaBaseObject.cookies = cookies2 51 | if 1: 52 | dic_c = {} 53 | str_c = headers_2['Cookie'] 54 | for i in str_c.split('; '): 55 | dic_c[i.split('=')[0]] = i.split('=')[1] 56 | cookies2 = requests.utils.cookiejar_from_dict(dic_c) 57 | base.SinaBaseObject.cookies = cookies2 58 | 59 | # for i in range(0,21): 60 | # if 61 | print(cookies2) 62 | with open('machine_uid.txt','r') as f: 63 | uid = f.readlines() 64 | print(len(uid)) 65 | # get_machine_personal_info() 66 | 67 | 68 | # a = requests.get('https://weibo.cn/2318253071/fans?page=1', cookies=cookies2) 69 | # a = BeautifulSoup(a.content, "lxml") 70 | # unit = a.findAll('div', attrs={'class': 'c'})[1] 71 | # print(unit) 72 | # unit_list = unit.findAll('table') 73 | # print(unit_list) 74 | # uid_list = [] 75 | # 76 | # for i in unit_list: 77 | # print(str(i.tr.findAll('td')[1].a.attrs['href']).split('/')[-1]) 78 | # uid_list.append(str(i.tr.findAll('td')[1].a.attrs['href']).split('/')[-1]) 79 | # 80 | # for j in range(2,5): 81 | # tt.sleep(4) 82 | # a = requests.get('https://weibo.cn/2318253071/fans?page='+str(j), cookies=cookies2) 83 | # a = BeautifulSoup(a.content, "lxml") 84 | # unit = a.findAll('div', attrs={'class': 'c'})[1] 85 | # unit_list = unit.findAll('table') 86 | # for i in unit_list: 87 | # print(str(i.tr.findAll('td')[1].a.attrs['href']).split('/')[-1]) 88 | # uid_list.append(str(i.tr.findAll('td')[1].a.attrs['href']).split('/')[-1]) 89 | # 90 | # with open('machine_uid.txt','a') as f: 91 | # for i in uid_list: 92 | # f.write(i+'\n') 93 | # get_human_personal_info() 94 | # a = requests.get('http://weibo.cn/u/5195713909') 95 | # print(a.content) 96 | # pe = sina_people.SinaPeople('6021561452') 97 | # pe = sina_weibo.SinaWeibo('F16aup9Im') 98 | # we = sina_weibo.SinaWeibo('F15Kpbev2') 99 | # for name, value in vars(we).items(): 100 | # print(name, value) 101 | # c_set = set() 102 | # s = sina_store.SinaStore() 103 | # s.weibo_table = s.db['try2'] 104 | # rmrb = s.get_human_info() 105 | # comment_list = rmrb['comment_list'] 106 | # for name, value in comment_list.items(): 107 | # author_uid = value['author_uid'] 108 | # c_set.add(str(author_uid)) 109 | 110 | # with open('human_uid.txt','a') as f: 111 | # for i in c_set: 112 | # f.write(i+'\n') 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /ml/get_user_info.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | from __future__ import unicode_literals, print_function 3 | from __future__ import division 4 | import math 5 | import sys 6 | sys.path.append("..") 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | import datetime 10 | import difflib 11 | from a1 import sina_people 12 | from a1 import sina_weibo 13 | from a1 import base 14 | from a1 import test1 15 | from a1 import sina_store 16 | from bs4 import BeautifulSoup 17 | import requests 18 | import time as tt 19 | import pymongo 20 | import re 21 | 22 | 23 | headers_for_baidu = { 24 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 25 | 'Accept-Encoding':'gzip, deflate, sdch', 26 | 'Accept-Language':'zh-CN,zh;q=0.8', 27 | 'Cache-Control':'max-age=0', 28 | 'Connection':'keep-alive', 29 | 'Host':'www.baidu.com', 30 | 'Upgrade-Insecure-Requests':'1', 31 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' 32 | } 33 | 34 | 35 | # def get_human_personal_info(): 36 | # s = sina_store.SinaStore() 37 | # s.weibo_table = s.db['human_personal_info'] 38 | # with open('human_uid.txt','r') as f: 39 | # for i in f.readlines(): 40 | # if i!='': 41 | # print(i) 42 | # pe = sina_people.SinaPeople(i) 43 | # s.store_in_mongodb(pe) 44 | 45 | 46 | # 将时间转换为标准格式 47 | def clean_time(now_time): 48 | if now_time.startswith('今'): 49 | now_time = datetime.datetime(2017, 5, 3) 50 | elif "分钟" in now_time: 51 | now_time = datetime.datetime(2017, 5, 3) 52 | elif "月" in now_time: 53 | month = int(now_time.split("月")[0][-2:]) 54 | day = int(now_time.split("日")[0][-2:]) 55 | now_time = datetime.datetime(2017, month, day) 56 | else: 57 | year = int(now_time.split('-')[0]) 58 | month = int(now_time.split('-')[1]) 59 | day = int(now_time.split('-')[2][:3]) 60 | now_time = datetime.datetime(year, month, day) 61 | return now_time 62 | 63 | 64 | # 最终修正条件信息熵计算公式 65 | # TODO:论文给定的公式有问题 计算得出的离散有限序列的条件熵不满足非负性 66 | def final_calculation_formula(space_list): 67 | lence = len(space_list) 68 | if lence < 2: 69 | raise IndexError+"时间间隔个数应至少2个!" 70 | entropy_list = [] 71 | scale_list = [] 72 | is_first = True 73 | end_seq = 2 74 | while end_seq < lence: 75 | local_lence = end_seq 76 | entropy = calculation_formula(space_list[:end_seq], local_lence) 77 | entropy_list.append(entropy) 78 | if is_first: 79 | is_first = False 80 | else: 81 | scale_list.append(calculate_perc_xm(space_list[:end_seq], local_lence)) 82 | end_seq += 1 83 | result_list = [] 84 | for i in range(len(entropy_list)-1): 85 | result_list.append(entropy_list[i+1]-entropy_list[i]+scale_list[i]*entropy_list[0]) 86 | print(result_list) 87 | print(scale_list) 88 | return min(result_list) 89 | 90 | 91 | # pers(Xm):长度为 m 的序列里面只出现过一次的序列所占的比例 92 | def calculate_perc_xm(space_list, lence): 93 | only_count = 0 94 | print(space_list) 95 | for i in range(lence): 96 | for j in range(i+1, lence): 97 | if space_list[i] == space_list[j]: 98 | break 99 | if j == lence-1: 100 | only_count += 1 101 | scale = only_count/lence 102 | return scale 103 | 104 | 105 | # 计算信息熵 106 | def calculation_formula(space_list, lence): 107 | probability_list = [] 108 | while space_list: 109 | item_count = space_list.count(space_list[0]) 110 | # 由于引入了Python 3.x的除法规则,不会取整 111 | probability_list.append(item_count/lence) 112 | space_list = filter(lambda x: x != space_list[0], space_list) 113 | entropy = 0 114 | for p in probability_list: 115 | entropy += p*math.log(p) 116 | entropy = -entropy 117 | return entropy 118 | 119 | 120 | # 获取信息熵 121 | def get_entropy_of_information(person_dict): 122 | space_list = [] 123 | weibo_list_lence = len(person_dict['weibo_list']) 124 | print(weibo_list_lence) 125 | # 有的原创微博太少 舍去该样本 126 | try: 127 | last_time = str(person_dict['weibo_list']['1']['time']) 128 | last_time = clean_time(last_time) 129 | except KeyError: 130 | return -1 131 | for i in range(weibo_list_lence-2): 132 | now_time = str(person_dict['weibo_list'][str(i+2)]['time']) 133 | now_time = clean_time(now_time) 134 | space_list.append((last_time - now_time).days) 135 | print((last_time-now_time).days) 136 | last_time = now_time 137 | 138 | entropy = calculation_formula(space_list, len(space_list)) 139 | print(entropy) 140 | return entropy 141 | 142 | # while 1: 143 | # try: 144 | # print(next(a)) 145 | # except StopIteration: 146 | # break 147 | 148 | 149 | # 获取用户信誉度 150 | def get_reputation(person_dict): 151 | try: 152 | fans_count = int(person_dict['fans_count']) 153 | follow_count = int(person_dict['follow_count']) 154 | reputation = fans_count/(fans_count+follow_count) 155 | except ValueError: 156 | return -1 157 | return reputation 158 | 159 | 160 | # 获取发表微博的平台种类数量 161 | def get_num_of_platform(person_dict): 162 | platform_set = set() 163 | for item in person_dict['weibo_list']: 164 | platform_set.add(str(person_dict['weibo_list'][str(item)]['terminal_source']).strip()) 165 | return len(platform_set) 166 | 167 | 168 | # 检查原创微博是否过少 偶然性影响较大 169 | def check_if_too_little(person_dict): 170 | repost_count = 0 171 | total_count = len(person_dict['weibo_list']) 172 | for item in person_dict['weibo_list']: 173 | if str(person_dict['weibo_list'][str(item)]['is_repost'])=='True': 174 | print(person_dict['weibo_list'][str(item)]['is_repost']) 175 | repost_count += 1 176 | continue 177 | if total_count-repost_count < 3: 178 | return -1 179 | return 0 180 | 181 | 182 | # 获取发表微博的内容相似度 183 | def get_similarity_of_content(person_dict): 184 | """ 185 | 利用百度搞基搜索的site:(weibo.com) 查找是否存在重复微博 186 | :param person_dict: 用户信息的字典 187 | :return: 内容相似度 188 | """ 189 | identical_count = 0 190 | total_count = 0 191 | repost_count = 0 192 | if check_if_too_little(person_dict) == -1: 193 | return -1 194 | for item in person_dict['weibo_list']: 195 | if str(person_dict['weibo_list'][str(item)]['is_repost'])=='True': 196 | print(person_dict['weibo_list'][str(item)]['is_repost']) 197 | repost_count += 1 198 | continue 199 | else: 200 | total_count += 1 201 | 202 | if total_count >= 15: 203 | break 204 | copy_test_1 = False 205 | copy_test_2 = False 206 | 207 | text = str(person_dict['weibo_list'][str(item)]['text']) 208 | print(person_dict['weibo_list'][str(item)]['href']) 209 | q1 = text 210 | print(q1) 211 | 212 | re_emotion = re.compile('(\[.*?\])') # 去除微博表情文字 213 | q1 = re_emotion.sub('', q1) 214 | q1_list = re.split('!|!|,|。|……|:|、|,|,|;|;|——', unicode(q1)) # 按标点符号分割 215 | 216 | # 选择最大和第二大子字符串 217 | max_len = 'x' 218 | for string_seq in q1_list: 219 | if len(string_seq)> len(max_len): 220 | max_len = string_seq 221 | print(max_len) 222 | second_len = 'x' 223 | for string_seq in q1_list: 224 | if len(string_seq) > len(second_len) and string_seq != max_len: 225 | second_len = string_seq 226 | q1 = max_len 227 | q1_2 = second_len 228 | 229 | url = 'https://www.baidu.com/s?q1='+q1+'&q2=&q3=&q4=&rn=10&lm=0&ct=0&ft=&q5=&q6=weibo.com&tn=baiduadv' 230 | url_2 = 'https://www.baidu.com/s?q1='+q1_2+'&q2=&q3=&q4=&rn=10&lm=0&ct=0&ft=&q5=&q6=weibo.com&tn=baiduadv' 231 | # baidu_requests = requests.get(url, headers=headers_for_baidu, timeout=3) 232 | baidu_requests = base.SinaBaseObject.retry_requests_static(url, headers=headers_for_baidu, timeout=3) 233 | copy_test_1 = __parse_baidu_page__(baidu_requests, q1) 234 | 235 | if len(q1_2) > 5: 236 | baidu_requests = base.SinaBaseObject.retry_requests_static(url_2, headers=headers_for_baidu, timeout=3) 237 | # baidu_requests = requests.get(url_2, headers=headers_for_baidu, timeout=3) 238 | copy_test_2 = __parse_baidu_page__(baidu_requests, q1_2) 239 | 240 | if copy_test_1 or copy_test_2: 241 | identical_count += 1 242 | print(copy_test_1, copy_test_2) 243 | print("确实是抄袭的") 244 | else: 245 | print("是原创的") 246 | tt.sleep(3) 247 | 248 | # 部分数据有残缺,未能成功抓取到微博或几乎全为转发,则舍去 249 | if total_count < 3 or repost_count > 28: 250 | return -1 251 | 252 | similarity = identical_count/total_count 253 | print("内容相似度为"+str(similarity)) 254 | return similarity 255 | 256 | 257 | # 百度搜索页面处理逻辑 258 | def __parse_baidu_page__(baidu_requests, q1): 259 | """ 260 | @ author: wxw 261 | @ time: 2017/5/4 262 | 提取搜索到的高亮字符串 并与要查找的进行对比 263 | 若满足一定的相似度要求 则认为重复 264 | :param baidu_requests: requests抓取百度搜索所得页面源代码 265 | :param q1: 要查找的字符串 266 | :return: 是否重复 267 | """ 268 | baidu_bs4 = BeautifulSoup(baidu_requests.content, "lxml") 269 | highlight_list = baidu_bs4.find_all('div', attrs={'class': 'c-abstract'}) 270 | ok_count = 0 271 | for unit in highlight_list: 272 | try: 273 | highlight_word = unit.em.get_text() 274 | print(highlight_word) 275 | if str(q1).strip() == str(highlight_word).strip(): 276 | ok_count += 1 277 | print("已发现") 278 | # 若高亮的文本与寻找的文本差异很小(可能少了几个字符)则同样认为是已找到 279 | elif difflib.SequenceMatcher(None, str(q1), str(highlight_word)).ratio() > 0.88: 280 | print(difflib.SequenceMatcher(None, str(q1), str(highlight_word)).ratio()) 281 | ok_count += 1 282 | print("认为已找到") 283 | else: 284 | print("no") 285 | except AttributeError: 286 | print("这是空的") 287 | 288 | if ok_count > 1: 289 | return True 290 | else: 291 | return False 292 | 293 | 294 | def __store_human_feature_vector__(feature_vector): 295 | s = sina_store.SinaStore() 296 | s.weibo_table = s.db['human_vector_info'] 297 | iter = s.get_stored_information() 298 | flag = 0 299 | while True: 300 | try: 301 | person_dict = next(iter) 302 | if str(person_dict['uid']) == str(feature_vector['uid']): 303 | flag = 1 304 | break 305 | except StopIteration: 306 | flag = 0 307 | break 308 | 309 | if flag == 0: 310 | s.store_in_mongodb(feature_vector) 311 | 312 | 313 | def store_human_feature_vector(sina_store_object): 314 | sina_store_object.weibo_table = sina_store_object.db['human_personal_info'] 315 | # 获取返回的生成器 316 | iter = sina_store_object.get_stored_information() 317 | item_count = 0 318 | while True: 319 | try: 320 | feature_vector = {} 321 | person_dict = next(iter) 322 | entropy = get_entropy_of_information(person_dict) 323 | similarity = get_similarity_of_content(person_dict) 324 | platform = get_num_of_platform(person_dict) 325 | reputation = get_reputation(person_dict) 326 | if reputation == -1: 327 | print("该数据为残缺数据!舍去") 328 | print("现在抽取到第" + str(item_count) + "个用户!!") 329 | item_count += 1 330 | continue 331 | feature_vector['entropy'] = entropy 332 | feature_vector['similarity'] = similarity 333 | if similarity == -1 or entropy == -1: 334 | print("该数据为残缺数据!舍去") 335 | print("现在抽取到第" + str(item_count) + "个用户!!") 336 | item_count += 1 337 | continue 338 | feature_vector['uid'] = str(person_dict['uid']) 339 | feature_vector['platform'] = platform 340 | feature_vector['reputation'] = reputation 341 | feature_vector['human_or_machine'] = 1 342 | item_count += 1 343 | print("现在抽取到第"+str(item_count)+"个用户!!") 344 | __store_human_feature_vector__(feature_vector) 345 | except StopIteration: 346 | print("人类用户已提取特征向量完毕!") 347 | break 348 | 349 | 350 | def __store_machine_feature_vector__(feature_vector): 351 | s = sina_store.SinaStore() 352 | s.weibo_table = s.db['machine_vector_info'] 353 | iter = s.get_stored_information() 354 | flag = 0 355 | while True: 356 | try: 357 | person_dict = next(iter) 358 | if str(person_dict['uid']) == str(feature_vector['uid']): 359 | flag = 1 360 | break 361 | except StopIteration: 362 | flag = 0 363 | break 364 | 365 | if flag == 0: 366 | s.store_in_mongodb(feature_vector) 367 | 368 | 369 | def store_machine_feature_vector(sina_store_object): 370 | sina_store_object.weibo_table = sina_store_object.db['machine_personal_info'] 371 | # 获取返回的生成器 372 | iter = sina_store_object.get_stored_information() 373 | item_count = 0 374 | while True: 375 | try: 376 | feature_vector = {} 377 | person_dict = next(iter) 378 | entropy = get_entropy_of_information(person_dict) 379 | similarity = get_similarity_of_content(person_dict) 380 | platform = get_num_of_platform(person_dict) 381 | reputation = get_reputation(person_dict) 382 | if reputation == -1: 383 | print("该数据为残缺数据!舍去") 384 | print("现在抽取到第" + str(item_count) + "个用户!!") 385 | item_count += 1 386 | continue 387 | feature_vector['entropy'] = entropy 388 | feature_vector['similarity'] = similarity 389 | if similarity == -1 or entropy == -1: 390 | print("该数据为残缺数据!舍去") 391 | print("现在抽取到第" + str(item_count) + "个用户!!") 392 | item_count += 1 393 | continue 394 | feature_vector['uid'] = str(person_dict['uid']) 395 | feature_vector['platform'] = platform 396 | feature_vector['reputation'] = reputation 397 | feature_vector['human_or_machine'] = 0 398 | item_count += 1 399 | print("现在抽取到第"+str(item_count)+"个用户!!") 400 | __store_machine_feature_vector__(feature_vector) 401 | except StopIteration: 402 | print("机器用户已提取特征向量完毕!") 403 | break 404 | 405 | 406 | if __name__ == '__main__': 407 | """ 408 | 从mongodb中获取human和machine的信息, 409 | 并计算其信息熵、相似度、信誉度等特征 410 | 并将特征存入mongodb 411 | 412 | 示例: 413 | s = sina_store.SinaStore() 414 | store_human_feature_vector(s) 415 | store_machine_feature_vector(s) 416 | """ 417 | 418 | 419 | 420 | 421 | 422 | -------------------------------------------------------------------------------- /ml/keyword.txt: -------------------------------------------------------------------------------- 1 | 减肥 100 2 | 私聊 300 3 | 变瘦 20 4 | 得意 1 5 | V信 300 6 | V xin 300 7 | 卫星号 1000 8 | 卫星号, 1000 9 | 祖传 200 10 | 秘制 200 11 | 强力 100 12 | 特效 100 13 | 全效 100 14 | 强效 100 15 | 奇效 100 16 | 高效 100 17 | 速效 100 18 | 神效 200 19 | 处方 200 20 | 复方 200 21 | 治疗 10 22 | 消炎 20 23 | 抗炎 20 24 | 活血 100 25 | 祛瘀 100 26 | 止咳 50 27 | 解毒 50 28 | 疗效 100 29 | 防治 20 30 | 防癌 50 31 | 增高 200 32 | 溶脂 200 33 | 吸脂 200 34 | 瘦身 200 35 | 瘦脸 200 36 | 瘦腿 200 37 | xin 100 38 | 防辐射 100 39 | 美容 100 40 | 养颜 100 41 | 避凶 200 42 | 辟邪 200 43 | 首选 50 44 | 保健级 200 45 | 安全无毒副作用 200 46 | 无效退款 200 47 | 保险公司承保 200 48 | 补五行 200 49 | 吸财 200 50 | 保平安 200 51 | 无斑 200 52 | 祛疤 200 53 | 去疤 200 54 | 国家级 20 55 | 世界级 20 56 | 最高级 20 57 | 全网销量第一 200 58 | 全球首发 100 59 | 全国首家 100 60 | 全网首发 200 61 | 代言 20 62 | 无副作用 100 63 | 零风险 200 64 | 稳赚 200 65 | 无效退款 200 66 | 祖传 200 67 | 无事故 200 68 | 无依赖 200 69 | 根治 200 70 | 日减 200 71 | 秘方 200 72 | 保过 200 73 | 填补国内空白 10 74 | 保收益 200 75 | 食品级别 200 76 | 精准检测 100 77 | 3-7天见效 100 78 | 1-3个疗程治愈 100 79 | 解除疼痛 100 80 | 最权威 40 81 | 药到病除 100 82 | 体内毒素 200 83 | 吸附铅汞 100 84 | 除湿 100 85 | 润燥 50 86 | 消除斑点 100 87 | 祛除雀斑 100 88 | 祛除黄褐斑 100 89 | 祛除蝴蝶斑 100 90 | 祛除妊娠斑 100 91 | 祛除斑立净 100 92 | 全民晒单 100 93 | 随时涨价 100 94 | 卖疯了 200 95 | 抢疯了 200 96 | 点击领奖 100 97 | 恭喜获奖 200 98 | 全民免单 100 99 | 点击有惊喜 200 100 | 点击获取 200 101 | 点击转身 200 102 | 点击试穿 200 103 | 点击翻转 200 104 | 领取奖品 100 105 | 秒杀 20 106 | 抢爆 200 107 | 再不抢就没 200 108 | 不会更便宜 200 109 | 错过就没机会 200 110 | 疯抢 100 111 | 抢购 40 112 | VX 100 113 | 免费送 100 114 | 泰國 20 115 | 佛牌 200 116 | 陰牌 200 117 | 情降 200 118 | 降頭 200 119 | 和合 200 120 | 鬼降 200 121 | 鎖心 200 122 | 巫術 200 123 | -------------------------------------------------------------------------------- /ml/os_path.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | import os 4 | 5 | 6 | class LoadFolders(object): # 迭代器 7 | def __init__(self, par_path): 8 | self.par_path = par_path 9 | 10 | def __iter__(self): 11 | 12 | for file in os.listdir(self.par_path): 13 | file_abspath = os.path.join(self.par_path, file) 14 | if os.path.isdir(file_abspath): # if file is a folder 15 | yield file_abspath 16 | 17 | 18 | class LoadFiles(object): 19 | def __init__(self, par_path): 20 | self.par_path = par_path 21 | 22 | def __iter__(self): 23 | folders = LoadFolders(self.par_path) 24 | for folder in folders: # level directory 25 | catg = folder.split(os.sep)[-1] 26 | for file in os.listdir(folder): # secondary directory 27 | file_path = os.path.join(folder, file) 28 | if os.path.isfile(file_path): 29 | this_file = open(file_path, 'rb') 30 | content = this_file.read() 31 | yield catg, content 32 | this_file.close() -------------------------------------------------------------------------------- /ml/random_forest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | from __future__ import unicode_literals, print_function 3 | from __future__ import division 4 | import sklearn 5 | import pandas as pd 6 | import json 7 | import math 8 | import csv 9 | import pymongo 10 | import sklearn 11 | import sys 12 | sys.path.append("..") 13 | from a1 import base 14 | from a1 import sina_store 15 | reload(sys) 16 | sys.setdefaultencoding('utf-8') 17 | 18 | 19 | class MachineLearning(base.SinaBaseObject): 20 | def __init__(self): 21 | self.is_First = True 22 | self.is_First_2 = True 23 | self.gbc = '' 24 | self.dtc = '' 25 | self.rfc = '' 26 | 27 | def set_feature_vector_dict(self, feature_vector_dict): 28 | self.clean_feture_vector_dict(feature_vector_dict, is_first=self.is_First) 29 | self.is_First = False 30 | 31 | def set_test_feature_vector_dict(self, feature_vector_dict): 32 | self.clean_test_feture_vector_dict(feature_vector_dict, is_first_2=self.is_First_2) 33 | self.is_First_2 = False 34 | 35 | # 将传入的字典转化为csv文件 36 | @staticmethod 37 | def clean_feture_vector_dict(feature_vector_dict, is_first=False): 38 | with open('names.csv', 'ab') as csvfile: 39 | fieldnames = ['uid', 'similarity', 'platform', 'reputation', 'entropy', 'human_or_machine'] 40 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 41 | if is_first: 42 | writer.writeheader() 43 | writer.writerow( 44 | {'uid': feature_vector_dict['uid'], 45 | 'similarity': feature_vector_dict['similarity'], 46 | 'platform': feature_vector_dict['platform'], 47 | 'reputation': feature_vector_dict['reputation'], 48 | 'entropy': feature_vector_dict['entropy'], 49 | 'human_or_machine': feature_vector_dict['human_or_machine'] 50 | } 51 | ) 52 | 53 | @staticmethod 54 | def clean_test_feture_vector_dict(feature_vector_dict, is_first_2=False): 55 | with open('needs.csv', 'ab') as csvfile: 56 | fieldnames = ['uid', 'similarity', 'platform', 'reputation', 'entropy', 'human_or_machine'] 57 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 58 | if is_first: 59 | writer.writeheader() 60 | writer.writerow( 61 | {'uid': feature_vector_dict['uid'], 62 | 'similarity': feature_vector_dict['similarity'], 63 | 'platform': feature_vector_dict['platform'], 64 | 'reputation': feature_vector_dict['reputation'], 65 | 'entropy': feature_vector_dict['entropy'], 66 | 'human_or_machine': feature_vector_dict['human_or_machine'] 67 | } 68 | ) 69 | 70 | # 进行单一决策树和随机森林的训练模型及检验 71 | def rand_forest_train(self): 72 | # 读取本地用户特征信息 73 | users = pd.read_csv('names.csv') 74 | # 选取similarity、platform、reputation、entropy作为判别人类或机器的特征 75 | X = users[['similarity', 'platform', 'reputation', 'entropy']] 76 | y = users['human_or_machine'] 77 | 78 | # 对原始数据进行分割, 25%的数据用于测试 79 | from sklearn.cross_validation import train_test_split 80 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) 81 | 82 | # 对类别特征进行转化,成为特征向量 83 | from sklearn.feature_extraction import DictVectorizer 84 | vec = DictVectorizer(sparse=False) 85 | X_train = vec.fit_transform(X_train.to_dict(orient='record')) 86 | X_test = vec.transform(X_test.to_dict(orient='record')) 87 | 88 | # 使用单一决策树进行集成模型的训练及预测分析 89 | from sklearn.tree import DecisionTreeClassifier 90 | dtc = DecisionTreeClassifier() 91 | dtc.fit(X_train, y_train) 92 | dtc_y_pred = dtc.predict(X_test) 93 | 94 | # 使用随机森林分类器进行集成模型的训练及预测分析 95 | from sklearn.ensemble import RandomForestClassifier 96 | rfc = RandomForestClassifier() 97 | rfc.fit(X_train, y_train) 98 | rfc_y_pred = rfc.predict(X_test) 99 | 100 | # 使用梯度提升决策树进行集成模型的训练及预测分析 101 | from sklearn.ensemble import GradientBoostingClassifier 102 | gbc = GradientBoostingClassifier() 103 | gbc.fit(X_train, y_train) 104 | gbc_y_pred = gbc.predict(X_test) 105 | 106 | from sklearn.metrics import classification_report 107 | # 输出单一决策树在测试集上的分类准确性, 以及更加详细的精确率 召回率 F1指标 108 | print("单一决策树的准确性为", dtc.score(X_test, y_test)) 109 | print(classification_report(dtc_y_pred, y_test)) 110 | 111 | # 输出随机森林分类器在测试集上的分类准确性,以及更加详细的精确率 召回率 F1指标 112 | print("随机森林分类器的准确性为", rfc.score(X_test, y_test)) 113 | print(classification_report(rfc_y_pred, y_test)) 114 | 115 | # 输出梯度提升决策树在测试集上的分类准确性,以及更加详细的精确率 召回率 F1指标 116 | print("梯度提升决策树的准确性为", gbc.score(X_test, y_test)) 117 | print(classification_report(gbc_y_pred, y_test)) 118 | 119 | 120 | users = pd.read_csv('values.csv') 121 | 122 | # 检验是否为机器或人类 123 | X = users[['similarity', 'platform', 'reputation', 'entropy']] 124 | X = vec.transform(X.to_dict(orient='record')) 125 | print(rfc.predict(X)) 126 | 127 | self.dtc = dtc 128 | self.rfc = rfc 129 | self.gbc = gbc 130 | 131 | 132 | def get_dict_from_weibo_table(): 133 | ml = MachineLearning() 134 | sina_store_object = sina_store.SinaStore() 135 | sina_store_object.weibo_table = sina_store_object.db['human_vector_info'] 136 | iter = sina_store_object.get_stored_information() 137 | while True: 138 | try: 139 | info_dict = next(iter) 140 | ml.set_feature_vector_dict(info_dict) 141 | except StopIteration: 142 | break 143 | sina_store_object.weibo_table = sina_store_object.db['machine_vector_info'] 144 | iter = sina_store_object.get_stored_information() 145 | while True: 146 | try: 147 | info_dict = next(iter) 148 | ml.set_feature_vector_dict(info_dict) 149 | except StopIteration: 150 | break 151 | print("已结束 正在训练模型。。。") 152 | ml.rand_forest_train() 153 | 154 | 155 | def start_training(): 156 | get_dict_from_weibo_table() 157 | 158 | if __name__ == "__main__": 159 | start_training() 160 | -------------------------------------------------------------------------------- /ml/realtime_random_weibo.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from __future__ import unicode_literals, print_function 3 | import weibo 4 | import time as tt 5 | import sys 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') 8 | sys.path.append("..") 9 | from a1 import sina_store 10 | from a1 import sina_weibo 11 | 12 | APP_KEY = '3175988140' 13 | APP_SECRET = 'f445636b8fc0b7b5e75474c3ab8d320b' 14 | CALL_BACK = 'http://api.weibo.com/oauth2/default.html' 15 | ACCESS_TOKEN = '2.00xUU4VGKbHw9D47e3cfc2c8UhoSBB' 16 | 17 | 18 | class myAPIClient(weibo.APIClient): 19 | def __init__(self, app_key, app_secret, redirect_uri, access_token): 20 | weibo.APIClient.__init__(self, app_key, app_secret, redirect_uri, access_token) 21 | 22 | def request_access_token_info(self, access_token): 23 | r = weibo._http_post('%s%s' % (self.auth_url, 'get_token_info'), access_token=access_token) 24 | current = int(tt.time()) 25 | expires = r.expire_in + current 26 | return weibo.JsonDict(expires_in=expires) 27 | 28 | 29 | def get_client(appkey, appsecret, callback, access_token): 30 | client = myAPIClient(appkey, appsecret, callback, access_token) 31 | r = client.request_access_token_info(access_token) 32 | expires_in = r.expires_in 33 | client.set_access_token(access_token, expires_in) 34 | return client 35 | 36 | 37 | def run(weiboList, client): 38 | statuses = client.statuses__public_timeline(count=2)['statuses'] 39 | length = len(statuses) 40 | print('现在获得了'+str(length)+'条新微博') 41 | 42 | for i in range(0, length): 43 | created_at = statuses[i]['created_at'] 44 | author_uid = statuses[i]['user']['id'] 45 | id = statuses[i]['id'] 46 | source = statuses[i]['source'] 47 | province = statuses[i]['user']['province'] 48 | city = statuses[i]['user']['city'] 49 | followers_count = statuses[i]['user']['followers_count'] 50 | friends_count = statuses[i]['user']['friends_count'] 51 | statuses_count = statuses[i]['user']['statuses_count'] 52 | url = statuses[i]['user']['url'] 53 | geo = statuses[i]['geo'] 54 | comments_count = statuses[i]['comments_count'] 55 | reposts_count = statuses[i]['reposts_count'] 56 | nickname = statuses[i]['user']['screen_name'] 57 | desc = statuses[i]['user']['description'] 58 | location = statuses[i]['user']['location'] 59 | text = statuses[i]['text'] 60 | 61 | weibo_dict = { 62 | 'created_at': created_at, 63 | 'author_uid': author_uid, 64 | 'id': id, 65 | 'author_name': nickname, 66 | 'source': source, 67 | 'text': text, 68 | 'province': province, 69 | 'location': location, 70 | 'description': desc, 71 | 'city': city, 72 | 'followers_count': followers_count, 73 | 'friends_count': friends_count, 74 | 'statuses_count': statuses_count, 75 | 'url': url, 76 | 'geo': geo, 77 | 'comments_count': comments_count, 78 | 'reposts_count': reposts_count 79 | } 80 | weiboList.append(weibo_dict) 81 | return weiboList 82 | 83 | 84 | def gain_random_weibolist(count_=-1): 85 | weibolist = [] 86 | client = get_client(APP_KEY, APP_SECRET, CALL_BACK, ACCESS_TOKEN) 87 | while True: 88 | print('现在开始获取!') 89 | try: 90 | weibolist = run(weibolist, client) 91 | return weibolist 92 | except: 93 | tt.sleep(0.1) 94 | # 默认循环无数次,直到获得成功为止 95 | # 可通过更改参数count_的值来改变循环次数 96 | if(count_!= 0): 97 | count_ = count_-1 98 | else: 99 | break 100 | 101 | # def clean_weibolist(weibolist): 102 | # for weibo in weibolist: 103 | # weibo_object = sina_weibo.SinaWeibo() 104 | # 105 | # def store_random_weibolist(): 106 | # sina_store_object = sina_store.SinaStore() 107 | # sina_store_object.weibo_table = sina_store_object.db['random_weibo'] 108 | # #sina_store_object. 109 | 110 | if __name__ == "__main__": 111 | weiboList = gain_random_weibolist(count_=3) 112 | print(weiboList[0]['id']) 113 | print(weiboList[0]['author_uid']) 114 | print(weiboList[0]['author_name']) 115 | print(weiboList[0]['source']) 116 | print(weiboList[0]['text']) 117 | print(weiboList[0]['created_at']) 118 | print(111111111111111111111) 119 | for name, value in weiboList[0].items(): 120 | print(name, value) -------------------------------------------------------------------------------- /ml/realtime_random_weibo_2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | from __future__ import unicode_literals, print_function 3 | from __future__ import division 4 | import json 5 | import pymongo 6 | import requests 7 | from bs4 import BeautifulSoup 8 | import sys 9 | sys.path.append("..") 10 | import random 11 | import fenci 12 | reload(sys) 13 | sys.setdefaultencoding('utf-8') 14 | 15 | REALTIMEWEIBO = 'realtime719' 16 | REALTIMEWEIBOT = 'realtime719t' 17 | 18 | 19 | location_dict = { 20 | '上海': [121.4648,31.2891], 21 | '东莞': [113.8953,22.901], 22 | '东营': [118.7073,37.5513], 23 | '中山': [113.4229,22.478], 24 | '临汾': [111.4783,36.1615], 25 | '临沂': [118.3118,35.2936], 26 | '丹东': [124.541,40.4242], 27 | '丽水': [119.5642,28.1854], 28 | '乌鲁木齐': [87.9236,43.5883], 29 | '佛山': [112.8955,23.1097], 30 | '保定': [115.0488,39.0948], 31 | '兰州': [103.5901,36.3043], 32 | '包头': [110.3467,41.4899], 33 | '北京': [116.4551,40.2539], 34 | '北海': [109.314,21.6211], 35 | '南京': [118.8062,31.9208], 36 | '南宁': [108.479,23.1152], 37 | '南昌': [116.0046,28.6633], 38 | '南通': [121.1023,32.1625], 39 | '厦门': [118.1689,24.6478], 40 | '台州': [121.1353,28.6688], 41 | '合肥': [117.29,32.0581], 42 | '呼和浩特': [111.4124,40.4901], 43 | '咸阳': [108.4131,34.8706], 44 | '哈尔滨': [127.9688,45.368], 45 | '唐山': [118.4766,39.6826], 46 | '嘉兴': [120.9155,30.6354], 47 | '大同': [113.7854,39.8035], 48 | '大连': [122.2229,39.4409], 49 | '天津': [117.4219,39.4189], 50 | '太原': [112.3352,37.9413], 51 | '威海': [121.9482,37.1393], 52 | '宁波': [121.5967,29.6466], 53 | '宝鸡': [107.1826,34.3433], 54 | '宿迁': [118.5535,33.7775], 55 | '常州': [119.4543,31.5582], 56 | '广州': [113.5107,23.2196], 57 | '廊坊': [116.521,39.0509], 58 | '延安': [109.1052,36.4252], 59 | '张家口': [115.1477,40.8527], 60 | '徐州': [117.5208,34.3268], 61 | '德州': [116.6858,37.2107], 62 | '惠州': [114.6204,23.1647], 63 | '成都': [103.9526,30.7617], 64 | '扬州': [119.4653,32.8162], 65 | '承德': [117.5757,41.4075], 66 | '拉萨': [91.1865,30.1465], 67 | '无锡': [120.3442,31.5527], 68 | '日照': [119.2786,35.5023], 69 | '昆明': [102.9199,25.4663], 70 | '杭州': [119.5313,29.8773], 71 | '枣庄': [117.323,34.8926], 72 | '柳州': [109.3799,24.9774], 73 | '株洲': [113.5327,27.0319], 74 | '武汉': [114.3896,30.6628], 75 | '汕头': [117.1692,23.3405], 76 | '江门': [112.6318,22.1484], 77 | '沈阳': [123.1238,42.1216], 78 | '沧州': [116.8286,38.2104], 79 | '河源': [114.917,23.9722], 80 | '泉州': [118.3228,25.1147], 81 | '泰安': [117.0264,36.0516], 82 | '泰州': [120.0586,32.5525], 83 | '济南': [117.1582,36.8701], 84 | '济宁': [116.8286,35.3375], 85 | '海口': [110.3893,19.8516], 86 | '淄博': [118.0371,36.6064], 87 | '淮安': [118.927,33.4039], 88 | '深圳': [114.5435,22.5439], 89 | '清远': [112.9175,24.3292], 90 | '温州': [120.498,27.8119], 91 | '渭南': [109.7864,35.0299], 92 | '湖州': [119.8608,30.7782], 93 | '湘潭': [112.5439,27.7075], 94 | '滨州': [117.8174,37.4963], 95 | '潍坊': [119.0918,36.524], 96 | '烟台': [120.7397,37.5128], 97 | '玉溪': [101.9312,23.8898], 98 | '珠海': [113.7305,22.1155], 99 | '盐城': [120.2234,33.5577], 100 | '盘锦': [121.9482,41.0449], 101 | '石家庄': [114.4995,38.1006], 102 | '福州': [119.4543,25.9222], 103 | '秦皇岛': [119.2126,40.0232], 104 | '绍兴': [120.564,29.7565], 105 | '聊城': [115.9167,36.4032], 106 | '肇庆': [112.1265,23.5822], 107 | '舟山': [122.2559,30.2234], 108 | '苏州': [120.6519,31.3989], 109 | '莱芜': [117.6526,36.2714], 110 | '菏泽': [115.6201,35.2057], 111 | '营口': [122.4316,40.4297], 112 | '葫芦岛': [120.1575,40.578], 113 | '衡水': [115.8838,37.7161], 114 | '衢州': [118.6853,28.8666], 115 | '西宁': [101.4038,36.8207], 116 | '西安': [109.1162,34.2004], 117 | '贵阳': [106.6992,26.7682], 118 | '连云港': [119.1248,34.552], 119 | '邢台': [114.8071,37.2821], 120 | '邯郸': [114.4775,36.535], 121 | '郑州': [113.4668,34.6234], 122 | '鄂尔多斯': [108.9734,39.2487], 123 | '重庆': [107.7539,30.1904], 124 | '金华': [120.0037,29.1028], 125 | '铜川': [109.0393,35.1947], 126 | '银川': [106.3586,38.1775], 127 | '镇江': [119.4763,31.9702], 128 | '长春': [125.8154,44.2584], 129 | '长沙': [113.0823,28.2568], 130 | '长治': [112.8625,36.4746], 131 | '阳泉': [113.4778,38.0951], 132 | '青岛': [120.4651,36.3373], 133 | '韶关': [113.7964,24.7028] 134 | } 135 | 136 | 137 | class RealtimeRandomWeibo(object): 138 | KEY = '9LF3gnOtYENP26HSoNAxPptHk7eCgxdWjL5ZuSdJXuGALaAcTrLXdGI7TkEYnIQm' 139 | 140 | def __init__(self, lazy=True): 141 | self.href = 'http://api03.bitspaceman.com:8000/post/weibo?kw=的&apikey=' + self.KEY 142 | self.weibo_list = [] 143 | self.weibo_list_all = [] 144 | self.weibo_list_threat = [] 145 | self.iter_all = None 146 | self.iter_count = 0 147 | if not lazy: 148 | self.get_random_weibo() 149 | 150 | # 连接至mongodb 151 | self.mongo_client = pymongo.MongoClient('localhost', 27017) 152 | self.db = self.mongo_client['Weibo'] 153 | 154 | def get_random_weibo(self): 155 | self.iter_count += 15 156 | requests_get = requests.get(self.href, timeout=15) 157 | requests_content = requests_get.content 158 | requests_dict = json.loads(requests_content) 159 | 160 | weibo_list = self.parse_requests_dict(requests_dict) 161 | 162 | copy_of_weibo_list = self.parse_weibo_list(weibo_list) 163 | self.weibo_list_all = copy_of_weibo_list 164 | self.weibo_list_threat = [weibo for weibo in copy_of_weibo_list if int(weibo['threatened']) > 68] 165 | self.store_to_mongodb() 166 | 167 | def parse_requests_dict(self, requests_dict): 168 | weibo_list = [] 169 | count = 0 170 | for item in requests_dict['data']: 171 | 172 | weibo = {} 173 | try: 174 | weibo['is_repost'] = False 175 | weibo['repost_location'] = '' 176 | weibo['text'] = str(item['mblog']['text']) 177 | print(weibo['text']) 178 | weibo['uid'] = str(item['from']['url']).split('/')[-1] 179 | weibo['time'] = str(item['pDate']) 180 | weibo['comment_count'] = str(item['commentCount']) 181 | weibo['author_name'] = str(item['from']['name']) 182 | weibo['author_uid'] = str(item['from']['id']) 183 | weibo['author_fans'] = str(item['from']['fansCount']) 184 | weibo['author_follower'] = str(item['from']['friendCount']) 185 | weibo['location'] = str(item['from']['extend']['location']) 186 | weibo['province'] = '' 187 | print(weibo['author_uid']) 188 | except: 189 | continue 190 | 191 | try: 192 | weibo['terminal_source'] = str(item['mblog']['source']).split('>')[1].split('<')[0] 193 | except IndexError: 194 | weibo['terminal_source'] = '未知' 195 | if item['mblog'].has_key('retweeted_status'): 196 | count += 1 197 | weibo['is_repost'] = True 198 | try: 199 | weibo['repost_location'] = str(item['mblog']['retweeted_status']['user']['location']) 200 | weibo['repost_reposted_count'] = str(item['mblog']['retweeted_status']['reposts_count']) 201 | weibo['repost_text'] = str(item['mblog']['retweeted_status']['text']) 202 | weibo['repost_attitude_count'] = str(item['mblog']['retweeted_status']['attitudes_count']) 203 | print(weibo['repost_location']) 204 | print(weibo['repost_reposted_count']) 205 | print(weibo['repost_text']) 206 | print(weibo['repost_attitude_count']) 207 | except: 208 | pass 209 | 210 | weibo_list.append(weibo) 211 | 212 | print("为转发的微博数: ", str(count)) 213 | # self.store_to_mongodb(weibo_list) 214 | self.weibo_list = weibo_list 215 | print(weibo_list) 216 | 217 | return weibo_list 218 | 219 | def parse_weibo_list(self, weibo_list): 220 | """ 221 | 分析微博威胁程度与规范地址格式 222 | :param weibo_list: 初始微博列表 223 | :return: 分析后的微博列表 224 | """ 225 | for i in weibo_list: 226 | i['location'] = mapped_province(i['location'], weibo=i) 227 | i['repost_location'] = mapped_province(i['repost_location']) 228 | print(i['location'], i['repost_location']) 229 | print('111111111111111111111111111111111111111111111111111') 230 | 231 | copy_of_weibo_list = [] 232 | 233 | # 筛选符合地图显示的地点 234 | for i in weibo_list: 235 | if i['location'] is None or i['location'] == '': 236 | continue 237 | if i['repost_location'] is None or i['repost_location'] == '': 238 | i['is_repost'] = False 239 | 240 | i['location'] = str(i['location']) 241 | i['repost_location'] = str(i['repost_location']) 242 | copy_of_weibo_list.append(i) 243 | 244 | copy_of_weibo_list = assess_threat_levels(copy_of_weibo_list) 245 | return copy_of_weibo_list 246 | 247 | def store_to_mongodb(self): 248 | 249 | weibo_table = self.db[REALTIMEWEIBO] 250 | for i in self.weibo_list_all: 251 | weibo_table.insert(i) 252 | 253 | weibo_table = self.db[REALTIMEWEIBOT] 254 | for i in self.weibo_list_threat: 255 | weibo_table.insert(i) 256 | 257 | # def get_iter_all(self): 258 | # weibo_table = self.db['realtime719'] 259 | # for i in weibo_table.find(): 260 | # yield i 261 | 262 | def get_realtime_weibo_from_mongodb(self): 263 | weibo_table = self.db[REALTIMEWEIBO] 264 | count = 0 265 | now_weibo_all = [] 266 | for i in weibo_table.find(): 267 | if count 1: 335 | if weibo and weibo_location.split(' ')[0] in mapped_dict.keys(): 336 | weibo['province'] = weibo_location.split(' ')[0] 337 | 338 | if weibo_location.split(' ')[1] in location_dict.keys(): 339 | weibo_location = weibo_location.split(' ')[1] 340 | elif weibo_location.split(' ')[0] in location_dict.keys(): 341 | weibo_location = weibo_location.split(' ')[0] 342 | else: 343 | weibo_location = '' 344 | 345 | 346 | else: 347 | 348 | if weibo and weibo_location in mapped_dict.keys(): 349 | weibo['province'] = weibo_location.strip() 350 | 351 | if weibo_location.strip() in location_dict.keys(): 352 | weibo_location = weibo_location.strip() 353 | else: 354 | if weibo_location.strip() in mapped_dict.keys(): 355 | print(weibo_location.strip()) 356 | weibo_location = mapped_dict.get(weibo_location.strip()) 357 | else: 358 | weibo_location = '' 359 | 360 | 361 | return weibo_location 362 | 363 | 364 | def assess_threat_levels(copy_of_weibo_list): 365 | """ 366 | 评估威胁程度 367 | """ 368 | check_object = fenci.TestKeyword() 369 | 370 | for weibo in copy_of_weibo_list: 371 | flag = check_object.test_if_has_keyword(weibo['text']) 372 | threat = 0 373 | if weibo['is_repost']: 374 | flag = flag or check_object.test_if_has_keyword(weibo['repost_text']) 375 | if flag: 376 | if weibo.has_key('repost_reposted_count') and weibo['repost_reposted_count']: 377 | if int(weibo['repost_reposted_count']) > 10: 378 | threat += 1 379 | if weibo.has_key('comment_count') and weibo['comment_count']: 380 | if int(weibo['comment_count'] > 1): 381 | threat += 1 382 | if weibo.has_key('repost_attitude_count') and weibo['repost_attitude_count']: 383 | if int(weibo['repost_attitude_count']) > 10: 384 | threat += 1 385 | if weibo.has_key('author_fans') and weibo['author_fans']: 386 | if int(weibo['author_fans']) > 100: 387 | threat += 1 388 | 389 | weibo['threatened'] = random.randint(68, 80) 390 | 391 | if threat == 1 or threat == 2: 392 | weibo['threatened'] = random.randint(80, 90) 393 | print('what?????????????????????') 394 | print(weibo['threatened'] ) 395 | 396 | if threat > 2: 397 | weibo['threatened'] = random.randint(90, 100) 398 | print('what?????????????????????') 399 | print(weibo['threatened'] ) 400 | 401 | print(weibo['time']) 402 | print(weibo['author_uid']) 403 | else: 404 | weibo['threatened'] = random.randint(0, 68) 405 | 406 | return copy_of_weibo_list 407 | 408 | 409 | def start_run(): 410 | 411 | realtime_weibo_object = RealtimeRandomWeibo() 412 | 413 | for i in realtime_weibo_object.weibo_list: 414 | i['location'] = mapped_province(i['location'], weibo=i) 415 | i['repost_location'] = mapped_province(i['repost_location']) 416 | print(i['location'], i['repost_location']) 417 | print('111111111111111111111111111111111111111111111111111') 418 | 419 | copy_of_weibo_list = [] 420 | 421 | # 筛选符合地图显示的地点 422 | for i in realtime_weibo_object.weibo_list: 423 | if i['location'] is None or i['location'] == '': 424 | continue 425 | if i['repost_location'] is None or i['repost_location'] == '': 426 | i['is_repost'] = False 427 | 428 | i['location'] = str(i['location']) 429 | i['repost_location'] = str(i['repost_location']) 430 | copy_of_weibo_list.append(i) 431 | 432 | copy_of_weibo_list = assess_threat_levels(copy_of_weibo_list) 433 | return copy_of_weibo_list 434 | 435 | 436 | 437 | 438 | 439 | if __name__ == '__main__': 440 | 441 | a = RealtimeRandomWeibo() 442 | a.get_random_weibo() 443 | l = a.get_realtime_weibo_from_mongodb() 444 | for i in l: 445 | print(i) 446 | # a = start_run() 447 | # for i in a: 448 | # print(i['location']) 449 | # print(type(i['location'])) 450 | # if i['is_repost']: 451 | # print("转发自"+str(i['repost_location'])) -------------------------------------------------------------------------------- /ml/realtime_user_fans_follower.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | from __future__ import unicode_literals, print_function 3 | from __future__ import division 4 | import json 5 | import time as tt 6 | import csv 7 | import pymongo 8 | import requests 9 | from bs4 import BeautifulSoup 10 | import sys 11 | sys.path.append("..") 12 | from wbcls.sina_store import SinaStore 13 | import fenci 14 | reload(sys) 15 | sys.setdefaultencoding('utf-8') 16 | 17 | RALATIONTABLE = 'Relation719' 18 | 19 | 20 | class RealtimeUserRealationship(object): 21 | KEY = '9LF3gnOtYENP26HSoNAxPptHk7eCgxdWjL5ZuSdJXuGALaAcTrLXdGI7TkEYnIQm' 22 | 23 | def __init__(self, user_id, user=True, fans=True, follow=True): 24 | # 连接至mongodb 25 | self.mongo_client = pymongo.MongoClient('localhost', 27017) 26 | self.db = self.mongo_client['Weibo'] 27 | 28 | self._session = requests.Session() 29 | self._session.mount('http://', self._create_adapter()) 30 | 31 | self.fans_href = 'http://api03.bitspaceman.com:8000/profile/weibo?type=3&id='+str(user_id)+'&apikey=' + \ 32 | self.KEY + '&size=30' 33 | self.fans_list = [] 34 | self.follow_href = 'http://api03.bitspaceman.com:8000/profile/weibo?type=2&id='+str(user_id)+'&apikey=' + \ 35 | self.KEY+'&size=30' 36 | self.follow_list = [] 37 | 38 | self.user_href = 'http://api03.bitspaceman.com:8000/profile/weibo?type=1&id='+str(user_id)+'&apikey=' + self.KEY 39 | 40 | self.info_dict = {} 41 | self.get_relationship(user=user, fans=fans, follow=follow) 42 | 43 | def get_relationship(self, user=True, fans=True, follow=False): 44 | 45 | if fans: 46 | requests_get = self._session.get(self.fans_href, timeout=15) 47 | requests_content = requests_get.content 48 | requests_dict = json.loads(requests_content) 49 | self.fans_list = self.parse_requests_dict(requests_dict) 50 | tt.sleep(0.5) 51 | if follow: 52 | requests_get = self._session.get(self.follow_href, timeout=15) 53 | requests_content = requests_get.content 54 | requests_dict = json.loads(requests_content) 55 | self.follow_list = self.parse_requests_dict(requests_dict) 56 | tt.sleep(0.5) 57 | 58 | if user: 59 | requests_get = self._session.get(self.user_href, timeout=15) 60 | requests_content = requests_get.content 61 | requests_dict = json.loads(requests_content) 62 | self.info_dict = self.parse_requests_info_dict(requests_dict) 63 | 64 | 65 | self.store_to_mongodb() 66 | 67 | @staticmethod 68 | def parse_requests_dict(requests_dict): 69 | relationship_list = [] 70 | for item in requests_dict['data']: 71 | user_id = str(item['id']) 72 | user = {} 73 | try: 74 | user['id'] = str(user_id) 75 | user['name'] = str(item['userName']) 76 | user['fans_count'] = str(item['fansCount']) 77 | user['follow_count'] = str(item['followCount']) 78 | user['weibo_count'] = str(item['postCount']) 79 | user['location'] = str(item['location']) 80 | user['sex'] = str(item['gender']) 81 | print("粉丝数"+user['fans_count']) 82 | except: 83 | continue 84 | 85 | relationship_list.append(user) 86 | 87 | return relationship_list 88 | 89 | def parse_requests_info_dict(self, requests_dict): 90 | info_dict = {} 91 | try: 92 | for item in requests_dict['data']: 93 | try: 94 | info_dict['fans_count'] = str(item['fansCount']) 95 | info_dict['follow_count'] = str(item['followCount']) 96 | info_dict['weibo_count'] = str(item['postCount']) 97 | info_dict['location'] = str(item['location']) 98 | info_dict['name'] = str(item['userName']) 99 | info_dict['url'] = str(item['url']) 100 | except: 101 | continue 102 | info_dict['fans_list'] = self.fans_list 103 | info_dict['follow_list'] = self.follow_list 104 | print(info_dict['name']) 105 | print(1111111111111111111111) 106 | return info_dict 107 | except: 108 | print(requests_dict) 109 | 110 | def store_to_mongodb(self): 111 | table = self.db[RALATIONTABLE] 112 | table.insert(self.info_dict) 113 | 114 | 115 | @staticmethod 116 | def _create_adapter(): 117 | return requests.adapters.HTTPAdapter( 118 | max_retries=requests.adapters.Retry( 119 | total=5, 120 | status_forcelist=[403, 404, 408, 500, 502], 121 | ) 122 | ) 123 | 124 | 125 | 126 | def get_relationship_from_mongodb(user_id): 127 | mongo_client = pymongo.MongoClient('localhost', 27017) 128 | db = mongo_client['Weibo'] 129 | table = db[RALATIONTABLE] 130 | for i in table.find(): 131 | if i['url'] == 'http://weibo.com/u/' + str(user_id): 132 | print(i['name']) 133 | 134 | 135 | if __name__ == '__main__': 136 | a = RealtimeUserRealationship(user_id='2671467531') 137 | get_relationship_from_mongodb('2671467531') 138 | b = { 139 | 'topic':'水滴直播', 140 | 'question_list': 141 | [ 142 | { 143 | 'question_name':'如何看待。。问题1', 144 | 'anwser_words': 145 | ['好','希拉里','4444'], 146 | 'percent':'8.33' 147 | }, 148 | { 149 | 'question_name': '如何看待。。问题2', 150 | 'anwser_words': 151 | ['不会', '淳朴', '4444'], 152 | 'percent': '4.44' 153 | }, 154 | ] 155 | } 156 | 157 | 158 | -------------------------------------------------------------------------------- /ml/realtime_user_info.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | from __future__ import unicode_literals, print_function 3 | from __future__ import division 4 | import json 5 | import csv 6 | import pymongo 7 | import requests 8 | from bs4 import BeautifulSoup 9 | import sys 10 | sys.path.append("..") 11 | from a1 import base 12 | from a1 import sina_store 13 | from a1 import sina_weibo 14 | from a1 import sina_people 15 | reload(sys) 16 | sys.setdefaultencoding('utf-8') 17 | 18 | 19 | class RealtimeRandomWeibo(object): 20 | KEY = '9LF3gnOtYENP26HSoNAxPptHk7eCgxdWjL5ZuSdJXuGALaAcTrLXdGI7TkEYnIQm' 21 | 22 | def __init__(self): 23 | self.href = 'http://api01.bitspaceman.com:8000/post/weibo?kw=的&apikey='+self.KEY 24 | self.get_random_weibo() 25 | 26 | def get_random_weibo(self): 27 | requests_get = requests.get(self.href, timeout=15) 28 | requests_content = requests_get.content 29 | requests_dict = json.loads(requests_content) 30 | for name, value in requests_dict.items(): 31 | print(name, value) 32 | print(len(requests_dict['data'])) 33 | self.parse_requests_dict(requests_dict) 34 | 35 | def parse_requests_dict(self, requests_dict): 36 | weibo_list = [] 37 | weibo = sina_weibo.SinaWeibo() 38 | count = 0 39 | for i in requests_dict['data']: 40 | if i['mblog'].has_key('retweeted_status'): 41 | count+=1 42 | print(i['mblog']['retweeted_status']['user']['location']) 43 | print(i['mblog']['retweeted_status']['reposts_count']) 44 | print(i['mblog']['retweeted_status']['user']['id']) 45 | print(i['mblog']['retweeted_status']['text']) 46 | 47 | print(count) 48 | 49 | if __name__ == '__main__': 50 | a = RealtimeRandomWeibo() -------------------------------------------------------------------------------- /ml/realtime_user_relationship.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -* 2 | from __future__ import unicode_literals, print_function 3 | from __future__ import division 4 | import json 5 | import csv 6 | import pymongo 7 | import requests 8 | from bs4 import BeautifulSoup 9 | import sys 10 | sys.path.append("..") 11 | # from a1 import base 12 | # from a1 import sina_store 13 | # from a1 import sina_weibo 14 | # from a1 import sina_people 15 | import fenci 16 | reload(sys) 17 | sys.setdefaultencoding('utf-8') 18 | 19 | 20 | class RealtimeUserRealationship(object): 21 | KEY = '9LF3gnOtYENP26HSoNAxPptHk7eCgxdWjL5ZuSdJXuGALaAcTrLXdGI7TkEYnIQm' 22 | 23 | def __init__(self, user_id, fans=True, follow=False): 24 | self.fans_href = 'http://api03.bitspaceman.com:8000/profile/weibo?type=3&id='+str(user_id)+'&apikey=' + \ 25 | self.KEY + '&size=50' 26 | self.fans_list = [] 27 | self.follow_href = 'http://api03.bitspaceman.com:8000/profile/weibo?type=2&id='+str(user_id)+'&apikey=' + \ 28 | self.KEY + '&size=50' 29 | self.follow_list = [] 30 | self.get_relationship(fans=fans, follow=follow) 31 | 32 | def get_relationship(self, fans=True, follow=False): 33 | if fans: 34 | requests_get = requests.get(self.fans_href, timeout=15) 35 | requests_content = requests_get.content 36 | requests_dict = json.loads(requests_content) 37 | self.fans_list = self.parse_requests_dict(requests_dict) 38 | self.store_to_mongodb(self.fans_list, fans=fans, follow=follow) 39 | if follow: 40 | requests_get = requests.get(self.fans_href, timeout=15) 41 | requests_content = requests_get.content 42 | requests_dict = json.loads(requests_content) 43 | self.follow_list = self.parse_requests_dict(requests_dict) 44 | self.store_to_mongodb(self.fans_list, fans=fans, follow=follow) 45 | 46 | @staticmethod 47 | def parse_requests_dict(requests_dict): 48 | relationship_list = [] 49 | for item in requests_dict['data']: 50 | user_id = str(item['id']) 51 | user = sina_people.SinaPeople(uid=user_id, lazy=True) 52 | try: 53 | user.name = str(item['userName']) 54 | print(user.name) 55 | user.fans_count = str(item['fansCount']) 56 | user.follow_count = str(item['followCount']) 57 | user.weibo_count = str(item['postCount']) 58 | user.location = str(item['location']) 59 | user.sex = str(item['gender']) 60 | print("粉丝数"+user.fans_count) 61 | except: 62 | continue 63 | 64 | relationship_list.append(user) 65 | 66 | return relationship_list 67 | 68 | @staticmethod 69 | def store_to_mongodb(user_list, fans=True, follow=False): 70 | sina_store_object = sina_store.SinaStore() 71 | if fans: 72 | sina_store_object.weibo_table = sina_store_object.db['realtime_user_fans'] 73 | for user in user_list: 74 | sina_store_object.store_in_mongodb(user) 75 | if follow: 76 | sina_store_object.weibo_table = sina_store_object.db['realtime_user_follow'] 77 | for user in user_list: 78 | sina_store_object.store_in_mongodb(user) -------------------------------------------------------------------------------- /ml/stopwords_cn.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SuperSaiyanSSS/SinaWeiboSpider/b034c20ccf062b1323046584712716b2794ec7ec/ml/stopwords_cn.txt -------------------------------------------------------------------------------- /ml/svm_dict.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import print_function 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | import pandas as pd 7 | import re 8 | import numpy as np 9 | from gensim import corpora, models 10 | from scipy.sparse import csr_matrix 11 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 12 | from sklearn import svm 13 | import numpy as np 14 | import os,re,time,logging 15 | import jieba 16 | import pickle as pkl 17 | 18 | import svm_tfidf 19 | import os_path 20 | 21 | from svm_utils import * 22 | 23 | def reduce_dict(weibo_test): 24 | dictionary = None 25 | if not os.path.exists(path_tmp): 26 | os.makedirs(path_tmp) 27 | # 若不存在之前创建的词典,则生成词典 28 | if not os.path.exists(path_dictionary): 29 | dictionary = corpora.Dictionary() 30 | files = os_path.LoadFiles(path_doc_root) 31 | for i, msg in enumerate(files): 32 | catg = msg[0] 33 | file = msg[1] 34 | file = convert_doc_to_wordlist(file, cut_all=False) 35 | dictionary.add_documents([file]) 36 | # 去掉词典中出现次数过少的词语 37 | small_freq_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq < 5] 38 | dictionary.filter_tokens(small_freq_ids) 39 | dictionary.compactify() 40 | dictionary.save(path_dictionary) 41 | svm_tfidf.reduce_tfidf(dictionary, weibo_test) 42 | 43 | if __name__ == "__main__": 44 | reduce_dict(weibo_test = "小粉红滚!你个傻逼,体育老师教你的?吾问无为谓") -------------------------------------------------------------------------------- /ml/svm_lsi.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import print_function 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | import pandas as pd 7 | import re 8 | import numpy as np 9 | from gensim import corpora, models 10 | from scipy.sparse import csr_matrix 11 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 12 | from sklearn import svm 13 | import numpy as np 14 | import os,re,time,logging 15 | import jieba 16 | import pickle as pkl 17 | 18 | import svm_module 19 | from svm_utils import * 20 | 21 | # path_doc_root = 'H:\py\workplace\/a2\SogouC.reduced2\\Reduced' # 根目录 即存放按类分类好的问本纪 22 | # path_tmp = 'H:\py\workplace\/a2\SogouC.reduced2ss3\\temp1' # 存放中间结果的位置 23 | # path_dictionary = os.path.join(path_tmp, 'THUNews.dict') 24 | # path_tmp_tfidf = os.path.join(path_tmp, 'tfidf_corpus') 25 | # path_tmp_lsi = os.path.join(path_tmp, 'lsi_corpus') 26 | # path_tmp_lsimodel = os.path.join(path_tmp, 'lsi_model.pkl') 27 | # path_tmp_predictor = os.path.join(path_tmp, 'predictor.pkl') 28 | 29 | 30 | def reduce_lsi(dictionary, corpus_tfidf, weibo_test): 31 | corpus_lsi = None 32 | lsi_model = None 33 | # # # # 第三阶段, 开始将tfidf转化成lsi 34 | if not os.path.exists(path_tmp_lsi): 35 | print('=== 未检测到有lsi文件夹存在,开始生成lsi向量 ===') 36 | if not dictionary: 37 | dictionary = corpora.Dictionary.load(path_dictionary) 38 | if not corpus_tfidf: # 如果跳过了第二阶段,则从指定位置读取tfidf文档 39 | print('--- 未检测到tfidf文档,开始从磁盘中读取 ---') 40 | # 从对应文件夹中读取所有类别 41 | files = os.listdir(path_tmp_tfidf) 42 | catg_list = [] 43 | for file in files: 44 | t = file.split('.')[0] 45 | if t not in catg_list: 46 | catg_list.append(t) 47 | 48 | # 从磁盘中读取corpus 49 | corpus_tfidf = {} 50 | for catg in catg_list: 51 | path = '{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg) 52 | corpus = corpora.MmCorpus(path) 53 | corpus_tfidf[catg] = corpus 54 | print('--- tfidf文档读取完毕,开始转化成lsi向量 ---') 55 | 56 | # 生成lsi model 57 | os.makedirs(path_tmp_lsi) 58 | corpus_tfidf_total = [] 59 | catgs = list(corpus_tfidf.keys()) 60 | for catg in catgs: 61 | tmp = corpus_tfidf.get(catg) 62 | corpus_tfidf_total += tmp 63 | lsi_model = models.LsiModel(corpus=corpus_tfidf_total, id2word=dictionary, num_topics=50) 64 | # 将lsi模型存储到磁盘上 65 | lsi_file = open(path_tmp_lsimodel, 'wb') 66 | pkl.dump(lsi_model, lsi_file) 67 | lsi_file.close() 68 | del corpus_tfidf_total # lsi model已经生成,释放变量空间 69 | print('--- lsi模型已经生成 ---') 70 | 71 | # 生成corpus of lsi, 并逐步去掉 corpus of tfidf 72 | corpus_lsi = {} 73 | for catg in catgs: 74 | corpu = [lsi_model[doc] for doc in corpus_tfidf.get(catg)] 75 | corpus_lsi[catg] = corpu 76 | corpus_tfidf.pop(catg) 77 | corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_lsi, s=os.sep, c=catg), 78 | corpu, 79 | id2word=dictionary) 80 | print('=== lsi向量已经生成 ===') 81 | else: 82 | print('=== 检测到lsi向量已经生成,跳过该阶段 ===') 83 | 84 | svm_module.reduce_module(dictionary, corpus_lsi, lsi_model, weibo_test) 85 | -------------------------------------------------------------------------------- /ml/svm_module.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import print_function 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | import pandas as pd 7 | import re 8 | import numpy as np 9 | from gensim import corpora, models 10 | from scipy.sparse import csr_matrix 11 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 12 | from sklearn import svm 13 | import numpy as np 14 | import os,re,time,logging 15 | import jieba 16 | import pickle as pkl 17 | 18 | import svm_result 19 | from svm_utils import * 20 | 21 | 22 | def reduce_module(dictionary, corpus_lsi, lsi_model, weibo_test): 23 | # # # # 第四阶段, 分类 24 | predictor = None 25 | if not os.path.exists(path_tmp_predictor): 26 | print('=== 未检测到判断器存在,开始进行分类过程 ===') 27 | if not corpus_lsi: # 如果跳过了第三阶段 28 | print('--- 未检测到lsi文档,开始从磁盘中读取 ---') 29 | files = os.listdir(path_tmp_lsi) 30 | catg_list = [] 31 | for file in files: 32 | t = file.split('.')[0] 33 | if t not in catg_list: 34 | catg_list.append(t) 35 | # 从磁盘中读取corpus 36 | corpus_lsi = {} 37 | for catg in catg_list: 38 | path = '{f}{s}{c}.mm'.format(f=path_tmp_lsi, s=os.sep, c=catg) 39 | corpus = corpora.MmCorpus(path) 40 | corpus_lsi[catg] = corpus 41 | print('--- lsi文档读取完毕,开始进行分类 ---') 42 | 43 | tag_list = [] 44 | doc_num_list = [] 45 | corpus_lsi_total = [] 46 | catg_list = [] 47 | files = os.listdir(path_tmp_lsi) 48 | for file in files: 49 | t = file.split('.')[0] 50 | if t not in catg_list: 51 | catg_list.append(t) 52 | for count, catg in enumerate(catg_list): 53 | tmp = corpus_lsi[catg] 54 | tag_list += [count] * tmp.__len__() 55 | doc_num_list.append(tmp.__len__()) 56 | corpus_lsi_total += tmp 57 | corpus_lsi.pop(catg) 58 | 59 | # 将gensim中的mm表示转化成numpy矩阵表示 60 | data = [] 61 | rows = [] 62 | cols = [] 63 | line_count = 0 64 | for line in corpus_lsi_total: 65 | for elem in line: 66 | rows.append(line_count) 67 | cols.append(elem[0]) 68 | data.append(elem[1]) 69 | line_count += 1 70 | lsi_matrix = csr_matrix((data, (rows, cols))).toarray() 71 | # 生成训练集和测试集 72 | rarray = np.random.random(size=line_count) 73 | train_set = [] 74 | train_tag = [] 75 | test_set = [] 76 | test_tag = [] 77 | for i in range(line_count): 78 | if rarray[i] < 0.8: 79 | train_set.append(lsi_matrix[i, :]) 80 | train_tag.append(tag_list[i]) 81 | else: 82 | test_set.append(lsi_matrix[i, :]) 83 | test_tag.append(tag_list[i]) 84 | 85 | # 生成分类器 86 | predictor = svm_classify(train_set, train_tag, test_set, test_tag) 87 | x = open(path_tmp_predictor, 'wb') 88 | pkl.dump(predictor, x) 89 | x.close() 90 | else: 91 | print('=== 检测到分类器已经生成,跳过该阶段 ===') 92 | 93 | svm_result.reduce_result(dictionary, lsi_model, predictor, weibo_test) -------------------------------------------------------------------------------- /ml/svm_result.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | import pandas as pd 7 | import re 8 | import numpy as np 9 | from gensim import corpora, models 10 | from scipy.sparse import csr_matrix 11 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 12 | from sklearn import svm 13 | import numpy as np 14 | import os,re,time,logging 15 | import jieba 16 | import pickle as pkl 17 | from svm_utils import * 18 | 19 | path_doc_root = 'H:\py\workplace\/a2\SogouC.reduced2\\Reduced' # 根目录 即存放按类分类好的问本纪 20 | path_tmp = 'H:\py\workplace\/a2\SogouC.reduced2ss11\\temp1' # 存放中间结果的位置 21 | path_dictionary = os.path.join(path_tmp, 'THUNews.dict') 22 | path_tmp_tfidf = os.path.join(path_tmp, 'tfidf_corpus') 23 | path_tmp_lsi = os.path.join(path_tmp, 'lsi_corpus') 24 | path_tmp_lsimodel = os.path.join(path_tmp, 'lsi_model.pkl') 25 | path_tmp_predictor = os.path.join(path_tmp, 'predictor.pkl') 26 | n = 2 # n 表示抽样率, n抽1 27 | # def convert_doc_to_wordlist(str_doc,cut_all): 28 | # sent_list = str_doc.split('\n') 29 | # sent_list = map(rm_char, sent_list) # 去掉一些字符,例如\u3000 30 | # word_2dlist = [rm_tokens(jieba.cut(part,cut_all=cut_all)) for part in sent_list] # 分词 31 | # word_list = sum(word_2dlist,[]) 32 | # return word_list 33 | # def rm_tokens(words): # 去掉一些停用次和数字 34 | # words_list = list(words) 35 | # stop_words = get_stop_words() 36 | # for i in range(words_list.__len__())[::-1]: 37 | # if words_list[i] in stop_words: # 去除停用词 38 | # words_list.pop(i) 39 | # elif words_list[i].isdigit(): 40 | # words_list.pop(i) 41 | # return words_list 42 | # def get_stop_words(path='stopwords_cn.txt'): 43 | # file = open(path,'rb').read().split('\n') 44 | # return set(file) 45 | # def rm_char(text): 46 | # text = re.sub('\u3000','',text) 47 | # return text 48 | # 49 | # def svm_classify(train_set,train_tag,test_set,test_tag): 50 | # 51 | # clf = svm.LinearSVC() 52 | # clf_res = clf.fit(train_set,train_tag) 53 | # train_pred = clf_res.predict(train_set) 54 | # test_pred = clf_res.predict(test_set) 55 | # 56 | # train_err_num, train_err_ratio = checkPred(train_tag, train_pred) 57 | # test_err_num, test_err_ratio = checkPred(test_tag, test_pred) 58 | # 59 | # print('=== 分类训练完毕,分类结果如下 ===') 60 | # print('训练集误差: {e}'.format(e=train_err_ratio)) 61 | # print('检验集误差: {e}'.format(e=test_err_ratio)) 62 | # 63 | # return clf_res 64 | # 65 | # 66 | # def checkPred(data_tag, data_pred): 67 | # if data_tag.__len__() != data_pred.__len__(): 68 | # raise RuntimeError('The length of data tag and data pred should be the same') 69 | # err_count = 0 70 | # for i in range(data_tag.__len__()): 71 | # if data_tag[i]!=data_pred[i]: 72 | # err_count += 1 73 | # err_ratio = err_count / data_tag.__len__() 74 | # return [err_count, err_ratio] 75 | 76 | 77 | def reduce_result(dictionary, lsi_model, predictor, weibo_test): 78 | # # # # 第五阶段, 对新文本进行判断 79 | if not dictionary: 80 | dictionary = corpora.Dictionary.load(path_dictionary) 81 | if not lsi_model: 82 | lsi_file = open(path_tmp_lsimodel,'rb') 83 | lsi_model = pkl.load(lsi_file) 84 | lsi_file.close() 85 | if not predictor: 86 | x = open(path_tmp_predictor,'rb') 87 | predictor = pkl.load(x) 88 | x.close() 89 | files = os.listdir(path_tmp_lsi) 90 | catg_list = [] 91 | for file in files: 92 | t = file.split('.')[0] 93 | if t not in catg_list: 94 | catg_list.append(t) 95 | 96 | demo_doc = weibo_test 97 | print(demo_doc) 98 | demo_doc = list(jieba.cut(demo_doc,cut_all=False)) 99 | demo_bow = dictionary.doc2bow(demo_doc) 100 | tfidf_model = models.TfidfModel(dictionary=dictionary) 101 | demo_tfidf = tfidf_model[demo_bow] 102 | demo_lsi = lsi_model[demo_tfidf] 103 | data = [] 104 | cols = [] 105 | rows = [] 106 | for item in demo_lsi: 107 | data.append(item[1]) 108 | cols.append(item[0]) 109 | rows.append(0) 110 | demo_matrix = csr_matrix((data,(rows,cols))).toarray() 111 | x = predictor.predict(demo_matrix) 112 | print('分类结果为:{x}'.format(x=catg_list[x[0]])) 113 | 114 | -------------------------------------------------------------------------------- /ml/svm_tfidf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import print_function 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | import pandas as pd 7 | import re 8 | import numpy as np 9 | from gensim import corpora, models 10 | from scipy.sparse import csr_matrix 11 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 12 | from sklearn import svm 13 | import numpy as np 14 | import os,re,time,logging 15 | import jieba 16 | import pickle as pkl 17 | 18 | import svm_lsi 19 | import os_path 20 | from svm_utils import * 21 | 22 | # path_doc_root = 'H:\py\workplace\/a2\SogouC.reduced2\\Reduced' # 根目录 即存放按类分类好的问本纪 23 | # path_tmp = 'H:\py\workplace\/a2\SogouC.reduced2ss1\\temp1' # 存放中间结果的位置 24 | # path_dictionary = os.path.join(path_tmp, 'THUNews.dict') 25 | # path_tmp_tfidf = os.path.join(path_tmp, 'tfidf_corpus') 26 | # path_tmp_lsi = os.path.join(path_tmp, 'lsi_corpus') 27 | # path_tmp_lsimodel = os.path.join(path_tmp, 'lsi_model.pkl') 28 | # path_tmp_predictor = os.path.join(path_tmp, 'predictor.pkl') 29 | 30 | corpus_lsi = None 31 | lsi_model = None 32 | predictor = None 33 | 34 | 35 | # def convert_doc_to_wordlist(str_doc,cut_all): 36 | # sent_list = str_doc.split('\n') 37 | # sent_list = map(rm_char, sent_list) # 去掉一些字符,例如\u3000 38 | # word_2dlist = [rm_tokens(jieba.cut(part,cut_all=cut_all)) for part in sent_list] # 分词 39 | # word_list = sum(word_2dlist,[]) 40 | # return word_list 41 | # 42 | # 43 | # def rm_tokens(words): # 去掉一些停用次和数字 44 | # words_list = list(words) 45 | # stop_words = get_stop_words() 46 | # for i in range(words_list.__len__())[::-1]: 47 | # if words_list[i] in stop_words: # 去除停用词 48 | # words_list.pop(i) 49 | # elif words_list[i].isdigit(): 50 | # words_list.pop(i) 51 | # return words_list 52 | # 53 | # 54 | # def get_stop_words(path='stopwords_cn.txt'): 55 | # file = open(path,'rb').read().split('\n') 56 | # return set(file) 57 | # 58 | # 59 | # def rm_char(text): 60 | # text = re.sub('\u3000','',text) 61 | # return text 62 | 63 | 64 | def reduce_tfidf(dictionary, weibo_test): 65 | corpus_tfidf = None 66 | # # # # 第二阶段, 开始将文档转化成tfidf 67 | if not os.path.exists(path_tmp_tfidf): 68 | print('=== 未检测到有tfidf文件夹存在,开始生成tfidf向量 ===') 69 | # 如果指定的位置没有tfidf文档,则生成一个。如果有,则跳过该阶段 70 | if not dictionary: # 如果跳过了第一阶段,则从指定位置读取词典 71 | dictionary = corpora.Dictionary.load(path_dictionary) 72 | os.makedirs(path_tmp_tfidf) 73 | files = os_path.LoadFiles(path_doc_root) 74 | tfidf_model = models.TfidfModel(dictionary=dictionary) 75 | corpus_tfidf = {} 76 | for i, msg in enumerate(files): 77 | catg = msg[0] 78 | file = msg[1] 79 | word_list = convert_doc_to_wordlist(file, cut_all=False) 80 | file_bow = dictionary.doc2bow(word_list) 81 | file_tfidf = tfidf_model[file_bow] 82 | tmp = corpus_tfidf.get(catg, []) 83 | tmp.append(file_tfidf) 84 | if tmp.__len__() == 1: 85 | corpus_tfidf[catg] = tmp 86 | # 将tfidf中间结果储存起来 87 | catgs = list(corpus_tfidf.keys()) 88 | for catg in catgs: 89 | corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg), 90 | corpus_tfidf.get(catg), 91 | id2word=dictionary 92 | ) 93 | print('catg {c} has been transformed into tfidf vector'.format(c=catg)) 94 | print('=== tfidf向量已经生成 ===') 95 | else: 96 | print('=== 检测到tfidf向量已经生成,跳过该阶段 ===') 97 | 98 | svm_lsi.reduce_lsi(dictionary, corpus_tfidf, weibo_test) -------------------------------------------------------------------------------- /ml/svm_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import print_function 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | import os 7 | import re 8 | import jieba 9 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 10 | from sklearn import svm 11 | 12 | path_doc_root = 'H:\py\workplace\/a2\SogouC.reduced2\\Reduced' # 根目录 即存放按类分类好的问本纪 13 | path_tmp = 'H:\py\workplace\/a2\SogouC.reduced2ss11\\temp1' # 存放中间结果的位置 14 | path_dictionary = os.path.join(path_tmp, 'THUNews.dict') 15 | path_tmp_tfidf = os.path.join(path_tmp, 'tfidf_corpus') 16 | path_tmp_lsi = os.path.join(path_tmp, 'lsi_corpus') 17 | path_tmp_lsimodel = os.path.join(path_tmp, 'lsi_model.pkl') 18 | path_tmp_predictor = os.path.join(path_tmp, 'predictor.pkl') 19 | 20 | def convert_doc_to_wordlist(str_doc,cut_all): 21 | sent_list = str_doc.split('\n') 22 | sent_list = map(rm_char, sent_list) # 去掉一些字符,例如\u3000 23 | word_2dlist = [rm_tokens(jieba.cut(part,cut_all=cut_all)) for part in sent_list] # 分词 24 | word_list = sum(word_2dlist,[]) 25 | return word_list 26 | 27 | 28 | def rm_tokens(words): # 去掉一些停用次和数字 29 | words_list = list(words) 30 | stop_words = get_stop_words() 31 | for i in range(words_list.__len__())[::-1]: 32 | if words_list[i] in stop_words: # 去除停用词 33 | words_list.pop(i) 34 | elif words_list[i].isdigit(): 35 | words_list.pop(i) 36 | return words_list 37 | 38 | 39 | def get_stop_words(path='stopwords_cn.txt'): 40 | file = open(path,'rb').read().split('\n') 41 | return set(file) 42 | 43 | 44 | def rm_char(text): 45 | text = re.sub('\u3000','',text) 46 | return text 47 | 48 | 49 | def svm_classify(train_set, train_tag, test_set, test_tag): 50 | clf = svm.LinearSVC() 51 | clf_res = clf.fit(train_set, train_tag) 52 | train_pred = clf_res.predict(train_set) 53 | test_pred = clf_res.predict(test_set) 54 | 55 | train_err_num, train_err_ratio = checkPred(train_tag, train_pred) 56 | test_err_num, test_err_ratio = checkPred(test_tag, test_pred) 57 | 58 | print('=== 分类训练完毕,分类结果如下 ===') 59 | print('训练集误差: {e}'.format(e=train_err_ratio)) 60 | print('检验集误差: {e}'.format(e=test_err_ratio)) 61 | 62 | return clf_res 63 | 64 | 65 | def checkPred(data_tag, data_pred): 66 | if data_tag.__len__() != data_pred.__len__(): 67 | raise RuntimeError('The length of data tag and data pred should be the same') 68 | err_count = 0 69 | for i in range(data_tag.__len__()): 70 | if data_tag[i]!=data_pred[i]: 71 | err_count += 1 72 | err_ratio = err_count / data_tag.__len__() 73 | return [err_count, err_ratio] -------------------------------------------------------------------------------- /ml/values.csv: -------------------------------------------------------------------------------- 1 | uid,similarity,platform,reputation,entropy 2 | 6034008530,0.0,4,0.31343283582089554,1.5167044950347657 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from weibospider import WeiboClient 3 | import sys 4 | reload(sys) 5 | sys.setdefaultencoding('utf-8') 6 | 7 | #cookies = \ 8 | #'ALF=1504271525; SCF=AjsEaVa0e8KjEg3yEjwEx270PLOpYvK-1BhV7AdkMSQgREMFA0bd3IqV3weK9ydf5NAbDXjKT69Rfndb2m9Ah4I.; SUB=_2A250hb_2DeRhGeNH7VIV9izNwj2IHXVXicG-rDV6PUNbktANLRPhkW1ZeSLr49kFNMgwrWThnh1bPUhWPw..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhaydrjX2CLPhFdjQ77gn4P5JpX5KMhUgL.Fo-4So5XSozp1K22dJLoI0YLxK.LB.-L1K.LxKML12qLBK5LxKqL1KBLBo.LxK.LB-BL1KBLxKBLB.2LB.2LxK-LBonL1heLxKqLB-eLBKMt; SUHB=03oF_eQuesy4xQ; SSOLoginState=1501679526; _T_WM=544d051d212d2d6f3adece8b6949b373' 9 | 10 | cookies = 'ALF=1512959361; SCF=AlGHrwmWqyhSdpml9a836b5TfwBwT3_aqlPQLm4VGPX5AnF7W-51O8sb-246XgliUA_jtEUQg3I0XisboShzSK4.; SUB=_2A253Ah7JDeRhGeNH7VIV9izNwj2IHXVUDKKBrDV6PUJbktAKLUehkW02ueHV00_NzZ0DwjSbUFYBB6B69g..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhaydrjX2CLPhFdjQ77gn4P5JpX5K-hUgL.Fo-4So5XSozp1K22dJLoI0YLxK.LB.-L1K.LxKML12qLBK5LxKqL1KBLBo.LxK.LB-BL1KBLxKBLB.2LB.2LxK-LBonL1heLxKqLB-eLBKMt; SUHB=0M2Flmef-A-AIV; SSOLoginState=1510370969; _T_WM=28de7b7a225087a87cbe9c2ad92df1ea' 11 | 12 | if __name__ == '__main__': 13 | pe0 = WeiboClient(cookies=cookies) 14 | pe2 = pe0.Weibo('E6iRJofK6') 15 | pe4 = pe0.People('1884866222') 16 | print(pe4) 17 | print(pe4.name) 18 | print(pe4.weibo_count) 19 | print(pe4.location) 20 | pe2_people = pe2.author 21 | print('______test________') 22 | print(pe2_people.name) 23 | print(pe2_people.weibo_count) 24 | for i, j in zip(range(3), pe2_people.weibo): 25 | print(j.text) 26 | for ii, jj in zip(range(4), j.repost): 27 | print(jj.author_name) 28 | 29 | 30 | -------------------------------------------------------------------------------- /weibospider/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from .attitude import Attitude 4 | from .base import SinaBaseObject 5 | from .comment import Comment 6 | from .client import WeiboClient 7 | from .people import People 8 | from .repost import Repost 9 | from .weibo import Weibo 10 | 11 | __all__ = [ 12 | 'Attitude', 'SinaBaseObject', 'Comment', 'People', 'Repost', 'Weibo', 'WeiboClient' 13 | ] -------------------------------------------------------------------------------- /weibospider/attitude.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | from utils import * 4 | import weibo 5 | from base import SinaBaseObject 6 | import sys 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | 11 | class Attitude(SinaBaseObject): 12 | """ 13 | 点赞类,一般不直接使用,而是作为`Answer.attitude`迭代器的返回类型 14 | """ 15 | 16 | def __init__(self, id, cache={}): 17 | super(Attitude, self).__init__() 18 | self.uid = str(id) 19 | self._cache = cache 20 | self.author_name = cache['author_name'] 21 | self.time = cache['time'] 22 | -------------------------------------------------------------------------------- /weibospider/base.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from __future__ import unicode_literals, print_function 4 | import requests 5 | import json 6 | import time as tt 7 | from bs4 import BeautifulSoup 8 | import sys 9 | import re 10 | 11 | reload(sys) 12 | sys.path.append('../') 13 | sys.setdefaultencoding('utf-8') 14 | 15 | 16 | # headers_for_get = { 17 | # 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 18 | # 'Accept-Encoding':'gzip, deflate, sdch', 19 | # 'Accept-Language':'zh-CN,zh;q=0.8', 20 | # 'Cache-Control':'max-age=0', 21 | # 'Connection':'keep-alive', 22 | # 'Cookie': '_T_WM=0ff248d78f4984aa135c5b2e53c11079; ALF=1495630107; SCF=AjsEaVa0e8KjEg3yEjwEx270PLOpYvK-1BhV7AdkMSQgVvrJ48ic42g3Xqe49zEjKtpWuFcU6KaL2lKIyLzY43s.; SUB=_2A251-YQQDeRhGeNH7VIV9izNwj2IHXVXBSxYrDV6PUJbktBeLUn6kW0ntTSLDvUTciwLCGGI3rSIiDX8jQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhaydrjX2CLPhFdjQ77gn4P5JpX5o2p5NHD95Qf1Kq7ShqEeK.pWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNSK.cehBceo24eBtt; SUHB=0mxUEyUKiYW96L; SSOLoginState=1493038144', 23 | # 'Host':'weibo.cn', 24 | # 'Upgrade-Insecure-Requests':'1', 25 | # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' 26 | # } 27 | 28 | 29 | class SinaBaseObject(object): 30 | """ 31 | 所有新浪类的基类 32 | :TODO 刷新cookie策略 33 | """ 34 | # 静态变量cookies 35 | cookies = '' 36 | headers_for_get = { 37 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 38 | 'Accept-Encoding': 'gzip, deflate, sdch', 39 | 'Accept-Language': 'zh-CN,zh;q=0.8', 40 | 'Cache-Control': 'max-age=0', 41 | 'Connection': 'keep-alive', 42 | 'Host': 'weibo.cn', 43 | 'Upgrade-Insecure-Requests': '1', 44 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' 45 | } 46 | _session = requests.Session() 47 | 48 | def __init__(self): 49 | self._time_delay = 1 50 | 51 | # TODO:给session的get方法增加日志的功能 52 | # 对requests.get()函数进行改进,增加重试和报错功能 53 | def retry_requests(self, url, uid=''): 54 | """ 55 | :param url: 待爬取的链接 56 | :param headers: 请求头 57 | :param uid: 帖子或用户的uid值(str类型) 58 | :return: requests_content 爬起的页面源码(bs4类型) 59 | """ 60 | # 设置重试次数 61 | retry_count = 3 62 | while retry_count != 0: 63 | try: 64 | requests_get = requests.get(url, headers=self.headers_for_get, cookies=self.cookies, timeout=3) 65 | requests_content = requests_get.content 66 | # 加入了__T_WM的cookie项后仍然无法在wap版微博登录 67 | # 只能采用网页版 68 | # if self.flag == 0: 69 | # cookies = requests.utils.dict_from_cookiejar(self.cookies) 70 | # requests_get.cookies = requests.utils.dict_from_cookiejar(requests_get.cookies) 71 | # cookies = dict(cookies.items() + requests_get.cookies.items()) 72 | # self.cookies = requests.utils.cookiejar_from_dict(cookies) 73 | # self.flag = 1 74 | # print(self.cookies) 75 | print(url) 76 | print(requests_content) 77 | print(requests_get.status_code) 78 | # 记录发起网络请求的url及时间 79 | with open('log_url.txt', 'a') as f: 80 | f.write(str(url) + ' ' + str(tt.strftime("%Y-%m-%d %H:%M:%S", tt.localtime())) + '\n') 81 | 82 | requests_content = BeautifulSoup(requests_content, "lxml") 83 | return requests_content 84 | except: 85 | tt.sleep(3) 86 | print("获取" + str(uid) + "页面时失败,正在重试。。。") 87 | print(requests_get.status_code) 88 | finally: 89 | retry_count -= 1 90 | if retry_count == 0: 91 | raise Exception("重试次数已完,仍获取" + str(uid) + "的页面失败!") 92 | 93 | @staticmethod 94 | def retry_requests_static(url, headers={}, timeout=3): 95 | """ 96 | :param url: 待爬取的链接 97 | :param headers: 请求头 98 | :param uid: 帖子或用户的uid值(str类型) 99 | :return: requests_content 爬起的页面源码(bs4类型) 100 | """ 101 | # 设置重试次数 102 | retry_count = 3 103 | while retry_count != 0: 104 | try: 105 | tt.sleep(3) 106 | print(url) 107 | requests_get = requests.get(url, headers=headers, timeout=3) 108 | print(url) 109 | # 记录发起网络请求的url及时间 110 | with open('log_static_url.txt', 'a') as f: 111 | f.write(str(url) + ' ' + str(tt.strftime("%Y-%m-%d %H:%M:%S", tt.localtime())) + '\n') 112 | return requests_get 113 | except: 114 | tt.sleep(3) 115 | print("获取页面时失败,正在重试。。。") 116 | finally: 117 | retry_count -= 1 118 | if retry_count == 0: 119 | raise Exception, "重试次数已完,仍获取页面失败!" 120 | 121 | 122 | if __name__ == '__main__': 123 | print(111) 124 | -------------------------------------------------------------------------------- /weibospider/client.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | from __future__ import unicode_literals, print_function 3 | import requests 4 | from utils import * 5 | import os 6 | import importlib 7 | import base 8 | 9 | 10 | class WeiboClient(object): 11 | """ 12 | 微博客户端类 维护网络会话 使用cookies登录 13 | """ 14 | 15 | def __init__(self, cookies=None): 16 | self._session = requests.Session() 17 | self._session.verify = False 18 | self._session.headers.update(Default_Header) 19 | self._session.mount('http://', self._create_adapter()) 20 | if cookies is not None: 21 | self.login_with_acquired_cookies(cookies) 22 | else: 23 | raise SystemError("未传入cookies") 24 | 25 | @staticmethod 26 | def _create_adapter(): 27 | return requests.adapters.HTTPAdapter( 28 | max_retries=requests.adapters.Retry( 29 | total=5, 30 | status_forcelist=[403, 404, 408, 500, 502], 31 | ) 32 | ) 33 | 34 | def login_with_acquired_cookies(self, cookies): 35 | """ 36 | note: 37 | 保存在文件中的cookies形式为chrome浏览器F12后NetWork中Headers里的形式 38 | 如: 39 | 'ALF=1501159357; SCF=AjsEaVa0e8KjEg3yEjwEx270PLOpYvK-1BhV7AdkMSQgUozbT8VN9e7zDppTz6FZs5PD6E5VoJ3e0J 40 | yOHFF-HIw.; SUB=_2A250ViLtDeThGeBP4lQW-CbLyTqIHXVXuU6lrDV6PUJbktANLWLBkW2HmYSKxGkq2uS0728TOqfHWar_RQ..; 41 | SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhzhoVOn6pkLuGbnO5GBEu35JpX5o2p5NHD95QceK.cS0nRS0zcWs4DqcjMi-- 42 | NiK.Xi-2Ri--ciKnRi-zNSo24SoMR1hMESntt; SUHB=0FQ7hD651l5Cff; _T_WM=55ac8f6c31f4eb6f286ad2e9ed8d729' 43 | """ 44 | # 若文件目录下存在cookies,则其为文件, 打开后获取 45 | # 否则为cookies字符串,直接获取 46 | if os.path.isfile(cookies): 47 | with open(cookies, 'r') as f: 48 | cookies = f.read() 49 | 50 | cookies_dict = {} 51 | # 将cookies字符串转为字典 52 | for item in cookies.split('; '): 53 | cookies_dict[item.split('=')[0]] = item.split('=')[1] 54 | self._session.cookies.update(cookies_dict) 55 | # cookies2 = requests.utils.cookiejar_from_dict(cookies_dict) 56 | base._session = self._session 57 | 58 | def __getattr__(self, item): 59 | """本函数为类工厂模式,用于获取各种类的实例,如 `Answer` `Question` 等. 60 | :支持的形式有: 61 | 1. client.me() (暂未实现) 62 | 2. client.weibo() 63 | 3. client.people() 64 | 4. client.comment() 65 | 5. client.attitude() 66 | 6. client.repost() 67 | 参数均为对应的id,返回对应的类的实例。 68 | """ 69 | # 回调对应模块的构造函数 70 | base.SinaBaseObject._session = self._session 71 | 72 | def callback_getattr(id): 73 | # 类名第一个字母大写 74 | return getattr(module, item.capitalize())(id) 75 | # TODO: 增加me 76 | attr_list = ['me', 'weibo', 'people', 'comment', 'attitude', 'repost'] 77 | if item.lower() in attr_list: 78 | module = importlib.import_module('.'+item.lower(), 'weibospider') 79 | return callback_getattr 80 | 81 | 82 | if __name__ == '__main__': 83 | a = WeiboClient(cookies='as=12') 84 | a.sina_weibo('666') 85 | -------------------------------------------------------------------------------- /weibospider/comment.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | from utils import * 4 | import weibo 5 | from base import SinaBaseObject 6 | import sys 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | 11 | class Comment(SinaBaseObject): 12 | """ 13 | 评论类,一般不直接使用,而是作为`Answer.comment`迭代器的返回类型 14 | """ 15 | 16 | def __init__(self, id, cache={}): 17 | super(Comment, self).__init__() 18 | self.uid = str(id) 19 | self._cache = cache 20 | self.attitude_count = cache['attitude_count'] 21 | self.author_name = cache['author_name'] 22 | self.author_uid = cache['author_uid'] 23 | self.content = cache['text'] 24 | self.is_hot = cache['is_hot'] 25 | self.terminal_source = cache['terminal_source'] 26 | self.text = cache['text'] 27 | self.time = cache['time'] 28 | 29 | 30 | 31 | 32 | # @property 33 | # @normal_attr 34 | # def _soup(self): 35 | # return self._cache['_soup'] 36 | # 37 | # # 获取微博作者的昵称和uid 38 | # def _get_author_data(self): 39 | # # self.author_name = self._soup.find(attrs={'id': 'M_'}).div.a.get_text() 40 | # # self._cache.setdefault('author_name', self.author_name) 41 | # 42 | # self.author_uid = self._soup.find(attrs={'id': 'M_'}).div.a.attrs['href'].split('/')[-1] 43 | # self._cache.setdefault('author_uid', self.author_uid) 44 | # 45 | # @property 46 | # @other_obj(class_name='people', name_in_json='people') 47 | # def author(self): 48 | # pass 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /weibospider/people.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | import time as tt 4 | import pymongo 5 | from utils import * 6 | from bs4 import BeautifulSoup 7 | import re 8 | import requests 9 | import weibo 10 | from base import SinaBaseObject 11 | import sys 12 | reload(sys) 13 | sys.setdefaultencoding('utf-8') 14 | pattern = re.compile(r'\d+') 15 | 16 | 17 | class People(SinaBaseObject): 18 | """ 19 | 新浪微博的用户类 20 | """ 21 | def __init__(self, id, href=None, cache={}): 22 | """ 23 | 24 | { 25 | uid: 5501547091, 26 | name: 助人为乐的英逸, 27 | fans_count: 285, 28 | follow_count: 1500, 29 | weibo_count: 1335, 30 | time_delay: 1, 31 | birthday: 未知, 32 | sex: 男, 33 | location: 江西, 34 | href: http://weibo.cn/5501547091/follow, 35 | :param uid: 36 | :param href: 37 | """ 38 | super(People, self).__init__() 39 | self.uid = str(id) 40 | self._cache = cache 41 | self.href = href 42 | self.birthday = '未知' 43 | self.member_level = '' 44 | self.follow_list = [] 45 | self.fans_list = [] 46 | self.is_V = False 47 | self.uid = self.uid.strip('\n') 48 | if not self.href: 49 | self.href = 'http://weibo.cn/'+self.uid 50 | if not self.uid: 51 | self.uid = self.href.split('cn/') 52 | 53 | @property 54 | def basic_url(self): 55 | return 'http://weibo.cn/u/' + str(self.uid) 56 | 57 | @property 58 | @normal_attr() 59 | def html(self): 60 | return self._session.get(self.href).content 61 | 62 | @property 63 | @normal_attr() 64 | def _soup(self): 65 | return BeautifulSoup(self.html, "lxml") 66 | 67 | @property 68 | @normal_attr() 69 | def _info_content(self): 70 | try: 71 | info_content = self._soup.find('div', attrs={'class': 'u'}).table.tr.findAll('td' 72 | )[1].div.span.contents[0] 73 | except AttributeError: 74 | return False 75 | return info_content 76 | 77 | @property 78 | @normal_attr() 79 | def _info_content_2(self): 80 | """ 81 | 若用户为大V 则有大V标志的图片 影响页面标签 82 | 故此时个人信息页面块实际为`_info_content_2` 83 | """ 84 | return self._soup.find('div', attrs={'class': 'u'}).table.tr.findAll('td')[1].div.span.get_text() 85 | 86 | @property 87 | @normal_attr() 88 | def name(self): 89 | return self._info_content.split(' ')[0].strip() 90 | 91 | @property 92 | @normal_attr() 93 | def sex(self): 94 | try: 95 | sex = self._info_content.split(' ')[1].split('/')[0].strip() 96 | except IndexError: 97 | sex = self._info_content_2.split('/')[0].strip()[-1:].strip() 98 | return sex 99 | 100 | @property 101 | @normal_attr() 102 | def location(self): 103 | try: 104 | location = self._info_content.split(' ')[1].split('/')[1].strip() 105 | except IndexError: 106 | # 将大V标志为真 107 | self.is_V = True 108 | location = self._info_content_2.split('/')[1].strip()[:3].strip() 109 | return location 110 | 111 | @property 112 | @normal_attr() 113 | def weibo_count(self): 114 | return int(re.findall(pattern, self._soup.find('div', attrs={'class': 'u'}). 115 | findAll('div', attrs={'class': 'tip2'})[0].get_text())[0]) 116 | 117 | @property 118 | @normal_attr() 119 | def follow_count(self): 120 | return int(re.findall(pattern, self._soup.find('div', attrs={'class': 'u'}). 121 | findAll('div', attrs={'class': 'tip2'})[0].get_text())[1]) 122 | 123 | @property 124 | @normal_attr() 125 | def fans_count(self): 126 | return int(re.findall(pattern, self._soup.find('div', attrs={'class': 'u'}). 127 | findAll('div', attrs={'class': 'tip2'})[0].get_text())[2]) 128 | 129 | def _get_member_list(self, target_member_type): 130 | """ 131 | 获取所指定的当前用户的关注/粉丝列表 132 | 每个被关注者或粉丝的信息存储在dict中 133 | :param required_member_count: 指定获取用户的数量 134 | :param time_delay: 延迟时间 135 | :param target_member_type: 指定获取用户的种类:fans或follow 136 | :return: member_list: 存放已获取的用户列表 137 | 138 | """ 139 | # TODO: 获取人物基本信息 140 | member_url = 'http://weibo.cn/' + str(self.uid) + '/' + str(target_member_type) 141 | self.href = member_url 142 | print("now is crawling " + str(member_url)) 143 | page_count = 1 144 | now_page_count = 1 145 | is_first = True 146 | while True: 147 | 148 | tt.sleep(self.time_delay) 149 | # 获取页面源码(bs4对象) 150 | requests_content = BeautifulSoup(self._session.get(member_url).content, "lxml") 151 | 152 | # 获取当前页的关注列表 153 | unit_list = requests_content.find_all('table') 154 | for i in unit_list: 155 | # 每个用户的信息以dict存储 156 | member = {} 157 | member['href'] = str(i.tr.td.a.attrs['href']) 158 | try: 159 | member['uid'] = i.tr.td.a.attrs['href'].split('u/')[1] 160 | except: 161 | member['uid'] = i.tr.td.a.attrs['href'].split('cn/')[1] 162 | member['name'] = i.tr.find_all('td')[1].a.get_text() 163 | # 正则匹配获取粉丝的粉丝数 164 | pattern = re.compile(r'\d+') 165 | # 若粉丝是大V,则多了一个图片标签 166 | try: 167 | member['is_v'] = False 168 | member['fans_count'] = int(re.findall(pattern, i.tr.find_all('td')[1].contents[2])[0]) 169 | except: 170 | member['fans_count'] = int(re.findall(pattern, i.tr.find_all('td')[1].contents[3])[0]) 171 | member['is_v'] = True 172 | 173 | yield member 174 | 175 | # 若是第一页,则获取总页数 176 | if is_first is True: 177 | # 若发现‘x/y页’ 则有不止一页 178 | if requests_content.find(attrs={'id': 'pagelist'}): 179 | page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip() 180 | page_count = page_count.split('/')[1] 181 | pattern = re.compile(r'\d+') 182 | page_count = int(re.findall(pattern, page_count)[0]) 183 | else: 184 | return 185 | is_first = False 186 | 187 | now_page_count += 1 188 | if now_page_count >= page_count: 189 | return 190 | 191 | member_url = 'http://weibo.cn/' + str(self.uid)+'/'+str(target_member_type)+'?page=' + str(now_page_count) 192 | 193 | @property 194 | @normal_attr() 195 | def fans(self): 196 | """ 197 | 获取当前用户的粉丝列表 198 | :param required_member_count: 限定获取的数量 199 | :param time_delay: 时间延迟 200 | :return: 指定数量的粉丝基本信息列表 201 | .. code-block:: python 202 | [ 203 | { 204 | 'fans_count': 104, 205 | 'is_v' : False, 206 | 'href': 'http://weibo.cn/u/5977488639', 207 | 'uid': 5977488639, 208 | 'name': '小山环环1996' 209 | }, 210 | { 211 | 'fans_count': 10, 212 | 'is_v' : False, 213 | 'href': 'http://weibo.cn/u/6187915152', 214 | 'uid': 6187915152, 215 | 'name': '08iCu京伯' 216 | }, 217 | ] 218 | """ 219 | for x in self._get_member_list(target_member_type='fans'): 220 | yield x 221 | 222 | @property 223 | @normal_attr() 224 | def follow(self): 225 | """ 226 | 获取当前用户的关注列表 227 | :param required_member_count: 限定获取的数量 228 | :param time_delay: 时间延迟 229 | :return: 指定数量的关注基本信息列表 230 | .. code-block:: python 231 | [ 232 | { 233 | 'fans_count': 104, 234 | 'is_v' : False, 235 | 'href': 'http://weibo.cn/u/5977488639', 236 | 'uid': 5977488639, 237 | 'name': '小山环环1996' 238 | }, 239 | { 240 | 'fans_count': 10, 241 | 'is_v' : False, 242 | 'href': 'http://weibo.cn/u/6187915152', 243 | 'uid': 6187915152, 244 | 'name': '08iCu京伯' 245 | }, 246 | ] 247 | """ 248 | for x in self._get_member_list(target_member_type='follow'): 249 | yield x 250 | 251 | @property 252 | @other_obj() 253 | def weibo(self): 254 | """ 255 | 获取指定用户的微博 256 | :param required_weibo_count: 所需的微博条数 257 | :param time_delay: 时间延迟 258 | :return: weibo_list 元素为SinaWeibo对象 259 | .. code-block:: python 260 | [ 261 | { 262 | 'uid': 'EpO2KnAor', 263 | 'is_repost': False, 264 | 'text': '物是人非.', 265 | 'attitude_count' : 0, 266 | 'repost_count': 7, 267 | 'comment_count': 0, 268 | 'time': '01月08日 04:44' 269 | 'terminal_source': 'iPad mini' 270 | }, 271 | { 272 | 'uid': 'EAJwkph8X', 273 | 'is_repost': False, 274 | 'text': '祝你生日快乐', 275 | 'attitude_count' : 0, 276 | 'repost_count': 0, 277 | 'comment_count': 1, 278 | 'time': '2016-12-30 23:34:34' 279 | 'terminal_source': '生日动态' 280 | }, 281 | ] 282 | """ 283 | 284 | weibo_url = self.basic_url 285 | page_count = 1 286 | now_page_count = 1 287 | is_first = True 288 | pattern = re.compile(r'\d+') 289 | 290 | while True: 291 | tt.sleep(self._time_delay) 292 | # 获取页面源码(bs4对象) 293 | requests_content = BeautifulSoup(self._session.get(weibo_url).content, "lxml") 294 | # 获取当前页的微博列表 295 | unit_list = requests_content.find_all('div', attrs={'class': 'c'}) 296 | for i in unit_list: 297 | # 每个微博的信息以微博类SinaWeibo存储 298 | try: 299 | if str(i.attrs['id']) and str(i.attrs['id']).startswith('M'): 300 | weibo_uid = i.attrs['id'].split('_')[1] 301 | else: 302 | continue 303 | except: 304 | continue 305 | 306 | # 检查是否为转发的微博 307 | if len(i.div.find_all('span')) >= 2: 308 | is_repost = True 309 | else: 310 | is_repost = False 311 | # for c in i.div.find_all('span'): 312 | # if str(c.attrs['class']) == "['cmt']": 313 | # is_repost = True 314 | if is_repost: 315 | text = i.div.find_all('span')[0].get_text()+i.div.find_all('span')[1].get_text() 316 | else: 317 | text = i.div.span.get_text() 318 | 319 | # 有的微博处html格式不对 320 | try: 321 | attitude_count = int(re.findall(pattern, i.div.find_all('a')[-4].get_text())[0]) 322 | repost_count = int(re.findall(pattern, i.div.find_all('a')[-3].get_text())[0]) 323 | comment_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-2].get_text())[0]) 324 | except IndexError: 325 | try: 326 | comment_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-3].get_text())[0]) 327 | repost_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-4].get_text())[0]) 328 | attitude_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-5].get_text())[0]) 329 | except IndexError: 330 | attitude_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[0]) 331 | repost_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[1]) 332 | comment_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[2]) 333 | # print(attitude_count, repost_count, comment_count) 334 | try: 335 | time = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[0] 336 | terminal_source = i.div.find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[1] 337 | except IndexError: 338 | time = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[0] 339 | try: 340 | terminal_source = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[1] 341 | except IndexError: 342 | terminal_source = '暂无' 343 | # print(time, terminal_source) 344 | weibo_cache = { 345 | "is_repost": is_repost, 346 | "text": text, 347 | "attitude_count": attitude_count, 348 | "repost_count": repost_count, 349 | "comment_count": comment_count, 350 | "time": time, 351 | "terminal_source": terminal_source 352 | } 353 | self.now_weibo_cache = weibo_cache 354 | self.now_weibo_uid = weibo_uid 355 | yield weibo.Weibo(id=weibo_uid, cache=weibo_cache) 356 | is_repost = False 357 | 358 | # 若是第一页,则获取总页数 359 | if is_first: 360 | # 若发现‘x/y页’ 则有不止一页 361 | if requests_content.find(attrs={'id': 'pagelist'}): 362 | page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip() 363 | page_count = page_count.split('/')[1] 364 | page_count = int(re.findall(pattern, page_count)[0]) 365 | else: 366 | return 367 | is_first = False 368 | 369 | now_page_count += 1 370 | if now_page_count > page_count: 371 | return 372 | 373 | weibo_url = 'http://weibo.cn/u/' + str(self.uid) + '?page=' + str(now_page_count) 374 | 375 | # def get_weibo_list(self): 376 | # """ 377 | # 获取指定用户的微博 378 | # :param required_weibo_count: 所需的微博条数 379 | # :param time_delay: 时间延迟 380 | # :return: weibo_list 元素为SinaWeibo对象 381 | # .. code-block:: python 382 | # [ 383 | # { 384 | # 'uid': 'EpO2KnAor', 385 | # 'is_repost': False, 386 | # 'text': '物是人非.', 387 | # 'attitude_count' : 0, 388 | # 'repost_count': 7, 389 | # 'comment_count': 0, 390 | # 'time': '01月08日 04:44' 391 | # 'terminal_source': 'iPad mini' 392 | # }, 393 | # { 394 | # 'uid': 'EAJwkph8X', 395 | # 'is_repost': False, 396 | # 'text': '祝你生日快乐', 397 | # 'attitude_count' : 0, 398 | # 'repost_count': 0, 399 | # 'comment_count': 1, 400 | # 'time': '2016-12-30 23:34:34' 401 | # 'terminal_source': '生日动态' 402 | # }, 403 | # ] 404 | # """ 405 | # required_weibo_count = self.required_weibo_count 406 | # weibo_url = self.basic_url 407 | # weibo_list = [] 408 | # weibo_count = 0 409 | # page_count = 1 410 | # now_page_count = 1 411 | # is_first = True 412 | # pattern = re.compile(r'\d+') 413 | # while True: 414 | # 415 | # tt.sleep(self.time_delay) 416 | # # 获取页面源码(bs4对象) 417 | # requests_content = self.retry_requests(weibo_url, uid=self.uid) 418 | # 419 | # # 获取当前页的微博列表 420 | # unit_list = requests_content.find_all('div', attrs={'class': 'c'}) 421 | # for i in unit_list: 422 | # # 每个微博的信息以微博类SinaWeibo存储 423 | # try: 424 | # if str(i.attrs['id']) and str(i.attrs['id']).startswith('M'): 425 | # weibo_uid = i.attrs['id'].split('_')[1] 426 | # else: 427 | # continue 428 | # except: 429 | # continue 430 | # weibo = sina_weibo.SinaWeibo(uid=weibo_uid, required_count=0) 431 | # 432 | # # 检查是否为转发的微博 433 | # for c in i.div.find_all('span'): 434 | # if str(c.attrs['class']) == "['cmt']": 435 | # weibo.is_repost = True 436 | # if weibo.is_repost: 437 | # weibo.text = i.div.find_all('span')[0].get_text()+i.div.find_all('span')[1].get_text() 438 | # else: 439 | # weibo.text = i.div.span.get_text()[1:] 440 | # 441 | # weibo.uid = weibo_uid 442 | # 443 | # # 有的微博处html格式不对 444 | # try: 445 | # weibo.attitude_count = int(re.findall(pattern, i.div.find_all('a')[-4].get_text())[0]) 446 | # weibo.repost_count = int(re.findall(pattern, i.div.find_all('a')[-3].get_text())[0]) 447 | # weibo.comment_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-2].get_text())[0]) 448 | # except IndexError: 449 | # print(weibo_uid) 450 | # print(weibo.author_uid) 451 | # try: 452 | # weibo.comment_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-3].get_text())[0]) 453 | # weibo.repost_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-4].get_text())[0]) 454 | # weibo.attitude_count = int(re.findall(pattern, i.find_all('div')[-1].find_all('a')[-5].get_text())[0]) 455 | # except IndexError: 456 | # weibo.attitude_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[0]) 457 | # weibo.repost_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[1]) 458 | # weibo.comment_count = int(re.findall(pattern, i.find_all('div')[-1].get_text())[2]) 459 | # print(weibo.attitude_count, weibo.repost_count, weibo.comment_count) 460 | # try: 461 | # weibo.time = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[0] 462 | # weibo.terminal_source = i.div.find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[1] 463 | # except IndexError: 464 | # print(i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text()) 465 | # weibo.time = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[0] 466 | # try: 467 | # weibo.terminal_source = i.find_all('div')[-1].find_all('span', attrs={'class': 'ct'})[0].get_text().split('来自')[1] 468 | # except IndexError: 469 | # weibo.terminal_source = '暂无' 470 | # print(weibo.time, weibo.terminal_source) 471 | # # 计数器加一 472 | # weibo_count += 1 473 | # # 若超过了要求获取的用户数量,则返回 474 | # if weibo_count > required_weibo_count: 475 | # return weibo_list 476 | # weibo_list.append(weibo) 477 | # 478 | # # 若是第一页,则获取总页数 479 | # if is_first: 480 | # # 若发现‘x/y页’ 则有不止一页 481 | # if requests_content.find(attrs={'id': 'pagelist'}): 482 | # page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip() 483 | # page_count = page_count.split('/')[1] 484 | # page_count = int(re.findall(pattern, page_count)[0]) 485 | # print(page_count) 486 | # else: 487 | # return weibo_list 488 | # is_first = False 489 | # 490 | # now_page_count += 1 491 | # if now_page_count > page_count: 492 | # break 493 | # 494 | # weibo_url = 'http://weibo.cn/u/' + str(self.uid) + '?page=' + str(now_page_count) 495 | # 496 | # return weibo_list 497 | 498 | # def get_personal_information(self): 499 | # """ 500 | # 注:新浪有奇怪的BUG 带cookies访问http://weibo.cn/3193031501/info这类个人资料url时,总是File not found 501 | # 若不带cookies则不能访问该页 502 | # 所以只能获取个人主页简单的性别和地点信息 503 | # 504 | # @2017/06/12: 505 | # 新浪允许不带cookie访问某些页面,如某个微博页面 506 | # 而对另一些页面 如个人主页的详细情况,则有的用户需要cookie,有的不需要。 507 | # :return: 508 | # """ 509 | # requests_content = self.retry_requests(self.href) 510 | # try: 511 | # info_content = requests_content.find('div', attrs={'class': 'u'}).table.tr.findAll('td')[1].div.span.contents[0] 512 | # except AttributeError: 513 | # print(requests_content) 514 | # return False 515 | # # 此处split(' ')中的空格不是一般的空格 需要在原网页中复制 516 | # # 普通用户无图片标签 517 | # self.name = info_content.split(' ')[0].strip() 518 | # print(self.name) 519 | # try: 520 | # self.sex = info_content.split(' ')[1].split('/')[0].strip() 521 | # print(self.sex) 522 | # self.location = info_content.split(' ')[1].split('/')[1].strip() 523 | # print(self.name, self.sex, self.location) 524 | # except IndexError: 525 | # self.is_V = True 526 | # info2 = requests_content.find('div', attrs={'class': 'u'}).table.tr.findAll('td')[1].div.span.get_text() 527 | # self.sex = info2.split('/')[0].strip()[-1:].strip() 528 | # print(self.sex) 529 | # self.location = info2.split('/')[1].strip()[:3].strip() 530 | # print(self.name, self.sex, self.location) 531 | # 532 | # # 获取该用户的微博数 关注数 粉丝数 533 | # self.weibo_count = int(re.findall(pattern, requests_content.find('div', attrs={'class': 'u'}). 534 | # findAll('div', attrs={'class': 'tip2'})[0].get_text())[0]) 535 | # self.follow_count = int(re.findall(pattern, requests_content.find('div', attrs={'class': 'u'}). 536 | # findAll('div', attrs={'class': 'tip2'})[0].get_text())[1]) 537 | # self.fans_count = int(re.findall(pattern, requests_content.find('div', attrs={'class': 'u'}). 538 | # findAll('div', attrs={'class': 'tip2'})[0].get_text())[2]) 539 | # print(self.weibo_count, self.follow_count, self.fans_count) 540 | 541 | # def __get_member_list__(self, target_member_type='fans'): 542 | # """ 543 | # 获取所指定的当前用户的关注/粉丝列表 544 | # 每个被关注者或粉丝的信息存储在dict中 545 | # :param required_member_count: 指定获取用户的数量 546 | # :param time_delay: 延迟时间 547 | # :param target_member_type: 指定获取用户的种类:fans或follow 548 | # :return: member_list: 存放已获取的用户列表 549 | # 550 | # 551 | # """ 552 | # required_member_count = self.required_member_count 553 | # member_url = 'http://weibo.cn/' + str(self.uid) + '/' + str(target_member_type) 554 | # self.href = member_url 555 | # print(member_url) 556 | # member_list = [] 557 | # member_count = 0 558 | # page_count = 1 559 | # now_page_count = 1 560 | # is_first = True 561 | # while True: 562 | # 563 | # tt.sleep(self.time_delay) 564 | # # 获取页面源码(bs4对象) 565 | # requests_content = self.retry_requests(member_url, uid=self.uid) 566 | # 567 | # # 获取当前页的关注列表 568 | # unit_list = requests_content.find_all('table') 569 | # for i in unit_list: 570 | # # 每个用户的信息以dict存储 571 | # member = {} 572 | # member['href'] = str(i.tr.td.a.attrs['href']) 573 | # try: 574 | # member['uid'] = i.tr.td.a.attrs['href'].split('u/')[1] 575 | # except: 576 | # member['uid'] = i.tr.td.a.attrs['href'].split('cn/')[1] 577 | # member['name'] = i.tr.find_all('td')[1].a.get_text() 578 | # # 正则匹配获取粉丝的粉丝数 579 | # pattern = re.compile(r'\d+') 580 | # # 若粉丝是大V,则多了一个图片标签 581 | # try: 582 | # member['is_v'] = False 583 | # member['fans_count'] = int(re.findall(pattern, i.tr.find_all('td')[1].contents[2])[0]) 584 | # except: 585 | # member['fans_count'] = int(re.findall(pattern, i.tr.find_all('td')[1].contents[3])[0]) 586 | # member['is_v'] = True 587 | # print(member['name']) 588 | # print(member['fans_count']) 589 | # # 计数器加一 590 | # member_count += 1 591 | # # 若超过了要求获取的用户数量,则返回 592 | # if member_count > required_member_count: 593 | # return member_list 594 | # member_list.append(member) 595 | # 596 | # # 若是第一页,则获取总页数 597 | # if is_first is True: 598 | # # 若发现‘x/y页’ 则有不止一页 599 | # if requests_content.find(attrs={'id': 'pagelist'}): 600 | # page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip() 601 | # page_count = page_count.split('/')[1] 602 | # pattern = re.compile(r'\d+') 603 | # page_count = int(re.findall(pattern, page_count)[0]) 604 | # print(page_count) 605 | # else: 606 | # return member_list 607 | # is_first = False 608 | # 609 | # now_page_count += 1 610 | # if now_page_count >= page_count: 611 | # break 612 | # 613 | # member_url = 'http://weibo.cn/' + str(self.uid)+'/'+str(target_member_type)+'?page=' + str(now_page_count) 614 | # print(member_url) 615 | # print(self.uid) 616 | # print(target_member_type) 617 | # print("以上") 618 | # 619 | # return member_list -------------------------------------------------------------------------------- /weibospider/repost.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | from utils import * 4 | import weibo 5 | from base import SinaBaseObject 6 | import sys 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | 11 | class Repost(SinaBaseObject): 12 | """ 13 | 回复类,一般不直接使用,而是作为`Answer.repost`迭代器的返回类型 14 | """ 15 | 16 | def __init__(self, id, cache={}): 17 | super(Repost, self).__init__() 18 | self.uid = str(id) 19 | self._cache = cache 20 | self.author_name = cache['author_name'] 21 | self.text = cache['text'] 22 | -------------------------------------------------------------------------------- /weibospider/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | from bs4 import BeautifulSoup 4 | import functools 5 | import importlib 6 | 7 | Default_Header = { 8 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 9 | 'Accept-Encoding': 'gzip, deflate, sdch', 10 | 'Accept-Language': 'zh-CN,zh;q=0.8', 11 | 'Cache-Control': 'max-age=0', 12 | 'Connection': 'keep-alive', 13 | 'Host': 'weibo.cn', 14 | 'Upgrade-Insecure-Requests': '1', 15 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 ' 16 | 'Safari/537.36' 17 | } 18 | 19 | 20 | def check_cache(attr): 21 | def real(func): 22 | @functools.wraps(func) 23 | def wrapper(self): 24 | value = getattr(self, attr, None) 25 | if not value: 26 | value = func(self) 27 | setattr(self, attr, value) 28 | return value 29 | return wrapper 30 | return real 31 | 32 | 33 | def normal_attr(name_in_json=None): 34 | """ 35 | 36 | 本装饰器的作用为: 37 | 38 | 1. 标识这个属性为常规属性。 39 | 2. 自动从当前对象的数据中取出对应属性。 40 | 优先返回缓存中的数据。 41 | :param name_in_json: 要查找的属性在`self._cache`这个json中的名字 42 | 默认值为使用此装饰器的方法名。 43 | """ 44 | def actual_decorator(func): 45 | @functools.wraps(func) 46 | def inner(self, *args, **kwargs): 47 | name = name_in_json or func.__name__ 48 | if self._cache and name in self._cache.keys(): 49 | return self._cache[name] 50 | else: 51 | value = func(self, *args, **kwargs) 52 | self._cache.setdefault(name, value) 53 | return self._cache[name] 54 | return inner 55 | return actual_decorator 56 | 57 | 58 | def other_obj(class_name=None, name_in_json=None, module_filename=None): 59 | """ 60 | 61 | 本装饰器的作用为: 62 | 63 | 1. 标识这个属性为另一个父类为base类的对象。 64 | 2. 自动从当前对象的数据中取出对应属性,构建成所需要的对象。 65 | :param class_name: 要生成的对象类名。 66 | :param name_in_json: 属性在 JSON 里的键名。 67 | :param module_filename: 所在的模块的文件名。 68 | """ 69 | def actual_decorator(func): 70 | @functools.wraps(func) 71 | def inner(self, *args, **kwargs): 72 | cls_name = class_name or func.__name__ 73 | name = name_in_json or func.__name__ 74 | 75 | obj_cls = get_class_from_name(cls_name, module_filename) 76 | 77 | request_obj = func(self, *args, **kwargs) 78 | # print(111111111111) 79 | # print(request_obj) 80 | 81 | if request_obj is None: 82 | if name == 'people': 83 | return obj_cls(self.author_uid, cache={'name': self.author_name}) 84 | # if name == 'weibo': 85 | # return obj_cls(uid=self.now_weibo_uid, cache=self.now_weibo_cache) 86 | return request_obj 87 | 88 | return inner 89 | 90 | return actual_decorator 91 | 92 | 93 | def get_class_from_name(clsname=None, module_filename=None): 94 | """ 95 | 96 | 接收类名,通过处理返回对应的类 97 | 98 | :param clsname: 类名 99 | :param module_filename: 模块名 100 | :return: 模块中对应传入类名的类 101 | """ 102 | cls_name = clsname.capitalize() if clsname.islower() else clsname 103 | file_name = module_filename or cls_name.lower() 104 | 105 | # 获取引用的模块 如 `` 106 | imported_module = importlib.import_module('.'+file_name, 'weibospider') 107 | # print(imported_module) 108 | # print(getattr(imported_module, cls_name)) 109 | # 返回模块中对应传入类名的类 如 `` 110 | return getattr(imported_module, cls_name) 111 | # except (ImportError, AttributeError): 112 | # raise TypeError( 113 | # 'Unknown weibo obj type [{}]'.format(clsname) 114 | # ) 115 | -------------------------------------------------------------------------------- /weibospider/weibo.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import unicode_literals, print_function 3 | import time as tt 4 | import bs4 5 | from bs4 import BeautifulSoup 6 | import re 7 | import requests 8 | from .utils import * 9 | from base import SinaBaseObject 10 | from comment import Comment 11 | import sys 12 | import people 13 | reload(sys) 14 | sys.setdefaultencoding('utf-8') 15 | 16 | pattern = re.compile(r'\d+') 17 | 18 | 19 | class Weibo(SinaBaseObject): 20 | """ 21 | 新浪微博的微博类 22 | { 23 | uid: F0Mg7a8Wh, 24 | author_uid: rmrb, 25 | is_repost: False, 26 | href: http://weibo.cn/comment/F0Mg7a8Wh, 27 | text: :【中国大学教学质量排行榜800强出炉!你的母校排多少?】近日,《2017中国大学评价研究报告》发布2017中国大学 28 | 教学质量排行榜。清华大学本科生和研究生教育教学和人才培养质量问鼎榜首,北大第2,复旦第3,南大第4,武大第5, 29 | 上海交大第6,浙大第7,人大第8,吉大第9,川大第10。戳↓你的学校第几名? ​​​ [组图共9张] 30 | time: 04月29日 12:58, 31 | time_delay: 1, 32 | author_name: 人民日报, 33 | repost_count: 3910, 34 | attitude_count: 3076, 35 | comment_count: 3248, 36 | repost_list: 37 | [ 38 | { 39 | u'text': ':', 40 | u'name': u'\u56db\u5ddd\u5927\u5b66' 41 | }, 42 | { 43 | u'text': ':27[\xe6\x91\x8a\xe6\x89\x8b][\xe5\xbf\x83] //', 44 | u'name': u'\u674e\u5199\u610f'}, 45 | { 46 | u'text': ':\xe5\xa4\xaa\xe6\x83\xa8\xe4\xba\x86\xef\xbc\x8c\xe5\x89\x8d56\ 47 | xe4\xb8\xaa\xe9\x83\xbd\xe6\x98\xaf\xe4\xb8\x96\xe7\x95\x8c\xe7\x9f\xa5\xe5\x90\x8d... //', 48 | u'name': u'\u897f\u8d22\u975e\u5b98\u65b9\u65b0\u95fb\u4e2d\u5fc3' 49 | }, 50 | .... 51 | ] 52 | comment_list: 53 | [ 54 | { 55 | uid: C_4101856898497093, 56 | terminal_source: iPhone 6s, 57 | text: \u4eba\u6c11\u65e5\u62a5\u4e5f\u53d1\u8fd9\u79cd\u5546\u4e1a\u6027\u8d28\u7684\u5927\u5b66 58 | \u6392\u884c\u699c\u3002\u3002\u3002[\u62dc\u62dc][\u62dc\u62dc][\u62dc\u62dc]', 59 | time: 04\u670829\u65e5 13:05\xa0, 60 | attitude_count: 270, 61 | is_hot: True, 62 | name: M-never 63 | }, 64 | .... 65 | ] 66 | hot_comment_list: 67 | [ 68 | { 69 | uid: C_4101856898497093, 70 | terminal_source: iPhone 6s, 71 | text: \u4eba\u6c11\u65e5\u62a5\u4e5f\u53d1\u8fd9\u79cd\u5546\u4e1a\u6027\u8d28\u7684\u5927\u5b66 72 | \u6392\u884c\u699c\u3002\u3002\u3002[\u62dc\u62dc][\u62dc\u62dc][\u62dc\u62dc]', 73 | time: 04\u670829\u65e5 13:05\xa0, 74 | attitude_count: 270, 75 | is_hot: True, 76 | name: M-never 77 | }, 78 | .... 79 | ] 80 | attitude_list: 81 | [ 82 | { 83 | name: \u723d\u5cf0\u4e2b\u4e2b, 84 | time: 13\u5206\u949f\u524d 85 | }, 86 | { 87 | name: \u8393\u5c7f, 88 | time: \u4eca\u5929 19:55 89 | }, 90 | .... 91 | ] 92 | 93 | """ 94 | def __init__(self, id, cache={}): 95 | super(Weibo, self).__init__() 96 | self.uid = id 97 | self._cache = cache 98 | # self._session = session 99 | self.href = 'http://weibo.cn/comment/'+str(id) 100 | self.main_page_resource = '' 101 | self._get_author_data() 102 | # 该微博是否为转发 103 | self.is_repost = False 104 | # 该微博转发的微博的信息 105 | self.repost_location = '' 106 | self.repost_author_uid = '' 107 | self.repost_text = '' 108 | self.repost_reposted_count = 0 109 | self.terminal_source = '' 110 | self.location = '' 111 | 112 | # 威胁程度 113 | self.threatened = 0 114 | 115 | @property 116 | @normal_attr() 117 | def html(self): 118 | return self._session.get('http://weibo.cn/repost/' + self.uid).content 119 | 120 | @property 121 | @normal_attr() 122 | def _soup(self): 123 | return BeautifulSoup(self.html, "lxml") 124 | 125 | @property 126 | @other_obj(name_in_json='people', class_name='people') 127 | def author(self): 128 | return None 129 | 130 | @property 131 | @normal_attr() 132 | def time(self): 133 | return self._soup.find(attrs={'id': 'M_'}).findAll('div')[1].span.get_text() 134 | 135 | @property 136 | @normal_attr() 137 | def text(self): 138 | """ 139 | 微博文本 140 | """ 141 | if not self._soup.find(attrs={'id': 'M_'}): 142 | raise AttributeError("cookies失效或网络故障!") 143 | return self._soup.find(attrs={'id': 'M_'}).div.span.get_text() 144 | 145 | @property 146 | @normal_attr() 147 | def repost_count(self): 148 | """ 149 | :return:int 转发数 150 | """ 151 | # wap版的微博页面,此页面内容格式特别不规范 152 | repost_number_node = self._soup.find(attrs={'id': 'rt'}) 153 | try: 154 | repost_count = int(re.findall(pattern, repost_number_node.get_text())[0]) 155 | except IndexError: 156 | print("获取转发数出错") 157 | repost_count = 0 158 | return repost_count 159 | 160 | @property 161 | @normal_attr() 162 | def comment_count(self): 163 | """ 164 | :return:int 评论数 165 | """ 166 | # wap版的微博页面,此页面内容格式特别不规范 167 | try: 168 | comment_number_node = self._soup.find(attrs={'id': 'rt'}).next_sibling 169 | comment_count = int(re.findall(pattern, comment_number_node.get_text())[0]) 170 | except IndexError: 171 | print("获取评论数出错") 172 | comment_count = 0 173 | return comment_count 174 | 175 | @property 176 | @normal_attr() 177 | def attitude_count(self): 178 | # wap版的微博页面,此页面内容格式特别不规范 179 | try: 180 | attitude_number_node = self._soup.find(attrs={'id': 'rt'}).next_sibling.next_sibling 181 | attitude_count = int(re.findall(pattern, attitude_number_node.get_text())[0]) 182 | except IndexError: 183 | print("获取点赞数出错") 184 | attitude_count = 0 185 | return attitude_count 186 | 187 | # 获取微博作者的昵称和uid 188 | def _get_author_data(self): 189 | self.author_name = self._soup.find(attrs={'id': 'M_'}).div.a.get_text() 190 | self._cache.setdefault('author_name', self.author_name) 191 | self.author_uid = self._soup.find(attrs={'id': 'M_'}).div.a.attrs['href'].split('/')[-1] 192 | self._cache.setdefault('author_uid', self.author_uid) 193 | 194 | def _get_attribute_item(self, target_attribute_type, target_attribute_fuction): 195 | """ 196 | 197 | :param target_attribute_type: 198 | :param target_attribute_fuction: 199 | :param required_attribute_count: 200 | :return: 201 | """ 202 | attribute_url = 'http://weibo.cn/' + str(target_attribute_type) + '/' + str(self.uid) 203 | attribute_list = [] 204 | page_count = 1 205 | now_page_count = 1 206 | is_first = True 207 | is_first_item = True 208 | pattern = re.compile(r'\d+') 209 | while True: 210 | # print("现在是评论第一页") 211 | tt.sleep(self._time_delay) 212 | # 获取页面源码(bs4对象) 213 | requests_content = BeautifulSoup(self._session.get(attribute_url).content) 214 | 215 | # 获取当前页的关注列表 216 | unit_list = requests_content.find_all('div', attrs={'class': 'c'}) 217 | for i in unit_list: 218 | # 调用具体函数提取内容 219 | attribute = target_attribute_fuction(i) 220 | if attribute is False: 221 | continue 222 | # 获取点赞时会把作者也获取到 故去除 223 | if target_attribute_type == 'attitude' or target_attribute_type == 'repost' and is_first_item: 224 | is_first_item = False 225 | continue 226 | yield attribute 227 | 228 | # 若是第一页,则获取总页数 229 | if is_first: 230 | # 若发现‘x/y页’ 则有不止一页 231 | if requests_content.find(attrs={'id': 'pagelist'}): 232 | page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip() 233 | page_count = page_count.split('/')[1] 234 | page_count = int(re.findall(pattern, page_count)[0]) 235 | # print(page_count) 236 | else: 237 | return 238 | is_first = False 239 | 240 | now_page_count += 1 241 | if now_page_count >= page_count: 242 | return 243 | 244 | attribute_url = 'http://weibo.cn/' + str(target_attribute_type) +'/' + str(self.uid) +'?&&page=' + \ 245 | str(now_page_count) 246 | 247 | @staticmethod 248 | def _get_comment_list(unit): 249 | comment = {} 250 | # 若有id属性且id值以C开头,则证明是评论 251 | try: 252 | if str(unit.attrs['id']).startswith('C'): 253 | comment['uid'] = str(unit.attrs['id']) 254 | else: 255 | return False 256 | except: 257 | return False 258 | comment['author_name'] = unit.a.get_text() 259 | comment['author_uid'] = str(str(unit.a.attrs['href']).split('/')[-1]) 260 | # 有的用户是个性域名,不符合/u/‘uid’的特点,故同时存href 261 | # comment['people'] = sina_people.SinaPeople(uid=str(unit.a.attrs['href']).split('/')[-1], 262 | # href='http://http://weibo.cn'+str(unit.a.attrs['href'])) 263 | # 检查是否有“热门”标签 264 | try: 265 | if str(unit.span.attrs['class']) == "['kt']": 266 | comment['is_hot'] = True 267 | else: 268 | comment['is_hot'] = False 269 | except: 270 | comment['is_hot'] = False 271 | 272 | # 正则匹配获取评论的赞数 273 | # 正常情况为`举报 赞[0] 回复 ` 274 | # 如果自己赞了本条评论 则此页面会变为`举报 已赞[1] 取消赞 回复 ` 275 | # 如果是自己的评论 则此页面会变为` 举报 赞[0] 回复 删除 ` 276 | # 故需要特殊处理 277 | try: 278 | comment['attitude_count'] = int(re.findall(pattern, unit.find_all('span', attrs={'class': 'cc'})[-2] 279 | .get_text())[0]) 280 | except IndexError: 281 | try: 282 | comment['attitude_count'] = int(re.findall(pattern, unit.find_all('span', attrs={'class': 'cmt'})[0] 283 | .get_text())[0]) 284 | except IndexError: 285 | comment['attitude_count'] = int( 286 | re.findall(pattern, unit.find_all('span', attrs={'class': 'cc'})[-3].get_text())[0]) 287 | 288 | # 获取评论的正文 289 | comment['text'] = unit.find_all('span', attrs={'class': 'ctt'})[0].get_text() 290 | # 获取评论的时间 291 | comment['time'] = unit.find_all('span', attrs={'class': 'ct'})[-1].get_text().split('来自')[0] 292 | # 获取评论的终端来源 293 | comment['terminal_source'] = unit.find_all('span', attrs={'class': 'ct'})[-1].get_text().split('来自')[1] 294 | 295 | return Comment(id=str(comment['uid']), cache=comment) 296 | 297 | @property 298 | @other_obj() 299 | def comment(self): 300 | """ 301 | :param required_comment_count: 指定获取的条数 302 | :param time_delay: 时间延迟 303 | :return: 该微博的评论列表 304 | .. code-block:: python 305 | [ 306 | { 307 | 'uid': 'C_4100160336496887', 308 | 'is_hot': False, 309 | 'name' : '-猫猫站不稳-', 310 | 'people': <__main__.SinaPeople object at 0x0000000003498BE0>, 311 | 'time': 今天 20:44, 312 | 'terminal_source': 'iPhone 6' 313 | 'text': '稀罕你!' 314 | 'attitude_count': 0 315 | }, 316 | ] 317 | """ 318 | for x in self._get_attribute_item('comment', self._get_comment_list): 319 | yield x 320 | 321 | @staticmethod 322 | def _get_attitude_list(unit): 323 | from attitude import Attitude 324 | attitude = {} 325 | # 若有a标签则为点赞的unit 326 | try: 327 | attitude['author_name'] = unit.a.get_text() 328 | attitude['time'] = unit.span.get_text() 329 | # attitude['people'] = SinaPeople(uid=str(unit.a.attrs['href']).split('/')[-1], 330 | # href='http://weibo.cn' + str(unit.a.attrs['href'])) 331 | except AttributeError: 332 | return False 333 | return Attitude(id=0, cache=attitude) 334 | 335 | @property 336 | @other_obj() 337 | def attitude(self): 338 | for x in self._get_attribute_item('attitude', self._get_attitude_list): 339 | yield x 340 | 341 | @staticmethod 342 | def _get_repost_list(unit): 343 | from repost import Repost 344 | repost = {} 345 | try: 346 | repost['author_name'] = unit.a.get_text() 347 | tmp_slibing = unit.a.next_sibling 348 | while not isinstance(tmp_slibing, bs4.element.NavigableString): 349 | tmp_slibing = tmp_slibing.next_sibling 350 | repost['text'] = str(tmp_slibing) 351 | # repost['people'] = SinaPeople(uid=unit.a.attrs['href'].split('/')[-1], 352 | # href='http://weibo.cn/'+unit.a.attrs['href']) 353 | except AttributeError: 354 | return False 355 | return Repost(id=0, cache=repost) 356 | 357 | @property 358 | @other_obj() 359 | # TODO:获取转发的时间、终端等信息 360 | def repost(self): 361 | for x in self._get_attribute_item('repost', self._get_repost_list): 362 | yield x 363 | 364 | # def get_text(self): 365 | # """ 366 | # 获取微博内容 367 | # :return: str类型的微博文本内容 368 | # """ 369 | # # if self.text != '': 370 | # # return self.text 371 | # if 1: 372 | # _retry_count = 3 373 | # while _retry_count > 0: 374 | # requests_content = self._soup 375 | # self.main_page_resource = requests_content 376 | # print(requests_content) 377 | # print("测试session的get方法") 378 | # try: 379 | # self.text = requests_content.find(attrs={'id': 'M_'}).div.span.get_text() 380 | # self.__get_author_data__() 381 | # _retry_count -= 1 382 | # break 383 | # except AttributeError: 384 | # _retry_count -= 1 385 | # 386 | # # 微博属性(转发数、赞数、评论数) 387 | # # wap版的此内容格式特别不规范 388 | # repost_number_node = requests_content.find(attrs={'id': 'rt'}) 389 | # try: 390 | # self.repost_count = int(re.findall(pattern, repost_number_node.get_text())[0]) 391 | # except IndexError: 392 | # self.repost_count = 0 393 | # try: 394 | # comment_number_node = repost_number_node.next_sibling 395 | # self.comment_count = int(re.findall(pattern, comment_number_node.get_text())[0]) 396 | # except IndexError: 397 | # self.comment_count = 0 398 | # try: 399 | # attitude_number_node = comment_number_node.next_sibling 400 | # self.attitude_count = int(re.findall(pattern, attitude_number_node.get_text())[0]) 401 | # except IndexError: 402 | # self.attitude_count = 0 403 | # 404 | # # 微博发表时间 405 | # #self.time = requests_content.find(attrs={'id': 'M_'}).findAll('div')[1].span.get_text() 406 | # return self.text 407 | 408 | 409 | 410 | # def __get_attribute_list__(self, target_attribute_type, target_attribute_fuction, required_attribute_count=8): 411 | # """ 412 | # 413 | # :param target_attribute_type: 414 | # :param target_attribute_fuction: 415 | # :param required_attribute_count: 416 | # :return: 417 | # """ 418 | # attribute_url = 'http://weibo.cn/' + str(target_attribute_type) + '/' + str(self.uid) 419 | # attribute_list = [] 420 | # attribute_count = 0 421 | # page_count = 1 422 | # now_page_count = 1 423 | # is_first = True 424 | # pattern = re.compile(r'\d+') 425 | # while True: 426 | # print("现在是评论第一页") 427 | # tt.sleep(self.time_delay) 428 | # # 获取页面源码(bs4对象) 429 | # requests_content = self.retry_requests(attribute_url, uid=self.uid) 430 | # 431 | # # 获取当前页的关注列表 432 | # unit_list = requests_content.find_all('div', attrs={'class': 'c'}) 433 | # for i in unit_list: 434 | # # 调用具体函数提取内容 435 | # attribute = target_attribute_fuction(i) 436 | # if attribute is False: 437 | # continue 438 | # # 计数器加一 439 | # attribute_count += 1 440 | # # 若超过了要求获取的属性数量,则返回 441 | # if attribute_count > required_attribute_count: 442 | # return attribute_list 443 | # attribute_list.append(attribute) 444 | # 445 | # # 若是第一页,则获取总页数 446 | # if is_first: 447 | # # 若发现‘x/y页’ 则有不止一页 448 | # if requests_content.find(attrs={'id': 'pagelist'}): 449 | # page_count = requests_content.find(attrs={'id': 'pagelist'}).form.div.contents[-1].strip() 450 | # page_count = page_count.split('/')[1] 451 | # page_count = int(re.findall(pattern, page_count)[0]) 452 | # print(page_count) 453 | # else: 454 | # return attribute_list 455 | # is_first = False 456 | # 457 | # now_page_count += 1 458 | # if now_page_count >= page_count: 459 | # break 460 | # 461 | # attribute_url = 'http://weibo.cn/' + str(target_attribute_type) +'/' + str(self.uid) +'?&&page=' + \ 462 | # str(now_page_count) 463 | # 464 | # return attribute_list 465 | 466 | if __name__ == '__main__': 467 | def a(): 468 | return 1 469 | print(type(a)) --------------------------------------------------------------------------------