├── a.txt ├── bug.txt ├── data.csv ├── get_weibo.py ├── get_weibo.pyc ├── network_graph.png ├── network_graph.py ├── post_encode.py ├── post_encode.pyc ├── readme.md ├── test.txt ├── time_graph.png ├── time_graph.py ├── weibo_login.py ├── weibo_login.pyc └── weibo_main.py /a.txt: -------------------------------------------------------------------------------- 1 | heheeh -------------------------------------------------------------------------------- /bug.txt: -------------------------------------------------------------------------------- 1 | dddddddd -------------------------------------------------------------------------------- /get_weibo.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import urllib 3 | import urllib2 4 | import re 5 | import os 6 | import time 7 | import random 8 | import json 9 | #使用beautifulsoup对HTML页面进行解析 10 | from bs4 import BeautifulSoup 11 | import sys 12 | reload(sys) 13 | sys.setdefaultencoding('utf8') 14 | #此函数获取单个页面的转发数据 15 | def get_forward(html, origin_html): 16 | #获取返回的json 17 | get_json = json.load(html) 18 | #获取返回json中的转发总数、 19 | total_forward = get_json['data']['count'] 20 | #获取返回json中的评论页面总数 21 | total_page = get_json['data']['page']['totalpage'] 22 | #获取返回json中的当前页面 23 | current_page = get_json['data']['page']['pagenum'] 24 | #获取返回json中的html 25 | forward_html = get_json['data']['html'] 26 | soup = BeautifulSoup(forward_html, "lxml") 27 | #获取网页中转发的页面 28 | div_forward = soup.find_all(attrs={'action-type' : 'feed_list_item'}) 29 | #一度转发uid 30 | uid = [] 31 | #原微博的发出者 32 | origin_uid_temp = get_origin_weibo(origin_html) 33 | #一度转发时间 34 | time = [] 35 | #二度转发原uid 36 | origin_uid2 = [] 37 | uid2 = [] 38 | #二度转发时间 39 | time2 = [] 40 | for i in range(len(div_forward)): 41 | a = str(div_forward[i].find(attrs={'node-type' : 'text'})) 42 | #b = a.find(re.compile("//")) 43 | p = re.compile('//():') 44 | #如果有多个转发 45 | if(p.search(a)): 46 | temp_uid2 = [] 47 | #the end of forward user 48 | end_uid = "name=" + div_forward[i].find(attrs={'node-type' : 'name'}).get_text() 49 | soup = BeautifulSoup(p.search(a).group(1), "lxml") 50 | temp = soup.find_all('a') 51 | temp_time = div_forward[i].find(attrs={'node-type' : 'feed_list_item_date'}).get('title') 52 | j = len(temp)-1 53 | while (j >=0): 54 | if(temp[j].get('usercard')): 55 | temp_uid2.append(temp[j].get('usercard').encode('utf-8')) 56 | j -=1 57 | 58 | temp_uid2.append(end_uid) 59 | if (temp_uid2[0] != origin_uid_temp): 60 | temp_uid2.insert(0, origin_uid_temp) 61 | for i in range(0, len(temp_uid2)-1): 62 | origin_uid2.append(temp_uid2[i]) 63 | time2.append(temp_time) 64 | for i in range(1, len(temp_uid2)): 65 | uid2.append(temp_uid2[i]) 66 | continue 67 | uid.append("name=" + div_forward[i].find(attrs={'node-type' : 'name'}).get_text()) 68 | time.append(div_forward[i].find(attrs={'node-type' : 'feed_list_item_date'}).get('title')) 69 | return (uid, origin_uid2, uid2, time, time2, total_forward, total_page, current_page) 70 | #此函数获取原微博的uid 71 | def get_origin_weibo(original_html): 72 | #将获取到的html源码分行,因为新浪微博将网页进行了压缩 73 | decoded_html = original_html.encode("utf-8").replace("\\", "") 74 | soup = BeautifulSoup(decoded_html, "lxml") 75 | div_origin = soup.find(attrs={'name' : 'keywords'}) 76 | origin_uid = div_origin.get('content') 77 | origin_uid = "name=" + origin_uid[0:-14] 78 | return origin_uid 79 | ########################################## 80 | #后面的函数与分析无关,后面的函数是获取微博的具体内容的。 81 | def decode_html(original_html): 82 | #将获取到的html源码分行,因为新浪微博将网页进行了压缩 83 | lines = original_html.splitlines() 84 | for line in lines: 85 | #以