├── a.txt
├── bug.txt
├── data.csv
├── get_weibo.py
├── get_weibo.pyc
├── network_graph.png
├── network_graph.py
├── post_encode.py
├── post_encode.pyc
├── readme.md
├── test.txt
├── time_graph.png
├── time_graph.py
├── weibo_login.py
├── weibo_login.pyc
└── weibo_main.py
/a.txt:
--------------------------------------------------------------------------------
1 | heheeh
--------------------------------------------------------------------------------
/bug.txt:
--------------------------------------------------------------------------------
1 | dddddddd
--------------------------------------------------------------------------------
/get_weibo.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | import urllib
3 | import urllib2
4 | import re
5 | import os
6 | import time
7 | import random
8 | import json
9 | #使用beautifulsoup对HTML页面进行解析
10 | from bs4 import BeautifulSoup
11 | import sys
12 | reload(sys)
13 | sys.setdefaultencoding('utf8')
14 | #此函数获取单个页面的转发数据
15 | def get_forward(html, origin_html):
16 | #获取返回的json
17 | get_json = json.load(html)
18 | #获取返回json中的转发总数、
19 | total_forward = get_json['data']['count']
20 | #获取返回json中的评论页面总数
21 | total_page = get_json['data']['page']['totalpage']
22 | #获取返回json中的当前页面
23 | current_page = get_json['data']['page']['pagenum']
24 | #获取返回json中的html
25 | forward_html = get_json['data']['html']
26 | soup = BeautifulSoup(forward_html, "lxml")
27 | #获取网页中转发的页面
28 | div_forward = soup.find_all(attrs={'action-type' : 'feed_list_item'})
29 | #一度转发uid
30 | uid = []
31 | #原微博的发出者
32 | origin_uid_temp = get_origin_weibo(origin_html)
33 | #一度转发时间
34 | time = []
35 | #二度转发原uid
36 | origin_uid2 = []
37 | uid2 = []
38 | #二度转发时间
39 | time2 = []
40 | for i in range(len(div_forward)):
41 | a = str(div_forward[i].find(attrs={'node-type' : 'text'}))
42 | #b = a.find(re.compile("//"))
43 | p = re.compile('//():')
44 | #如果有多个转发
45 | if(p.search(a)):
46 | temp_uid2 = []
47 | #the end of forward user
48 | end_uid = "name=" + div_forward[i].find(attrs={'node-type' : 'name'}).get_text()
49 | soup = BeautifulSoup(p.search(a).group(1), "lxml")
50 | temp = soup.find_all('a')
51 | temp_time = div_forward[i].find(attrs={'node-type' : 'feed_list_item_date'}).get('title')
52 | j = len(temp)-1
53 | while (j >=0):
54 | if(temp[j].get('usercard')):
55 | temp_uid2.append(temp[j].get('usercard').encode('utf-8'))
56 | j -=1
57 |
58 | temp_uid2.append(end_uid)
59 | if (temp_uid2[0] != origin_uid_temp):
60 | temp_uid2.insert(0, origin_uid_temp)
61 | for i in range(0, len(temp_uid2)-1):
62 | origin_uid2.append(temp_uid2[i])
63 | time2.append(temp_time)
64 | for i in range(1, len(temp_uid2)):
65 | uid2.append(temp_uid2[i])
66 | continue
67 | uid.append("name=" + div_forward[i].find(attrs={'node-type' : 'name'}).get_text())
68 | time.append(div_forward[i].find(attrs={'node-type' : 'feed_list_item_date'}).get('title'))
69 | return (uid, origin_uid2, uid2, time, time2, total_forward, total_page, current_page)
70 | #此函数获取原微博的uid
71 | def get_origin_weibo(original_html):
72 | #将获取到的html源码分行,因为新浪微博将网页进行了压缩
73 | decoded_html = original_html.encode("utf-8").replace("\\", "")
74 | soup = BeautifulSoup(decoded_html, "lxml")
75 | div_origin = soup.find(attrs={'name' : 'keywords'})
76 | origin_uid = div_origin.get('content')
77 | origin_uid = "name=" + origin_uid[0:-14]
78 | return origin_uid
79 | ##########################################
80 | #后面的函数与分析无关,后面的函数是获取微博的具体内容的。
81 | def decode_html(original_html):
82 | #将获取到的html源码分行,因为新浪微博将网页进行了压缩
83 | lines = original_html.splitlines()
84 | for line in lines:
85 | #以