├── FinalReport.pdf
├── README.md
├── content.py
├── cookies.txt
├── forward.py
├── uids.txt
└── upload.py


/FinalReport.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SQRPI/weibo-spider/d8c8a527fb0c0995419f310845db9eb0795ad8c8/FinalReport.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # weibo-spyder
 2 | 社交网络挖掘期末Project 大数据相关微博用户关系爬虫
 3 | 
 4 | 
 5 | ## 已实现功能
 6 | * 搜索特定标签用户
 7 | * uid和昵称转换
 8 | * 批量爬取指定用户标签
 9 | * 批量爬取用户关系列表
10 | * 微博内容提取和分析
11 | * 5000页面每小时的反反爬虫机制
12 | 
13 | ## 使用方法
14 | 
15 | * 在cookies.txt中输入你的cookie(可输入多个)
16 | * 在uids.txt中输入要爬取人的uid
17 | ```python
18 | python content.py
19 | ```
20 | * 其他功能请自行修改upload.py中的代码或等待作者有空时整理
21 | 
22 | ## 参数
23 | 
24 | * --p 要爬取的页数,默认20
25 | * --m 写入文件方式,默认a(追加),如
26 | ```Python
27 | python content.py --m w 
28 | ```
29 | 则为覆盖.大规模爬取推荐使用默认值,有断点续爬功能
30 | * --u uid文件保存位置,默认uids.txt
31 | * --c Cookie文件保存位置,默认cookies.txt
32 | * --f 输出位置,默认result.txt
33 | 


--------------------------------------------------------------------------------
/content.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Jan 20 14:43:21 2018
  4 | 
  5 | @author: SQRPI/Ning Shangyi
  6 | """
  7 | 
  8 | # -*- coding: utf-8 -*-
  9 | 
 10 | 
 11 | import sys
 12 | from bs4 import BeautifulSoup
 13 | import requests
 14 | import time
 15 | import argparse
 16 | 
 17 | headers = {
 18 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 19 | 'Accept-Encoding':'gzip, deflate, br',
 20 | 'Accept-Language':'zh-CN,zh;q=0.8',
 21 | 'Connection':'keep-alive',
 22 | 'Cookie':'',
 23 | 'Host':'weibo.cn',
 24 | 'Referer':'https://weibo.cn/2113734951/profile',
 25 | 'Upgrade-Insecure-Requests':'1',
 26 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36'
 27 | }
 28 | 
 29 | 
 30 | def readCookies(path):
 31 |     global cookieNum, cookieCount
 32 |     cookieCount = 0
 33 |     cookieNum = 0
 34 |     f = open(path, 'r')
 35 |     cookie = []
 36 |     flag = 0
 37 |     for line in f:
 38 |         if line.strip()[0] == '#':
 39 |             continue
 40 |         if flag==0:
 41 |             flag = 1
 42 |             if line.strip()=='':
 43 |                 continue
 44 |         cookie.append(line.strip())
 45 |         cookieCount += 1
 46 |     if cookieCount == 0:
 47 |         raise Exception('Input your cookies in cookies.txt!')
 48 |     return cookie
 49 | 
 50 | def weiboContent(uid, pages=1, uidnum=1):
 51 |     global cookieNum, cookieCount, uidCount
 52 |     startCookie = 0
 53 |     flag = 0
 54 |     toReturn = []
 55 |     page = 0
 56 |     while page <= pages:
 57 |         try:
 58 |             headers['Cookie'] = cookie[cookieNum]
 59 |             url = 'https://weibo.cn/u/%s?page=%d' % (uid, page+1)
 60 |             s = requests.Session()
 61 |             s.headers.update(headers)
 62 |             html = s.get(url).content
 63 | 
 64 |             soup = BeautifulSoup(html, 'lxml')
 65 |             content = soup.find_all('span', {'class': 'ctt'})
 66 |             if page == 0 and len(content) >= 3:
 67 |                 content = content[3:len(content)]
 68 |             for i in range(len(content)):
 69 |                 toReturn.append([uid, page*10+i+1, content[i].getText()])
 70 |             if not html:
 71 |                 if startCookie==0:
 72 |                     startCookie = cookieNum
 73 |                     cookieNum = (cookieNum + 1)%cookieCount
 74 |                     continue
 75 |                 cookieNum = (cookieNum + 1)%cookieCount
 76 |                 if cookieNum != startCookie:
 77 |                     continue
 78 |                 sys.stdout.write('\rWarning: All Accounts Banned, trying to reconnect %d, uid %d/%d, page %d/%d' % (flag, uidnum, uidCount, page, args.p))
 79 |                 time.sleep(20)
 80 |                 if flag >= 50:
 81 |                     sys.stdout.write('\nError 254, uid =%s Page= %d, %d/1000, Connection Failed\n' % (uid, page, uidnum))
 82 |                     page += 1
 83 |                     flag = 0
 84 |                     continue
 85 |                 flag += 1
 86 |                 continue
 87 |             startCookie = 0
 88 |             page += 1
 89 |             flag = 0
 90 |             sys.stdout.write('\rLoaded %3d-th uid, Page %d/%d...\t\t\t\t\t\t\t\t' % (uidnum, page, pages+1))
 91 |         except:
 92 |             if flag:
 93 |                 sys.stdout.write('\rWarning: uid =%s Page= %d, %d/1000, trying to reconnect %d\t\t\t\t\t' % (uid, page, uidnum, flag))
 94 |             time.sleep(20)
 95 |             if flag >= 50:
 96 |                 sys.stdout.write('\nError: uid =%s PageId= %d, %d/1000, Connection Failed\n' % (uid, page, uidnum))
 97 |                 page += 1
 98 |                 flag = 0
 99 |                 continue
100 |             flag += 1
101 |             continue
102 |     return toReturn
103 | 
104 | def readUids(path):
105 |     global uidCount
106 |     uidCount = 0
107 |     f = open(path, 'r')
108 |     uids = []
109 |     flag = 0
110 |     for line in f:
111 |         if line.strip()[0] == '#':
112 |             continue
113 |         if flag==0:
114 |             flag = 1
115 |             if line.strip()=='':
116 |                 continue
117 |         uids.append(line.strip())
118 |         uidCount += 1
119 |     if uidCount == 0:
120 |         raise Exception('Input your uids in uids.txt!')
121 |     return uids
122 | def writeContent(uids, text, pages):
123 |     global uidCount
124 |     start = False
125 |     try:
126 |         f = open(text, 'r')
127 |         for line in f:
128 |             pass
129 |         contentNum = int(line.split('\t')[1])
130 |         startUid = line.split('\t')[0]
131 |         f.close()
132 |     except:
133 |         contentNum = 0
134 |         startUid = uids[0]
135 |     f = open(text, args.m)
136 |     for i in range(len(uids)):
137 |         if start is True:
138 |             t = weiboContent(uids[i], pages, uidnum=i+1)
139 |             for item in t:
140 |                 f.write('%s\t%d\t%s\n' % (item[0], item[1], item[2].decode('utf8')))
141 |         if uids[i] == startUid:
142 |             start = True
143 |             sys.stdout.write('Started! uid = %s, content %d/%d\n' % (startUid, i+1, uidCount))
144 |             t = weiboContent(uids[i], pages, uidnum=i+1)
145 |             for item in t:
146 |                     if item[1] >= contentNum:
147 |                         f.write('%s\t%d\t%s\n' % (item[0], item[1], item[2]))
148 |     f.close()
149 |     return
150 |     
151 | 
152 | parser  = argparse.ArgumentParser()
153 | parser.add_argument('--p', type=int, default=20)
154 | parser.add_argument('--f', type=str, default='result.txt')
155 | parser.add_argument('--c', type=str, default='cookies.txt')
156 | parser.add_argument('--u', type=str, default='uids.txt')
157 | parser.add_argument('--m', type=str, default='a')
158 | args    = parser.parse_args()
159 | cookie  = readCookies(args.c)
160 | uids    = readUids(args.u)
161 | writeContent(uids, args.f, args.p-1)
162 | sys.stdout.write('\nFinished!\n')
163 | 


--------------------------------------------------------------------------------
/cookies.txt:
--------------------------------------------------------------------------------
1 | #在这里输入cookies,换行分隔,更换账号可以或者不同的cookie,cookie越多爬取速度越快
2 | #获得方法:浏览器F12,打开weibo.cn,在网络/Network一栏中选择与weibo.cn,复制Cookie一项
3 | #下面是正常Cookie的格式(星号为字符)
4 | #SUB=_2A253E************lEW8yrFzj2IHXVU_bw8**************************6xoDeq2DGsXzFBmn6_s-gZUF13; SUHB=0M*******A7Zvj; SCF=AvkgK***************HPgeGVEJCKV2Fbu0ip_zeKE2NbeFI*****************vqmhStEIu7P--S7i***48.; _T_WM=425295*******************0c8d763
5 | 


--------------------------------------------------------------------------------
/forward.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jul 25 12:25:42 2017
 4 | 
 5 | @author: Sqrpi
 6 | """
 7 | 
 8 | 
 9 | import re
10 | import string
11 | import sys
12 | import os
13 | import urllib
14 | import urllib2
15 | from bs4 import BeautifulSoup
16 | import requests
17 | from lxml import etree
18 | import time
19 | 
20 | #
21 | #reload(sys)
22 | #sys.setdefaultencoding('utf-8')
23 | 
24 | 
25 | def pp(text):
26 |     print text.encode('utf-8').decode('utf-8')
27 | 
28 | def pl(List):
29 |     for item in List:
30 |         pp(item)
31 | 
32 | 
33 | def pltx(List):
34 |     for item in List:
35 |         pp(item.text)
36 | 
37 | cookie = {"Cookie": 'Your Cookie Here'}
38 | '''
39 |     获得cookie: chromeF12-Network, 登录微博, 打开weibo.cn, Name列点击weibo.cn, 复制Cookie项
40 |     示例格式: _T_WM=0c6a2e5cd..... SSOLoginState=150090124
41 | '''
42 | maxPage = 300
43 | pageId = 1
44 | flag = 0
45 | maxPage = 10
46 | Dict = {} # key forwards value
47 | '''
48 |     maxPage最大页数可修改
49 |     Dict 是最终结果, A:[B,C]表示A从B,C处转发两次.
50 |     但是A转发B, B转发C会记做 A:[B]
51 | '''
52 | while pageId < maxPage:
53 |     url = 'https://weibo.cn/repost/FdSmtlQKX?uid=2670306073&rl=1&page=%d' % (pageId)
54 |     html = requests.get(url, cookies=cookie).content
55 | 
56 |     soup = BeautifulSoup(html, 'lxml')
57 |     forwardList = soup.find_all('div', class_='c')[3:]
58 |     if not html:
59 |         sys.stdout.write('\rWarning 251: Account Banned, trying to reconnect %d,page %d\t\t\t' % (flag, pageId))
60 |         time.sleep(20)
61 |         if flag >= 50:
62 |             sys.stdout.write('\nError 254, page %d, Connection Failed\n' % (pageId))
63 |             flag = 0
64 |             continue
65 |         flag += 1
66 |         continue
67 |     for item in forwardList:
68 |         m = item.find_all('a')
69 |         if len(m) > 2:
70 |             forer = m[0].text
71 |             fored = m[1].text[1:]
72 |         elif len(m) == 2:
73 |             forer = m[0].text
74 |             fored = u'更方更正的物理'
75 |         if forer not in Dict:
76 |             Dict[forer] = [fored]
77 |         elif fored not in Dict[forer]:
78 |             Dict[forer].append(fored)
79 |     pageId += 1
80 |     flag = 0
81 |     sys.stdout.write('\rLoaded page %d\t\t\t\t\t' % (pageId))


--------------------------------------------------------------------------------
/uids.txt:
--------------------------------------------------------------------------------
1 | #输入要爬取用户的UID,换行分隔
2 | #查看UID方法:用weibo.cn打开用户资料,网址显示为https://weibo.cn/2113734951/info,其中的数字即为uid
3 | #下面是一个示例
4 | 2113734951
5 | 


--------------------------------------------------------------------------------
/upload.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Jun 06 00:00:27 2017
  4 | 
  5 | @author: Ning Shangyi/SQRPI
  6 | """
  7 | 
  8 | # -*- coding: utf-8 -*-
  9 | 
 10 | import re
 11 | import string
 12 | import sys
 13 | import os
 14 | import urllib
 15 | import urllib2
 16 | from bs4 import BeautifulSoup
 17 | import requests
 18 | from lxml import etree
 19 | import time
 20 | 
 21 | #
 22 | #reload(sys)
 23 | #sys.setdefaultencoding('utf-8')
 24 | 
 25 | 
 26 | def pp(text):
 27 |     print text.encode('utf-8').decode('utf-8')
 28 | 
 29 | cookie = {"Cookie": 'INPUT YOUR COOKIE HERE'}
 30 | #url = 'http://weibo.cn/u/%d?filter=1&page=1'%user_id
 31 | 
 32 | def getIdFromStag(stags, ids = {}):
 33 |     # url for search
 34 |     for stag in stags:
 35 |         m = 0
 36 |         flag = 0
 37 |         while m < 51:
 38 |             pageId = m
 39 |             print 'Loading PAGE', pageId, 'of Stag', urllib.unquote(stag)
 40 |             try:
 41 |                 url = 'https://weibo.cn/search/user/?keyword=%s&sort=0&filter=stag&page=%d' % (stag, pageId)
 42 |                 html = requests.get(url, cookies=cookie).content
 43 | 
 44 |                 soup = BeautifulSoup(html, 'lxml')
 45 |                 content = soup.find_all('a', href=re.compile(r'f=search', re.I))
 46 |                 for i in range(len(content)):
 47 |                     item = content[i]
 48 |                     toAppend = item['href'].split('?')[0]
 49 |                     if toAppend in ids:
 50 |                         ids[toAppend] += 1
 51 |                     else:
 52 |                         ids[toAppend] = 1
 53 |                 #time.sleep(1)
 54 |                 if len(content) <= 0:
 55 |                     sys.stdout.write('\rWarning 047: len(content) = 0\t%d page, %s stag\t\t' % (m, urllib.unquote(stag)))
 56 |                     time.sleep(20)
 57 |                     if flag>=50:
 58 |                         m += 1
 59 |                         flag = 0
 60 |                         continue
 61 |                     flag += 1
 62 |                     continue
 63 |                 m += 1
 64 |                 flag = 0
 65 |             except:
 66 |                 break
 67 |                 time.sleep(1000)
 68 |                 if flag:
 69 |                     m += 1
 70 |                     flag = 0
 71 |                     continue
 72 |                 flag = 1
 73 |                 continue
 74 |     return ids
 75 | 
 76 | 
 77 | def getUidFromOid(oid):
 78 | #    print 'Loading uid: oid=', oid
 79 |     url = 'https://weibo.cn%s' % oid
 80 |     html = requests.get(url, cookies=cookie).content
 81 | 
 82 |     soup = BeautifulSoup(html, 'lxml')
 83 |     content = soup.find_all('a', href=re.compile(r'/operation', re.I))[0]
 84 |     uid = content['href'].split('/')[1]
 85 |     return uid
 86 | 
 87 | 
 88 | def getStagFromUid(uid):
 89 |     url = 'https://weibo.cn/account/privacy/tags/?uid=%s&st=789d9e' % uid
 90 |     html = requests.get(url, cookies=cookie).content
 91 | 
 92 |     soup = BeautifulSoup(html, 'lxml')
 93 |     content = soup.find_all('a', href=re.compile(r'keyword', re.I))
 94 |     stags = []
 95 |     for item in content:
 96 |         istag = item['href'].split('=')[1].split('&')[0]
 97 |         stags.append(istag)
 98 |     return stags
 99 | 
100 | 
101 | def getUidsFromOids(oids, num=0, type=1):
102 |     uids = []
103 |     i = 0
104 |     flag = 0
105 |     while i < len(oids):
106 |         if type:
107 |             oid = oids[i]
108 |         else:
109 |             oid = oids[i][0]
110 |         try:
111 |             if i >= num:
112 |                 uid = getUidFromOid(oid)
113 |                 uids.append(uid)
114 | #                time.sleep(1)
115 |                 sys.stdout.write('\rLoading %d-th Uid, oid = %s\t\t\t\t\t' % (i, oid))
116 |             i += 1
117 |             flag = 0
118 |         except IndexError:
119 |             if flag:
120 |                 sys.stdout.write('\rWarning 091, oid =%s %d/1000, trying to reconnect %d\t\t\t\t\t' % (oid, i, flag))
121 |             time.sleep(50)
122 |             if flag >= 100:
123 |                 sys.stdout.write('\nError 118, oid =%s %d/1000, Connection Failed\n' % (oid, i))
124 |                 i += 1
125 |                 flag = 0
126 |                 continue
127 |             flag += 1
128 |             continue
129 |     return uids
130 | 
131 | 
132 | def allStags(uids):
133 |     stags = {}
134 |     flag = 0
135 |     for uid in uids:
136 |         try:
137 |             flag += 1
138 |             uid = uid.split('\n')[0]
139 |             stag = getStagFromUid(uid)
140 |             #time.sleep(1)
141 |             print 'Loading From Uid', uid, 'Number', flag
142 |             for item in stag:
143 |                 if item in stags:
144 |                     stags[item] += 1
145 |                 else:
146 |                     stags[item] = 1
147 |         except:
148 |             continue
149 |     return sorted(stags.iteritems(), key=lambda d: d[1], reverse=True)
150 | 
151 | 
152 | def unquoteStag(stags):
153 |     output = []
154 |     for item in stags:
155 |         t = item[0]
156 |         output.append(urllib.unquote(t))
157 |     return output
158 | 
159 | 
160 | def getFollowPages(uids):
161 |     toReturn = []
162 |     i = 0
163 |     flag = 0
164 |     while i < len(uids):
165 |         uid = uids[i]
166 |         try:
167 |             url = 'https://weibo.cn/%s/follow' % uid
168 |             html = requests.get(url, cookies=cookie).content
169 | 
170 |             selector = etree.HTML(html)
171 |             pageNum = (int)(selector.xpath('//input[@name="mp"]')[0].attrib['value'])
172 |             toReturn.append((uid, pageNum))
173 |             i += 1
174 |             sys.stdout.write('\rLoaded %d-th uid, %d Pages.\t\t\t\t\t' % (i, pageNum))
175 |             flag = 0
176 |         except:
177 |             if html:
178 |                 toReturn.append((uid, 0))
179 |                 i += 1
180 |                 sys.stdout.write('\rLoaded %d-th uid, %d Pages.\t\t\t\t\t' % (i, pageNum))
181 |                 flag = 0
182 |                 continue
183 |             if flag:
184 |                 sys.stdout.write('\rWarning 172, uid =%s %d/1000, trying to reconnect %d\t\t\t\t\t' % (uid, i, flag))
185 |             time.sleep(50)
186 |             if flag >= 40:
187 |                 sys.stdout.write('\nError 175, uid =%s %d/1000, Connection Failed\n' % (uid, i))
188 |                 i += 1
189 |                 flag = 0
190 |                 continue
191 |             flag += 1
192 |             continue
193 |     return toReturn
194 | 
195 | 
196 | def getFollowRalationship(followPagedUid):
197 |     ids = []
198 |     m = 0
199 |     flag = 0
200 |     while m < len(followPagedUid):
201 |         uid, pages = followPagedUid[m]
202 |         if pages == 0:
203 |             pages = 1
204 |         pageId = 1
205 |         while pageId <= pages:
206 |             try:
207 |                 url = 'https://weibo.cn/%s/follow?page=%d' % (uid, pageId)
208 |                 html = requests.get(url, cookies=cookie).content
209 | 
210 |                 soup = BeautifulSoup(html, 'lxml')
211 |                 content = soup.find_all('a', text="关注他"or"关注她")
212 |                 for i in range(len(content)):
213 |                     item = content[i]
214 |                     followed = item['href'].split('=')[1].split('&')[0]
215 |                     # (uid, item) means uid follows item
216 |                     ids.append((uid, followed))
217 |                 if not html:
218 |                     sys.stdout.write('\rWarning 047: Account Banned, trying to reconnect %d' % flag)
219 |                     time.sleep(20)
220 |                     if flag >= 50:
221 |                         sys.stdout.write('\nError 175, uid =%s PageId= %d, %d/%d, Connection Failed\n' % (uid, pageId, m, len(followPagedUid)))
222 |                         pageId += 1
223 |                         flag = 0
224 |                         continue
225 |                     flag += 1
226 |                     continue
227 |                 pageId += 1
228 |                 flag = 0
229 |                 sys.stdout.write('\rLoaded %3d-th uid, Page %d/%d...\t\t\t\t' % (m+1, pageId-1, pages))
230 |             except:
231 |                 if flag:
232 |                     sys.stdout.write('\rWarning 172, uid =%s PageId= %d %d/1000, trying to reconnect %d\t\t\t\t\t' % (uid, pageId, m, flag))
233 |                 time.sleep(20)
234 |                 if flag >= 50:
235 |                     sys.stdout.write('\nError 175, uid =%s PageId= %d, %d/1000, Connection Failed\n' % (uid, pageId, m))
236 |                     pageId += 1
237 |                     flag = 0
238 |                     continue
239 |                 flag += 1
240 |                 continue
241 |         m += 1
242 |     return ids
243 | 
244 | def weiboContent(uid, pages=20, uidnum=0):
245 |     flag = 0
246 |     toReturn = []
247 |     page = 0
248 |     while page <= pages:
249 |         try:
250 |             url = 'https://weibo.cn/u/%s?page=%d' % (uid, page+1)
251 |             html = requests.get(url, cookies=cookie).content
252 | 
253 |             soup = BeautifulSoup(html, 'lxml')
254 |             content = soup.find_all('span', {'class': 'ctt'})
255 |             if page == 0 and len(content) >= 3:
256 |                 content = content[3:len(content)]
257 |             for i in range(len(content)):
258 |                 toReturn.append([uid, page*10+i+1, content[i].getText()])
259 |             if not html:
260 |                 sys.stdout.write('\rWarning 251: Account Banned, trying to reconnect %d, uid %d/1000, page %d/20' % (flag, uidnum, page))
261 |                 time.sleep(20)
262 |                 if flag >= 50:
263 |                     sys.stdout.write('\nError 254, uid =%s Page= %d, %d/1000, Connection Failed\n' % (uid, page, uidnum))
264 |                     page += 1
265 |                     flag = 0
266 |                     continue
267 |                 flag += 1
268 |                 continue
269 |             page += 1
270 |             flag = 0
271 |             sys.stdout.write('\rLoaded %3d-th uid, Page %d/%d...\t\t\t\t' % (uidnum, page, pages))
272 |         except:
273 |             if flag:
274 |                 sys.stdout.write('\rWarning 265, uid =%s Page= %d %d/1000, trying to reconnect %d\t\t\t\t\t' % (uid, page, uidnum, flag))
275 |             time.sleep(20)
276 |             if flag >= 50:
277 |                 sys.stdout.write('\nError 268, uid =%s PageId= %d, %d/1000, Connection Failed\n' % (uid, page, uidnum))
278 |                 page += 1
279 |                 flag = 0
280 |                 continue
281 |             flag += 1
282 |             continue
283 |     return toReturn
284 | 
285 | def writeContent(uids, text):
286 |     start = False
287 |     try:
288 |         f = open(text, 'r')
289 |         for line in f:
290 |             pass
291 |         contentNum = int(line.split('\t')[1])
292 |         startUid = line.split('\t')[0]
293 |         f.close()
294 |     except:
295 |         contentNum = 0
296 |         startUid = uids[0]
297 |     f = open(text, 'a')
298 |     for i in range(len(uids)):
299 |         if start is True:
300 |             t = weiboContent(uids[i], uidnum=i+1)
301 |             for item in t:
302 |                 f.write('%s\t%d\t%s\n' % (item[0], item[1], item[2].encode('utf-8')))
303 |         if uids[i] == startUid:
304 |             start = True
305 |             sys.stdout.write('Started! uid = %s, content %d/200' % (startUid, i+1))
306 |             t = weiboContent(uids[i], uidnum=i+1)
307 |             for item in t:
308 |                     if item[1] >= contentNum:
309 |                         f.write('%s\t%d\t%s\n' % (item[0], item[1], item[2].encode('utf-8')))
310 |     f.close()
311 |     return
312 | f = open('testContent2.txt', 'r')
313 | i=0
314 | for line in f:
315 |     print line
316 |     i+=1
317 |     if i%50==0:
318 |         time.sleep(2)
319 | 
320 | #
321 | #for item in dstags:
322 | #    print item
323 | #    time.sleep(1)
324 | 
325 | #
326 | ## Get id from DaShuJu
327 | ##originIds = getIdFromStag(stag)
328 | # Get id from other stags
329 | #allStag = ['%E5%A4%A7%E6%95%B0%E6%8D%AE',
330 | #           '%E4%BA%91%E8%AE%A1%E7%AE%97',
331 | #           '%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98',
332 | #           '%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0',
333 | #           'Hadoop',
334 | #           '%E7%A7%BB%E5%8A%A8%E4%BA%92%E8%81%94%E7%BD%91',
335 | #           '%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90',
336 | #           '%E5%95%86%E4%B8%9A%E6%99%BA%E8%83%BD',
337 | #           '%E5%AD%98%E5%82%A8',
338 | #           '%E4%BA%91%E5%AD%98%E5%82%A8',
339 | #           '%E4%BF%A1%E6%81%AF%E5%8C%96']
340 | #
341 | #allIds = getIdFromStag(allStag)
342 | #sortedAllIds = sorted(allIds.iteritems(), key=lambda d: d[1], reverse=True)
343 | #usingIds = sortedAllIds[0:1000]
344 | #
345 | ##f = open('1000ids.txt', 'w')
346 | ##for item in usingIds:
347 | ##    f.write(item[0])
348 | ##    f.write('\n')
349 | #
350 | # Read Uids From File
351 | 
352 | #f = open('1000Uids_t.txt', 'r')
353 | #allUids = []
354 | #for line in f:
355 | #    allUids.append(line[0:len(line)-1])
356 | 
357 | #f = open('1000ids.txt', 'r')
358 | #usingIds = []
359 | #for line in f:
360 | #    usingIds.append(line[0:len(line)-1])
361 | #
362 | ### Reading Uids from Oid
363 | #
364 | #allUids = getUidsFromOids(usingIds)
365 | ##
366 | #f = open('1000Uids1.txt', 'w')
367 | #for item in allUids:
368 | #    f.write(item)
369 | #    f.write('\n')
370 | #
371 | #pagesWithUid = getFollowPages(allUids)
372 | 
373 | #f = open('pagesWithUid.txt', 'w')
374 | #for item in pagesWithUid:
375 | #    f.write(item[0])
376 | #    f.write('\t')
377 | #    f.write(str(item[1]))
378 | #    f.write('\n')
379 | 
380 | #f = open('pagesWithUid.txt', 'r')
381 | #pagesWithUid1 = []
382 | #allUids = []
383 | #for item in f:
384 | #    t = item.split('\t')
385 | #    t[1] = int(t[1])
386 | #    t = tuple(t)
387 | #    pagesWithUid1.append(t)
388 | #    allUids.append(t[0])
389 | #followRalationship = getFollowRalationship(pagesWithUid)
390 | #
391 | ##followRalationship = ids
392 | ## Write in ralationship
393 | #f = open('followRelationshipP.txt', 'w')
394 | #for item in ids:
395 | #    f.write(item[0])
396 | #    f.write('\t')
397 | #    f.write(item[1])
398 | #    f.write('\n')
399 | #f = open('followRalationship.txt', 'r')
400 | #followRelationship = []
401 | #for item in f:
402 | #    t = item.split('\n')[0].split('\t')
403 | #    t = tuple(t)
404 | #    followRelationship.append(t)
405 | #
406 | #uids = []
407 | #for item in followRelationship:
408 | #    if item[0] not in uids:
409 | #        uids.append(item[0])
410 | #count = {}
411 | #for item in followRelationship:
412 | #    if item[1] in uids:
413 | #        if item[1] not in count:
414 | #            count[item[1]] = 1
415 | #        else:
416 | #            count[item[1]] += 1
417 | #sortedCount = sorted(count.iteritems(), key=lambda d: d[1], reverse=True)
418 | #for i in range(999):
419 | #    if allUids[i] in uids:
420 | #        print i
421 | 
422 | #oids = []
423 | #for i in range(1000):
424 | #    oids.append(usingIds[i][0])
425 | #get
426 | 
427 | 
428 | ###
429 | 


--------------------------------------------------------------------------------