├── FinalReport.pdf ├── README.md ├── content.py ├── cookies.txt ├── forward.py ├── uids.txt └── upload.py /FinalReport.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SQRPI/weibo-spider/d8c8a527fb0c0995419f310845db9eb0795ad8c8/FinalReport.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # weibo-spyder 2 | 社交网络挖掘期末Project 大数据相关微博用户关系爬虫 3 | 4 | 5 | ## 已实现功能 6 | * 搜索特定标签用户 7 | * uid和昵称转换 8 | * 批量爬取指定用户标签 9 | * 批量爬取用户关系列表 10 | * 微博内容提取和分析 11 | * 5000页面每小时的反反爬虫机制 12 | 13 | ## 使用方法 14 | 15 | * 在cookies.txt中输入你的cookie(可输入多个) 16 | * 在uids.txt中输入要爬取人的uid 17 | ```python 18 | python content.py 19 | ``` 20 | * 其他功能请自行修改upload.py中的代码或等待作者有空时整理 21 | 22 | ## 参数 23 | 24 | * --p 要爬取的页数,默认20 25 | * --m 写入文件方式,默认a(追加),如 26 | ```Python 27 | python content.py --m w 28 | ``` 29 | 则为覆盖.大规模爬取推荐使用默认值,有断点续爬功能 30 | * --u uid文件保存位置,默认uids.txt 31 | * --c Cookie文件保存位置,默认cookies.txt 32 | * --f 输出位置,默认result.txt 33 | -------------------------------------------------------------------------------- /content.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Jan 20 14:43:21 2018 4 | 5 | @author: SQRPI/Ning Shangyi 6 | """ 7 | 8 | # -*- coding: utf-8 -*- 9 | 10 | 11 | import sys 12 | from bs4 import BeautifulSoup 13 | import requests 14 | import time 15 | import argparse 16 | 17 | headers = { 18 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 19 | 'Accept-Encoding':'gzip, deflate, br', 20 | 'Accept-Language':'zh-CN,zh;q=0.8', 21 | 'Connection':'keep-alive', 22 | 'Cookie':'', 23 | 'Host':'weibo.cn', 24 | 'Referer':'https://weibo.cn/2113734951/profile', 25 | 'Upgrade-Insecure-Requests':'1', 26 | 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36' 27 | } 28 | 29 | 30 | def readCookies(path): 31 | global cookieNum, cookieCount 32 | cookieCount = 0 33 | cookieNum = 0 34 | f = open(path, 'r') 35 | cookie = [] 36 | flag = 0 37 | for line in f: 38 | if line.strip()[0] == '#': 39 | continue 40 | if flag==0: 41 | flag = 1 42 | if line.strip()=='': 43 | continue 44 | cookie.append(line.strip()) 45 | cookieCount += 1 46 | if cookieCount == 0: 47 | raise Exception('Input your cookies in cookies.txt!') 48 | return cookie 49 | 50 | def weiboContent(uid, pages=1, uidnum=1): 51 | global cookieNum, cookieCount, uidCount 52 | startCookie = 0 53 | flag = 0 54 | toReturn = [] 55 | page = 0 56 | while page <= pages: 57 | try: 58 | headers['Cookie'] = cookie[cookieNum] 59 | url = 'https://weibo.cn/u/%s?page=%d' % (uid, page+1) 60 | s = requests.Session() 61 | s.headers.update(headers) 62 | html = s.get(url).content 63 | 64 | soup = BeautifulSoup(html, 'lxml') 65 | content = soup.find_all('span', {'class': 'ctt'}) 66 | if page == 0 and len(content) >= 3: 67 | content = content[3:len(content)] 68 | for i in range(len(content)): 69 | toReturn.append([uid, page*10+i+1, content[i].getText()]) 70 | if not html: 71 | if startCookie==0: 72 | startCookie = cookieNum 73 | cookieNum = (cookieNum + 1)%cookieCount 74 | continue 75 | cookieNum = (cookieNum + 1)%cookieCount 76 | if cookieNum != startCookie: 77 | continue 78 | sys.stdout.write('\rWarning: All Accounts Banned, trying to reconnect %d, uid %d/%d, page %d/%d' % (flag, uidnum, uidCount, page, args.p)) 79 | time.sleep(20) 80 | if flag >= 50: 81 | sys.stdout.write('\nError 254, uid =%s Page= %d, %d/1000, Connection Failed\n' % (uid, page, uidnum)) 82 | page += 1 83 | flag = 0 84 | continue 85 | flag += 1 86 | continue 87 | startCookie = 0 88 | page += 1 89 | flag = 0 90 | sys.stdout.write('\rLoaded %3d-th uid, Page %d/%d...\t\t\t\t\t\t\t\t' % (uidnum, page, pages+1)) 91 | except: 92 | if flag: 93 | sys.stdout.write('\rWarning: uid =%s Page= %d, %d/1000, trying to reconnect %d\t\t\t\t\t' % (uid, page, uidnum, flag)) 94 | time.sleep(20) 95 | if flag >= 50: 96 | sys.stdout.write('\nError: uid =%s PageId= %d, %d/1000, Connection Failed\n' % (uid, page, uidnum)) 97 | page += 1 98 | flag = 0 99 | continue 100 | flag += 1 101 | continue 102 | return toReturn 103 | 104 | def readUids(path): 105 | global uidCount 106 | uidCount = 0 107 | f = open(path, 'r') 108 | uids = [] 109 | flag = 0 110 | for line in f: 111 | if line.strip()[0] == '#': 112 | continue 113 | if flag==0: 114 | flag = 1 115 | if line.strip()=='': 116 | continue 117 | uids.append(line.strip()) 118 | uidCount += 1 119 | if uidCount == 0: 120 | raise Exception('Input your uids in uids.txt!') 121 | return uids 122 | def writeContent(uids, text, pages): 123 | global uidCount 124 | start = False 125 | try: 126 | f = open(text, 'r') 127 | for line in f: 128 | pass 129 | contentNum = int(line.split('\t')[1]) 130 | startUid = line.split('\t')[0] 131 | f.close() 132 | except: 133 | contentNum = 0 134 | startUid = uids[0] 135 | f = open(text, args.m) 136 | for i in range(len(uids)): 137 | if start is True: 138 | t = weiboContent(uids[i], pages, uidnum=i+1) 139 | for item in t: 140 | f.write('%s\t%d\t%s\n' % (item[0], item[1], item[2].decode('utf8'))) 141 | if uids[i] == startUid: 142 | start = True 143 | sys.stdout.write('Started! uid = %s, content %d/%d\n' % (startUid, i+1, uidCount)) 144 | t = weiboContent(uids[i], pages, uidnum=i+1) 145 | for item in t: 146 | if item[1] >= contentNum: 147 | f.write('%s\t%d\t%s\n' % (item[0], item[1], item[2])) 148 | f.close() 149 | return 150 | 151 | 152 | parser = argparse.ArgumentParser() 153 | parser.add_argument('--p', type=int, default=20) 154 | parser.add_argument('--f', type=str, default='result.txt') 155 | parser.add_argument('--c', type=str, default='cookies.txt') 156 | parser.add_argument('--u', type=str, default='uids.txt') 157 | parser.add_argument('--m', type=str, default='a') 158 | args = parser.parse_args() 159 | cookie = readCookies(args.c) 160 | uids = readUids(args.u) 161 | writeContent(uids, args.f, args.p-1) 162 | sys.stdout.write('\nFinished!\n') 163 | -------------------------------------------------------------------------------- /cookies.txt: -------------------------------------------------------------------------------- 1 | #在这里输入cookies,换行分隔,更换账号可以或者不同的cookie,cookie越多爬取速度越快 2 | #获得方法:浏览器F12,打开weibo.cn,在网络/Network一栏中选择与weibo.cn,复制Cookie一项 3 | #下面是正常Cookie的格式(星号为字符) 4 | #SUB=_2A253E************lEW8yrFzj2IHXVU_bw8**************************6xoDeq2DGsXzFBmn6_s-gZUF13; SUHB=0M*******A7Zvj; SCF=AvkgK***************HPgeGVEJCKV2Fbu0ip_zeKE2NbeFI*****************vqmhStEIu7P--S7i***48.; _T_WM=425295*******************0c8d763 5 | -------------------------------------------------------------------------------- /forward.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jul 25 12:25:42 2017 4 | 5 | @author: Sqrpi 6 | """ 7 | 8 | 9 | import re 10 | import string 11 | import sys 12 | import os 13 | import urllib 14 | import urllib2 15 | from bs4 import BeautifulSoup 16 | import requests 17 | from lxml import etree 18 | import time 19 | 20 | # 21 | #reload(sys) 22 | #sys.setdefaultencoding('utf-8') 23 | 24 | 25 | def pp(text): 26 | print text.encode('utf-8').decode('utf-8') 27 | 28 | def pl(List): 29 | for item in List: 30 | pp(item) 31 | 32 | 33 | def pltx(List): 34 | for item in List: 35 | pp(item.text) 36 | 37 | cookie = {"Cookie": 'Your Cookie Here'} 38 | ''' 39 | 获得cookie: chromeF12-Network, 登录微博, 打开weibo.cn, Name列点击weibo.cn, 复制Cookie项 40 | 示例格式: _T_WM=0c6a2e5cd..... SSOLoginState=150090124 41 | ''' 42 | maxPage = 300 43 | pageId = 1 44 | flag = 0 45 | maxPage = 10 46 | Dict = {} # key forwards value 47 | ''' 48 | maxPage最大页数可修改 49 | Dict 是最终结果, A:[B,C]表示A从B,C处转发两次. 50 | 但是A转发B, B转发C会记做 A:[B] 51 | ''' 52 | while pageId < maxPage: 53 | url = 'https://weibo.cn/repost/FdSmtlQKX?uid=2670306073&rl=1&page=%d' % (pageId) 54 | html = requests.get(url, cookies=cookie).content 55 | 56 | soup = BeautifulSoup(html, 'lxml') 57 | forwardList = soup.find_all('div', class_='c')[3:] 58 | if not html: 59 | sys.stdout.write('\rWarning 251: Account Banned, trying to reconnect %d,page %d\t\t\t' % (flag, pageId)) 60 | time.sleep(20) 61 | if flag >= 50: 62 | sys.stdout.write('\nError 254, page %d, Connection Failed\n' % (pageId)) 63 | flag = 0 64 | continue 65 | flag += 1 66 | continue 67 | for item in forwardList: 68 | m = item.find_all('a') 69 | if len(m) > 2: 70 | forer = m[0].text 71 | fored = m[1].text[1:] 72 | elif len(m) == 2: 73 | forer = m[0].text 74 | fored = u'更方更正的物理' 75 | if forer not in Dict: 76 | Dict[forer] = [fored] 77 | elif fored not in Dict[forer]: 78 | Dict[forer].append(fored) 79 | pageId += 1 80 | flag = 0 81 | sys.stdout.write('\rLoaded page %d\t\t\t\t\t' % (pageId)) -------------------------------------------------------------------------------- /uids.txt: -------------------------------------------------------------------------------- 1 | #输入要爬取用户的UID,换行分隔 2 | #查看UID方法:用weibo.cn打开用户资料,网址显示为https://weibo.cn/2113734951/info,其中的数字即为uid 3 | #下面是一个示例 4 | 2113734951 5 | -------------------------------------------------------------------------------- /upload.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jun 06 00:00:27 2017 4 | 5 | @author: Ning Shangyi/SQRPI 6 | """ 7 | 8 | # -*- coding: utf-8 -*- 9 | 10 | import re 11 | import string 12 | import sys 13 | import os 14 | import urllib 15 | import urllib2 16 | from bs4 import BeautifulSoup 17 | import requests 18 | from lxml import etree 19 | import time 20 | 21 | # 22 | #reload(sys) 23 | #sys.setdefaultencoding('utf-8') 24 | 25 | 26 | def pp(text): 27 | print text.encode('utf-8').decode('utf-8') 28 | 29 | cookie = {"Cookie": 'INPUT YOUR COOKIE HERE'} 30 | #url = 'http://weibo.cn/u/%d?filter=1&page=1'%user_id 31 | 32 | def getIdFromStag(stags, ids = {}): 33 | # url for search 34 | for stag in stags: 35 | m = 0 36 | flag = 0 37 | while m < 51: 38 | pageId = m 39 | print 'Loading PAGE', pageId, 'of Stag', urllib.unquote(stag) 40 | try: 41 | url = 'https://weibo.cn/search/user/?keyword=%s&sort=0&filter=stag&page=%d' % (stag, pageId) 42 | html = requests.get(url, cookies=cookie).content 43 | 44 | soup = BeautifulSoup(html, 'lxml') 45 | content = soup.find_all('a', href=re.compile(r'f=search', re.I)) 46 | for i in range(len(content)): 47 | item = content[i] 48 | toAppend = item['href'].split('?')[0] 49 | if toAppend in ids: 50 | ids[toAppend] += 1 51 | else: 52 | ids[toAppend] = 1 53 | #time.sleep(1) 54 | if len(content) <= 0: 55 | sys.stdout.write('\rWarning 047: len(content) = 0\t%d page, %s stag\t\t' % (m, urllib.unquote(stag))) 56 | time.sleep(20) 57 | if flag>=50: 58 | m += 1 59 | flag = 0 60 | continue 61 | flag += 1 62 | continue 63 | m += 1 64 | flag = 0 65 | except: 66 | break 67 | time.sleep(1000) 68 | if flag: 69 | m += 1 70 | flag = 0 71 | continue 72 | flag = 1 73 | continue 74 | return ids 75 | 76 | 77 | def getUidFromOid(oid): 78 | # print 'Loading uid: oid=', oid 79 | url = 'https://weibo.cn%s' % oid 80 | html = requests.get(url, cookies=cookie).content 81 | 82 | soup = BeautifulSoup(html, 'lxml') 83 | content = soup.find_all('a', href=re.compile(r'/operation', re.I))[0] 84 | uid = content['href'].split('/')[1] 85 | return uid 86 | 87 | 88 | def getStagFromUid(uid): 89 | url = 'https://weibo.cn/account/privacy/tags/?uid=%s&st=789d9e' % uid 90 | html = requests.get(url, cookies=cookie).content 91 | 92 | soup = BeautifulSoup(html, 'lxml') 93 | content = soup.find_all('a', href=re.compile(r'keyword', re.I)) 94 | stags = [] 95 | for item in content: 96 | istag = item['href'].split('=')[1].split('&')[0] 97 | stags.append(istag) 98 | return stags 99 | 100 | 101 | def getUidsFromOids(oids, num=0, type=1): 102 | uids = [] 103 | i = 0 104 | flag = 0 105 | while i < len(oids): 106 | if type: 107 | oid = oids[i] 108 | else: 109 | oid = oids[i][0] 110 | try: 111 | if i >= num: 112 | uid = getUidFromOid(oid) 113 | uids.append(uid) 114 | # time.sleep(1) 115 | sys.stdout.write('\rLoading %d-th Uid, oid = %s\t\t\t\t\t' % (i, oid)) 116 | i += 1 117 | flag = 0 118 | except IndexError: 119 | if flag: 120 | sys.stdout.write('\rWarning 091, oid =%s %d/1000, trying to reconnect %d\t\t\t\t\t' % (oid, i, flag)) 121 | time.sleep(50) 122 | if flag >= 100: 123 | sys.stdout.write('\nError 118, oid =%s %d/1000, Connection Failed\n' % (oid, i)) 124 | i += 1 125 | flag = 0 126 | continue 127 | flag += 1 128 | continue 129 | return uids 130 | 131 | 132 | def allStags(uids): 133 | stags = {} 134 | flag = 0 135 | for uid in uids: 136 | try: 137 | flag += 1 138 | uid = uid.split('\n')[0] 139 | stag = getStagFromUid(uid) 140 | #time.sleep(1) 141 | print 'Loading From Uid', uid, 'Number', flag 142 | for item in stag: 143 | if item in stags: 144 | stags[item] += 1 145 | else: 146 | stags[item] = 1 147 | except: 148 | continue 149 | return sorted(stags.iteritems(), key=lambda d: d[1], reverse=True) 150 | 151 | 152 | def unquoteStag(stags): 153 | output = [] 154 | for item in stags: 155 | t = item[0] 156 | output.append(urllib.unquote(t)) 157 | return output 158 | 159 | 160 | def getFollowPages(uids): 161 | toReturn = [] 162 | i = 0 163 | flag = 0 164 | while i < len(uids): 165 | uid = uids[i] 166 | try: 167 | url = 'https://weibo.cn/%s/follow' % uid 168 | html = requests.get(url, cookies=cookie).content 169 | 170 | selector = etree.HTML(html) 171 | pageNum = (int)(selector.xpath('//input[@name="mp"]')[0].attrib['value']) 172 | toReturn.append((uid, pageNum)) 173 | i += 1 174 | sys.stdout.write('\rLoaded %d-th uid, %d Pages.\t\t\t\t\t' % (i, pageNum)) 175 | flag = 0 176 | except: 177 | if html: 178 | toReturn.append((uid, 0)) 179 | i += 1 180 | sys.stdout.write('\rLoaded %d-th uid, %d Pages.\t\t\t\t\t' % (i, pageNum)) 181 | flag = 0 182 | continue 183 | if flag: 184 | sys.stdout.write('\rWarning 172, uid =%s %d/1000, trying to reconnect %d\t\t\t\t\t' % (uid, i, flag)) 185 | time.sleep(50) 186 | if flag >= 40: 187 | sys.stdout.write('\nError 175, uid =%s %d/1000, Connection Failed\n' % (uid, i)) 188 | i += 1 189 | flag = 0 190 | continue 191 | flag += 1 192 | continue 193 | return toReturn 194 | 195 | 196 | def getFollowRalationship(followPagedUid): 197 | ids = [] 198 | m = 0 199 | flag = 0 200 | while m < len(followPagedUid): 201 | uid, pages = followPagedUid[m] 202 | if pages == 0: 203 | pages = 1 204 | pageId = 1 205 | while pageId <= pages: 206 | try: 207 | url = 'https://weibo.cn/%s/follow?page=%d' % (uid, pageId) 208 | html = requests.get(url, cookies=cookie).content 209 | 210 | soup = BeautifulSoup(html, 'lxml') 211 | content = soup.find_all('a', text="关注他"or"关注她") 212 | for i in range(len(content)): 213 | item = content[i] 214 | followed = item['href'].split('=')[1].split('&')[0] 215 | # (uid, item) means uid follows item 216 | ids.append((uid, followed)) 217 | if not html: 218 | sys.stdout.write('\rWarning 047: Account Banned, trying to reconnect %d' % flag) 219 | time.sleep(20) 220 | if flag >= 50: 221 | sys.stdout.write('\nError 175, uid =%s PageId= %d, %d/%d, Connection Failed\n' % (uid, pageId, m, len(followPagedUid))) 222 | pageId += 1 223 | flag = 0 224 | continue 225 | flag += 1 226 | continue 227 | pageId += 1 228 | flag = 0 229 | sys.stdout.write('\rLoaded %3d-th uid, Page %d/%d...\t\t\t\t' % (m+1, pageId-1, pages)) 230 | except: 231 | if flag: 232 | sys.stdout.write('\rWarning 172, uid =%s PageId= %d %d/1000, trying to reconnect %d\t\t\t\t\t' % (uid, pageId, m, flag)) 233 | time.sleep(20) 234 | if flag >= 50: 235 | sys.stdout.write('\nError 175, uid =%s PageId= %d, %d/1000, Connection Failed\n' % (uid, pageId, m)) 236 | pageId += 1 237 | flag = 0 238 | continue 239 | flag += 1 240 | continue 241 | m += 1 242 | return ids 243 | 244 | def weiboContent(uid, pages=20, uidnum=0): 245 | flag = 0 246 | toReturn = [] 247 | page = 0 248 | while page <= pages: 249 | try: 250 | url = 'https://weibo.cn/u/%s?page=%d' % (uid, page+1) 251 | html = requests.get(url, cookies=cookie).content 252 | 253 | soup = BeautifulSoup(html, 'lxml') 254 | content = soup.find_all('span', {'class': 'ctt'}) 255 | if page == 0 and len(content) >= 3: 256 | content = content[3:len(content)] 257 | for i in range(len(content)): 258 | toReturn.append([uid, page*10+i+1, content[i].getText()]) 259 | if not html: 260 | sys.stdout.write('\rWarning 251: Account Banned, trying to reconnect %d, uid %d/1000, page %d/20' % (flag, uidnum, page)) 261 | time.sleep(20) 262 | if flag >= 50: 263 | sys.stdout.write('\nError 254, uid =%s Page= %d, %d/1000, Connection Failed\n' % (uid, page, uidnum)) 264 | page += 1 265 | flag = 0 266 | continue 267 | flag += 1 268 | continue 269 | page += 1 270 | flag = 0 271 | sys.stdout.write('\rLoaded %3d-th uid, Page %d/%d...\t\t\t\t' % (uidnum, page, pages)) 272 | except: 273 | if flag: 274 | sys.stdout.write('\rWarning 265, uid =%s Page= %d %d/1000, trying to reconnect %d\t\t\t\t\t' % (uid, page, uidnum, flag)) 275 | time.sleep(20) 276 | if flag >= 50: 277 | sys.stdout.write('\nError 268, uid =%s PageId= %d, %d/1000, Connection Failed\n' % (uid, page, uidnum)) 278 | page += 1 279 | flag = 0 280 | continue 281 | flag += 1 282 | continue 283 | return toReturn 284 | 285 | def writeContent(uids, text): 286 | start = False 287 | try: 288 | f = open(text, 'r') 289 | for line in f: 290 | pass 291 | contentNum = int(line.split('\t')[1]) 292 | startUid = line.split('\t')[0] 293 | f.close() 294 | except: 295 | contentNum = 0 296 | startUid = uids[0] 297 | f = open(text, 'a') 298 | for i in range(len(uids)): 299 | if start is True: 300 | t = weiboContent(uids[i], uidnum=i+1) 301 | for item in t: 302 | f.write('%s\t%d\t%s\n' % (item[0], item[1], item[2].encode('utf-8'))) 303 | if uids[i] == startUid: 304 | start = True 305 | sys.stdout.write('Started! uid = %s, content %d/200' % (startUid, i+1)) 306 | t = weiboContent(uids[i], uidnum=i+1) 307 | for item in t: 308 | if item[1] >= contentNum: 309 | f.write('%s\t%d\t%s\n' % (item[0], item[1], item[2].encode('utf-8'))) 310 | f.close() 311 | return 312 | f = open('testContent2.txt', 'r') 313 | i=0 314 | for line in f: 315 | print line 316 | i+=1 317 | if i%50==0: 318 | time.sleep(2) 319 | 320 | # 321 | #for item in dstags: 322 | # print item 323 | # time.sleep(1) 324 | 325 | # 326 | ## Get id from DaShuJu 327 | ##originIds = getIdFromStag(stag) 328 | # Get id from other stags 329 | #allStag = ['%E5%A4%A7%E6%95%B0%E6%8D%AE', 330 | # '%E4%BA%91%E8%AE%A1%E7%AE%97', 331 | # '%E6%95%B0%E6%8D%AE%E6%8C%96%E6%8E%98', 332 | # '%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0', 333 | # 'Hadoop', 334 | # '%E7%A7%BB%E5%8A%A8%E4%BA%92%E8%81%94%E7%BD%91', 335 | # '%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90', 336 | # '%E5%95%86%E4%B8%9A%E6%99%BA%E8%83%BD', 337 | # '%E5%AD%98%E5%82%A8', 338 | # '%E4%BA%91%E5%AD%98%E5%82%A8', 339 | # '%E4%BF%A1%E6%81%AF%E5%8C%96'] 340 | # 341 | #allIds = getIdFromStag(allStag) 342 | #sortedAllIds = sorted(allIds.iteritems(), key=lambda d: d[1], reverse=True) 343 | #usingIds = sortedAllIds[0:1000] 344 | # 345 | ##f = open('1000ids.txt', 'w') 346 | ##for item in usingIds: 347 | ## f.write(item[0]) 348 | ## f.write('\n') 349 | # 350 | # Read Uids From File 351 | 352 | #f = open('1000Uids_t.txt', 'r') 353 | #allUids = [] 354 | #for line in f: 355 | # allUids.append(line[0:len(line)-1]) 356 | 357 | #f = open('1000ids.txt', 'r') 358 | #usingIds = [] 359 | #for line in f: 360 | # usingIds.append(line[0:len(line)-1]) 361 | # 362 | ### Reading Uids from Oid 363 | # 364 | #allUids = getUidsFromOids(usingIds) 365 | ## 366 | #f = open('1000Uids1.txt', 'w') 367 | #for item in allUids: 368 | # f.write(item) 369 | # f.write('\n') 370 | # 371 | #pagesWithUid = getFollowPages(allUids) 372 | 373 | #f = open('pagesWithUid.txt', 'w') 374 | #for item in pagesWithUid: 375 | # f.write(item[0]) 376 | # f.write('\t') 377 | # f.write(str(item[1])) 378 | # f.write('\n') 379 | 380 | #f = open('pagesWithUid.txt', 'r') 381 | #pagesWithUid1 = [] 382 | #allUids = [] 383 | #for item in f: 384 | # t = item.split('\t') 385 | # t[1] = int(t[1]) 386 | # t = tuple(t) 387 | # pagesWithUid1.append(t) 388 | # allUids.append(t[0]) 389 | #followRalationship = getFollowRalationship(pagesWithUid) 390 | # 391 | ##followRalationship = ids 392 | ## Write in ralationship 393 | #f = open('followRelationshipP.txt', 'w') 394 | #for item in ids: 395 | # f.write(item[0]) 396 | # f.write('\t') 397 | # f.write(item[1]) 398 | # f.write('\n') 399 | #f = open('followRalationship.txt', 'r') 400 | #followRelationship = [] 401 | #for item in f: 402 | # t = item.split('\n')[0].split('\t') 403 | # t = tuple(t) 404 | # followRelationship.append(t) 405 | # 406 | #uids = [] 407 | #for item in followRelationship: 408 | # if item[0] not in uids: 409 | # uids.append(item[0]) 410 | #count = {} 411 | #for item in followRelationship: 412 | # if item[1] in uids: 413 | # if item[1] not in count: 414 | # count[item[1]] = 1 415 | # else: 416 | # count[item[1]] += 1 417 | #sortedCount = sorted(count.iteritems(), key=lambda d: d[1], reverse=True) 418 | #for i in range(999): 419 | # if allUids[i] in uids: 420 | # print i 421 | 422 | #oids = [] 423 | #for i in range(1000): 424 | # oids.append(usingIds[i][0]) 425 | #get 426 | 427 | 428 | ### 429 | --------------------------------------------------------------------------------