├── test ├── __init__.py ├── set_test.py ├── asy_loop_test.py ├── tornado_test.py ├── print_test.py └── asy_test.py ├── analysis_tools ├── demo.pkl ├── retweeted_network.py ├── get_profile_img_url.py ├── word2vec.py ├── predeal_gephi.py └── read_content.py ├── test_tools ├── transform_time.py ├── create_index_forLatestHistory.py └── create_index.py ├── key_config.py ├── File_Interface.py ├── server_config.py ├── verify_proxy.py ├── client_config.py ├── init_redis.py ├── server_data.py ├── README ├── data_transport.py ├── DB_Interface.py ├── server_proxy.py ├── server.py ├── server_database.py └── client_asy_update.py /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /analysis_tools/demo.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/multiangle/Distributed_Microblog_Spider/HEAD/analysis_tools/demo.pkl -------------------------------------------------------------------------------- /test/set_test.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | count = 0 4 | try: 5 | count += 1 6 | raise ValueError("klajsdf") 7 | count += 1 8 | except: 9 | pass 10 | print(count) -------------------------------------------------------------------------------- /test_tools/transform_time.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | import time 3 | 4 | def formate_time(timestamp): 5 | print(time.strftime('%Y-%m-%d %H:%M',time.localtime(timestamp))) 6 | 7 | if __name__=='__main__': 8 | formate_time(1461092302) -------------------------------------------------------------------------------- /analysis_tools/retweeted_network.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | from pymongo import MongoClient 3 | 4 | client=MongoClient('localhost',27017) 5 | db=client['microblog_spider'] 6 | table_name='user_2016_03' 7 | table=eval('db.{tname}'.format(tname=table_name)) 8 | -------------------------------------------------------------------------------- /key_config.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # SERVER_URL ='http://server:port' #服务器地址,端口号 4 | # DATA_SERVER_URL ='http://data_server:port' #数据服务器地址,端口号 5 | 6 | SERVER_URL ='http://127.0.0.1:8000' #服务器地址,端口号 7 | DATA_SERVER_URL ='http://127.0.0.1:8001' #数据服务器地址,端口号 8 | 9 | GET_PROXY_URL = '' # 你的代理地址 -------------------------------------------------------------------------------- /analysis_tools/get_profile_img_url.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | 4 | from pymongo import MongoClient 5 | 6 | client=MongoClient('localhost',27017) 7 | db=client['microblog_spider'] 8 | latest_history=db.latest_history 9 | res=latest_history.find({'user.profile_image_url':{'$ne':None}}) 10 | res= [x for x in res] 11 | print(res.__len__()) 12 | 13 | -------------------------------------------------------------------------------- /test_tools/create_index_forLatestHistory.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | from pymongo import MongoClient 4 | import time 5 | import sys 6 | import File_Interface as FI 7 | import matplotlib.pyplot as plt 8 | import pymongo 9 | 10 | 11 | client=MongoClient('localhost',27017) 12 | db=client['microblog_spider'] 13 | collec = db.latest_history 14 | collec.create_index([('user_id',pymongo.ASCENDING)]) -------------------------------------------------------------------------------- /analysis_tools/word2vec.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | import jieba 4 | import File_Interface as FI 5 | 6 | data = FI.load_pickle('demo.pkl') 7 | user_list = [x['user_name'] for x in data] 8 | text_list = [x['dealed_text']['left_content'] for x in data] 9 | 10 | for line in text_list: 11 | print(line) 12 | res = jieba.cut(line[0],cut_all=False) 13 | # print(list(seg_list)) 14 | res = list(res) 15 | print(res) 16 | 17 | 18 | -------------------------------------------------------------------------------- /test/asy_loop_test.py: -------------------------------------------------------------------------------- 1 | 2 | import asyncio 3 | import threading 4 | import time 5 | 6 | async def asy_test_basic(id,gap): 7 | count = 0 8 | for i in range(5): 9 | count += 1 10 | await asyncio.sleep(gap) 11 | # print('hehe') 12 | print('id:{i}, count: {c}'.format(i=id,c=count)) 13 | 14 | loop = asyncio.get_event_loop() 15 | tasks = [asy_test_basic(i,5) for i in range(5)] 16 | loop.run_until_complete(asyncio.wait(tasks)) 17 | loop.close() 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /File_Interface.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | import csv,pickle 4 | 5 | def read_csv(path): #读入原始csv文件,不做任何变动 6 | file=open(path,'r') 7 | reader=csv.reader(file) 8 | data=[row for row in reader] 9 | return data 10 | def load_pickle(path): #读入pickle文件,不做任何变动 11 | file=open(path,'rb') 12 | data=pickle.load(file) 13 | file.close() 14 | return data 15 | def save_pickle(data,path): 16 | file=open(path,'wb') 17 | pickle.dump(data,file) 18 | file.close() -------------------------------------------------------------------------------- /analysis_tools/predeal_gephi.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | from DB_Interface import MySQL_Interface 4 | import json 5 | import networkx as nx 6 | 7 | dbi=MySQL_Interface() 8 | 9 | # create table (select * from user_info_table order by fans_num limit 1000) 10 | 11 | [web_info,col_info]=dbi.select_all('temp_table2') 12 | select_web=[] 13 | select_user={} 14 | for atte in web_info: 15 | if (atte[1],atte[0]) in web_info: 16 | select_web.append(list(atte)) 17 | select_user[atte[1]]=1 18 | select_user[atte[0]]=1 19 | select_user=select_user.keys() 20 | 21 | G=nx.Graph() 22 | G.add_nodes_from(select_user) 23 | G.add_edges_from(select_web) 24 | nx.write_gexf(G,'weibo_node1000.gexf') -------------------------------------------------------------------------------- /test/tornado_test.py: -------------------------------------------------------------------------------- 1 | import tornado.web 2 | import tornado.ioloop 3 | import tornado.options 4 | from tornado.options import define,options 5 | 6 | import time 7 | 8 | define('port',default=8080,help='run on the given port',type=int) 9 | 10 | class Application(tornado.web.Application): 11 | def __init__(self): 12 | handlers=[ 13 | (r'/test',TestHandler), 14 | ] 15 | settings=dict( 16 | debug=True 17 | ) 18 | tornado.web.Application.__init__(self,handlers,**settings) 19 | 20 | class TestHandler(tornado.web.RequestHandler): 21 | def get(self): 22 | # time.sleep(10) 23 | self.write('233333') 24 | self.finish() 25 | 26 | if __name__=='__main__': 27 | tornado.options.parse_command_line() # tornado thread 28 | Application().listen(options.port) 29 | tornado.ioloop.IOLoop.instance().start() 30 | print('ok') 31 | 32 | -------------------------------------------------------------------------------- /server_config.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | #======= USED IN server_proxy.py ==================================== 4 | import key_config 5 | GET_PROXY_URL = key_config.GET_PROXY_URL 6 | PROXY_POOL_SIZE =600 7 | VERIFY_PROXY_THREAD_NUM =300 8 | PROXY_NORMAL_INFO_PRINT =True 9 | MAX_VALID_PROXY_THREAD_NUM =3 10 | PROXY_MONITOR_GAP =10 # (seconds) every 10 seconds, a process will note the state 11 | # proxy pool 12 | PROXY_SIZE_STATE_LIST_LEN =30 #the len of proxy size state list. For example if you want 13 | # to monitor the state of proxy pool in latest 5 minutes, 14 | # the value of this item be 60*5/PROXY_MONITOR_GAP 15 | HISTORY_TASK_VALVE =15000 # 微博数大于15000的,由本机完成搜索,小于15000的,交由云主机 -------------------------------------------------------------------------------- /verify_proxy.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | #====================================================================== 3 | # 4 | # This program run in a certain server to verify if a proxy is useful 5 | # Version_0.1_ 6 | # if a http get request is sent to here, it means the http proxy is 7 | # useful 8 | # 9 | #====================================================================== 10 | import tornado.web 11 | import tornado.ioloop 12 | import tornado.options 13 | 14 | from tornado.options import define,options 15 | define('port',default=7001,help='run on the given port',type=int) 16 | 17 | class Application(tornado.web.Application): 18 | def __init__(self): 19 | handlers=[ 20 | (r'/verify_proxy',verify_proxy), 21 | ] 22 | settings=dict( 23 | debug=False 24 | ) 25 | tornado.web.Application.__init__(self,handlers) 26 | 27 | class verify_proxy(tornado.web.RequestHandler): 28 | def get(self): 29 | self.write('valid proxy') 30 | 31 | if __name__=='__main__': 32 | tornado.options.parse_command_line() 33 | Application().listen(options.port) 34 | tornado.ioloop.IOLoop.instance().start() -------------------------------------------------------------------------------- /test/print_test.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | class printer(): 4 | def gen_timestr(self): 5 | tstr = time.strftime('%Y/%m/%d %H:%M:%S',time.localtime(time.time())) 6 | return tstr 7 | 8 | def gen_center_str(self, content, len=42, frame="|||"): 9 | if type(content)==str: 10 | content = content.split("\n") 11 | # content = [content] 12 | ret = "" 13 | for s in content: 14 | left = len-frame.__len__()*2-s.__len__() 15 | margin_left = left>>1 16 | margin_right = left-margin_left 17 | line = "{fr}{ml}{s}{mr}{fr}".format( 18 | ml = " "*margin_left, 19 | s = s, 20 | mr = " "*margin_right, 21 | fr = frame 22 | ) 23 | ret += line+'\n' 24 | return ret 25 | 26 | def gen_block(self, content, len=42, frame="|||"): 27 | ret = "="*len + '\n' 28 | ret += self.gen_center_str(content,len,frame=frame) 29 | ret += "="*len + '\n' 30 | return ret 31 | 32 | 33 | def gen_block_with_time(self, content, len=42, frame="|||"): 34 | ret = "="*len+'\n' 35 | time_s = self.gen_timestr() 36 | timeline = "TIME: "+time_s 37 | ret += self.gen_center_str(timeline,len,frame=frame) 38 | return ret+self.gen_block(content,len,frame=frame) 39 | 40 | p = printer() 41 | print(p.gen_block_with_time("TIME TO GO\nhehehehe")) 42 | -------------------------------------------------------------------------------- /client_config.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | 4 | #======================================================== 5 | # 6 | # This file contains the option info for client. 7 | # 8 | #======================================================== 9 | import key_config 10 | 11 | # 全局参数 global config info 12 | PROCESS_NUM =2 #进程数目 number of process 13 | THREAD_NUM =100 #每个进程最多线程 max thread num per process 14 | NOMAL_INFO_PRINT =False #普通信息显示 if print normal information 15 | KEY_INFO_PRINT =True #关键信息显示 if print key information 16 | DEBUG_INFO_PRINT =True # 调试信息显示 17 | NORMAL_INFO_LOG =True #普通信息日志 if output normal info to log 18 | KEY_INFO_LOG =True #错误信息日志 if output key info to log 19 | LOG_POS ='log\\' #日志存放点 the address of log 20 | DATA_POS ='temp\\' #临时数据存放点 21 | UUID =4 #客户端的型号 或者说id 22 | #代理相关 about proxy 23 | USE_PROXY =True #是否使用代理 if use proxy 24 | PROXY_POOL_SIZE =THREAD_NUM*2 #每个进程维持的代理池的大小 25 | CURRENT_YEAR =2016 26 | LARGEST_TRY_TIMES =3 # 获取页面或解析失败以后,重新尝试的次数 27 | 28 | ####-------------------------------------#### 29 | 30 | SERVER_URL = key_config.SERVER_URL #服务器地址,端口号 31 | DATA_SERVER_URL = key_config.DATA_SERVER_URL #数据服务器地址,端口号 32 | 33 | 34 | -------------------------------------------------------------------------------- /init_redis.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | from DB_Interface import MySQL_Interface 4 | import redis 5 | 6 | class SimpleHash(): 7 | def __init__(self,cap,seed): 8 | self.cap=cap 9 | self.seed=seed 10 | def hash(self,value): 11 | ret=0 12 | for i in range(value.__len__()): 13 | ret+=self.seed*ret+ord(value[i]) 14 | return ((self.cap-1) & ret) 15 | 16 | class BloomFilter(): 17 | def __init__(self): 18 | self.bit_size=1<<15 19 | self.seeds=[5,7,11,13,31,37,61] 20 | self.r=redis.StrictRedis(host='127.0.0.1',port=6379,db=0) 21 | self.hashFunc=[] 22 | for i in range(self.seeds.__len__()): 23 | self.hashFunc.append(SimpleHash(self.bit_size,self.seeds[i])) 24 | 25 | def isContains(self,str_input,name): 26 | if str_input==None: 27 | return False 28 | if str_input.__len__()==0: 29 | return False 30 | ret=True 31 | for f in self.hashFunc: 32 | loc=f.hash(str_input) 33 | ret=ret & self.r.getbit(name,loc) 34 | return ret 35 | 36 | def insert(self,str_input,name): 37 | for f in self.hashFunc: 38 | loc=f.hash(str_input) 39 | self.r.setbit(name,loc,1) 40 | 41 | dbi=MySQL_Interface(dbname='microblog_spider') 42 | r=redis.StrictRedis(host='127.0.0.1',port=6379,db=0) 43 | query='select uid from user_info_table ;' 44 | uid=dbi.select_asQuery(query) 45 | uid=[x[0] for x in uid] 46 | bf=BloomFilter() 47 | for id in uid: 48 | bf.insert(id,'user_info_table') -------------------------------------------------------------------------------- /test/asy_test.py: -------------------------------------------------------------------------------- 1 | 2 | import asyncio 3 | import aiohttp 4 | import threading 5 | 6 | @asyncio.coroutine 7 | async def asy_test(url, id, proxy_used=0): 8 | try: 9 | res = await singleConn(url,id) 10 | print(res) 11 | return res 12 | except Exception as e: 13 | if proxy_used<3: 14 | print('{i} current proxy invalid ,switch another'.format(i=id)) 15 | return await asy_test(url, id, proxy_used+1) 16 | else: 17 | # raise RuntimeError('2333333333333') 18 | print('2333333333333') 19 | 20 | @asyncio.coroutine 21 | async def singleConn(url,id,times=0): 22 | headers = {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) ' 23 | 'AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile' 24 | '/12A4345d Safari/600.1.4'} 25 | async with aiohttp.ClientSession() as session : 26 | try: 27 | with aiohttp.Timeout(3): 28 | async with session.get(url,headers=headers) as resp: 29 | content = await resp.read() 30 | content = content.decode('utf8') 31 | print(content) 32 | return content 33 | except Exception as e: 34 | print(e) 35 | print('{i} try again {x}'.format(x=times,i=id)) 36 | if times<3: 37 | return await singleConn(url,id,times+1) 38 | else: 39 | raise RuntimeError('{i} This proxy cannot connect , swith another'.format(i=id)) 40 | 41 | 42 | url = 'http://127.0.0.1:8080/test' 43 | tasks = [singleConn(url,i) for i in range(5)] 44 | loop = asyncio.get_event_loop() 45 | loop.run_until_complete(asyncio.wait(tasks)) 46 | loop.close() 47 | 48 | -------------------------------------------------------------------------------- /test_tools/create_index.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | from pymongo import MongoClient 3 | import time 4 | import sys 5 | import File_Interface as FI 6 | import matplotlib.pyplot as plt 7 | import pymongo 8 | 9 | def create_index_asTable(table_list): 10 | client=MongoClient('localhost',27017) 11 | db=client['microblog_spider'] 12 | for t in table_list: 13 | collec = eval('db.{x}'.format(x=t)) 14 | collec.create_index([('user_id',pymongo.DESCENDING)]) 15 | collec.create_index([('id',pymongo.DESCENDING)]) 16 | print('{t} done'.format(t=t)) 17 | 18 | def create_index_all(): 19 | client=MongoClient('localhost',27017) 20 | db=client['microblog_spider'] 21 | res=db.collection_names() 22 | collec_list=[] 23 | for x in res: 24 | if 'user' in x: 25 | collec_list.append(x) 26 | for item in collec_list: 27 | collec=eval('db.{x}'.format(x=item)) 28 | collec.create_index([('user_id',pymongo.DESCENDING)]) 29 | print(item+' is done') 30 | 31 | def auto_index(): 32 | client = MongoClient('localhost',27017) 33 | db = client['microblog_spider'] 34 | collec_list = [] 35 | res=db.collection_names() 36 | for x in res: 37 | if 'user' in x: 38 | collec_list.append(x) 39 | print('****** start to check the index station of collections in mongodb ******') 40 | for name in collec_list: 41 | collec = db.get_collection(name) 42 | indexs = [x for x in collec.list_indexes()] 43 | if indexs.__len__()<3: # 此时没有索引 44 | print('{n} do not have indexes yet, ready to craete'.format(n=name)) 45 | collec.create_index([('user_id',pymongo.DESCENDING)]) 46 | collec.create_index([('id',pymongo.DESCENDING)]) 47 | else: 48 | # print('{n} has 3 indexs, done'.format(n=name)) 49 | pass 50 | print('****** all indexes is created ******') 51 | 52 | auto_index() -------------------------------------------------------------------------------- /server_data.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | #====================================================================== 4 | #----------------import package-------------------------- 5 | # import python package 6 | import tornado.web 7 | import tornado.ioloop 8 | import tornado.options 9 | from tornado.options import define,options 10 | from pymongo import MongoClient 11 | 12 | # import from this folder 13 | #====================================================================== 14 | class DataServer(tornado.web.Application): 15 | def __init__(self): 16 | handlers=[ 17 | (r'/history_data',HistoryDataReturn), 18 | (r'/auth',DataAuth) 19 | ] 20 | setting=dict( 21 | debug=True 22 | ) 23 | tornado.web.Application.__init__(self,handlers,**setting) 24 | 25 | class HistoryDataReturn(tornado.web.RequestHandler): 26 | def post(self): 27 | try: 28 | data=eval(self.get_argument('data')) 29 | current_id=int(self.get_argument('current_id')) 30 | total_num=int(self.get_argument('total_num')) 31 | len=int(self.get_argument('len')) 32 | container_id=self.get_argument('container_id') 33 | self.write('success') 34 | self.finish() 35 | 36 | client=MongoClient('localhost',27017) 37 | db=client['microblog_spider'] 38 | collection=db.assemble_factory 39 | 40 | # store sub pack to assemble factory 41 | mongo_data=dict( 42 | data =data, 43 | current_id =current_id, 44 | total_num =total_num, 45 | len =len, 46 | container_id=container_id, 47 | type ='history' 48 | ) 49 | result=collection.insert(mongo_data) 50 | # print('ServerData->HistoryDataReturn: Success to get data from web') 51 | 52 | except Exception as e: 53 | self.write('fail to return user history') 54 | self.finish() 55 | print('Error:server-HistoryReturn:' 56 | 'Unable to get value from http package,Reason:') 57 | print(e) 58 | return 59 | 60 | class DataAuth(tornado.web.RequestHandler): 61 | def get(self): 62 | self.write('connection success') 63 | self.finish() 64 | 65 | if __name__=='__main__': 66 | DataServer().listen(8001) 67 | tornado.ioloop.IOLoop.instance().start() -------------------------------------------------------------------------------- /analysis_tools/read_content.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | import pymongo 4 | from DB_Interface import MySQL_Interface 5 | from pymongo import MongoClient 6 | import File_Interface as FI 7 | 8 | def read_content_in_mongo(table_name,select={},field=[],limit=-1,sort='',sort_type='up'): 9 | client=MongoClient('localhost',27017) 10 | db=client['microblog_spider'] 11 | collection=eval('db.{name}'.format(name=table_name)) 12 | res=None 13 | if limit==-1: 14 | if field.__len__()==0: 15 | if sort=='': 16 | res=[pop_id(x) for x in collection.find(select)] 17 | else: 18 | if sort_type=='up': 19 | res=[pop_id(x) for x in collection.find(select).sort(sort,pymongo.ASCENDING)] 20 | else: 21 | res=[pop_id(x) for x in collection.find(select).sort(sort,pymongo.DESCENDING)] 22 | else: 23 | f={} 24 | for item in field: 25 | f[item]=1 26 | 27 | if sort=='': 28 | res=[pop_id(x) for x in collection.find(select,f)] 29 | else: 30 | if sort_type=='up': 31 | res=[pop_id(x) for x in collection.find(select,f).sort(sort,pymongo.ASCENDING)] 32 | else: 33 | res=[pop_id(x) for x in collection.find(select,f).sort(sort,pymongo.DESCENDING)] 34 | else: 35 | if field.__len__()==0: 36 | if sort=='': 37 | res=[pop_id(x) for x in collection.find(select).limit(limit)] 38 | else: 39 | if sort_type=='up': 40 | res=[pop_id(x) for x in collection.find(select).limit(limit).sort(sort,pymongo.ASCENDING)] 41 | else: 42 | res=[pop_id(x) for x in collection.find(select).limit(limit).sort(sort,pymongo.DESCENDING)] 43 | else: 44 | f={} 45 | for item in field: 46 | f[item]=1 47 | 48 | if sort=='': 49 | res=[pop_id(x) for x in collection.find(select,f).limit(limit)] 50 | else: 51 | if sort_type=='up': 52 | res=[pop_id(x) for x in collection.find(select,f).limit(limit).sort(sort,pymongo.ASCENDING)] 53 | else: 54 | res=[pop_id(x) for x in collection.find(select,f).limit(limit).sort(sort,pymongo.DESCENDING)] 55 | return res 56 | 57 | def pop_id(data): 58 | data.pop('_id') 59 | return data 60 | 61 | res=read_content_in_mongo('latest_history',{'user_id':'1681029540'},['dealed_text.left_content','created_at','user_name'],-1,'id','down') 62 | 63 | data = FI.load_pickle('demo.pkl') 64 | data = data + res 65 | FI.save_pickle(data,'demo.pkl') 66 | for line in res: 67 | print(line) 68 | print(res.__len__()) 69 | 70 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | ========================================================================================================= 2 | INTRODUCTION 3 | 4 | This is Distributed Microblog Spider which can used to getting the social network of users 5 | and the microblog tweeted by users. Besides, it can update the latest information everyday. 6 | The spider can mainly divided into two parts, the server and client. Server is responsable 7 | for arranging tasks, collecting data, maintaining proxy pool and managing databases. While cli- 8 | -ent is responsable for collecting data from website and sending them to server. 9 | 10 | ========================================================================================================= 11 | DEPENDENCY 12 | 13 | Language : Python 3.5 14 | 15 | External Python Lib : tornado 16 | pymongo 17 | redis 18 | pymysql 19 | 20 | Database : MySQL 21 | Redis 22 | MongoDB 23 | 24 | Others : nginx 25 | 26 | URL : python : https://www.python.org/ 27 | MySQL : http://www.mysql.com/downloads/ 28 | MongoDB : https://www.mongodb.org/ 29 | Redis : http://redis.io/ 30 | tornado : http://www.tornadoweb.org/en/stable/ 31 | nginx : http://nginx.org/ 32 | 33 | ========================================================================================================= 34 | 35 | The component and details of the project(__edition_0.9_) 36 | --------------------------------------------------------------------------------------------------------- 37 | This is a Sina Microblog Spider archieved by server terminal and client terminal. 38 | The Server terminal mainly deal with 4 tasks. 39 | 40 | 1. Get proxy from http proxy website. Unfortunately, not all proxy scratched from website 41 | is valid. In fact, almost one quarter of them is valid. So, it is important to check if 42 | those proxy is valid. One thread of server will manage a proxy pool. 43 | Raw and unchecked proxy will scratched from website, and this thread will arrange these 44 | raw proxy to several sub thread, each sub thread will check if the arranged proxy is 45 | valid. If so, these proxy will be sent to valid proxy pool and ready for use. 46 | Besides, the valid proxy may extend after some time. So it is necessary to check if the 47 | proxy in proxy pool is valid. 48 | 49 | 2. The second part is communicating with clients. Clients will require proxy and task from 50 | server. When client finish the task of getting info of users, they will return the user info 51 | to server. When received this user info , server will store then in cache database. 52 | 53 | 3. The 3rd part is about receiving the data translated from client. Sometimes client will 54 | translate data which is sized of hundred of MBs. When server is receiving there data, the query 55 | from other client will be denied. So it's necessary to add some data server which is used to 56 | receive data specially. 57 | 58 | 4. The 3rd part of function is dealing with the data in cache databases. The attends of user 59 | will be checked if already exist in user_info_table. If not, the user will be inserted into 60 | user_info_table. And no matter whether it exists, the user will be deleted from ready_to_get 61 | table. And ,as for attends, if they are not contained in user_info_table, they will be inserted 62 | to ready_to_get table. 63 | ========================================================================================================== 64 | 65 | UPDATE HISTORY: 66 | _0.9_ : add update function. Can update microblog 67 | _0.8_ : start to use multi data servers and nginx 68 | _0.7_ : the function of putting data to MongoDB is moved from server.py to server_database.py, 69 | which improve the speed of reaction of main server 70 | _0.6_ : add a data server to avoid the broken of server. And the data of weibo is translated in 71 | small parts 72 | _0.5_ : add the function of scratching the history of certain user 73 | _0.4_ : if the average proxy size less than 30, refuse to assignment task to client 74 | _0.3_ : drop the attends whoes fans num less than 1000 75 | _0.2_ : add the partition to monitor the state of proxy pool 76 | _0.1_ : the initial edition 77 | 78 | ========================================================================================================== 79 | 80 | 开机流程: 81 | 82 | 1.开启MySQL 83 | 启动 mysqld --user=mysql 84 | 2.开启MongoDB 85 | 启动 mongod --dbpath I:\MongoDB\data 86 | 3.将redis持久化关闭 87 | config set save "" 88 | 4.将user info table 写入redis 89 | 5.启动nginx 90 | 6.开启花生壳 91 | 7.启动服务器 92 | 8.SSL连接云主机 93 | 9.启动客户端 94 | 95 | ========================================================================================================== 96 | 97 | 爬虫重新冷启动流程: 98 | 1.drop mongodb中的 assemble factory 和 formal两表 99 | 2.清空mysql 中的cache_history表 100 | truncate table cache_history 101 | 3.将mysql-》user_info_table中的update_time,latest_blog和isGettingBlog设为null 102 | update user_info_table set update_time=null,latest_blog=null,isGettingBlog=null -------------------------------------------------------------------------------- /data_transport.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | import math 3 | import threading 4 | import urllib.parse as parse 5 | import urllib.request as request 6 | import time 7 | 8 | # todo 线程调节那块需要补充 , 当数据小时,自动调节线程,避免线程空转 9 | 10 | class upload_list(): 11 | def __init__(self,data,url,setting): 12 | # data : the formation of data should be a list 13 | # url : target url the data should be uploaded 14 | # setting: keys: batch_size 15 | # thread_adjust 16 | # thread_num 17 | self.url=url 18 | self.data_list=data 19 | self.list_len=self.data_list.__len__() 20 | 21 | self.batch_size=1 22 | self.thread_adjust=True 23 | self.thread_num=10 24 | self.setting=setting 25 | self.seting_check() 26 | 27 | self.task_list=[] 28 | self.task_num=0 29 | self.build_task_list() 30 | 31 | def run(self): 32 | stat_ret=[] 33 | if self.thread_adjust: 34 | alive_id=[] 35 | thread_pool=[] 36 | stat_his=[] 37 | 38 | for i in range(self.thread_num): 39 | t=upload_sub(self.task_list,self.url,i,stat_ret,alive_id) 40 | thread_pool.append(t) 41 | for t in thread_pool: 42 | t.start() 43 | 44 | while self.task_list: 45 | for i in range(self.thread_num): 46 | if not thread_pool[i].is_alive(): 47 | t=upload_sub(self.task_list,self.url,i,stat_ret,alive_id) 48 | thread_pool[i]=t 49 | thread_pool[i].start() 50 | 51 | task_left_num=self.task_list.__len__() 52 | task_done_num=max(self.task_num-task_left_num,0) 53 | show_block=40 54 | task_left_show=min(max(int(show_block*task_left_num/(task_left_num+task_done_num)),0),show_block) 55 | task_done_show=show_block-task_left_show 56 | print(task_done_show*'★'+task_left_show*'☆') 57 | 58 | time.sleep(1) 59 | 60 | while True: 61 | all_dead=True 62 | for t in thread_pool: 63 | if t.is_alive(): 64 | all_dead=False 65 | if all_dead: 66 | break 67 | time.sleep(1) 68 | 69 | 70 | else: 71 | alive_id=[x for x in range(self.thread_num)] 72 | thread_pool=[] 73 | for i in range(self.thread_num): 74 | t=upload_sub(self.task_list,self.url,i,stat_ret,alive_id) 75 | thread_pool.append(t) 76 | for t in thread_pool: 77 | t.start() 78 | 79 | while self.task_list: 80 | for i in range(self.thread_num): 81 | if not thread_pool[i].is_alive(): 82 | t=upload_sub(self.task_list,self.url,i,stat_ret,alive_id) 83 | thread_pool[i]=t 84 | thread_pool[i].start() 85 | 86 | task_left_num=self.task_list.__len__() 87 | task_done_num=max(self.task_num-task_left_num,0) 88 | show_block=40 89 | task_left_show=min(max(int(show_block*task_left_num/(task_left_num+task_done_num)),0),show_block) 90 | task_done_show=show_block-task_left_show 91 | print(task_done_show*'◆'+task_left_show*'―') 92 | 93 | time.sleep(1) 94 | 95 | while True: 96 | all_dead=True 97 | for t in thread_pool: 98 | if t.is_alive(): 99 | all_dead=False 100 | if all_dead: 101 | break 102 | time.sleep(1) 103 | 104 | def seting_check(self): 105 | keys=self.setting.keys() 106 | 107 | if 'batch_size' in keys: 108 | self.batch_size=self.setting['batch_size'] 109 | else: 110 | raise ValueError('Unknown data of batch size') 111 | 112 | if 'thread_adjust' in keys: 113 | self.thread_adjust=self.setting['thread_adjust'] 114 | if not isinstance(self.thread_adjust,bool): 115 | raise ValueError('The type of thread_adjust should be boolean') 116 | else: 117 | self.thread_adjust=True 118 | 119 | if 'thread_num' in keys: 120 | self.thread_num=self.setting['thread_num'] 121 | else: 122 | if not self.thread_adjust: 123 | raise ValueError('thread num should be provided ' 124 | 'if auto adjust is turn off') 125 | 126 | def build_task_list(self): 127 | batch_num=math.ceil(self.list_len/self.batch_size) 128 | for i in range(batch_num): 129 | ori_block=self.data_list[i*self.batch_size:min((i+1)*self.batch_size,self.list_len)] 130 | self.task_list.append(self.pack_block(ori_block,i,batch_num)) 131 | self.task_num=self.task_list.__len__() 132 | self.data_list=None # 注释掉data_list,留下 task_list formation:[{}{}{}] 133 | 134 | def pack_block(self,main_data,pack_id,pack_num): 135 | data={ 136 | 'data':main_data 137 | } 138 | data=parse.urlencode(data).encode('utf8') 139 | return data 140 | 141 | # def judge_tread_num(self): 142 | 143 | class upload_sub(threading.Thread): 144 | def __init__(self,task_list,url,thread_id,stat_ret,alive_id): 145 | threading.Thread.__init__(self) 146 | self.task_list=task_list 147 | self.url=url 148 | self.stat_ret=stat_ret 149 | self.thread_id=thread_id 150 | self.alive_id=alive_id 151 | 152 | def run(self): 153 | while self.task_list: 154 | task=self.task_list.pop(0) 155 | 156 | req=request.Request(self.url,task) 157 | opener=request.build_opener() 158 | 159 | stat={} 160 | stat['id']=self.thread_id 161 | stat['start']=time.time() 162 | try: 163 | res=opener.open(req,timeout=20) 164 | res=res.read().decode('utf8') 165 | if 'success' in res: 166 | stat['result']='success' 167 | else: 168 | stat['result']='denied' 169 | except: 170 | self.task_list.append(task) 171 | stat['result']='timeout' 172 | stat['end']=time.time() 173 | stat['gap']=stat['start']-stat['end'] 174 | self.stat_ret.append(stat) 175 | 176 | if self.thread_id not in self.alive_id: 177 | break 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | -------------------------------------------------------------------------------- /DB_Interface.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | # import mysql.connector 3 | import pymysql 4 | import traceback 5 | 6 | class MySQL_Interface: 7 | def __init__(self,host='127.0.0.1',user='root',pwd='admin',dbname='microblog_spider'): 8 | self.host=host 9 | self.user=user 10 | self.passwd=pwd 11 | self.db=dbname 12 | try: 13 | self.conn=pymysql.connect( 14 | user=self.user, 15 | password=self.passwd, 16 | host=self.host, 17 | db=self.db, 18 | charset='utf8mb4' 19 | ) 20 | self.cur=self.conn.cursor() 21 | 22 | except Exception as e: 23 | print("ERROR:faile to connect mysql") 24 | print(e) 25 | print(traceback.print_exc()) 26 | 27 | def __del__(self): 28 | try: 29 | self.cur.close() 30 | self.conn.close() 31 | except Exception as e: 32 | print('ERROR:__del__',e) 33 | 34 | def create_table(self,table_name,col_name_list,col_type_list=[]): 35 | """ 36 | #可以有预设值 0 表示 INT; 1表示 float; 2表示 varchar(255) 37 | :param table_name: 38 | :param col_name_list: 39 | :param col_type_list: 40 | :return: 1 create successfully 41 | 0 create fail 42 | """ 43 | if col_type_list==[]: #col_type_list默认为空。如果为空,则默认值为varchar(255) 44 | col_type_list=['varchar(255)']*col_name_list.__len__() 45 | if col_name_list.__len__()!=col_type_list.__len__(): 46 | print('ERROR:列名与列属性长度不一致!') 47 | return -1 48 | q1="create table %s ("%(table_name) 49 | q2="" 50 | for i in range(0,col_name_list.__len__()): 51 | q2=q2+col_name_list[i]+' ' 52 | if col_type_list[i]==0: #可以有预设值 0 表示 int; 1表示 float; 2表示 varchar(100) 53 | q2=q2+'INT,' 54 | elif col_type_list[i]==1: 55 | q2=q2+'FLOAT,' 56 | elif col_type_list[i]==2: 57 | q2=q2+'VARCHAR(255),' 58 | else: 59 | q2=q2+col_type_list[i]+',' 60 | q2=q2[0:q2.__len__()-1] 61 | query=q1+q2+');' 62 | # print(query) 63 | try: 64 | self.cur.execute(query) 65 | self.conn.commit() 66 | return 1 67 | except: 68 | print("ERROR:create_table: 创建表失败") 69 | return -1 70 | 71 | def drop_table(self,table_name): 72 | query="drop table %s ;"%(table_name) 73 | try: 74 | self.cur.execute(query) 75 | self.conn.commit() 76 | except: 77 | print('ERROR: drop table') 78 | 79 | def get_col_name(self,table_name): 80 | query="SHOW COLUMNS FROM %s ;"%(table_name) 81 | try: 82 | self.cur.execute(query) 83 | except: 84 | print('fail to get column info ') 85 | col_name=[x[0] for x in self.cur.fetchall()] 86 | return col_name 87 | 88 | def get_line_num(self,table_name): 89 | query='select count(*) as value from {table_name} ;'.format(table_name=table_name) 90 | num=-1 91 | try: 92 | self.cur.execute(query) 93 | res=self.cur.fetchall() 94 | num=res[0][0] 95 | except Exception as e: 96 | print('fail to get line num ',e) 97 | return num 98 | 99 | 100 | def select_all(self,table_name,code=''): 101 | query="select * from %s ;"%(table_name) 102 | try: 103 | self.cur.execute(query) 104 | except Exception as e: 105 | print('fail to get data from %s'%(table_name)) 106 | print(e) 107 | data=[x for x in self.cur.fetchall()] 108 | col_info=self.get_col_name(table_name) 109 | if code=='': 110 | return [data,col_info] 111 | else: 112 | for i in range(data.__len__()): 113 | data[i]=self.list_code_transform(data[i],code) 114 | return [data,col_info] 115 | 116 | def select_asQuery(self,query,code=''): 117 | try: 118 | self.cur.execute(query) 119 | res=[list(x) for x in self.cur.fetchall()] 120 | except Exception as e: 121 | print('fail to execute the query') 122 | print(e) 123 | if code=='': 124 | return res 125 | else: 126 | for i in range(res.__len__()): 127 | res[i]=self.list_code_transform(res[i],code) 128 | return res 129 | 130 | def add_col(self,table_name,new_col_name,new_col_property): 131 | col_info=self.get_col_name(table_name) 132 | if new_col_name in col_info: 133 | print('WARNING:import_data.MSSQL_Interface.add_col: 待插入列已经存在') 134 | else: 135 | query="alter table %s add column %s %s"%(table_name,new_col_name,new_col_property) 136 | try: 137 | self.cur.execute(query) 138 | self.conn.commit() 139 | except Exception as e: 140 | print('fail to add col') 141 | print(e) 142 | 143 | def drop_col(self,table_name,col_name): 144 | col_info=self.get_col_name(table_name) 145 | if col_name not in col_info: 146 | print('WARNING:待删除列不存在!') 147 | else: 148 | query="alter table %s drop column %s;"%(table_name,col_name) 149 | try: 150 | self.cur.execute(query) 151 | self.conn.commit() 152 | except Exception as e: 153 | print('faile to drop column') 154 | print(e) 155 | 156 | def alter_col_property(self,table_name,target_col,target_property): 157 | col_info=self.get_col_name(table_name) 158 | if target_col in col_info: 159 | query="alter table %s modify %s %s"%(table_name,target_col,target_property) 160 | try: 161 | self.cur.execute(query) 162 | self.conn.commit() 163 | except Exception as e: 164 | print('fail to alter column property ', e) 165 | else: 166 | print('target col not exist in table!') 167 | 168 | def update_content(self,table_name,target_col_name,target_col_value,pos_col_name,pos_col_value): 169 | query="update %s set %s=%s where %s=%s ;"%(table_name,target_col_name,target_col_value,pos_col_name,pos_col_value) 170 | try: 171 | self.cur.execute() 172 | self.conn.commit() 173 | except Exception as e: 174 | print('fail to update content ', e) 175 | 176 | def update_asQuery(self,query): 177 | try: 178 | self.cur.execute(query) 179 | self.conn.commit() 180 | except Exception as e: 181 | print(query) 182 | print('Unable to update conent',e) 183 | 184 | # def update_content_asList(self,table_name,param_list): 185 | # query="update %s set %s=%s where %s=%s ;" 186 | # try: 187 | # self.cur.executemany(query,param_list) 188 | # self.conn.commit() 189 | # except Exception as e: 190 | # print(e) 191 | 192 | def insert_asList(self,table_name,data_list,unique=False): 193 | if data_list.__len__()==0: #check the length of data list 194 | print('the length of data is 0') 195 | return -1 196 | if data_list[0]==0: # the the length of columns in data list 197 | print('the length of columns is 0') 198 | return -1 199 | len_set=set([x.__len__() for x in data_list]) 200 | if len_set.__len__()>1: #check if the data list is aligned 201 | print('array is not aligned') 202 | return -1 203 | if isinstance(data_list[0],list): 204 | #check if the data type in [[]..[]] transform to [()..()] 205 | data_list=[tuple(x) for x in data_list] 206 | if unique: 207 | q1="insert ignore into %s values ("%(table_name) 208 | else: 209 | q1="insert into %s values ("%(table_name) 210 | q2="%s,"*(data_list[0].__len__()-1)+"%s)" 211 | query=q1+q2 212 | try: 213 | self.cur.executemany(query,data_list) 214 | self.conn.commit() 215 | except Exception as e: 216 | print("fail to insert data", e) 217 | 218 | def insert_asQuery(self,query): 219 | try: 220 | self.cur.execute(query) 221 | self.conn.commit() 222 | except Exception as e: 223 | print('fail to insert as query,',e) 224 | 225 | def delete_line(self,table_name,col_name,col_value): 226 | query="delete from {table_name} where {col}=\'{col_value}\'"\ 227 | .format(table_name=table_name,col=col_name,col_value=col_value) 228 | self.cur.execute(query) 229 | self.conn.commit() 230 | 231 | def list_code_transform(self,strlist,codec='gb2312'): 232 | # len=strlist.__len__() 233 | out=[] 234 | for item in strlist: 235 | p=self.code_transform(item,codec) 236 | out.append(p) 237 | return out 238 | 239 | def code_transform(self,strText,codec='gb2312'): 240 | b = bytes((ord(i) for i in strText)) 241 | return b.decode(codec) 242 | 243 | def is_empty(self,table_name): 244 | query='select * from {tname} limit 1 ;'.format(tname=table_name) 245 | res=self.select_asQuery(query) 246 | if res.__len__()==0: 247 | return True 248 | else: 249 | return False 250 | 251 | if __name__=='__main__': 252 | mi=MySQL_Interface() 253 | # data=[('1','2'),('3','4'),('5','6')] 254 | query = 'select * from user_info_table limit 100' 255 | res = mi.select_asQuery(query) 256 | print(res) 257 | -------------------------------------------------------------------------------- /server_proxy.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | """ 3 | NAME: server_proxy.py 4 | PY_VERSION: python3.4 5 | FUNCTION: This part is used to get large number of proxy 6 | from certain http proxy website, then verify if 7 | they are useful. Useful proxy is saved in cache 8 | and provided to client to get info from website 9 | VERSION: _0.2_ 10 | 11 | UPDATE HISTORY: 12 | _0.2_: add the partition to monitor the state of proxy pool 13 | _0.1_: the first edition 14 | """ 15 | #====================================================================== 16 | #----------------import package-------------------------- 17 | # import python package 18 | from multiprocessing import Process 19 | import os 20 | import re,json 21 | import time 22 | import threading 23 | import urllib.request as request 24 | import random 25 | 26 | # import from this folder 27 | from server_config import GET_PROXY_URL,PROXY_POOL_SIZE #about proxy 28 | from server_config import VERIFY_PROXY_THREAD_NUM,MAX_VALID_PROXY_THREAD_NUM 29 | import server_config 30 | import File_Interface as FI 31 | from DB_Interface import MySQL_Interface 32 | #======================================================================= 33 | 34 | class proxy_manager(threading.Thread): 35 | 36 | def __init__(self,proxy_pool,proxy_lock,proxy_pool_size=PROXY_POOL_SIZE): 37 | threading.Thread.__init__(self) 38 | self.proxy_pool=proxy_pool 39 | self.proxy_pool_size=proxy_pool_size 40 | self.proxy_lock=proxy_lock 41 | self.start_up() 42 | 43 | def start_up(self): 44 | """ 45 | function: used to recover info when start up this process 46 | for example, read stored proxy list 47 | """ 48 | pass 49 | #TODO 50 | 51 | def run(self): 52 | """ 53 | function: The main circle of this process. 54 | Monitor the state of proxy pool 55 | """ 56 | thread_pool=[] 57 | run_value=[int(self.proxy_pool_size/1),int(self.proxy_pool_size/2)] 58 | MAX_VALID_PROXY_THREAD_NUM=2 # maximum num of thread of find valid proxy 59 | for i in range(MAX_VALID_PROXY_THREAD_NUM): # initialize of thread pool 60 | temp_t=find_valid_proxy(self.proxy_pool,self.proxy_lock) 61 | thread_pool.append(temp_t) 62 | 63 | if run_value.__len__()!=MAX_VALID_PROXY_THREAD_NUM: # check data formation 64 | raise ValueError('the length of run_value is not equal to ' 65 | 'MAX_VALID_PROXY_THREAD_NUM') 66 | 67 | maintain_proxy_thread=keep_proxy_valid(self.proxy_pool) 68 | maintain_proxy_thread.start() 69 | 70 | state_persistance_thread=state_persistance(self.proxy_pool) 71 | state_persistance_thread.start() 72 | 73 | while (True): 74 | time.sleep(0.1) 75 | for i in range(thread_pool.__len__()): 76 | if not thread_pool[i].is_alive(): 77 | if self.proxy_pool.size()<=run_value[i]: 78 | thread_pool[i]=find_valid_proxy(self.proxy_pool,self.proxy_lock) 79 | thread_pool[i].start() 80 | if not maintain_proxy_thread.is_alive(): 81 | maintain_proxy_thread=keep_proxy_valid(self.proxy_pool) 82 | maintain_proxy_thread.start() 83 | if not state_persistance_thread.is_alive(): 84 | state_persistance_thread=state_persistance(self.proxy_pool) 85 | state_persistance_thread.start() 86 | 87 | class state_persistance(threading.Thread): 88 | """ 89 | function: monitor and note the state of proxy pool,including the current 90 | size of proxy pool, the input speed of new proxy , and the output speed. 91 | and manage the average size oj of proxy_pool class 92 | """ 93 | def __init__(self,proxy_pool): 94 | threading.Thread.__init__(self) 95 | self.proxy_pool=proxy_pool 96 | self.dbi=MySQL_Interface() 97 | 98 | def run(self): 99 | while True: 100 | time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 101 | current_size=self.proxy_pool.size() 102 | [input,output]=self.proxy_pool.update_proxy_state() 103 | insert_value=[[current_size,time_stick,input,output]] 104 | self.dbi.insert_asList('proxy_table',insert_value,unique=True) 105 | time.sleep(server_config.PROXY_MONITOR_GAP) 106 | 107 | class find_valid_proxy(threading.Thread): 108 | """ 109 | function: Get raw proxy list,check them ,and find valide proxy list 110 | """ 111 | def __init__(self,proxy_pool,proxy_lock): 112 | threading.Thread.__init__(self) 113 | self.proxy_pool=proxy_pool #proxy pool 114 | self.proxy_lock=proxy_lock 115 | self.raw_proxy=[] 116 | self.raw_proxy_lock=threading.Lock() 117 | 118 | def run(self): 119 | self.get_raw_proxy() 120 | self.threads=[] 121 | for i in range(VERIFY_PROXY_THREAD_NUM): 122 | t=check_proxy(self.raw_proxy,self.proxy_pool,self.raw_proxy_lock,self.proxy_lock) 123 | self.threads.append(t) 124 | for t in self.threads: 125 | t.start() 126 | 127 | def get_raw_proxy(self): 128 | RAW_PROXY_RATIO=5 # the ratio of raw and valid proxy 129 | current_proxy_num=self.proxy_pool.size() 130 | fetch_size=max(0,PROXY_POOL_SIZE-current_proxy_num)*RAW_PROXY_RATIO+1 131 | url=GET_PROXY_URL.format(NUM=fetch_size) 132 | try: 133 | time.sleep(random.randint(2,2*MAX_VALID_PROXY_THREAD_NUM)) 134 | res=request.urlopen(url) 135 | res=res.read() 136 | res=str(res,encoding='utf-8') 137 | self.raw_proxy=res.split('\r\n') 138 | if self.raw_proxy.__len__() get_raw_proxy: ',e) 151 | # if can't get proxy ,sleep for 1 sec , then try again 152 | try: 153 | time.sleep(random.randint(2,2*MAX_VALID_PROXY_THREAD_NUM)) 154 | res=request.urlopen(url).read() 155 | res=str(res,encoding='utf-8') 156 | self.raw_proxy=res.split('\r\n') 157 | if self.raw_proxy.__len__() get_raw_proxy: ' 159 | 'the proxy num got from web is not enough \n ' 160 | 'the wanted size is {want_size}, the gotten size is {gotten_size}' 161 | .format(want_size=fetch_size,gotten_size=str(self.raw_proxy.__len__()))) 162 | except Exception as e: 163 | print('error: find_valid_proxy -> get_raw_proxy: ',e) 164 | # raise IOError('Unable to get raw proxy from website') 165 | print('Unable to get raw proxy from website') 166 | 167 | class check_proxy(threading.Thread): 168 | def __init__(self,raw_proxy,proxy_pool,raw_proxy_lock,proxy_lock): 169 | threading.Thread.__init__(self) 170 | self.raw_proxy=raw_proxy 171 | self.proxy_pool=proxy_pool 172 | self.raw_proxy_lock=raw_proxy_lock 173 | self.proxy_lock=proxy_lock 174 | def run(self): 175 | while(True): 176 | if not self.raw_proxy: # if raw_proxy is empty ,end this threading 177 | break 178 | self.raw_proxy_lock.acquire() 179 | try: 180 | current_raw_proxy=self.raw_proxy.pop(0) 181 | except: 182 | break 183 | self.raw_proxy_lock.release() 184 | 185 | handler=request.ProxyHandler({'http':'http://%s'%(current_raw_proxy)}) 186 | self.opener=request.build_opener(handler) 187 | testurl='http://m.weibo.cn/page/tpl?containerid=1005051221171697_-_FOLLOWERS&page=3' 188 | t1=time.time() 189 | try: 190 | page=self.getData(testurl,timeout=10) 191 | page=re.findall(r'"card_group":.+?]}]',page)[0] 192 | page='{'+page[:page.__len__()-1] 193 | page=json.loads(page) 194 | temp_list=[self.card_group_item_parse(x) for x in page['card_group']] 195 | usetime=time.time()-t1 196 | self.proxy_lock.acquire() 197 | self.proxy_pool.add([[current_raw_proxy,usetime]]) 198 | self.proxy_lock.release() 199 | except Exception as e: 200 | pass 201 | 202 | def getData(self,url,timeout=10): 203 | headers= {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) ' 204 | 'AppleWebKit/600.1.3 (KHTML, like Gecko) ' 205 | 'Version/8.0 Mobile/12A4345d Safari/600.1.4'} 206 | req=request.Request(url,headers=headers) 207 | result=self.opener.open(req,timeout=timeout) 208 | return result.read().decode('utf-8') 209 | def card_group_item_parse(self,sub_block): 210 | """ 211 | :param user_block : json type 212 | :return: user : dict type 213 | """ 214 | user_block=sub_block['user'] 215 | user_block_keys=user_block.keys() 216 | user={} 217 | 218 | if 'profile_url' in user_block_keys: 219 | user['basic_page']=user_block['profile_url'] 220 | 221 | if 'screen_name' in user_block_keys: 222 | user['name']=user_block['screen_name'] 223 | 224 | if 'desc2' in user_block_keys: 225 | user['recent_update_time']=user_block['desc2'] 226 | 227 | if 'desc1' in user_block_keys: 228 | user['recent_update_content']=user_block['desc1'] 229 | 230 | if 'gender' in user_block_keys: 231 | user['gender']=('male' if user_block['gender']=='m' else 'female') 232 | 233 | if 'verified_reason' in user_block_keys: 234 | user['verified_reason']=user_block['verified_reason'] 235 | 236 | if 'profile_image_url' in user_block_keys: 237 | user['profile']=user_block['profile_image_url'] 238 | 239 | if 'statuses_count' in user_block_keys: 240 | temp=user_block['statuses_count'] 241 | if isinstance(temp,str): 242 | temp=int(temp.replace('万','0000')) 243 | user['blog_num']=temp 244 | 245 | if 'description' in user_block_keys: 246 | user['description']=user_block['description'] 247 | 248 | if 'follow_me' in user_block_keys: 249 | user['follow_me']=user_block['follow_me'] 250 | 251 | if 'id' in user_block_keys: 252 | user['uid']=user_block['id'] 253 | 254 | if 'fansNum' in user_block_keys: 255 | temp=user_block['fansNum'] 256 | if isinstance(temp,str): 257 | temp=int(temp.replace('万','0000')) 258 | user['fans_num']=temp 259 | 260 | return user 261 | 262 | class proxy_pool(): 263 | """ 264 | Core Data: proxy_pool ,formation as [[],[],[]] 265 | Method: get(num) 266 | add(data) 267 | """ 268 | def __init__(self): 269 | self.proxy=[] 270 | 271 | self.ave_proxy_size=0 # used to monitor the state of proxy pool 272 | self.proxy_size_list=[] 273 | self.input_speed=0 274 | self.output_speed=0 275 | 276 | def add(self,data): 277 | """ 278 | Data Formation: each item be formation of list[[],[],...,[]] 279 | [[ip:port(str),timedelay(float)],[ip:port(str),timedelay(float)]] 280 | and so on 281 | """ 282 | self.proxy=data+self.proxy 283 | if isinstance(data,list) and data.__len__()>0: 284 | self.input_speed+=data.__len__() 285 | 286 | def insert(self,single_data): 287 | """ 288 | Data Formation: each item be formation of 289 | [ip:port(str),timedelay(float)] and so on 290 | """ 291 | self.proxy.insert(0,single_data) 292 | self.input_speed+=1 293 | 294 | def sort(self): # sort according to the timedelay 295 | pass 296 | #TODO 297 | 298 | def empty(self): #清空proxy列表 299 | self.output_speed+=self.proxy.__len__() 300 | self.proxy=[] 301 | 302 | def get(self,num): # return [[]...[]] 303 | if self.proxy.__len__()==0: 304 | return [] 305 | if self.proxy.__len__()server_config.PROXY_SIZE_STATE_LIST_LEN: 327 | self.proxy_size_list.pop() 328 | self.proxy_size_list.insert(0,self.proxy.__len__()) 329 | else: 330 | self.proxy_size_list.insert(0,self.proxy.__len__()) 331 | self.ave_proxy_size=int(sum(self.proxy_size_list)/self.proxy_size_list.__len__()) 332 | 333 | a=self.input_speed # and reset these two values as 0 334 | b=self.output_speed 335 | self.input_speed=0 336 | self.output_speed=0 337 | return [a,b] 338 | 339 | class keep_proxy_valid(threading.Thread): 340 | def __init__(self,proxy_pool): 341 | threading.Thread.__init__(self) 342 | self.proxy_pool=proxy_pool 343 | 344 | def run(self): 345 | while True: 346 | if self.proxy_pool.size()==0: 347 | time.sleep(0.5) 348 | continue 349 | try: 350 | c_proxy=self.proxy_pool.pop()[0] 351 | except: 352 | time.sleep(0.5) 353 | continue 354 | # url='http://m.sina.cn/' 355 | url='http://m.weibo.cn/page/tpl?containerid=1005051221171697_-_FOLLOWERS&page=3' 356 | handler=request.ProxyHandler({'http':'http://%s'%(c_proxy)}) 357 | t_start=time.time() 358 | try: 359 | page=self.getData(url,handler,timeout=5) 360 | page=re.findall(r'"card_group":.+?]}]',page)[0] 361 | page='{'+page[:page.__len__()-1] 362 | page=json.loads(page) 363 | temp_list=[self.card_group_item_parse(x) for x in page['card_group']] 364 | usetime=time.time()-t_start 365 | self.proxy_pool.insert([c_proxy,usetime]) 366 | # print('proxy {proxy} is valid, insert it'.format(proxy=c_proxy)) 367 | except Exception as e: 368 | pass 369 | # print(e) 370 | # print('proxy {proxy} is invalid ,drop it'.format(proxy=c_proxy)) 371 | 372 | def getData(self,url,handler,timeout=10): 373 | headers= {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) ' 374 | 'AppleWebKit/600.1.3 (KHTML, like Gecko) ' 375 | 'Version/8.0 Mobile/12A4345d Safari/600.1.4'} 376 | req=request.Request(url,headers=headers) 377 | opener=request.build_opener(handler) 378 | page=opener.open(req,timeout=timeout) 379 | return page.read().decode('utf-8') 380 | 381 | def card_group_item_parse(self,sub_block): 382 | """ 383 | :param user_block : json type 384 | :return: user : dict type 385 | """ 386 | user_block=sub_block['user'] 387 | user_block_keys=user_block.keys() 388 | user={} 389 | 390 | if 'profile_url' in user_block_keys: 391 | user['basic_page']=user_block['profile_url'] 392 | 393 | if 'screen_name' in user_block_keys: 394 | user['name']=user_block['screen_name'] 395 | 396 | if 'desc2' in user_block_keys: 397 | user['recent_update_time']=user_block['desc2'] 398 | 399 | if 'desc1' in user_block_keys: 400 | user['recent_update_content']=user_block['desc1'] 401 | 402 | if 'gender' in user_block_keys: 403 | user['gender']=('male' if user_block['gender']=='m' else 'female') 404 | 405 | if 'verified_reason' in user_block_keys: 406 | user['verified_reason']=user_block['verified_reason'] 407 | 408 | if 'profile_image_url' in user_block_keys: 409 | user['profile']=user_block['profile_image_url'] 410 | 411 | if 'statuses_count' in user_block_keys: 412 | temp=user_block['statuses_count'] 413 | if isinstance(temp,str): 414 | temp=int(temp.replace('万','0000')) 415 | user['blog_num']=temp 416 | 417 | if 'description' in user_block_keys: 418 | user['description']=user_block['description'] 419 | 420 | if 'follow_me' in user_block_keys: 421 | user['follow_me']=user_block['follow_me'] 422 | 423 | if 'id' in user_block_keys: 424 | user['uid']=user_block['id'] 425 | 426 | if 'fansNum' in user_block_keys: 427 | temp=user_block['fansNum'] 428 | if isinstance(temp,str): 429 | temp=int(temp.replace('万','0000')) 430 | user['fans_num']=temp 431 | 432 | return user 433 | 434 | def proxy_info_print(str_info,type='NORMAL'): # decide if normal of key infomation should be print 435 | from server_config import PROXY_NORMAL_INFO_PRINT 436 | if type=='NORMAL': 437 | if PROXY_NORMAL_INFO_PRINT: 438 | print(str_info) 439 | 440 | if __name__=='__main__': 441 | proxy_lock=threading.Lock() 442 | proxy=proxy_pool() 443 | t=proxy_manager(proxy,proxy_lock) 444 | t.start() 445 | while True: 446 | time.sleep(0.1) 447 | print(proxy.size()) 448 | 449 | 450 | 451 | -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | """ 3 | NAME: server.py 4 | PY_VERSION: python3.4 5 | FUNCTION: 6 | This server part of distrubuted microblog spider. 7 | The function of server can be divided into 3 parts. 8 | 1. proxy manager. Scratch web page in high speed need a lot of http proxy ip. 9 | Server should maintain a proxy pool which should provide proxy to client. 10 | 2. task manager. Client will require task from server. task list should fetched 11 | from sqlserver and stored in memory 12 | 3. store return info. When client finished searching user information from sina, 13 | client will return this info to server. if the length of data is too lang, client 14 | will seperate it into several parts and send then individually. Server should combine 15 | these data package together. 16 | Besides, server should check whether the received user is already exist in database. 17 | Server has to assure that no repeating data exists in database. It a heavy task for 18 | server to connect with databases. 19 | 20 | VERSION: 21 | _0.5_ 22 | """ 23 | #====================================================================== 24 | #----------------import package-------------------------- 25 | # import python package 26 | import threading 27 | import time 28 | import sys,os 29 | from random import Random 30 | from pprint import pprint 31 | 32 | # import from outer package 33 | from pymongo import MongoClient 34 | import pymongo 35 | 36 | import tornado.web 37 | import tornado.ioloop 38 | import tornado.options 39 | from tornado.options import define,options 40 | 41 | # import from this folder 42 | from server_proxy import proxy_pool,proxy_manager 43 | import server_config as config 44 | from server_database import DB_manager,deal_cache_user_info,deal_cache_attends 45 | import File_Interface as FI 46 | from DB_Interface import MySQL_Interface 47 | from server_data import DataServer 48 | #======================================================================= 49 | define('port',default=8000,help='run on the given port',type=int) 50 | 51 | class Application(tornado.web.Application): 52 | def __init__(self): 53 | handlers=[ 54 | (r'/auth',AuthHandler), 55 | (r'/proxy/',ProxyHandler), 56 | (r'/task/',TaskHandler), 57 | (r'/proxy_size',ProxySize), 58 | (r'/proxy_empty',ProxyEmpty), 59 | (r'/proxy_return',ProxyReturn), 60 | (r'/info_return',InfoReturn), 61 | (r'/history_report',HistoryReport), 62 | (r'/update_report',UpdateReport) 63 | ] 64 | settings=dict( 65 | debug=True 66 | ) 67 | tornado.web.Application.__init__(self,handlers,**settings) 68 | 69 | class AuthHandler(tornado.web.RequestHandler): 70 | def get(self): 71 | self.write('connection valid') 72 | self.finish() 73 | 74 | class ProxyHandler(tornado.web.RequestHandler): 75 | def get(self): 76 | global proxy 77 | num=int(self.get_argument('num')) 78 | if num>proxy.size(): 79 | self.write('no valid proxy') 80 | self.finish() 81 | else: 82 | proxy_list=proxy.get(num) 83 | try: 84 | proxy_list=['{url},{timedelay};'.format(url=x[0],timedelay=x[1]) for x in proxy_list] 85 | except Exception as e: 86 | self.write('no valid proxy') 87 | self.finish() 88 | print('ERROR:server->ProxyHandler:') 89 | print(e) 90 | return 91 | res='' 92 | for i in proxy_list: res+=i 93 | res=res[0:-1] # 'url,timedelay;url,timedelay;...,' 94 | self.write(res) 95 | self.finish() 96 | 97 | class TaskHandler(tornado.web.RequestHandler): 98 | 99 | def get(self): 100 | global proxy 101 | uuid=str(self.get_argument('uuid')) 102 | task_id=self.task_assign(uuid) 103 | 104 | if proxy.get_ave_proxy_size()<30: # check the size of current proxy size 105 | self.write('no task') 106 | self.finish() 107 | return 108 | 109 | if task_id==-1: # checi if this uuid is valid 110 | self.write('no task') 111 | self.finish() 112 | return 113 | 114 | if task_id==1: # get the social web of certain user 115 | dbi=MySQL_Interface() 116 | query='select * from ready_to_get where is_fetching is null order by fans_num desc limit 1;' 117 | res=dbi.select_asQuery(query) 118 | if res.__len__()==0: 119 | self.write('no task') 120 | self.finish() 121 | return 122 | res=res[0] 123 | col_info=dbi.get_col_name('ready_to_get') 124 | uid=res[col_info.index('uid')] 125 | 126 | self.write('{uid},connect'.format(uid=uid)) 127 | self.finish() 128 | 129 | time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 130 | query="update ready_to_get set is_fetching=\'{t_time}\' where uid={uid} ;"\ 131 | .format(t_time=time_stick,uid=uid) 132 | dbi.update_asQuery(query) 133 | 134 | 135 | if task_id==2: # get the history microblog of a certain user 136 | dbi=MySQL_Interface() 137 | query='select container_id,blog_num from user_info_table ' \ 138 | 'where (isGettingBlog is null and update_time is null and blog_num<{valve} and blog_num>100)' \ 139 | 'order by fans_num desc limit 1 ;'.format(valve=config.HISTORY_TASK_VALVE) 140 | # query='select container_id,blog_num from user_info_table ' \ 141 | # 'order by rand() limit 1 ;' 142 | res=dbi.select_asQuery(query) 143 | if res.__len__()==0: 144 | self.write('no task') 145 | self.finish() 146 | return 147 | [container_id,blog_num]=res[0] 148 | self.write('{c_id};{blog},history' 149 | .format(c_id=container_id,blog=blog_num)) 150 | self.finish() 151 | time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 152 | query="update user_info_table set isGettingBlog=\'{t_time}\' where container_id={cid} ;"\ 153 | .format(t_time=time_stick,cid=container_id) 154 | dbi.update_asQuery(query) 155 | 156 | if task_id==3: # get the history microblog of a certain user 157 | dbi=MySQL_Interface() 158 | query='select container_id,blog_num from user_info_table ' \ 159 | 'where (isGettingBlog is null and update_time is null and blog_num>={valve} and blog_num>100)' \ 160 | 'order by fans_num desc limit 1 ;'.format(valve=config.HISTORY_TASK_VALVE) 161 | # query='select container_id,blog_num from user_info_table ' \ 162 | # 'order by rand() limit 1 ;' 163 | [container_id,blog_num]=dbi.select_asQuery(query)[0] 164 | self.write('{c_id};{blog},history' 165 | .format(c_id=container_id,blog=blog_num)) 166 | self.finish() 167 | time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 168 | query="update user_info_table set isGettingBlog=\'{t_time}\' where container_id={cid} ;" \ 169 | .format(t_time=time_stick,cid=container_id) 170 | dbi.update_asQuery(query) 171 | 172 | if task_id==4 or task_id==5 or task_id==100: # this part is in test 173 | dbi=MySQL_Interface() 174 | current_time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 175 | target_time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()-60*60*24*1)) #提早5天 176 | if task_id==4: 177 | batch_size = 100 178 | elif task_id==5: 179 | batch_size = 200 180 | else: 181 | batch_size = 10 182 | query='select container_id,update_time,latest_blog from user_info_table ' \ 183 | 'where update_time<\'{target_time}\' and isGettingBlog is null and blog_num>10 order by fans_num desc limit {batch}' \ 184 | .format(target_time=target_time_stick,batch=batch_size) 185 | print(query) 186 | res=dbi.select_asQuery(query) 187 | 188 | # 将从mysql中取得的用户列表加上必要的变量以后发送给客户端 189 | res=[[line[0],int(time.mktime(line[1].timetuple())),int(time.mktime(line[2].timetuple()))] for line in res] 190 | res_cp=res 191 | 192 | if res_cp.__len__()==0: # if no task ,then return "no task" 193 | print('*** warning: no avaliable update mission ***') 194 | self.write('no task') 195 | self.finish() 196 | return 197 | 198 | # print('debug from task handler') 199 | # pprint(res_cp) 200 | res=[line[0]+'-'+str(line[1])+'-'+str(line[2]) for line in res] 201 | inn='' 202 | for item in res: 203 | inn+=item+';' 204 | inn=inn[0:-1] 205 | # uid-stamp;uid-timestamp;...;,update (the formation of order) 206 | mission_id=random_str(15) 207 | commend='{list};{task_id},update'.format(list=inn,task_id=mission_id) 208 | # 传送给客户端的指令格式: ContainerId-UpdateTime-LatestBlog;...;...;...,update 209 | self.write(commend) 210 | self.finish() 211 | 212 | # 将用户列表,任务id,以及任务开始时间存入mongodb 213 | u_list=[dict(container_id=x[0],update_time=x[1],latest_blog=x[2]) for x in res_cp] 214 | data_toMongo=dict( 215 | mission_id = mission_id, 216 | user_list = u_list, 217 | mission_start= int(time.time()) 218 | ) 219 | client=MongoClient('localhost',27017) 220 | db=client['microblog_spider'] 221 | collec=db.update_mission 222 | collec.insert(data_toMongo) 223 | 224 | # 将相关内容从mysql中设置isGettingBlog 225 | user_list_str='' 226 | for line in res_cp: 227 | user_list_str+='\'{cid}\','.format(cid=line[0]) 228 | user_list_str=user_list_str[:-1] 229 | time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 230 | query='update user_info_table set isGettingBlog=\'{time}\' where container_id in ({ulist})'\ 231 | .format(time=time_stick,ulist=user_list_str) 232 | dbi.update_asQuery(query) 233 | 234 | def task_assign(self,uuid): 235 | t_1=['1'] # get social web 236 | t_2=['2'] # get history weibo , get counter which blog_num<=15000 ,connect with server through www 237 | t_3=['3'] # get history weibo , get counter which blog_num>15000 ,connect with server through localhost 238 | t_4=['4'] # update weibo 239 | t_5=['5'] 240 | t_6=['100'] 241 | if uuid in t_1: 242 | return 1 243 | elif uuid in t_2: 244 | return 2 245 | elif uuid in t_3: 246 | return 3 247 | elif uuid in t_4: 248 | return 4 249 | elif uuid in t_5: 250 | return 5 251 | elif uuid in t_6: 252 | return 100 253 | else: 254 | return -1 255 | 256 | class ProxySize(tornado.web.RequestHandler): 257 | global proxy 258 | def get(self): 259 | self.write(str(proxy.size())) 260 | self.finish() 261 | 262 | class ProxyEmpty(tornado.web.RequestHandler): 263 | global proxy 264 | def get(self): 265 | proxy.empty() 266 | if proxy.size()<2: 267 | self.write('empty proxy success') 268 | self.finish() 269 | 270 | class ProxyReturn(tornado.web.RequestHandler): 271 | def post(self): 272 | global proxy 273 | data=self.get_argument('data') 274 | print('proxy data:',data) 275 | proxy_list=data.split(';') 276 | in_data=[x.split(',') for x in proxy_list] 277 | if in_data.__len__()>0: 278 | proxy.add(in_data) 279 | print('Success to receive returned proxy') 280 | for i in in_data: 281 | print(i) 282 | self.write('return success') 283 | self.finish() 284 | 285 | class InfoReturn(tornado.web.RequestHandler): 286 | def post(self): 287 | 288 | try: 289 | user_basic_info=self.get_argument('user_basic_info') 290 | attends=self.get_argument('user_attends') 291 | user_basic_info=eval(user_basic_info) 292 | attends=eval(attends) 293 | self.write('success to return user info') 294 | self.finish() 295 | except: 296 | self.write('fail to return user info') 297 | self.finish() 298 | return 299 | 300 | try: 301 | dbi=MySQL_Interface() 302 | except: 303 | print('unable to connect to MySql DB') 304 | 305 | try: 306 | if attends.__len__()>0: #store attends info 307 | table_name='cache_attends' 308 | attends_col_info=dbi.get_col_name(table_name) 309 | keys=attends[0].keys() 310 | attends= [[line[i] if i in keys else '' for i in attends_col_info] for line in attends] 311 | fans_col_pos=attends_col_info.index('fans_num') 312 | insert_attends=[] 313 | for line in attends: 314 | if line[fans_col_pos]>1000: 315 | insert_attends.append(line) 316 | dbi.insert_asList(table_name,insert_attends,unique=True) 317 | print('Success : attends of {uid} is stored in {tname}' 318 | .format(uid=user_basic_info['uid'],tname=table_name)) 319 | else: 320 | pass 321 | except Exception as e: 322 | print(e) 323 | path="temp"+os.sep+"{uid}_attends.pkl".format(uid=user_basic_info['uid']) 324 | print('unable to store attends of {uid}, it will be stored ' 325 | .format(uid=user_basic_info['uid'])) 326 | FI.save_pickle(attends,path) 327 | 328 | try: 329 | atten_num_real=user_basic_info['attends_num'] 330 | atten_num_get=attends.__len__() 331 | user_basic_info['accuracy']=atten_num_get # 实际获取到的关注数目 332 | col_info=dbi.get_col_name('cache_user_info') # store user basic info 333 | keys=user_basic_info.keys() 334 | data=[user_basic_info[i] if i in keys else '' for i in col_info] 335 | dbi.insert_asList('cache_user_info',[data],unique=True) 336 | print('Success : basic info of {uid} is stored in cache_user_info' 337 | .format(uid=user_basic_info['uid'])) 338 | except Exception as e: 339 | print(e) 340 | path='temp'+os.sep+'{uid}_basic_info.pkl'.format(uid=user_basic_info['uid']) 341 | print('unable to store basic info of {uid} , it will be stored' 342 | .format(uid=user_basic_info['uid'])) 343 | FI.save_pickle(user_basic_info,path) 344 | 345 | try: 346 | if attends.__len__()>0: # store atten connection web 347 | from_uid=user_basic_info['uid'] 348 | from_fans_num=user_basic_info['fans_num'] 349 | from_blog_num=user_basic_info['blog_num'] 350 | data=[[from_uid,from_fans_num,from_blog_num,str(x[attends_col_info.index('uid')]),str(x[attends_col_info.index('fans_num')]),str(x[attends_col_info.index('blog_num')])]for x in attends] 351 | dbi.insert_asList('cache_atten_web',data) 352 | print('Success : conn web of {uid} is stored in cache_atten_web' 353 | .format(uid=user_basic_info['uid'])) 354 | else: 355 | pass 356 | except Exception as e: 357 | print(e) 358 | path='{uid}_atten_web.pkl'.format(uid=user_basic_info['uid']) 359 | print('unable to store atten web of {uid} , it will be stored' 360 | .format(uid=user_basic_info['uid'])) 361 | FI.save_pickle(data,path) 362 | 363 | class HistoryReport(tornado.web.RequestHandler): 364 | def post(self): 365 | 366 | # 从客户端获取信息 367 | try: 368 | latest_time=self.get_argument('latest_time') 369 | latest_timestamp=self.get_argument('latest_timestamp') 370 | container_id=self.get_argument('container_id') 371 | self.write('success') 372 | self.finish() 373 | print('Success: to get data from web') 374 | except Exception as e: 375 | self.write('fail to return user history') 376 | self.finish() 377 | print('Error:server-HistoryReturn:' 378 | 'Unable to get value from http package,Reason:') 379 | print(e) 380 | return 381 | 382 | dbi=MySQL_Interface() 383 | checkin_timestamp=int(time.time()) 384 | col_info=dbi.get_col_name('cache_history') 385 | data=dict( 386 | latest_time=latest_time, 387 | latest_timestamp=latest_timestamp, 388 | container_id=container_id, 389 | checkin_timestamp=checkin_timestamp 390 | ) 391 | keys=data.keys() 392 | insert_data=[[data[item] if item in keys else None for item in col_info]] 393 | dbi.insert_asList('cache_history',insert_data) 394 | 395 | class UpdateReport(tornado.web.RequestHandler): 396 | def post(self): 397 | # 从客户端获取信息 398 | try: 399 | mission_id=self.get_argument('mission_id') 400 | self.write('success') 401 | self.finish() 402 | print('Success: to get update report from web') 403 | except Exception as e: 404 | self.write('fail to return user update') 405 | self.finish() 406 | print('Error:server-UpdateReturn:' 407 | 'Unable to get value from http package,Reason:') 408 | print(e) 409 | return 410 | 411 | # 将该任务在mongodb中设置为组装状态 412 | client=MongoClient('localhost',27017) 413 | db=client['microblog_spider'] 414 | collec=db.update_mission 415 | collec.update({'mission_id':mission_id},{'$set':{'isReported':int(time.time())}}) 416 | 417 | def random_str(randomlength=8): 418 | str = '' 419 | chars = 'AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789' 420 | length = len(chars) - 1 421 | random = Random() 422 | for i in range(randomlength): 423 | str+=chars[random.randint(0, length)] 424 | return str 425 | 426 | def auto_index(): 427 | client = MongoClient('localhost',27017) 428 | db = client['microblog_spider'] 429 | collec_list = [] 430 | res=db.collection_names() 431 | for x in res: 432 | if 'user' in x: 433 | collec_list.append(x) 434 | print('** start to check the index station of collections in mongodb **') 435 | for name in collec_list: 436 | collec = db.get_collection(name) 437 | indexs = [x for x in collec.list_indexes()] 438 | if indexs.__len__()<3: # 此时没有索引 439 | print('{n} do not have indexes yet, ready to craete'.format(n=name)) 440 | collec.create_index([('user_id',pymongo.DESCENDING)]) 441 | collec.create_index([('id',pymongo.DESCENDING)]) 442 | else: 443 | # print('{n} has 3 indexs, done'.format(n=name)) 444 | pass 445 | print('** all indexes is created **') 446 | 447 | def start_selfcheck(): # 启动自检 448 | print('\n\n********* start to selfcheck *********\n') 449 | mi = MySQL_Interface() 450 | if mi.cur : 451 | print('mysql is connected') 452 | client = MongoClient('localhost',27017) 453 | print('mongodb is connected') 454 | client.close() 455 | auto_index() 456 | print('\n********* selfcheck success *********\n') 457 | 458 | 459 | if __name__=='__main__': 460 | 461 | start_selfcheck() # 启动自检 462 | 463 | proxy_lock=threading.Lock() # proxy thread 464 | global proxy 465 | proxy=proxy_pool() 466 | pm=proxy_manager(proxy,proxy_lock) 467 | pm.start() 468 | 469 | db_thread=DB_manager() # database thread 470 | db_thread.start() 471 | 472 | tornado.options.parse_command_line() # tornado thread 473 | Application().listen(options.port) 474 | # nginx 使用8001接口,分别链接到8002,8003,8004等若干个数据服务器 475 | DataServer().listen(8002) 476 | DataServer().listen(8003) 477 | DataServer().listen(8004) 478 | tornado.ioloop.IOLoop.instance().start() -------------------------------------------------------------------------------- /server_database.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | """ 4 | NAME: server_database 5 | PY_VERSION: python3.4 6 | FUNCTION:-------------------------------------------------------------- 7 | This file deal with the event of databases. 8 | The several main task is: 9 | 1. Check cahce_attends, and check if the user exists in table of ready_to_get 10 | and user_info_table. If not , store it into ready_to_get table 11 | 2. Check the data stored in cache_user_info, and store then into user_info_table. 12 | Be sure that the user in user_info_table should be unique. And delet the user in 13 | ready_to_get table 14 | 3. Check the data stored in cache_user_get , and store then into atten_web table. 15 | Also, the attend connection of two people should be unique 16 | 4. Check the ready_to_get table . if some uid is fetching for too much time, set 17 | the value if is_fetching to null 18 | ----------------------------------------------------------------------- 19 | VERSION: _0.2_ 20 | UPDATE_HISTORY: 21 | _0.2: add redis and bloom filter as the cache of mysql 22 | _0.1_: The 1st edition 23 | """ 24 | #====================================================================== 25 | #----------------import package-------------------------- 26 | # import python package 27 | import urllib.request as request 28 | import urllib.parse as parse 29 | from multiprocessing import Process 30 | import threading 31 | import time 32 | import redis 33 | import os 34 | import json 35 | import http.cookiejar 36 | import re 37 | import random 38 | import sys 39 | from pymongo import MongoClient 40 | from pymongo import UpdateOne,UpdateMany 41 | 42 | # import from this folder 43 | import client_config as config 44 | import File_Interface as FI 45 | from DB_Interface import MySQL_Interface 46 | #======================================================================= 47 | 48 | #======================================================================= 49 | #---code session 50 | class deal_cache_attends(threading.Thread): 51 | def __init__(self): 52 | threading.Thread.__init__(self) 53 | dbi=MySQL_Interface() 54 | self.dbi=dbi 55 | self.bf=BloomFilter() 56 | 57 | def run(self): 58 | bag=[] 59 | uid_bag=[] #与bag类似,只不过存储uid 60 | bag_size=1000 #100次插入一次 61 | ready_to_get_col=self.dbi.get_col_name('ready_to_get') 62 | cache_attends_col=self.dbi.get_col_name('cache_attends') 63 | while True: 64 | query='select * from cache_attends limit 5000' 65 | res=self.dbi.select_asQuery(query) 66 | if res.__len__()==0: 67 | if bag.__len__()>0: 68 | self.dbi.insert_asList('ready_to_get',bag,unique=True) 69 | bag=[] 70 | # self.bf.insert_asList(uid_bag,'ready_to_get') 71 | uid_bag=[] 72 | time.sleep(1) 73 | self.dbi=MySQL_Interface() #更新dbi 74 | continue 75 | 76 | print('thread cache attends is working') 77 | 78 | for line in res: 79 | raw_id=line[cache_attends_col.index('uid')] 80 | in_user_info=self.bf.isContains(raw_id,'user_info_table') #此处可优化 81 | if not in_user_info: 82 | data=[line[cache_attends_col.index(col)] if col in cache_attends_col else None for col in ready_to_get_col] 83 | bag.append(data) 84 | uid_bag.append(raw_id) 85 | if bag.__len__()>bag_size: 86 | self.dbi.insert_asList('ready_to_get',bag,unique=True) 87 | # self.bf.insert_asList(uid_bag,'ready_to_get') 88 | print('insert once') 89 | bag=[] 90 | uid_bag=[] 91 | self.dbi.delete_line('cache_attends','uid',raw_id) # 此处可优化 92 | 93 | def isInUserInfo(self,in_uid): 94 | col_user_info=self.dbi.get_col_name('user_info_table') 95 | query='select * from user_info_table where uid={uid}'.format(uid=in_uid) 96 | res=self.dbi.select_asQuery(query) 97 | if res.__len__()==0: 98 | return False 99 | else: 100 | return True 101 | 102 | class deal_cache_user_info(threading.Thread): 103 | def __init__(self): 104 | threading.Thread.__init__(self) 105 | self.dbi=MySQL_Interface() 106 | self.bf=BloomFilter() 107 | 108 | def run(self): 109 | while True: 110 | if self.dbi.is_empty('cache_user_info'): 111 | time.sleep(2) 112 | self.dbi=MySQL_Interface() 113 | continue 114 | [res,cache_user_info_col]=self.dbi.select_all('cache_user_info') 115 | 116 | time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # insert into user info table 117 | user_info_table_col=self.dbi.get_col_name('user_info_table') 118 | data= [ 119 | [ 120 | line[cache_user_info_col.index(col)] if col in cache_user_info_col 121 | else time_stick if col=='insert_time' 122 | else None if col=='update_time' 123 | else None if col=='latest_blog' 124 | else None if col=='isGettingBlog' 125 | else '' 126 | for col in user_info_table_col 127 | ] for line in res] 128 | uid_list=[line[user_info_table_col.index('uid')] for line in data] 129 | self.dbi.insert_asList('user_info_table',data,unique=True) # 插入 user info table 130 | self.bf.insert_asList(uid_list,'user_info_table') 131 | print('insert {num} users into user info table'.format(num=data.__len__())) 132 | 133 | uid_list=[line[cache_user_info_col.index('uid')] for line in res] 134 | q1="delete from {table_name} where uid in ( {id_str_list} ) ;" # 从cache user info 中删除 135 | id_str_list='' 136 | for i in uid_list: 137 | id_str_list=id_str_list+'\''+str(i)+'\''+',' 138 | id_str_list=id_str_list[:-1] 139 | 140 | query=q1.format(id_str_list=id_str_list,table_name='cache_user_info') 141 | self.dbi.cur.execute(query) 142 | self.dbi.conn.commit() 143 | 144 | query=q1.format(id_str_list=id_str_list,table_name='ready_to_get') 145 | self.dbi.cur.execute(query) 146 | self.dbi.conn.commit() 147 | 148 | class deal_fetching_user(threading.Thread): 149 | #定期清理获取时间过长的部分 150 | 151 | def __init__(self): 152 | threading.Thread.__init__(self) 153 | self.dbi=MySQL_Interface() 154 | 155 | def run(self): 156 | while True: 157 | self.dbi=MySQL_Interface() 158 | t=time.time() 159 | time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t-3600)) 160 | query="update ready_to_get set is_fetching=null where is_fetching < \'{time}\' ;".format(time=time_stick) 161 | # print(query) 162 | # query='select * from ready_to_get where is_fetching < {time}'.format(time=time_stick) 163 | self.dbi.update_asQuery(query) 164 | time.sleep(1) 165 | 166 | class control_ready_table(threading.Thread): 167 | def __init__(self): 168 | threading.Thread.__init__(self) 169 | self.dbi=MySQL_Interface() 170 | def run(self): 171 | while True: 172 | self.dbi=MySQL_Interface() 173 | num=self.dbi.get_line_num('ready_to_get') 174 | if num>150*1000: 175 | query='select m.fans_num from (' \ 176 | 'select fans_num from ready_to_get ' \ 177 | 'ORDER BY fans_num limit 50000' \ 178 | ') as m order by fans_num desc limit 1' 179 | res=self.dbi.select_asQuery(query)[0][0] 180 | query='delete from ready_to_get where fans_num<{num}'\ 181 | .format(num=res) 182 | self.dbi.update_asQuery(query) 183 | else: 184 | time.sleep(600) 185 | 186 | class deal_isGettingBLog_user(threading.Thread): 187 | def __init__(self): 188 | threading.Thread.__init__(self) 189 | self.dbi=MySQL_Interface() 190 | 191 | def run(self): 192 | while True: 193 | self.dbi=MySQL_Interface() 194 | t=time.time() 195 | time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t-12*60*60)) 196 | 197 | #删掉cache_history中的行 198 | query='delete from cache_history where container_id in (select container_id from user_info_table where isGettingBlog<\'{time}\' and update_time is null)'\ 199 | .format(time=time_stick) 200 | self.dbi.update_asQuery(query) 201 | 202 | # 删掉mongodb-assemble factory中的相关值 203 | select_query='select container_id from user_info_table where isGettingBlog<\'{time}\' and update_time is null'.format(time=time_stick) 204 | res=[x[0] for x in self.dbi.select_asQuery(select_query)] 205 | client=MongoClient('localhost',27017) 206 | db=client['microblog_spider'] 207 | assemble_table=db.assemble_factory 208 | assemble_table.remove({'container_id':{'$in':res}}) 209 | 210 | # 将user info table中超时行的isGettingBlog清空 211 | query="update user_info_table set isGettingBlog=null where isGettingBlog<\'{time}\' and update_time is null".format(time=time_stick) 212 | self.dbi.update_asQuery(query) 213 | 214 | # 将cache_history中的残留项去除 215 | query = "delete from cache_history where is_dealing<\'{time}\' ;".format(time = time_stick) 216 | self.dbi.update_asQuery(query) 217 | 218 | time.sleep(60) 219 | 220 | class deal_cache_history(threading.Thread): 221 | def __init__(self): 222 | threading.Thread.__init__(self) 223 | 224 | def run(self): 225 | while True: 226 | start_time = time.time() 227 | dbi=MySQL_Interface() 228 | col_info=dbi.get_col_name('cache_history') 229 | query='select * from cache_history where is_dealing is null order by checkin_timestamp limit 1' 230 | 231 | mysql_res=dbi.select_asQuery(query) 232 | if mysql_res.__len__()==0: # cache_history表为空时,睡眠1秒,跳过此次循环 233 | time.sleep(1) 234 | continue 235 | 236 | mysql_res=mysql_res[0] 237 | 238 | # todo for delete----- 239 | print('debug->start to deal with a new task') 240 | print('debug->mysql_res: ') 241 | print(mysql_res) 242 | #------------------------ 243 | 244 | container_id=mysql_res[col_info.index('container_id')] 245 | print('debug->container_id: {cid}'.format(cid=container_id)) 246 | latest_time=mysql_res[col_info.index('latest_time')] 247 | latest_timestamp=mysql_res[col_info.index('latest_timestamp')] 248 | time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 249 | query = 'update cache_history set is_dealing=\'{time}\' where container_id={cid}'.format(time=time_stick, cid = container_id) 250 | # todo for delete----- 251 | print('debug->query1 : {q}'.format(q=query)) 252 | # ------------------------ 253 | dbi.update_asQuery(query) 254 | 255 | client = MongoClient('localhost', 27017) 256 | db = client['microblog_spider'] 257 | assemble_table = db.assemble_factory 258 | res = assemble_table.find({'container_id': container_id}, {'current_id': 1, 'total_num': 1}) 259 | id_list = [x['current_id'] for x in res] 260 | num = int([x['total_num'] for x in assemble_table.find({'container_id': container_id}).limit(1)][0]) 261 | ## todo for delete----- 262 | print('debug->id_list_len: {len}'.format(len=id_list.__len__())) 263 | print('debug->num: {n}'.format(n=num)) 264 | # ------------------------ 265 | # 检查是否所有包裹已经到齐 266 | check_state = True 267 | if id_list.__len__() < num: 268 | print('server->HistoryReport:The package is not complete, retry to catch data') 269 | check_state = False 270 | 271 | if check_state: 272 | # 如果所有子包已经收集完毕,则将数据放入正式数据库mongodb 273 | # 将装配车间中的相关数据删除 274 | # 并且在Mysql中更新update_time和latest_blog,抹掉isGettingBlog 275 | 276 | # 从mysql获取该用户信息 277 | try: 278 | query = 'select * from user_info_table where container_id=\'{cid}\'' \ 279 | .format(cid=container_id) 280 | user_info = dbi.select_asQuery(query)[0] 281 | # todo fro debug------------- 282 | print('task {cid} :debug->query2: {q}'.format(q=query,cid=container_id)) 283 | print('task {cid} debug->user_info:'.format(cid = container_id)) 284 | print(user_info) 285 | # -------------------------------- 286 | col_name = dbi.get_col_name('user_info_table') 287 | except Exception as e: 288 | print('task {cid} :Error:server-HistoryReturn:' 289 | 'No such user in MySQL.user_info_table,Reason:'.format(cid = container_id)) 290 | print(e) 291 | 292 | # 将数据从assemble factory中提取出来 293 | try: 294 | data_list = assemble_table.find({'container_id':container_id}, {'data': 1 , 'current_id': 1}) 295 | data_list_ori = [x for x in data_list] 296 | data_list = [x['data'] for x in data_list_ori] 297 | id_list = [x['current_id'] for x in data_list_ori] 298 | data_list_ori = None 299 | # todo fro debug------------- 300 | print('task {cid} debug->datalist: {len}'.format(len = data_list.__len__(),cid=container_id)) 301 | # -------------------------------- 302 | except Exception as e: 303 | print('Error:server-HistoryReturn:' 304 | 'Unable to get data from MongoDB, assemble factory,Reason:') 305 | print(e) 306 | 307 | # 长度大于预期,说明有重复信息,需要去重 308 | if id_list.__len__() > num : 309 | unique_data_list = [] 310 | check_dict = {} 311 | for i in range(id_list.__len__()) : 312 | try: 313 | # 这里使用字典去重,(算是hash吧) 314 | check_dict[str(id_list[i])] 315 | continue 316 | except: 317 | check_dict[str(id_list[i])] = True 318 | unique_data_list.append(data_list[i]) 319 | # print('data_list.len :{len}'.format(len=data_list.__len__())) 320 | # print('id_list.len :{len}'.format(len=id_list.__len__())) 321 | # print(i) 322 | data_list = unique_data_list 323 | 324 | # 将碎片拼接 325 | try: 326 | data_final = [] 327 | for i in data_list: 328 | data_final = data_final+i 329 | # todo fro debug------------- 330 | print('task {cid} debug->数据拼接完毕,len {len}'.format(len=data_final.__len__(),cid=container_id)) 331 | # -------------------------------- 332 | except Exception as e: 333 | print('Error:server-HistoryReport:' 334 | 'Unable to contact the pieces of information,Reason:') 335 | print(e) 336 | 337 | # 将本次信息录入accuracy_table 用以进一步分析 338 | blog_len = data_final.__len__() 339 | wanted_blog_len = user_info[col_name.index('blog_num')] 340 | blog_accuracy = blog_len/wanted_blog_len 341 | time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 342 | query = 'insert into accuracy_table values ({acc},\'{t_s}\',{num}) ;' \ 343 | .format(acc=blog_accuracy, t_s=time_stick, num=wanted_blog_len) 344 | dbi.insert_asQuery(query) 345 | 346 | # 将数据录入Mongodb 更改Mysql,删除assemble中相关内容 347 | try: 348 | if not user_info[col_name.index('update_time')]: 349 | # 将数据存入 Mongodb 的formal collection 350 | save_data_seperately(data_final) 351 | print('task {cid} Success: Data has saved in Mongodb, size is {size}' 352 | .format(size=sys.getsizeof(data_final),cid=container_id)) 353 | 354 | # # 将关键信息录入Mydql 355 | query = 'update user_info_table set ' \ 356 | 'update_time=\'{up_time}\',' \ 357 | 'latest_blog=\'{latest_blog}\',' \ 358 | 'isGettingBlog=null ' \ 359 | 'where container_id=\'{cid}\';'\ 360 | .format(up_time=time_stick,latest_blog=latest_time,cid=container_id) 361 | # query='update user_info_table set ' \ 362 | # 'update_time=\'{up_time}\',' \ 363 | # 'latest_blog=\'{latest_blog}\'' \ 364 | # 'where container_id=\'{cid}\';' \ 365 | # .format(up_time=time_stick,latest_blog=latest_time,cid=container_id) 366 | #TODO 这里为了方便统计,去掉了抹除isGetting这一项,但是正式运行的时候是要加上的 367 | dbi.update_asQuery(query) 368 | print('task {cid} Success: insert user into MongoDB, the num of data is {len}' 369 | .format(len=blog_len,cid=container_id)) 370 | else: 371 | query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \ 372 | .format(cid=container_id) 373 | dbi.update_asQuery(query) 374 | 375 | except Exception as e: 376 | print('task {cid} Error:server->HistoryReport:' 377 | 'Reason:'.format(cid=container_id)) 378 | print(e) 379 | else: 380 | # 如果所有子包不全,则抹掉isGettingBlog,将装配车间中数据删除 381 | print('task {cid} :Error: the package is not complete ,{a} of {b}' 382 | .format(a=id_list.__len__(),b=num,cid=container_id)) 383 | query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \ 384 | .format(cid=container_id) 385 | dbi.update_asQuery(query) 386 | 387 | # 将数据从assemble factory 去掉 388 | assemble_table.remove({'container_id':container_id}) 389 | print('task {cid} Success: Data has been removed from assemble factory' 390 | .format(cid=container_id)) 391 | 392 | # 将cache_history中的相应行删掉,表示已经处理完该事物了 393 | query='delete from cache_history where container_id=\'{cid}\'' \ 394 | .format(cid=container_id) 395 | dbi.update_asQuery(query) 396 | 397 | end_time = time.time() 398 | deal_time = end_time - start_time 399 | print('task {cid} :Success : the user {cid} is completed, length is {len}, use {t} seconds' 400 | .format(cid = container_id, len = data_final.__len__(), t = deal_time)) 401 | 402 | class deal_update_mission(threading.Thread): 403 | def __init__(self): 404 | threading.Thread.__init__(self) 405 | 406 | def run(self): 407 | client=MongoClient('localhost',27017) 408 | while True: 409 | db=client['microblog_spider'] 410 | mission_mongo=db.update_mission 411 | # 表示需要处理,但是现在无人处理的任务 412 | res=mission_mongo.find({'isReported':{'$ne':None},'isDealing':None}).limit(1) 413 | res=[x for x in res] 414 | 415 | # 若没有待完成的任务,则该线程休眠1秒然后继续 416 | if res.__len__()==0: 417 | time.sleep(1) 418 | continue 419 | 420 | # 提取出需要处理的任务 421 | task=res[0] 422 | task.pop('_id') 423 | mission_id=task['mission_id'] 424 | user_content=task['user_list'] 425 | 426 | # 将任务列表中的isDealing设置当前时间,表示当前任务开始受理 427 | mission_mongo.update({'mission_id':mission_id},{'$set':{'isDealing':int(time.time())}}) 428 | print('Update Mission :{mi} set isDealing as {t}'.format(mi=mission_id,t=int(time.time()))) 429 | 430 | # 获取包裹id和总包裹数 431 | assemble_table=db.assemble_factory 432 | res=assemble_table.find({'container_id':mission_id},{'current_id':1,'total_num':1}) 433 | id_list=[x['current_id'] for x in res] 434 | check_state=True 435 | try: 436 | num=int([x['total_num'] for x in assemble_table.find({'container_id':mission_id}).limit(1)][0]) 437 | except: 438 | print('deal_update_mission :{mi} can not get num info from mongo' 439 | .format(mi=mission_id)) 440 | num = 100000000 441 | check_state = False 442 | 443 | #检查是否所有包裹已经到齐 444 | if id_list.__len__()datalist: {len}'.format(len=data_list.__len__(),mi=mission_id)) 461 | except Exception as e: 462 | print('Update Mission :{mi} Error:server_database-deal_update_mission:' 463 | 'Unable to get data from MongoDB, assemble factory,Reason:'.format(mi=mission_id)) 464 | print(e) 465 | 466 | # 长度大于预期,说明有重复信息,需要去重 467 | if id_list.__len__() > num : 468 | unique_data_list = [] 469 | check_dict = {} 470 | for i in range(id_list.__len__()) : 471 | try: 472 | # 这里使用字典去重,(算是hash吧) 473 | check_dict[str(id_list[i])] 474 | continue 475 | except: 476 | check_dict[str(id_list[i])] = True 477 | unique_data_list.append(data_list[i]) 478 | data_list = unique_data_list 479 | 480 | # 将碎片拼接 481 | try: 482 | data_final=[] 483 | for i in data_list: 484 | data_final=data_final+i 485 | print('Update Mission :{mi} success->数据拼接完毕,len {len}' 486 | .format(len=data_final.__len__(),mi=mission_id)) 487 | except Exception as e: 488 | print('Update Mission :{mi} Error:server-HistoryReport:' 489 | 'Unable to contact the pieces of information,Reason:'.format(mi=mission_id)) 490 | print(e) 491 | 492 | # 增加当前时间的转发,点赞和评论数,便于追踪,并制作成UpdateMany对象 493 | user_list=[x['container_id'] for x in user_content] 494 | user_list_str='' 495 | for item in user_list: 496 | user_list_str+='\''+str(item)+'\',' 497 | user_list_str=user_list_str[:-1] 498 | 499 | def temp_add_trace(line): 500 | msg_id=line['id'] 501 | current_status=dict( 502 | comments_count=line['comments_count'], 503 | attitudes_count=line['attitudes_count'], 504 | reposts_count=line['reposts_count'] 505 | ) 506 | t=int(time.time()) 507 | t_str=str(t) 508 | line['status_trace.{date}'.format(date=t_str)]=current_status 509 | update_item=UpdateMany({'id':msg_id},{'$set':line},upsert=True) 510 | return update_item 511 | 512 | requests=[temp_add_trace(x) for x in data_final] 513 | latest_mongo=db.latest_history 514 | latest_mongo.bulk_write(requests) 515 | print('Update Mission :{mi} Success: server_database:UpdateMany列表生成,' 516 | '写入latest_history表成功,{len}'.format(len=requests.__len__(),mi=mission_id)) 517 | 518 | # 将获得数据写入各按月份分类的聚合中 519 | table_list=[] 520 | request_updateMonth=[] 521 | for i in range(data_final.__len__()): 522 | temp_time=data_final[i]['created_at'] 523 | temp_table_name='user_{year}_{month}'.format(year=temp_time[0:4],month=temp_time[5:7]) 524 | if temp_table_name in table_list: 525 | request_updateMonth[table_list.index(temp_table_name)].append(requests[i]) 526 | else: 527 | table_list.append(temp_table_name) 528 | request_updateMonth.append([requests[i]]) 529 | print('the number of ori table is {len}'.format(len=request_updateMonth.__len__())) 530 | print(table_list) 531 | selected_num = 5 532 | if table_list.__len__()>selected_num: 533 | packed = [[table_list[i],request_updateMonth[i]] 534 | for i in range(table_list.__len__())] 535 | packed = sorted(packed, key=lambda x:x[0], reverse=True) 536 | packed = packed[:selected_num] 537 | table_list = [x[0] for x in packed] 538 | request_updateMonth = [x[1] for x in packed] 539 | print('the number of dealed table is {len}'.format(len=request_updateMonth.__len__())) 540 | print(table_list) 541 | if request_updateMonth.__len__()>=3: 542 | print('{a}-{b}-{c}'.format( a=request_updateMonth[0].__len__(), 543 | b=request_updateMonth[1].__len__(), 544 | c=request_updateMonth[2].__len__() 545 | )) 546 | 547 | for i in range(table_list.__len__()): 548 | collection=eval('db.{name}'.format(name=table_list[i])) 549 | # todo for debug---------------------------- 550 | print('table {x} is started'.format(x=table_list[i])) 551 | #--------------------------------------------------- 552 | if request_updateMonth[i].__len__()>0: 553 | try: 554 | collection.bulk_write(request_updateMonth[i]) 555 | except Exception as e: 556 | print('Update Mission :{mi} fail to update table {t}' 557 | .format(mi=mission_id,t=table_list[i])) 558 | 559 | print('Update Mission :{mi} Success:server_database:所获的数组已经写入按月分类聚合中' 560 | .format(mi=mission_id)) 561 | 562 | # 清理Mydql,更新相关行数中的update_time和latest_blog 563 | time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 564 | # 找出各用户的最近更新时间 565 | latest_list=[0]*user_list.__len__() 566 | for line in data_final: 567 | this_timestick=int(line['created_timestamp']) 568 | this_container='100505'+str(line['user_id']) 569 | try: 570 | index=user_list.index(this_container) 571 | if latest_list[index]deal_update_mission:' 575 | 'container {id} is not in user_list'.format(id=this_container)) 576 | 577 | # 将各用户最近更新时间固化为mysql更新语句。 578 | case_list='' 579 | updated_user_list='' 580 | for i in range(latest_list.__len__()): 581 | if latest_list[i]>user_content[i]['latest_blog'] : 582 | time_stick_inner=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(latest_list[i])) 583 | case_list+=' when \'{cid}\' then \'{tstick}\' '.format(cid=user_list[i],tstick=time_stick_inner) 584 | updated_user_list+='\'{cid}\','.format(cid=user_list[i]) 585 | updated_user_list=updated_user_list[:-1] 586 | # 构建mysql更新语句 587 | query1='update user_info_table set update_time=\'{time}\' where container_id in ( {user_list} ) ;'\ 588 | .format(time=time_stick,user_list=user_list_str) 589 | query2='update user_info_table set latest_blog= case container_id {case_list} end where container_id in ( {ulist2} ) ;'\ 590 | .format(case_list=case_list,ulist2=updated_user_list) 591 | dbi=MySQL_Interface() 592 | dbi.update_asQuery(query2) 593 | dbi.update_asQuery(query1) 594 | print('Update Mission :{mi} Success:server_database: UpdateTime和LatestBlog选项已更新' 595 | .format(mi=mission_id)) 596 | if user_list_str.__len__()>0: 597 | query='update user_info_table set isGettingBlog=null where container_id in ({user_list});' \ 598 | .format(user_list=user_list_str) 599 | dbi.update_asQuery(query) 600 | print('Update Mission :{mi} Success:erver_database: isGettingBlog选项已清除'.format(mi=mission_id)) 601 | 602 | else: 603 | if user_list_str.__len__()>0: 604 | query='update user_info_table set isGettingBlog=null where container_id in ({user_list});'\ 605 | .format(user_list=user_list_str) 606 | dbi=MySQL_Interface() 607 | dbi.update_asQuery(query) 608 | 609 | # 将assemble_factory中与当前任务有关数据清空 610 | assemble_table.remove({'container_id':mission_id}) 611 | print('Update Mission :{mi} Success:server_database: assemble_factory in Mongo is cleared' 612 | .format(mi=mission_id)) 613 | 614 | # 将mongodb,任务列表中当前任务项清空 615 | mission_mongo.remove({'mission_id':mission_id}) 616 | print('Update Mission :{mi} Success:server_database: this mission is cleared' 617 | .format(mi=mission_id)) 618 | 619 | class clear_expired_update_mission(threading.Thread): 620 | def __init__(self): 621 | threading.Thread.__init__(self) 622 | 623 | def run(self): 624 | while True: 625 | client=MongoClient('localhost',27017) 626 | db=client['microblog_spider'] 627 | mission_mongo=db.update_mission 628 | assemble_mongo=db.assemble_factory 629 | current_time=int(time.time()) 630 | target_time=current_time-60*60*6 #将6个小时仍未完成的任务清除出去 631 | expired_mission=mission_mongo.find({'mission_start':{'$lt':target_time}}).limit(1) 632 | expired_mission=[x for x in expired_mission] 633 | if expired_mission.__len__()==0: 634 | # 如果没有符合要求的过期任务,则休眠 635 | time.sleep(60) 636 | else: 637 | # 如果有过期的任务 638 | expired_mission=expired_mission[0] 639 | mission_id=expired_mission['mission_id'] 640 | user_content=expired_mission['user_list'] 641 | user_list=[x['container_id'] for x in user_content] 642 | 643 | # 将mysql中相关用户isGettingBlog清空 644 | user_list_str='' 645 | for item in user_list: 646 | user_list_str+='\''+str(item)+'\',' 647 | user_list_str=user_list_str[:-1] 648 | if user_list_str.__len__()>0: 649 | dbi=MySQL_Interface() 650 | query='update user_info_table set isGettingBlog=null where container_id in ({user_list});' \ 651 | .format(user_list=user_list_str) 652 | dbi.update_asQuery(query) 653 | 654 | # 将assemble_factory中数据清空 655 | assemble_mongo.remove({'container_id':mission_id}) 656 | 657 | # 将Mongo中该任务从任务表中清空。 658 | mission_mongo.remove({'mission_id':mission_id}) 659 | 660 | class clear_expired_update_content(threading.Thread): 661 | def __init__(self): 662 | threading.Thread.__init__(self) 663 | 664 | def run(self): 665 | while True: 666 | client=MongoClient('localhost',27017) 667 | db=client['microblog_spider'] 668 | latest_mongo=db.latest_history 669 | t=int(time.time())-60*60*24*15 670 | latest_mongo.remove({'created_timestamp':{'$lt':t}}) 671 | time.sleep(600) 672 | 673 | class DB_manager(threading.Thread): 674 | def __init__(self): 675 | threading.Thread.__init__(self) 676 | 677 | # p1,p2,p3,p4 used to get the atten web of user in microblog 678 | self.p1=deal_cache_attends() 679 | self.p2=deal_cache_user_info() 680 | self.p3=deal_fetching_user() 681 | self.p4=control_ready_table() 682 | 683 | # p5, p6 are used to get the historical content of microblog 684 | self.p5=deal_isGettingBLog_user() 685 | self.p6=deal_cache_history() 686 | 687 | # self.deal_history = [] 688 | # for i in range(3) : 689 | # self.deal_history.append(deal_cache_history()) 690 | 691 | # p7,p8 used to get the update content of microblog 692 | self.p7=deal_update_mission() 693 | self.p8=clear_expired_update_mission() 694 | 695 | self.p9=clear_expired_update_content() 696 | 697 | def run(self): 698 | self.p1.start() 699 | self.p2.start() 700 | self.p3.start() 701 | self.p4.start() 702 | 703 | self.p5.start() 704 | self.p6.start() 705 | # for t in self.deal_history: 706 | # time.sleep(10) 707 | # t.start() 708 | 709 | self.p7.start() 710 | self.p8.start() 711 | 712 | self.p9.start() 713 | print('Process: deal_cache_attends is started ') 714 | print('Process: deal_cache_user_info is started ') 715 | print('Process: deal_fetching_user is started') 716 | print('Process: control_ready_table is started') 717 | print('Process: deal_isGettingBLog_user is started') 718 | print('Process: deal_cache_history is started') 719 | print('Process: deal_update_mission is started') 720 | print('Process: clear_expired_update_mission') 721 | print('Process: clear_expired_update_content') 722 | 723 | while True: 724 | time.sleep(5) 725 | if not self.p1.is_alive(): 726 | self.p1=deal_cache_attends() 727 | self.p1.start() 728 | print('Process: deal_cache_attends is restarted ') 729 | if not self.p2.is_alive(): 730 | self.p2=deal_cache_user_info() 731 | self.p2.start() 732 | print('Process: deal_cache_user_info is restarted ') 733 | if not self.p3.is_alive(): 734 | self.p3=deal_fetching_user() 735 | self.p3.start() 736 | print('Process: deal_fetching_user is restarted') 737 | if not self.p4.is_alive(): 738 | self.p4=control_ready_table() 739 | self.p4.start() 740 | print('Process: control_ready_table is restarted') 741 | if not self.p5.is_alive(): 742 | self.p5=deal_isGettingBLog_user() 743 | self.p5.start() 744 | print('Process: deal_isGettingBlog_user is restarted') 745 | if not self.p6.is_alive(): 746 | self.p6=deal_cache_history() 747 | self.p6.start() 748 | print('Process: deal_cache_history is restarted') 749 | 750 | # temp_len = self.deal_history.__len__() 751 | # for i in range(temp_len): 752 | # if not self.deal_history[i].is_alive(): 753 | # self.deal_history[i] = deal_isGettingBLog_user() 754 | # self.deal_history[i].start() 755 | # print('Process: deal_cache_history is restarted') 756 | 757 | if not self.p7.is_alive(): 758 | self.p7=deal_update_mission() 759 | self.p7.start() 760 | print('Process: deal_update_mission is restarted') 761 | if not self.p8.is_alive(): 762 | self.p8=clear_expired_update_mission() 763 | self.p8.start() 764 | print('Process: clear_expired_update_mission is restarted') 765 | if not self.p9.is_alive(): 766 | self.p9.start() 767 | print('Process: clear_expired_update_content') 768 | 769 | class SimpleHash(): 770 | def __init__(self,cap,seed): 771 | self.cap=cap 772 | self.seed=seed 773 | def hash(self,value): 774 | ret=0 775 | for i in range(value.__len__()): 776 | ret+=self.seed*ret+ord(value[i]) 777 | return ((self.cap-1) & ret) 778 | 779 | class BloomFilter(): 780 | def __init__(self): 781 | self.bit_size=1<<15 782 | self.seeds=[5,7,11,13,31,37,61] 783 | self.r=redis.StrictRedis(host='127.0.0.1',port=6379,db=0) 784 | self.hashFunc=[] 785 | for i in range(self.seeds.__len__()): 786 | self.hashFunc.append(SimpleHash(self.bit_size,self.seeds[i])) 787 | 788 | def isContains(self,str_input,name): 789 | if str_input==None: 790 | return False 791 | if str_input.__len__()==0: 792 | return False 793 | ret=True 794 | for f in self.hashFunc: 795 | loc=f.hash(str_input) 796 | ret=ret & self.r.getbit(name,loc) 797 | return ret 798 | 799 | def insert(self,str_input,name): 800 | for f in self.hashFunc: 801 | loc=f.hash(str_input) 802 | self.r.setbit(name,loc,1) 803 | 804 | def insert_asList(self,list_input,name): 805 | for line in list_input: 806 | self.insert(line,name) 807 | 808 | def save_data_inMongo(dict_data): 809 | client=MongoClient('localhost',27017) 810 | db=client['microblog_spider'] 811 | collection=db.formal 812 | result=collection.insert_many(dict_data) 813 | 814 | def save_data_seperately(dict_data): 815 | client=MongoClient('localhost',27017) 816 | db=client['microblog_spider'] 817 | table_list=[] 818 | data_list=[] 819 | for line in dict_data: 820 | temp_time=line['created_at'] 821 | temp_table_name='user_{year}_{month}'.format(year=temp_time[0:4],month=temp_time[5:7]) 822 | if temp_table_name not in table_list: 823 | table_list.append(temp_table_name) 824 | sub_data_list=[line] 825 | data_list.append(sub_data_list) 826 | else: 827 | data_list[table_list.index(temp_table_name)].append(line) 828 | for i in range(table_list.__len__()): 829 | print('ready to execute collec {c}, len is {len}' 830 | .format(c=table_list[i],len=data_list[i].__len__())) 831 | start_time = time.time() 832 | collection=eval('db.{name}'.format(name=table_list[i])) 833 | try: 834 | collection.insert_many(data_list[i]) 835 | except Exception as e: 836 | print('error from save_data_seperately, error collec is {c}' 837 | .format(c=table_list[i])) 838 | print(e) 839 | end_time = time.time() 840 | time_gap = end_time - start_time 841 | print('this part len is {len}, use {t} secs'.format(len = data_list[i].__len__(), t = time_gap)) 842 | 843 | if __name__=='__main__': 844 | db_thread=DB_manager() # database thread 845 | db_thread.start() -------------------------------------------------------------------------------- /client_asy_update.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | """ 3 | NAME: client_asy_update.py 4 | PY_VERSION: python3.5 5 | FUNCTION: This client part of distrubuted microblog spider. 6 | Can execute update mission through asynchronize ways 7 | VERSION: _0.1_ 8 | 9 | """ 10 | #====================================================================== 11 | #----------------import package-------------------------- 12 | # import python package 13 | import urllib.request as request 14 | import urllib.parse as parse 15 | from multiprocessing import Process 16 | import threading 17 | import time 18 | import os 19 | import json 20 | import http.cookiejar 21 | import re 22 | import random 23 | from random import Random 24 | import math 25 | import aiohttp 26 | import asyncio 27 | import traceback 28 | 29 | # import from this folder 30 | import client_config as config 31 | import File_Interface as FI 32 | from data_transport import upload_list 33 | from client import parseMicroblogPage 34 | #======================================================================= 35 | 36 | class clientAsy(): 37 | def __init__(self, uuid=None): 38 | self.pm = PrintManager() 39 | self.task_uid = uuid 40 | self.task_type = None 41 | check_server() 42 | self.get_task() 43 | self.proxy_pool=[] 44 | self.get_proxy_pool(self.proxy_pool,config.PROXY_POOL_SIZE) 45 | print(self.pm.gen_block_with_time("SUCCESS TO GET PROXY\nTHE SIZE IS {x}\nSTART TO RUN PROGRAM" 46 | .format(x=self.proxy_pool.__len__()))) 47 | self.run() 48 | 49 | def get_task(self): 50 | 51 | """ 52 | get task user id from server 53 | """ 54 | print(self.pm.gen_block_with_time("MISSION START\nREADY TO GET TASK")) 55 | if not self.task_uid: 56 | self.task_uid = config.UUID 57 | url='{url}/task/?uuid={uuid}'.format(url=config.SERVER_URL,uuid=self.task_uid) 58 | 59 | try: 60 | res=request.urlopen(url,timeout=10).read() 61 | res=str(res,encoding='utf8') 62 | except Exception as e: 63 | check_server() # sleep until server is available 64 | try: 65 | res=request.urlopen(url,timeout=10).read() 66 | res=str(res,encoding='utf8') 67 | except: 68 | err_str='error: client -> get_task : ' \ 69 | 'unable to connect to server, exit process' 70 | info_manager(err_str,type='KEY') 71 | os._exit(0) 72 | 73 | if 'no task' in res: # if server have no task uid ,return 'no task uid' 74 | info_manager(self.pm.gen_block_with_time("UNABLE TO GET TASK\nSLEEP FOR 1 MIN AND EXIT"), 75 | type="KEY",with_time=False) 76 | time.sleep(60) 77 | os._exit(0) 78 | 79 | 80 | try: # try to parse task str 81 | res=res.split(',') 82 | self.task_uid=res[0] 83 | self.task_type=res[1] 84 | except: 85 | info_manager(self.pm.gen_block_with_time("UNABLE TO GET TASK\nSLEEP FOR 1 MIN AND EXIT"), 86 | type="KEY",with_time=False) 87 | os._exit(0) 88 | 89 | info_manager(self.pm.gen_block_with_time("GOTTEN TASK\nREADY TO GET PROXY"), 90 | type='KEY',with_time=False) 91 | 92 | def get_proxy_pool(self,proxy_pool,num): 93 | 94 | """ 95 | request certain number of proxy from server 96 | :param num: 97 | :return: None, but a list of proxy as formation of [[proxy(str),timeout(float)]...[]] 98 | will be added to self.proxy_pool 99 | """ 100 | 101 | url='{url}/proxy/?num={num}'.format(url=config.SERVER_URL,num=num) 102 | 103 | try: 104 | res=request.urlopen(url,timeout=10).read() 105 | res=str(res,encoding='utf8') 106 | except: 107 | time.sleep(5) 108 | check_server() # sleep until server is available 109 | try: 110 | res=request.urlopen(url,timeout=10).read() 111 | res=str(res,encoding='utf8') 112 | except Exception as e: 113 | err_str='error: client -> get_proxy_pool : unable to ' \ 114 | 'connect to proxy server ' 115 | info_manager(err_str,type='KEY') 116 | if config.KEY_INFO_PRINT: 117 | print('Error from client.get_proxy_pool,reason:',e) 118 | return 119 | 120 | if 'no valid proxy' in res: # if server return no valid proxy, means server 121 | # cannot provide proxy to this client 122 | err_str='error: client -> get_proxy_pool : fail to ' \ 123 | 'get proxy from server' 124 | info_manager(err_str,type='KEY') 125 | time.sleep(1) 126 | return 127 | 128 | try: 129 | data=res.split(';') # 'url,timedelay;url,timedelay;.....' 130 | data=[proxy_object(x) for x in data] 131 | except Exception as e: 132 | err_str='error: client -> get_proxy_pool : fail to ' \ 133 | 'parse proxy str info:\r\n'+res 134 | info_manager(err_str,type='KEY') 135 | return 136 | 137 | proxy_pool[:]=proxy_pool[:]+data 138 | 139 | def return_proxy(self): 140 | 141 | """ 142 | return useful or unused proxy to server 143 | """ 144 | 145 | check_server() 146 | url='{url}/proxy_return'.format(url=config.SERVER_URL) 147 | proxy_ret= [x.raw_data for x in self.proxy_pool] 148 | proxy_str='' 149 | 150 | for item in proxy_ret: 151 | proxy_str=proxy_str+item 152 | data={ 153 | 'data':proxy_str 154 | } 155 | 156 | data=parse.urlencode(data).encode('utf-8') 157 | 158 | try: 159 | opener=request.build_opener() 160 | req=request.Request(url,data) 161 | res=opener.open(req).read().decode('utf-8') 162 | except: 163 | try: 164 | opener=request.build_opener() 165 | req=request.Request(url,data) 166 | res=opener.open(req).read().decode('utf-8') 167 | except: 168 | err_str='error:client->return_proxy:unable to ' \ 169 | 'connect to server' 170 | info_manager(err_str,type='KEY') 171 | return 172 | 173 | if 'return success' in res: 174 | print('Success: return proxy to server') 175 | return 176 | else: 177 | err_str='error:client->return_proxy:'+res 178 | info_manager(err_str,type='KEY') 179 | # raise ConnectionError('Unable to return proxy') 180 | return 181 | 182 | def run(self): 183 | # 监控proxy pool,建议get_proxy_pool单独开一个线程, 184 | # 如果server立即返回则马上关闭,否则设为长连接 185 | 186 | proxy_thread = proxy_keep_thread(self.proxy_pool) 187 | proxy_thread.start() 188 | 189 | if self.task_type=='update': 190 | asyupdate = AsyUpdateHistory(self.proxy_pool,self.task_uid) 191 | asyupdate.run() 192 | 193 | class proxy_keep_thread(threading.Thread): 194 | def __init__(self,proxy_pool): 195 | self.proxy_pool = proxy_pool 196 | threading.Thread.__init__(self) 197 | 198 | def run(self): 199 | inner_count = 0 200 | while True: 201 | inner_count += 1 202 | time.sleep(0.1) 203 | 204 | if inner_count==20: 205 | # print('proxy_keep_thread.run: ths size of pool is {x}' 206 | # .format(x=self.proxy_pool.__len__())) 207 | inner_count = 0 208 | if self.proxy_pool.__len__()50: 394 | print('ERROR:unable to convey success info to server') 395 | os._exit(0) 396 | 397 | res=res.read().decode('utf8') 398 | if 'success' in res: 399 | suc_str='Success:updateHistory->run:\n'\ 400 | 'Success to return update to server' 401 | info_manager(self.pm.gen_block_with_time(suc_str),type="KEY",with_time=False) 402 | else: 403 | string='warn: updateHistory->run: \n' \ 404 | 'get user update, \n' \ 405 | 'but report was denied by server' 406 | info_manager(self.pm.gen_block_with_time(string),type='KEY',with_time=False) 407 | 408 | self.return_proxy() 409 | os._exit(0) 410 | 411 | def return_proxy(self): 412 | 413 | """ 414 | return useful or unused proxy to server 415 | """ 416 | 417 | # check_server() 418 | url='{url}/proxy_return'.format(url=config.SERVER_URL) 419 | proxy_ret= [x.raw_data for x in self.proxy_pool] 420 | proxy_str='' 421 | 422 | for item in proxy_ret: 423 | proxy_str=proxy_str+item+';' 424 | proxy_str=proxy_str[0:-1] 425 | data={ 426 | 'data':proxy_str 427 | } 428 | 429 | data=parse.urlencode(data).encode('utf-8') 430 | 431 | try: 432 | opener=request.build_opener() 433 | req=request.Request(url,data) 434 | res=opener.open(req).read().decode('utf-8') 435 | except: 436 | try: 437 | opener=request.build_opener() 438 | req=request.Request(url,data) 439 | res=opener.open(req).read().decode('utf-8') 440 | except: 441 | err_str='error:client->return_proxy:unable to ' \ 442 | 'connect to server' 443 | info_manager(err_str,type='KEY') 444 | return 445 | 446 | if 'return success' in res: 447 | print('Success: return proxy to server') 448 | return 449 | else: 450 | err_str='error:client->return_proxy:'+res 451 | info_manager(err_str,type='KEY') 452 | # raise ConnectionError('Unable to return proxy') 453 | return 454 | 455 | @asyncio.coroutine 456 | async def asyUpdateHistory_user(self, task_dict, ret_content, page_undealed_list, timeout=10): 457 | 458 | # 初始化变量 459 | container_id = task_dict['container_id'] 460 | update_time = task_dict['update_time'] 461 | latest_blog = task_dict['latest_blog'] 462 | reconn_limit = task_dict['reconn_limit'] 463 | proxy_limit = task_dict['proxy_limit'] 464 | retry_left = task_dict['retry_left'] 465 | 466 | self.url_model='http://m.weibo.cn/page/json?containerid={cid}_-_WEIBO_SECOND_PROFILE_WEIBO&page={page}' 467 | aconn = AsyConnector(self.proxy_pool) 468 | 469 | page = 1 470 | batch = 1 # 每批获取页面的个数 471 | ave_time_per_page = None 472 | 473 | # this func exec the normal seq, and there will be another func to deal with unsuccess page 474 | continue_err_page_count = 0 475 | 476 | while True: 477 | if continue_err_page_count>5: 478 | info_manager('warning: the continue_err_page_count come up to 5, up to {p}, finish {c} task' 479 | .format(c=container_id,p=page)) 480 | self.exec_res.add_user_finish(container_id) 481 | break 482 | # for i in range(batch): 483 | try: 484 | url = self.url_model.format(cid=container_id,page=page) 485 | if self.exec_res.unfinished_size()>0: 486 | # print(self.exec_res.report_unfinished_tasks()) 487 | info_manager('user execution report: user: {i} | current page: {p} | pre actions num: {a}' 488 | .format(i=container_id,a=self.exec_res.get_action_times(container_id),p=page)) 489 | 490 | self.exec_res.add_user_action(container_id) # 对运行结果进行监控 491 | self.exec_res.add_page_action(container_id,page) 492 | time_start = time.time() 493 | 494 | res = await self.getPageContent(url,proxy_limit,reconn_limit,timeout=timeout) 495 | continue_err_page_count = 0 496 | 497 | self.exec_res.add_page_success(container_id,page) # 对运行结果进行监控 498 | time_end = time.time() 499 | self.exec_res.add_exec_time(time_end-time_start) 500 | 501 | # if page>=5 and not ave_time_per_page: # 经过前5页以后,开始估计要多少同步获取才能够在5页内完成 502 | # target_gaps = min((time.time() - latest_blog + 86400*10), 86400*80) 503 | # current_gap = time.time()-res[-1]['created_timestamp'] 504 | # ave_gap_per_page = current_gap/page 505 | # target_gaps -= current_gap 506 | # if target_gaps>0: 507 | # tmp = int(target_gaps/(ave_gap_per_page*5)) 508 | # if tmp>1: 509 | # batch = tmp 510 | 511 | valid_res = self.pick_out_valid_res(res,latest_blog,update_time) 512 | ret_content += valid_res 513 | if valid_res.__len__()0 else 0) 638 | ret += "The page status is {a} / {b}\n".format( 639 | a = self._success_page_count, 640 | b = self._action_page_count 641 | ) 642 | ret += "page success ratio: {p}\n".format( 643 | p = str(self._success_page_count/self._action_page_count)[:5] 644 | ) 645 | ret += "this task lasted for {t} secs".format( 646 | t = int (time.time() - self._mission_start_time)) 647 | return ret 648 | 649 | def report_unfinished_tasks(self): 650 | ret = '' 651 | for id in self._unfinished_ids: 652 | tmp = 'id:{i}\taction pages:{p}'.format(i=id,p=self._action_user_set[id]) 653 | ret += tmp + '\n' 654 | return ret[:-1] 655 | 656 | def unfinished_size(self): 657 | return self._total_user_num-self._finished_user_count 658 | 659 | def get_action_times(self,container_id): 660 | return self._action_user_set.get(container_id,0) 661 | 662 | def tmp(self): 663 | return self._action_user_set 664 | 665 | @asyncio.coroutine 666 | async def asyUpdateHistory_undealed(self,task,ret_content,timeout=10): 667 | try: 668 | page_id = task['page_id'] 669 | container_id = task['container_id'] 670 | self.exec_undealed_status.add_action_page(container_id, page_id) 671 | url = self.url_model.format(cid=container_id,page=page_id) 672 | res = await self.getPageContent(url, 673 | task['proxy_limit'], 674 | task['reconn_limit'], 675 | timeout=timeout 676 | ) 677 | valid_res = self.pick_out_valid_res(res,task['latest_blog'],task['update_time']) 678 | ret_content += valid_res 679 | info_manager(' UNDEALED: Success {cid}-page {i} is done'.format(cid=container_id,i=page_id)) 680 | self.exec_undealed_status.add_success_page(container_id, page_id) 681 | except: 682 | if task['retry_left'] > 0: 683 | task['retry_left'] -= 1 684 | info_manager("unable to get page {c}-{i} | left time {l}" 685 | .format(c=task['container_id'],i=task['page_id'],l=task['retry_left'])) 686 | await self.asyUpdateHistory_undealed(task,ret_content,timeout=timeout) 687 | else: 688 | self.exec_undealed_status.add_finished_page(container_id, page_id) 689 | print('sorry about that {c} {i}'.format(c=container_id,i=page_id)) 690 | 691 | class exec_undealed_status(): 692 | def __init__(self): 693 | self.action_page_set = {} 694 | self.action_page_count = 0 695 | self.success_page_set = {} 696 | self.success_page_count = 0 697 | self.finished_page_set = {} 698 | self.finished_page_count = 0 699 | 700 | 701 | def add_action_page(self,container_id, page_id): 702 | key = '{c}-{i}'.format(c=container_id,i=page_id) 703 | tmp = self.action_page_set.get(key, 0) 704 | if tmp==0: 705 | self.action_page_count += 1 706 | self.action_page_set[key] = tmp + 1 707 | 708 | def add_success_page(self,container_id, page_id): 709 | key = '{c}-{i}'.format(c=container_id,i=page_id) 710 | tmp = self.success_page_set.get(key, 0) 711 | if tmp==0: 712 | self.success_page_count += 1 713 | self.success_page_set[key] = tmp + 1 714 | self.add_finished_page(container_id, page_id) 715 | 716 | def add_finished_page(self,container_id, page_id): 717 | key = '{c}-{i}'.format(c=container_id,i=page_id) 718 | tmp = self.finished_page_set.get(key,0) 719 | if tmp==0: 720 | self.finished_page_count += 1 721 | self.finished_page_set[key] = tmp + 1 722 | 723 | class exec_undealed_supervisor(threading.Thread): 724 | def __init__(self, msg_queue, exec_status): 725 | self.exec_status = exec_status 726 | self.msg_queue = msg_queue 727 | threading.Thread.__init__(self) 728 | 729 | def run(self): 730 | pm = PrintManager() 731 | while self.msg_queue.__len__()==0: 732 | time.sleep(10) 733 | if self.msg_queue.__len__()==0: 734 | info_manager( 735 | pm.gen_block_with_time( 736 | "undealed execution status:\n" 737 | "{a} / {b} / {c}".format(a = self.exec_status.success_page_count, 738 | b = self.exec_status.finished_page_count, 739 | c = self.exec_status.action_page_count 740 | ) 741 | ), 742 | type = 'KEY', 743 | with_time = False 744 | ) 745 | 746 | 747 | async def getPageContent(self, url, proxy_limit, 748 | reconn_limit, timeout=10): 749 | aconn = AsyConnector(self.proxy_pool) 750 | 751 | # get page 752 | try: 753 | page = await aconn.getPage( url,proxy_limit, 754 | reconn_limit,timeout=timeout) 755 | except Exception as e: 756 | raise IOError("Unable to get page , url:{u}" 757 | .format(u=url)) 758 | # parse page 759 | try: 760 | pmp = parseMicroblogPage() 761 | res = pmp.parse_blog_page(page) 762 | return res 763 | except Exception as e: 764 | raise ValueError("Unable to parse page, url: {u}, \n\t\tpage content:{p}".format(p=page,u=url)) 765 | 766 | def pick_out_valid_res(self,init_res,latest_blog,update_time): 767 | valid_res = [] 768 | for r in init_res: 769 | if int(r['created_timestamp'])>int(latest_blog)-60*60*24*10 \ 770 | and int(r['created_timestamp'])>time.time()-60*60*24*80: # 追踪到最后一条微博的前10天,或者是最近的80天 771 | # if int(r['created_timestamp'])>time.time()-60*60*24*80: 772 | valid_res.append(r) 773 | return valid_res 774 | 775 | class AsyConnector(): 776 | def __init__(self, proxy_pool, if_proxy=True): 777 | self.proxy_pool = proxy_pool 778 | self.if_proxy = if_proxy 779 | 780 | async def getPage(self, url, proxy_limit, reconn_limit, 781 | proxy_used=0, timeout=10): 782 | while True: 783 | if self.proxy_pool.__len__()>0: 784 | proxy = 'http://'+self.proxy_pool.pop(0).url 785 | break 786 | else: 787 | info_manager("AsyConnector.getPage.getproxy->" 788 | "unable to get proxy, sleep for 3 sec",type="NORMAL") 789 | await asyncio.sleep(3) 790 | try: 791 | ret_data = await self.__single_connect(url, 792 | proxy, 793 | reconn_limit, 794 | timeout=timeout) 795 | reconn_times = ret_data['reconn_times'] 796 | page = ret_data['content'] 797 | info_manager("success to get page {u} \n\t\tafter try {p} proxies and {r} reconn" 798 | .format(u=url,p=proxy_used,r=reconn_times)) 799 | return page 800 | except Exception as e: 801 | info_manager("Error from AsyConnector.getPage {u}\n\t\treason:{e}" 802 | .format(e=e,u=url)) 803 | if proxy_used < proxy_limit: 804 | info_manager('\t\tthis proxy seems invalid, ready to change one, ' 805 | 'the {i}th proxy'.format(i=proxy_used+1)) 806 | return await self.getPage(url, 807 | proxy_limit, 808 | reconn_limit, 809 | proxy_used+1, 810 | timeout=timeout) 811 | else: 812 | raise RuntimeError("** warning: can not get page, " 813 | "boooooooooooom") 814 | 815 | async def __single_connect(self,url, proxy, reconn_limit, # 处理单个proxy的任务 816 | reconn_times=0, timeout=10): 817 | headers = {'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) ' 818 | 'AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile' 819 | '/12A4345d Safari/600.1.4'} 820 | conn = aiohttp.ProxyConnector(proxy=proxy, conn_timeout=timeout) 821 | async with aiohttp.ClientSession(connector=conn) as session: 822 | try: 823 | with aiohttp.Timeout(timeout+1): 824 | async with session.get(url, headers=headers) as resp: 825 | content = await resp.read() 826 | content = content.decode('utf8') 827 | # print("success to get page after reconn {t} times".format(t=reconn_times)) 828 | ret_data = dict( 829 | content = content, 830 | reconn_times = reconn_times 831 | ) 832 | return ret_data 833 | except Exception as e: 834 | info_manager("Error from AsyConnector.__single_connect: \n\t\treason :{x}".format(x=e)) 835 | if reconn_times < reconn_limit: 836 | info_manager("\t\treconn again, the {i} times".format(i=reconn_times+1)) 837 | return await self.__single_connect(url, proxy, reconn_limit, 838 | reconn_times+1, timeout=timeout) 839 | else: 840 | raise RuntimeError("** warning: can not get page, ready to change proxy and retry") 841 | 842 | def info_manager(info_str,type='NORMAL',with_time=True): 843 | time_stick=time.strftime('%Y/%m/%d %H:%M:%S ||', 844 | time.localtime(time.time())) 845 | if with_time: 846 | str=time_stick+info_str 847 | else: 848 | str = info_str 849 | if type=='NORMAL': 850 | if config.NOMAL_INFO_PRINT: 851 | print(str) 852 | elif type=='KEY': 853 | if config.KEY_INFO_PRINT: 854 | print(str) 855 | elif type=='DEBUG': 856 | if config.DEBUG_INFO_PRINT: 857 | print(str) 858 | else: 859 | print('Error from info_manager : unknown type') 860 | 861 | def check_server(): 862 | 863 | """ 864 | check if server can provide service 865 | if server is valid, will return 'connection valid' 866 | """ 867 | 868 | url='{url}/auth'.format(url=config.SERVER_URL) 869 | while True: 870 | 871 | # # todo 异常危险!!! 把checkserver这一步跳过了 872 | # break 873 | 874 | try: 875 | res=request.urlopen(url,timeout=10).read() 876 | res=str(res,encoding='utf8') 877 | #-------------------------------------------------- 878 | if 'connection valid' in res: 879 | break 880 | else: 881 | error_str='error: client-> check_server :' \ 882 | 'no auth to connect to server,exit process' 883 | info_manager(error_str,type='KEY') 884 | os._exit(0) 885 | except Exception as e: 886 | err_str='error:client->check_server:cannot ' \ 887 | 'connect to server; process sleeping' 888 | info_manager(err_str,type='NORMAL') 889 | print('Error from check_server',e,' url is',url) 890 | time.sleep(5) # sleep for 1 seconds 891 | 892 | class proxy_object(): 893 | def __init__(self,data): # in this version ,data is in formation of [str(proxy),int(timedelay)] 894 | self.raw_data=data 895 | res=data.split(',') 896 | self.url=res[0] 897 | self.timedelay=res[1] 898 | def getUrl(self): 899 | return self.url 900 | def getRawType(self): #返回原来格式 901 | return self.raw_data 902 | 903 | def generate_timestr(): 904 | tstr = time.strftime('%Y/%m/%d %H:%M:%S',time.localtime(time.time())) 905 | return tstr 906 | 907 | class PrintManager(): 908 | def gen_timestr(self): 909 | tstr = time.strftime('%Y/%m/%d %H:%M:%S',time.localtime(time.time())) 910 | return tstr 911 | 912 | def gen_center_str(self, content, len=42, frame="|||"): 913 | if type(content)==str: 914 | content = content.split("\n") 915 | # content = [content] 916 | ret = "" 917 | for s in content: 918 | left = len-frame.__len__()*2-s.__len__() 919 | margin_left = left>>1 920 | margin_right = left-margin_left 921 | line = "{fr}{ml}{s}{mr}{fr}".format( 922 | ml = " "*margin_left, 923 | s = s, 924 | mr = " "*margin_right, 925 | fr = frame 926 | ) 927 | ret += line+'\n' 928 | return ret 929 | 930 | def gen_block(self, content, len=42, frame="|||"): 931 | ret = "="*len + '\n' 932 | ret += self.gen_center_str(content,len,frame=frame) 933 | ret += "="*len + '\n' 934 | return ret 935 | 936 | 937 | def gen_block_with_time(self, content, len=42, frame="|||"): 938 | ret = "="*len+'\n' 939 | time_s = self.gen_timestr() 940 | timeline = "TIME: "+time_s 941 | ret += self.gen_center_str(timeline,len,frame=frame) 942 | return ret+self.gen_block(content,len,frame=frame) 943 | 944 | 945 | 946 | class upload_history(upload_list): 947 | def __init__(self,data,url,pack_len,thread_num,container_id): 948 | self.container_id=container_id 949 | setting=dict( 950 | batch_size=pack_len, 951 | thread_adjust=False, 952 | thread_num=thread_num 953 | ) 954 | upload_list.__init__(self,data,url,setting) 955 | 956 | def pack_block(self,main_data,pack_id,pack_num): 957 | data={ 958 | 'data': main_data, 959 | 'current_id': pack_id, 960 | 'total_num': pack_num, 961 | 'len': main_data.__len__(), 962 | 'container_id':self.container_id 963 | } 964 | data=parse.urlencode(data).encode('utf8') 965 | return data 966 | 967 | if __name__=='__main__': 968 | p_pool = [] 969 | uuid = 5 970 | for i in range(3): 971 | p = Process(target=clientAsy,args=(uuid,)) 972 | p_pool.append(p) 973 | for p in p_pool: 974 | p.start() 975 | while True: 976 | for i in range(p_pool.__len__()): 977 | if not p_pool[i].is_alive(): 978 | p_pool[i] = Process(target=clientAsy,args=(uuid,)) 979 | x=random.randint(1,10) 980 | time.sleep(x) 981 | p_pool[i].start() 982 | 983 | --------------------------------------------------------------------------------