├── KMSD.py ├── README.md ├── master ├── share_code │ ├── scrapy.cfg │ ├── share_code │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── middlewares.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── algorithm │ │ │ ├── best_params.csv │ │ │ ├── similarity.py │ │ │ ├── similarity.pyc │ │ │ └── test1.py │ │ ├── auto_ip.py │ │ ├── auto_ip_multi_pro.py │ │ ├── example.py │ │ ├── extensions.pyc │ │ ├── extensionsItem.py │ │ ├── extensionsItem.pyc │ │ ├── extensionsTime.py │ │ ├── extensionsTime.pyc │ │ ├── items.py │ │ ├── items.pyc │ │ ├── middlewares.py │ │ ├── middlewares.pyc │ │ ├── pipelines.py │ │ ├── pipelines.pyc │ │ ├── settings.py │ │ ├── settings.pyc │ │ ├── spiders │ │ │ ├── ShareSpider.py │ │ │ ├── ShareSpider.pyc │ │ │ ├── __init__.py │ │ │ ├── __init__.pyc │ │ │ └── __pycache__ │ │ │ │ ├── ShareSpider.cpython-36.pyc │ │ │ │ └── __init__.cpython-36.pyc │ │ └── untitled2.py │ └── zookeeper.out └── zoo_detect │ └── zoo_watcher.py ├── nupic_output.py ├── share_experiment.py ├── similarity.py ├── slave ├── share_code │ ├── scrapy.cfg │ └── share_code │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── items.cpython-36.pyc │ │ ├── middlewares.cpython-36.pyc │ │ ├── pipelines.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ │ ├── auto_ip.py │ │ ├── auto_ip_multi_pro.py │ │ ├── example.py │ │ ├── extensions.pyc │ │ ├── extensionsItem.py │ │ ├── extensionsItem.pyc │ │ ├── extensionsTime.py │ │ ├── extensionsTime.pyc │ │ ├── items.py │ │ ├── items.pyc │ │ ├── middlewares.py │ │ ├── middlewares.pyc │ │ ├── pipelines.py │ │ ├── pipelines.pyc │ │ ├── settings.py │ │ ├── settings.pyc │ │ ├── spiders │ │ ├── SlaveSpider.py │ │ ├── SlaveSpider.pyc │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ └── __pycache__ │ │ │ ├── SlaveSpider.cpython-36.pyc │ │ │ └── __init__.cpython-36.pyc │ │ └── untitled2.py └── zoo_detect │ └── zoo_watcher.py ├── test_nupic.py ├── testbp.py ├── 参考博客 ├── 各组件安装文档说明 ├── 启动步骤 └── 笔记摘录 /KMSD.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from similarity import MSD 3 | import pandas as pd 4 | import numpy as np 5 | import copy 6 | import random 7 | 8 | def local_MSD_layer(a,b): #对应于出现断层数据 9 | l1=np.nonzero(a!=-1)[0] 10 | l2=np.nonzero(b!=-1)[0] 11 | l=list(set(l1) & set(l2)) 12 | sim=MSD(a[l],b[l]) 13 | return sim 14 | 15 | #在这里我将欧式距离改为了MSD距离 16 | def distEclud(vecA,vecB): 17 | return local_MSD_layer(vecA,vecB) 18 | #随机初始化K个质心(质心满足数据边界之内) 19 | def randCent(dataSet,k): 20 | #得到数据样本的维度 21 | n=np.shape(dataSet)[1] 22 | #初始化为一个(k,n)的矩阵 23 | centroids=np.mat(np.zeros((k,n))) 24 | #遍历数据集的每一维度 25 | for j in range(n): 26 | #得到该列数据的最小值 27 | minJ=min(dataSet[:,j]) 28 | #得到该列数据的范围(最大值-最小值) 29 | rangeJ=float(max(dataSet[:,j])-minJ) 30 | #k个质心向量的第j维数据值随机为位于(最小值,最大值)内的某一值 31 | centroids[:,j]=minJ+rangeJ*np.random.rand(k,1) 32 | #返回初始化得到的k个质心向量 33 | return np.array(centroids) 34 | #k-均值聚类算法 35 | #@dataSet:聚类数据集 36 | #@k:用户指定的k个类 37 | #@distMeas:距离计算方法,默认欧氏距离distEclud() 38 | #@createCent:获得k个质心的方法,默认随机获取randCent() 39 | def kMeans(dataSet,k,distMeas=distEclud,createCent=randCent): 40 | dataSet=np.array(dataSet) 41 | #获取数据集样本数 42 | m=np.shape(dataSet)[0] 43 | #初始化一个(m,2)的矩阵 44 | clusterAssment=np.mat(np.zeros((m,2))) 45 | #创建初始的k个质心向量 46 | centroids=createCent(dataSet,k) 47 | #聚类结果是否发生变化的布尔类型 48 | clusterChanged=True 49 | #只要聚类结果一直发生变化,就一直执行聚类算法,直至所有数据点聚类结果不变化 50 | while clusterChanged: 51 | #聚类结果变化布尔类型置为false 52 | clusterChanged=False 53 | #遍历数据集每一个样本向量 54 | for i in range(m): 55 | #初始化最小距离最正无穷;最小距离对应索引为-1 56 | minDist=np.inf;minIndex=-1 57 | #循环k个类的质心 58 | for j in range(k): 59 | #计算数据点到质心的欧氏距离 60 | distJI=distMeas(centroids[j,:],dataSet[i,:].reshape(centroids[j,:].shape)) 61 | #如果距离小于当前最小距离 62 | if distJI=int(0.1*len(tempDF)): #限制为10%的好处是,可以避免这种情况:两个序列中,有一个序列非常多-1,导致计算不准确 109 | print('Not satisfied!') 110 | continue 111 | simL=[] 112 | for j in range(tempDF.shape[1]): 113 | if j!=detect: #自己不会跟自己计算 114 | l=len(np.nonzero(tempDF.iloc[:,j]==-1)[0]) 115 | if l<=int(0.1*len(tempDF)): #如果-1占比不超过10%,计算 116 | sim=local_MSD_layer(tempDF.iloc[:,detect],tempDF.iloc[:,j]) 117 | simL.append({sim:str(detect)+'-'+str(j)}) #相似度做keys 118 | else: 119 | print('%d col is not suitable for %d because -1 too much '%(j,detect)) 120 | D=sortedTw(simL) #得到与detect符合的20个序列序号,以及相似度 121 | Data,Label = update_train(tempDF,D,detect,Data,Label) #更新train训练集和标签 122 | print('Data print: ',Data) 123 | #ime.sleep(6) 124 | return Data,Label 125 | 126 | def percent(a): 127 | a=pd.DataFrame(a) 128 | a.columns=['origin','predict'] 129 | b=[] 130 | for i in range(len(a)): 131 | d=abs(a.iloc[i,0]-a.iloc[i,1])/a.iloc[i,1] if a.iloc[i,1] else 'Error' 132 | if not isinstance(d,str): 133 | d=str(d*100)+'%' 134 | b.append(d) 135 | a['loss']=b 136 | return a 137 | def use_algorithm(Data,Label,pro=0.7): #利用sklearn的神经网络实现预测,默认0.7训练集 138 | #from sklearn.neural_network import MLPRegressor 139 | train_length=int(pro*len(Data)) 140 | trainD=Data[:train_length,:] 141 | testD=Data[train_length:,:] 142 | trainL=Label[:train_length] 143 | testL=Label[train_length:] 144 | print('trainD,trainL,testD,testL: ',trainD.shape,trainL.shape,testD.shape,testL.shape) 145 | #exit(1) 146 | # 神经网络 147 | parameters = { 148 | 'hidden_layer_sizes': [(15,),(7,),(21,),(15,3),(7,3),(21,3),(7,3,3),(7,5,3)], 149 | 'max_iter': [20000,100000], 150 | 'momentum': [0,0.5,1], 151 | 'learning_rate': ['adaptive','constant','invscaling'],\ 152 | 'solver': ['sgd','adam'],\ 153 | 'shuffle': [False],\ 154 | 'activation': ['logistic','relu','tanh'] 155 | } 156 | mlp = MLPRegressor() 157 | clf = GridSearchCV(mlp, parameters) 158 | model=clf.fit(trainD,trainL) 159 | bestp=clf.best_params_ 160 | with open('best_params.csv','w') as f: 161 | f.write(str(bestp)) 162 | f.close() 163 | with open('best_params.csv','r') as f: 164 | bestp=eval(f.read()) 165 | f.close() 166 | print('bestp: ',bestp) 167 | #clf = MLPRegressor(hidden_layer_sizes=(15,), max_iter=100000,learning_rate='adaptive',solver='sgd',shuffle=False,activation='logistic') 168 | #model = clf.fit(trainD,trainL) 169 | predictD = model.predict(testD) #predict data 170 | 171 | nnDF=np.concatenate([predictD.reshape(-1,1),testL.reshape(-1,1)],axis=1) 172 | loss=percent(nnDF) 173 | print('loss: ',loss) 174 | #coefficient of determination ---- 1.0 is the best 175 | score = model.score(testD,testL) 176 | print('score: ',score) 177 | 178 | if __name__=='__main__': 179 | 180 | filepath='/sdbadmin/hadoop/input' 181 | try: 182 | client=Client('http://192.168.111.130:50070') 183 | except Exception as e: 184 | print(e) 185 | 186 | dirs=client.list(filepath) 187 | #将hdfs本地化 188 | print('there are %d shares'%(len(dirs))) 189 | ''' 190 | try: 191 | for i in range(len(dirs)): 192 | client.download(filepath+'/'+dirs[i],'/opt/share_code_data/'+dirs[i]) 193 | except Exception as e: 194 | print(e) 195 | ''' 196 | min_max_scaler = preprocessing.MinMaxScaler() 197 | DD=pd.DataFrame([]) 198 | for i in range(len(dirs)): 199 | df=pd.read_csv('/opt/share_code_data/'+dirs[i],index_col=0) 200 | if len(DD)==len(df) or len(DD)==0 and len(df)!=0: 201 | trun = min_max_scaler.fit_transform(copy.deepcopy(df.iloc[:,5]).values.reshape(-1,1)) 202 | DD[dirs[i].strip().split('.')[0]]=trun.ravel() 203 | elif len(df)!=0: 204 | cols=DD.columns 205 | trun = min_max_scaler.fit_transform(copy.deepcopy(df.iloc[:,5]).values.reshape(-1,1)) 206 | DD=pd.concat([DD,pd.DataFrame(trun)],axis=1) #长度不一致的合并 207 | f=list(cols) 208 | f.append(dirs[i].strip().split('.')[0]) 209 | DD.columns=f 210 | DD.fillna(-1, inplace=True) 211 | 212 | print('DD shape: ',np.shape(DD)) 213 | print(DD) #[6690 rows x 169 columns] 214 | 215 | Data,Label=create_examples(DD,detect=0) 216 | 217 | use_algorithm(Data,Label) 218 | 219 | 220 | ''' 221 | for i in range(y): #遍历所有share 222 | print('Now is %s'%DD.columns[i]) 223 | for j in range(y): 224 | if i!=j: 225 | l1=np.nonzero(DD.iloc[:,i]!=-1)[0] 226 | l2=np.nonzero(DD.iloc[:,j]!=-1)[0] 227 | l=set(l1) & set(l2) 228 | tempDF=copy.deepcopy(DD.iloc[list(l),[i,j]]) 229 | print('tempDF shape: ',tempDF.shape) 230 | train_len=int(len(tempDF)*0.7) 231 | test_len=int(len(tempDF)*0.3) 232 | ''' 233 | -------------------------------------------------------------------------------- /master/share_code/share_code/auto_ip.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 13 21:51:29 2018 4 | 5 | @author: Administrator 6 | """ 7 | import urllib 8 | import urllib2 9 | import time 10 | import redis 11 | from scrapy.selector import Selector 12 | import requests 13 | 14 | def get_url(url): # 国内高匿代理的链接 15 | url_list = [] 16 | for i in range(1,100): 17 | url_new = url + str(i) 18 | url_list.append(url_new) 19 | return url_list 20 | def get_content(url): # 获取网页内容 21 | user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0' 22 | headers = {'User-Agent': user_agent} 23 | req = requests.get(url=url, headers=headers) 24 | 25 | ''' 26 | req = urllib.request.Request(url=url, headers=headers) 27 | res = urllib.request.urlopen(req) 28 | content = res.read() 29 | return content.decode('utf-8') 30 | ''' 31 | return req 32 | 33 | def get_info(content): # 提取网页信息 / ip 端口 34 | datas_ip = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[2]/text()').extract() 35 | datas_head = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[6]/text()').extract() 36 | datas_port =Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[3]/text()').extract() 37 | 38 | ''' 39 | datas_ip = Selector(text=content).xpath('//table[contains(@id,"ip_list")]/tr/td[2]/text()').extract() 40 | datas_head = Selector(text=content).xpath('//table[contains(@id,"ip_list")]/tr/td[6]/text()').extract() 41 | datas_port =Selector(text=content).xpath('//table[contains(@id,"ip_list")]/tr/td[3]/text()').extract() 42 | ''' 43 | 44 | #写入redis 45 | print('head: ',datas_head) 46 | try: 47 | R=redis.Redis(host='localhost',port='6379') 48 | except Exception as e: 49 | print(e) 50 | count=0 51 | for head,ip,port in zip(datas_head,datas_ip,datas_port): 52 | p=R.zadd('share:auto_ip_pool',str(head).lower()+'://'+str(ip)+':'+str(port),count) 53 | if p: 54 | count+=1 55 | #print(datas_ip,datas_port) 56 | return datas_head,datas_ip,datas_port 57 | def verify_ip(head,ip,port): # 验证ip有效 58 | user_agent ='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0' 59 | headers = {'User-Agent':user_agent} 60 | if 'https' in head: 61 | proxy = {'https':'%s://%s:%s'%(head.lower(),ip,port)} 62 | else: 63 | proxy = {'http':'%s://%s:%s'%(head.lower(),ip,port)} 64 | print(proxy) 65 | 66 | #proxy_handler = urllib.request.ProxyHandler(proxy) 67 | proxy_handler=urllib2.ProxyHandler(proxy) 68 | #opener = urllib.request.build_opener(proxy_handler) 69 | opener = urllib2.build_opener(proxy_handler) 70 | urllib2.install_opener(opener) 71 | 72 | #test_url = "https://www.baidu.com/" 73 | test_url = "http://quote.eastmoney.com/stocklist.html#sh" 74 | req = urllib2.Request(url=test_url,headers=headers) 75 | time.sleep(3) 76 | 77 | try: 78 | R=redis.Redis(host='localhost',port='6379') 79 | except Exception as e: 80 | print(e) 81 | count=0 82 | try: 83 | res = urllib2.urlopen(req,timeout=1) 84 | #time.sleep(3) 85 | content = res.read() 86 | if content: 87 | print('that is ok') 88 | R.zadd('share:auto_ip_pool_ok',str(head).lower()+'://'+str(ip)+':'+str(port),count) 89 | count+=1 90 | else: 91 | print('its not ok') 92 | except urllib2.URLError as e: 93 | print(e.reason) 94 | except Exception as e: 95 | print(e) 96 | 97 | if __name__ == '__main__': 98 | url = 'http://www.xicidaili.com/nn/' 99 | url_list = get_url(url) 100 | for i in url_list: 101 | print(i) 102 | content = get_content(i) 103 | time.sleep(3) 104 | data=get_info(content) 105 | for head,ip,port in zip(data[0],data[1],data[2]): 106 | verify_ip(head,ip,port) 107 | -------------------------------------------------------------------------------- /master/share_code/share_code/auto_ip_multi_pro.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 11 00:01:28 2018 4 | 5 | @author: Administrator 6 | """ 7 | from scrapy.selector import Selector 8 | import time 9 | import requests 10 | import redis 11 | from multiprocessing import Process, Queue 12 | 13 | 14 | def get_url(url): # 国内高匿代理的链接 15 | url_list = [] 16 | for i in range(1,100): 17 | url_new = url + str(i) 18 | url_list.append(url_new) 19 | return url_list 20 | 21 | def get_content(url): # 获取网页内容 22 | user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' 23 | headers = {'User-Agent': user_agent} 24 | req = requests.get(url=url, headers=headers) 25 | return req 26 | 27 | def get_info(content): # 提取网页信息 / ip 端口 28 | datas_ip = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[2]/text()').extract() 29 | datas_head = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[6]/text()').extract() 30 | datas_port =Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[3]/text()').extract() 31 | 32 | #写入redis 33 | print('head: ',datas_head) 34 | try: 35 | R=redis.Redis(host='localhost',port='6379') 36 | except Exception as e: 37 | print(e) 38 | count=0 39 | for head,ip,port in zip(datas_head,datas_ip,datas_port): 40 | p=R.zadd('share:auto_ip_pool',str(head).lower()+'://'+str(ip)+':'+str(port),count) 41 | if p: 42 | count+=1 43 | #print(datas_ip,datas_port) 44 | return datas_head,datas_ip,datas_port 45 | 46 | def verify_ip_one(old_queue,new_queue): # 验证ip有效性 47 | while 1: 48 | data=old_queue.get() 49 | print(data) 50 | if data==0: 51 | break 52 | head=data[0].lower() 53 | ip=data[1] 54 | port=data[2] 55 | print('head,data,ip') 56 | 57 | user_agent ='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0' 58 | accept_encoding ='gzip, deflate, sdch' 59 | accept_language ='zh-CN,zh;q=0.8' 60 | headers = {'User-Agent':user_agent,'Accept-Encoding':accept_encoding,'Accept-Language':accept_language} 61 | if 'https' in head: 62 | proxy = {'https':'%s://%s:%s'%(head,ip,port)} 63 | else: 64 | proxy = {'http':'%s://%s:%s'%(head,ip,port)} 65 | print(proxy) 66 | 67 | test_url = "https://www.baidu.com/" 68 | 69 | try: 70 | req = requests.get(url=test_url,proxies=proxy,headers=headers) 71 | status_code=req.status_code 72 | if status_code==200: 73 | print('that is ok') 74 | print(str(ip) + u":" + str(port)) 75 | new_queue.put([head,ip,port]) 76 | else: 77 | print('its not ok') 78 | except Exception as e: 79 | print('fall down') 80 | 81 | def verif_ip(data): 82 | old_queue=Queue() 83 | for head,ip,port in zip(data[0],data[1],data[2]): 84 | old_queue.put([head,ip,port]) #往没验证过的queue加入数据 85 | print('verify ip.....') 86 | print('old_queue: ',old_queue.qsize()) 87 | old_queue.put(0) #终止条件 88 | new_queue=Queue() 89 | works = [] 90 | for i in range(1): 91 | print('process %s'%i) 92 | works.append(Process(target=verify_ip_one, args=(old_queue,new_queue))) 93 | for work in works: 94 | print('process start') 95 | work.start() 96 | work.join() 97 | ''' 98 | for work in works: 99 | work.join() 100 | ''' 101 | try: 102 | R=redis.Redis(host='localhost',port='6379') 103 | except Exception as e: 104 | print(e) 105 | for i in range(new_queue.qsize()): 106 | head,ip,port=new_queue.get() 107 | R.sadd('share:auto_ip_pool_ok',str(head).lower()+'://'+str(ip)+':'+str(port)) 108 | print('insert one row') 109 | 110 | 111 | if __name__ == '__main__': 112 | url = 'http://www.xicidaili.com/nn/' 113 | url_list = get_url(url) 114 | for i in url_list: 115 | print(i) 116 | content = get_content(i) 117 | time.sleep(3) 118 | data=get_info(content) 119 | verif_ip(data) 120 | ''' 121 | for head,ip,port in zip(data[0],data[1],data[2]): 122 | verif_ip(head.lower(),ip,port) 123 | ''' 124 | -------------------------------------------------------------------------------- /master/share_code/share_code/example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 13 17:35:23 2018 4 | 5 | @author: Administrator 6 | """ 7 | 8 | from multiprocessing import Process, Queue 9 | 10 | def f(q,n): 11 | q.put([42, n, 'hello']) 12 | 13 | if __name__ == '__main__': 14 | q = Queue() 15 | p_list=[] 16 | for i in range(3): 17 | p = Process(target=f, args=(q,i)) 18 | p_list.append(p) 19 | p.start() 20 | print(q.get()) 21 | print(q.get()) 22 | print(q.get()) 23 | for i in p_list: 24 | i.join() -------------------------------------------------------------------------------- /master/share_code/share_code/extensions.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/extensions.pyc -------------------------------------------------------------------------------- /master/share_code/share_code/extensionsItem.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 20 19:18:54 2018 4 | 5 | @author: Administrator 6 | """ 7 | import time 8 | import logging 9 | from scrapy import signals 10 | from scrapy.exceptions import NotConfigured 11 | logger = logging.getLogger(__name__) 12 | 13 | class SpiderOpenCloseLogging(object): 14 | 15 | def __init__(self, item_count,idle_number,crawler): 16 | self.item_count = item_count 17 | 18 | self.items_scraped = 0 19 | 20 | self.idle_count = 0 21 | 22 | self.idle_list = [] 23 | 24 | self.crawler = crawler 25 | 26 | self.idle_number = idle_number 27 | 28 | @classmethod 29 | def from_crawler(cls, crawler): 30 | # first check if the extension should be enabled and raise 31 | 32 | # NotConfigured otherwise 33 | 34 | if not crawler.settings.getbool('MYEXT_ENABLED'): 35 | 36 | raise NotConfigured 37 | #idle_number 38 | idle_number = crawler.settings.getint('IDLE_NUMBER', 10) 39 | 40 | # get the number of items from settings 41 | 42 | item_count = crawler.settings.getint('ITEM_NUMBER', 10000000) 43 | 44 | # instantiate the extension object 45 | 46 | ext = cls(item_count,idle_number,crawler) 47 | 48 | # connect the extension object to signals 49 | 50 | crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) 51 | 52 | crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) 53 | 54 | crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped) 55 | 56 | crawler.signals.connect(ext.spider_idle, signal=signals.spider_idle) 57 | 58 | # return the extension object 59 | 60 | return ext 61 | 62 | def spider_opened(self, spider): 63 | logger.info("opened spider %s redis spider Idle, Continuous idle limit: %d", spider.name, self.idle_number) 64 | 65 | def spider_closed(self, spider,reason='finished'): #默认结束reason是finished,如果spider是被引擎的 close_spider 方法所关闭,则其为调用该方法时传入的 reason 参数(默认为 'cancelled') 66 | logger.info("closed spider %s, idle count %d , Continuous idle count %d ,closed reason %s", 67 | spider.name, self.idle_count, len(self.idle_list),reason) 68 | 69 | def item_scraped(self, item, spider): 70 | if item: 71 | self.items_scraped += 1 72 | print('self.items: ',item) 73 | if self.items_scraped % self.item_count == 0: 74 | spider.log("scraped %d items" % self.items_scraped) 75 | 76 | def spider_idle(self, spider): 77 | self.idle_count += 1 # 空闲计数 78 | print('idle_count:',self.idle_count) 79 | self.idle_list.append(time.time()) # 每次触发 spider_idle时,记录下触发时间戳 80 | idle_list_len = len(self.idle_list) # 获取当前已经连续触发的次数 81 | if idle_list_len > 8: 82 | # 连续触发的次数达到配置次数后关闭爬虫 83 | logger.info('\n continued idle number exceed {} Times' 84 | '\n meet the idle shutdown conditions, will close the reptile operation' 85 | '\n idle start time: {}, close spider time: {}'.format(8, 86 | self.idle_list[0], self.idle_list[-1])) 87 | self.crawler.engine.close_spider(spider, 'closespider_ForNullRun') 88 | -------------------------------------------------------------------------------- /master/share_code/share_code/extensionsItem.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/extensionsItem.pyc -------------------------------------------------------------------------------- /master/share_code/share_code/extensionsTime.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from time import time 4 | 5 | from scrapy.exceptions import NotConfigured 6 | from twisted.internet import task 7 | from scrapy import signals 8 | 9 | 10 | class Latencies(object): 11 | """ 12 | An extension that measures throughput and latencies. 13 | """ 14 | @classmethod 15 | def from_crawler(cls, crawler): 16 | return cls(crawler) 17 | 18 | def __init__(self, crawler): 19 | self.crawler = crawler 20 | self.interval = crawler.settings.getfloat('LATENCIES_INTERVAL') 21 | 22 | if not self.interval: 23 | raise NotConfigured 24 | 25 | cs = crawler.signals 26 | cs.connect(self._spider_opened, signal=signals.spider_opened) 27 | cs.connect(self._spider_closed, signal=signals.spider_closed) 28 | cs.connect(self._request_scheduled, signal=signals.request_scheduled) 29 | cs.connect(self._response_received, signal=signals.response_received) 30 | cs.connect(self._item_scraped, signal=signals.item_scraped) 31 | 32 | self.latency, self.proc_latency, self.items = 0, 0, 0 33 | 34 | def _spider_opened(self, spider): 35 | self.task = task.LoopingCall(self._log, spider) 36 | self.task.start(self.interval) 37 | 38 | def _spider_closed(self, spider, reason): 39 | if self.task.running: 40 | self.task.stop() 41 | 42 | def _request_scheduled(self, request, spider): 43 | request.meta['schedule_time'] = time() 44 | 45 | def _response_received(self, response, request, spider): 46 | request.meta['received_time'] = time() 47 | 48 | def _item_scraped(self, item, response, spider): 49 | self.latency += time() - response.meta['schedule_time'] 50 | self.proc_latency += time() - response.meta['received_time'] 51 | self.items += 1 52 | 53 | def _log(self, spider): 54 | irate = float(self.items) / self.interval #interval 时间内处理item数 55 | latency = self.latency / self.items if self.items else 0 #单个时延 56 | proc_latency = self.proc_latency / self.items if self.items else 0 #单个响应时间 57 | 58 | spider.logger.info(("Scraped %d items at %.1f items/s, avg latency: " 59 | "%.2f s and avg time in pipelines: %.2f s") % 60 | (self.items, irate, latency, proc_latency)) 61 | 62 | self.latency, self.proc_latency, self.items = 0, 0, 0 63 | -------------------------------------------------------------------------------- /master/share_code/share_code/extensionsTime.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/extensionsTime.pyc -------------------------------------------------------------------------------- /master/share_code/share_code/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from scrapy.loader import ItemLoader 10 | from scrapy.loader.processors import MapCompose, TakeFirst, Join 11 | 12 | 13 | class ShareCodeItem(scrapy.Item): 14 | # define the fields for your item here like: 15 | # name = scrapy.Field() 16 | number = scrapy.Field() 17 | 18 | data = scrapy.Field() 19 | 20 | class ShareLoader(ItemLoader): 21 | default_item_class = ShareCodeItem 22 | default_input_processor = MapCompose(lambda s: s.lstrip().replace('\r',''))#去掉左边空格 23 | default_output_processor = Join() 24 | description_out = Join() 25 | -------------------------------------------------------------------------------- /master/share_code/share_code/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/items.pyc -------------------------------------------------------------------------------- /master/share_code/share_code/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | import scrapy 9 | from scrapy import signals 10 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 11 | from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware 12 | import random 13 | import redis 14 | 15 | 16 | class ShareCodeSpiderMiddleware(object): 17 | # Not all methods need to be defined. If a method is not defined, 18 | # scrapy acts as if the spider middleware does not modify the 19 | # passed objects. 20 | 21 | @classmethod 22 | def from_crawler(cls, crawler): 23 | # This method is used by Scrapy to create your spiders. 24 | s = cls() 25 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 26 | return s 27 | 28 | def process_spider_input(self, response, spider): 29 | # Called for each response that goes through the spider 30 | # middleware and into the spider. 31 | 32 | # Should return None or raise an exception. 33 | return None 34 | 35 | def process_spider_output(self, response, result, spider): 36 | # Called with the results returned from the Spider, after 37 | # it has processed the response. 38 | 39 | # Must return an iterable of Request, dict or Item objects. 40 | for i in result: 41 | yield i 42 | 43 | def process_spider_exception(self, response, exception, spider): 44 | # Called when a spider or process_spider_input() method 45 | # (from other spider middleware) raises an exception. 46 | 47 | # Should return either None or an iterable of Response, dict 48 | # or Item objects. 49 | pass 50 | 51 | def process_start_requests(self, start_requests, spider): 52 | # Called with the start requests of the spider, and works 53 | # similarly to the process_spider_output() method, except 54 | # that it doesn’t have a response associated. 55 | 56 | # Must return only requests (not items). 57 | for r in start_requests: 58 | yield r 59 | 60 | def spider_opened(self, spider): 61 | spider.logger.info('Spider opened: %s' % spider.name) 62 | 63 | 64 | class ShareCodeDownloaderMiddleware(object): 65 | # Not all methods need to be defined. If a method is not defined, 66 | # scrapy acts as if the downloader middleware does not modify the 67 | # passed objects. 68 | 69 | @classmethod 70 | def from_crawler(cls, crawler): 71 | # This method is used by Scrapy to create your spiders. 72 | s = cls() 73 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 74 | return s 75 | 76 | def process_request(self, request, spider): 77 | # Called for each request that goes through the downloader 78 | # middleware. 79 | 80 | # Must either: 81 | # - return None: continue processing this request 82 | # - or return a Response object 83 | # - or return a Request object 84 | # - or raise IgnoreRequest: process_exception() methods of 85 | # installed downloader middleware will be called 86 | return None 87 | 88 | def process_response(self, request, response, spider): 89 | # Called with the response returned from the downloader. 90 | 91 | # Must either; 92 | # - return a Response object 93 | # - return a Request object 94 | # - or raise IgnoreRequest 95 | return response 96 | 97 | def process_exception(self, request, exception, spider): 98 | # Called when a download handler or a process_request() 99 | # (from other downloader middleware) raises an exception. 100 | 101 | # Must either: 102 | # - return None: continue processing this exception 103 | # - return a Response object: stops process_exception() chain 104 | # - return a Request object: stops process_exception() chain 105 | pass 106 | 107 | def spider_opened(self, spider): 108 | spider.logger.info('Spider opened: %s' % spider.name) 109 | 110 | class DoubleUserAgentMiddleware(UserAgentMiddleware): 111 | ''' 112 | 设置User-Agent 113 | ''' 114 | 115 | def __init__(self, user_agent): 116 | self.user_agent = user_agent 117 | 118 | @classmethod 119 | def from_crawler(cls, crawler): 120 | return cls( 121 | user_agent=crawler.settings.get('DOUBLE_USER_AGENT') 122 | ) 123 | 124 | def process_request(self, request, spider): 125 | agent = random.choice(self.user_agent) 126 | request.headers['User-Agent'] = agent 127 | 128 | class AuToIpMiddleware(HttpProxyMiddleware): 129 | '''设置自动IP''' 130 | def __init__(self,ipaddr=''): 131 | super(AuToIpMiddleware,self).__init__ 132 | self.ipaddr=ipaddr 133 | 134 | def process_request(self, request, spider): 135 | '''对request对象加上proxy''' 136 | self.ipaddr=self.get_random_proxy() 137 | print('The chosen ip is: '+str(self.ipaddr)) 138 | request.meta["proxy"] = self.ipaddr 139 | 140 | def process_response(self, request, response, spider): 141 | '''对返回的response处理''' 142 | # 如果返回的response状态不是200,重新生成当前request对象 143 | if response.status != 200 : 144 | if 'Unauthorized' in response.body: 145 | print('length: ',len(response.body)) 146 | print('content: ',response.body) 147 | proxy = self.get_random_proxy() 148 | print("Response ip:"+proxy) 149 | # 对当前reque加上代理 150 | request.meta['proxy'] = self.ipaddr 151 | return request 152 | return request 153 | return response 154 | 155 | def get_random_proxy(self): 156 | try: 157 | R=redis.Redis(host='localhost',port='6379') 158 | except Exception as e: 159 | print(e) 160 | length=R.zcard('share:auto_ip_pool_ok') 161 | number=random.randint(0,length-1) 162 | ipaddr=R.zrange('share:auto_ip_pool_ok',number,number)[0] #取出随机一个ip 163 | ipaddr=ipaddr.decode('utf-8') 164 | return ipaddr 165 | -------------------------------------------------------------------------------- /master/share_code/share_code/middlewares.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/middlewares.pyc -------------------------------------------------------------------------------- /master/share_code/share_code/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | #from hdfs import Client 8 | from scrapy.exceptions import DropItem 9 | import logging 10 | logger=logging.getLogger(__name__) 11 | 12 | class ShareCodePipeline(object): 13 | # 初始化方法 14 | def __init__(self): 15 | logger.info("ShareCodePipeline __init__") 16 | ''' 17 | try: 18 | self.client=Client('http://192.168.111.130:50070') 19 | except Exception as e: 20 | print(e) 21 | ''' 22 | 23 | def process_item(self, item, spider): 24 | logger.info("ShareCodePipeline process_item") 25 | 26 | if item['number']: 27 | number=item['number'] 28 | logger.info('number exists') 29 | else: 30 | raise DropItem('Missing number in %s'%item) 31 | if item['data']: 32 | data=item['data'] 33 | logger.info('data exists') 34 | #print('data:',data,'\n\n') 35 | else: 36 | raise DropItem('Missing data in %s'%item) 37 | 38 | data_str=data.encode('utf-8') #内含中文,先编码成utf-8 39 | logger.info('ShareCodePipeline process_item success') 40 | ''' 41 | try: 42 | print('begin write') 43 | self.client.write('/sdbadmin/hadoop/input/'+str(number)+'.csv',data=data_str) 44 | print('end write') 45 | except Exception as e: 46 | print(e) 47 | ''' 48 | -------------------------------------------------------------------------------- /master/share_code/share_code/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/pipelines.pyc -------------------------------------------------------------------------------- /master/share_code/share_code/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for share_code project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'share_code' 13 | 14 | SPIDER_MODULES = ['share_code.spiders'] 15 | NEWSPIDER_MODULE = 'share_code.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'share_code (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'share_code.middlewares.ShareCodeSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | DOWNLOADER_MIDDLEWARES = { 56 | 'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None, 57 | 'share_code.middlewares.DoubleUserAgentMiddleware': 240, 58 | 'share_code.middlewares.AuToIpMiddleware': None, #340 59 | 'share_code.middlewares.ShareCodeDownloaderMiddleware': 543, 60 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':800, 61 | 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': None 62 | } 63 | 64 | # Enable or disable extensions 65 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 66 | #EXTENSIONS = { 67 | # 'scrapy.extensions.telnet.TelnetConsole': None, 68 | #} 69 | 70 | # Configure item pipelines 71 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 72 | #ITEM_PIPELINES = { 73 | # 'share_code.pipelines.ShareCodePipeline': 300, 74 | #} 75 | 76 | # Enable and configure the AutoThrottle extension (disabled by default) 77 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 78 | #AUTOTHROTTLE_ENABLED = True 79 | # The initial download delay 80 | #AUTOTHROTTLE_START_DELAY = 5 81 | # The maximum download delay to be set in case of high latencies 82 | #AUTOTHROTTLE_MAX_DELAY = 60 83 | # The average number of requests Scrapy should be sending in parallel to 84 | # each remote server 85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 86 | # Enable showing throttling stats for every response received: 87 | #AUTOTHROTTLE_DEBUG = False 88 | 89 | # Enable and configure HTTP caching (disabled by default) 90 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 91 | #HTTPCACHE_ENABLED = True 92 | #HTTPCACHE_EXPIRATION_SECS = 0 93 | #HTTPCACHE_DIR = 'httpcache' 94 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 95 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 96 | 97 | MYEXT_ENABLED = True 98 | 99 | LATENCIES_INTERVAL = 5 100 | ITEM_NUMBERS=10000000 101 | IDLE_NUMBERS=8 102 | 103 | MEMUSAGE_NOTIFY_MAIL = ['3081881935@qq.com'] 104 | MEMUSAGE_REPORT = True 105 | MEMUSAGE_ENABLED = True 106 | MEMUSAGE_LIMIT_MB = 2048 107 | MEMDEBUG_ENABLED = True 108 | MEMDEBUG_NOTIFY = [] 109 | 110 | EXTENSIONS = { 111 | 'share_code.extensionsItem.SpiderOpenCloseLogging': 100, 112 | 'share_code.extensionsTime.Latencies': 120, 113 | 'scrapy.contrib.memusage.MemoryUsage': 50, 114 | 'scrapy.contrib.memdebug.MemoryDebugger': 60 115 | } 116 | 117 | ITEM_PIPELINES = { 118 | 'share_code.pipelines.ShareCodePipeline':100 119 | } 120 | 121 | # Enables scheduling storing requests queue in redis. 122 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 123 | 124 | # Ensure all spiders share same duplicates filter through redis. 125 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 126 | 127 | REDIS_START_URLS_AS_SET = True 128 | 129 | REDIS_HOST = '192.168.111.130' 130 | REDIS_PORT = '6379' 131 | 132 | DOUBLE_USER_AGENT = [ 133 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 134 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 135 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 136 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 137 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 138 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 139 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 140 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 141 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 142 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 143 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 144 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 145 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 146 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 147 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 148 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 149 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", 150 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", 151 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", 152 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", 153 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", 154 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 155 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", 156 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 157 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", 158 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 159 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 160 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 161 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 162 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 163 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", 164 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", 165 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", 166 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", 167 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", 168 | ] 169 | -------------------------------------------------------------------------------- /master/share_code/share_code/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/settings.pyc -------------------------------------------------------------------------------- /master/share_code/share_code/spiders/ShareSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | #redis-cli sadd share:start_urls http://quote.eastmoney.com/stocklist.html#sh 4 | #scrapy crawl share 5 | #del share:download_url share:temp_urls share:share_code 6 | 7 | 8 | from scrapy_redis.spiders import RedisSpider 9 | import redis 10 | import scrapy 11 | from scrapy import log 12 | from share_code.items import ShareLoader 13 | from scrapy.selector import Selector 14 | import re 15 | from kazoo.client import KazooClient 16 | #rom scrapy import log 17 | 18 | class ShareSpider(RedisSpider): 19 | name = "share" 20 | #allowed_domains = ["share.org"]redis_key = 'share:start_urls' 21 | #start_urls = 'http://quote.eastmoney.com/stocklist.html#sh' 22 | redis_key = 'share:start_urls' 23 | 24 | temp_url_piece = 'http://quotes.money.163.com/trade/lsjysj_' #中间url的片段 25 | download_url_piece = 'http://quotes.money.163.com/service/chddata.html?code=0' 26 | 27 | zk = KazooClient(hosts='127.0.0.1:2181') 28 | zk.start() 29 | 30 | # Ensure a path, create if necessary 31 | zk.ensure_path("/ip_process") 32 | 33 | # Create a node with data 34 | zk.create("/ip_process/192.168.111.130", 35 | value=b"ok", ephemeral=True) 36 | 37 | pool=redis.ConnectionPool(host='localhost', port=6379, decode_responses=True) 38 | 39 | def parse(self, response): 40 | self.log('parse begin!',level=log.INFO) 41 | #r = ShareLoader(response=response) 42 | #response.encoding='GB2312' 43 | node_list = Selector(response=response).xpath('//*[@id="quotesearch"]/ul[1]/li/a/text()').extract() 44 | #print(node_list) 45 | code_list = [] 46 | count = 1 #zadd score 47 | totalCount = 0 #total share url 48 | 49 | R = redis.Redis(connection_pool=self.pool) 50 | 51 | self.log('Redis connect success!',level=log.INFO) 52 | for node in node_list: 53 | #r.add_value('number',node) 54 | try: 55 | code = re.match(r'.*?\((\d+)\)', node).group(1) 56 | print ('code: ',code) 57 | code_list.append(code) 58 | p0 = R.zadd('share:share_code',code,count) #增加code进sorted_set 59 | if p0==1: #无重复 60 | print('Add share code success') 61 | totalCount += 1 62 | print('code totalCount +=1') 63 | p1 = R.sadd('share:temp_urls',self.temp_url_piece +str(code) + '.html') 64 | if p1 ==1: #无重复 65 | print('share:temp_urls count: ',R.scard('share:temp_urls')) 66 | temp_url = self.temp_url_piece +str(code) + '.html' 67 | count += 1 68 | yield scrapy.Request(url=temp_url,callback=self.parse2,meta={'code':code}) 69 | self.log('parse end!',level=log.INFO) 70 | else: 71 | print('Add share code duplicate') 72 | except Exception as e: 73 | print(e) 74 | continue 75 | 76 | def parse2(self,response): 77 | #print('response body: ',response.body) 78 | if response.status == 200: 79 | #print('url: ',response.url) 80 | #print('response.body: ',response.body) 81 | code = response.meta['code'] 82 | #print('__init__: ',Selector(response=response).xpath('//input[@name="date_start_type"]/@value').extract()) 83 | if Selector(response=response).xpath('//input[@name="date_start_type"]/@value').extract(): 84 | start_date = Selector(response=response).xpath('//input[@name="date_start_type"]/@value').extract()[0].replace('-','') 85 | else: 86 | start_date = Selector(response=response).xpath('//input[@type="text"]/@value').extract()[0].replace('-','') 87 | end_date = Selector(response=response).xpath('//input[@name="date_end_type"]/@value').extract()[0].replace('-','') 88 | print('start_date: ',start_date) 89 | download_url = self.download_url_piece+str(code)+"&start="+str(start_date)+"&end="+str(end_date)+"&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP" 90 | #yield scrapy.Request(url=download_url,meta={'code',code},callback=self.parse3) 91 | r = ShareLoader(response=response) 92 | pool=redis.ConnectionPool(host='localhost', port=6379, decode_responses=True) 93 | R = redis.Redis(connection_pool=pool) 94 | 95 | p2 = R.sadd('share:download_url',download_url) 96 | 97 | if p2: 98 | print('download_url success') 99 | print('download_url: ',download_url) 100 | 101 | r.add_value('number',code) 102 | r.add_value('data',download_url) 103 | self.log('Add download_url one more', level=log.INFO) 104 | 105 | return r.load_item() 106 | else: 107 | self.log('Response not 200!! ',level=log.WARNING) 108 | -------------------------------------------------------------------------------- /master/share_code/share_code/spiders/ShareSpider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/spiders/ShareSpider.pyc -------------------------------------------------------------------------------- /master/share_code/share_code/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /master/share_code/share_code/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/spiders/__init__.pyc -------------------------------------------------------------------------------- /master/share_code/share_code/spiders/__pycache__/ShareSpider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/spiders/__pycache__/ShareSpider.cpython-36.pyc -------------------------------------------------------------------------------- /master/share_code/share_code/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/master/share_code/share_code/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /master/share_code/share_code/untitled2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 20 01:08:28 2018 4 | 5 | @author: Administrator 6 | """ 7 | 8 | class zoo: 9 | b=1 10 | def __init__(self): 11 | print('init',self.b) 12 | @classmethod 13 | def ok(cls,iin): 14 | print('classmethod: ',iin) 15 | print('hello') 16 | global a 17 | a=1 18 | a+=1 19 | print(a) 20 | zoo() -------------------------------------------------------------------------------- /master/share_code/zookeeper.out: -------------------------------------------------------------------------------- 1 | 2018-05-12 17:40:42,831 [myid:] - INFO [main:QuorumPeerConfig@134] - Reading configuration from: /opt/zookeeper-3.4.10/bin/../conf/zoo.cfg 2 | 2018-05-12 17:40:42,875 [myid:] - INFO [main:QuorumPeer$QuorumServer@167] - Resolved hostname: 192.168.111.131 to address: /192.168.111.131 3 | 2018-05-12 17:40:42,877 [myid:] - INFO [main:QuorumPeer$QuorumServer@167] - Resolved hostname: 192.168.111.128 to address: /192.168.111.128 4 | 2018-05-12 17:40:42,880 [myid:] - INFO [main:QuorumPeer$QuorumServer@167] - Resolved hostname: 192.168.111.129 to address: /192.168.111.129 5 | 2018-05-12 17:40:42,881 [myid:] - INFO [main:QuorumPeer$QuorumServer@167] - Resolved hostname: 192.168.111.130 to address: /192.168.111.130 6 | 2018-05-12 17:40:42,882 [myid:] - WARN [main:QuorumPeerConfig@352] - Non-optimial configuration, consider an odd number of servers. 7 | 2018-05-12 17:40:42,882 [myid:] - INFO [main:QuorumPeerConfig@396] - Defaulting to majority quorums 8 | 2018-05-12 17:40:42,893 [myid:0] - INFO [main:DatadirCleanupManager@78] - autopurge.snapRetainCount set to 3 9 | 2018-05-12 17:40:42,893 [myid:0] - INFO [main:DatadirCleanupManager@79] - autopurge.purgeInterval set to 0 10 | 2018-05-12 17:40:42,894 [myid:0] - INFO [main:DatadirCleanupManager@101] - Purge task is not scheduled. 11 | 2018-05-12 17:40:42,926 [myid:0] - INFO [main:QuorumPeerMain@127] - Starting quorum peer 12 | 2018-05-12 17:40:42,947 [myid:0] - INFO [main:NIOServerCnxnFactory@89] - binding to port 0.0.0.0/0.0.0.0:2181 13 | 2018-05-12 17:40:42,949 [myid:0] - ERROR [main:QuorumPeerMain@89] - Unexpected exception, exiting abnormally 14 | java.net.BindException: 地址已在使用 15 | at sun.nio.ch.Net.bind0(Native Method) 16 | at sun.nio.ch.Net.bind(Net.java:433) 17 | at sun.nio.ch.Net.bind(Net.java:425) 18 | at sun.nio.ch.ServerSocketChannelImpl.bind(ServerSocketChannelImpl.java:223) 19 | at sun.nio.ch.ServerSocketAdaptor.bind(ServerSocketAdaptor.java:74) 20 | at sun.nio.ch.ServerSocketAdaptor.bind(ServerSocketAdaptor.java:67) 21 | at org.apache.zookeeper.server.NIOServerCnxnFactory.configure(NIOServerCnxnFactory.java:90) 22 | at org.apache.zookeeper.server.quorum.QuorumPeerMain.runFromConfig(QuorumPeerMain.java:130) 23 | at org.apache.zookeeper.server.quorum.QuorumPeerMain.initializeAndRun(QuorumPeerMain.java:111) 24 | at org.apache.zookeeper.server.quorum.QuorumPeerMain.main(QuorumPeerMain.java:78) 25 | -------------------------------------------------------------------------------- /master/zoo_detect/zoo_watcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 20 00:57:45 2018 4 | 5 | @author: Administrator 6 | """ 7 | 8 | from kazoo.client import KazooClient 9 | 10 | import time 11 | 12 | import logging 13 | logging.basicConfig() 14 | 15 | zk = KazooClient(hosts='127.0.0.1:2181') 16 | zk.start() 17 | 18 | # Determine if a node exists 19 | while True: 20 | for ip in ['192.168.111.130']: 21 | if zk.exists("/ip_process/" + ip): 22 | print ("%s is alive!"%ip) 23 | else: 24 | print ("%s is dead!"%ip) 25 | break 26 | time.sleep(6) 27 | 28 | zk.stop() 29 | -------------------------------------------------------------------------------- /nupic_output.py: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | # ---------------------------------------------------------------------- 3 | # Numenta Platform for Intelligent Computing (NuPIC) 4 | # Copyright (C) 2013, Numenta, Inc. Unless you have an agreement 5 | # with Numenta, Inc., for a separate license for this software code, the 6 | # following terms and conditions apply: 7 | # 8 | # This program is free software: you can redistribute it and/or modify 9 | # it under the terms of the GNU General Public License version 3 as 10 | # published by the Free Software Foundation. 11 | # 12 | # This program is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 15 | # See the GNU General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU General Public License 18 | # along with this program. If not, see http://www.gnu.org/licenses. 19 | # 20 | # http://numenta.org/licenses/ 21 | # ---------------------------------------------------------------------- 22 | import csv 23 | from collections import deque 24 | from abc import ABCMeta, abstractmethod 25 | from nupic.data.inference_shifter import InferenceShifter 26 | # Some users might not have matplotlib, and will only be using NuPICFileOutput. 27 | # So we can attempt to import and swallow any import errors that occur. 28 | try: 29 | import matplotlib.pyplot as plt 30 | import matplotlib.gridspec as gridspec 31 | except ImportError: 32 | pass 33 | 34 | 35 | WINDOW = 360 36 | 37 | 38 | class NuPICOutput(object): 39 | 40 | __metaclass__ = ABCMeta 41 | 42 | 43 | def __init__(self, name, show_anomaly_score=False): 44 | self.name = name 45 | self.show_anomaly_score = show_anomaly_score 46 | 47 | 48 | @abstractmethod 49 | def write(self, index, value, prediction_result, prediction_step=1): 50 | pass 51 | 52 | 53 | @abstractmethod 54 | def close(self): 55 | pass 56 | 57 | 58 | 59 | class NuPICFileOutput(NuPICOutput): 60 | 61 | 62 | def __init__(self, *args, **kwargs): 63 | super(NuPICFileOutput, self).__init__(*args, **kwargs) 64 | self.linecount = 0 65 | output_filename = '/home/sdbadmin/runSwarm/'+"%s.csv" % self.name 66 | print "Preparing to output to %s" % output_filename 67 | self.file = open(output_filename, 'w') 68 | self.writer = csv.writer(self.file) 69 | header_row = ['Time', 'value', 'prediction'] 70 | if self.show_anomaly_score: 71 | header_row.append('anomaly score') 72 | self.writer.writerow(header_row) 73 | 74 | 75 | def write(self, index, value, prediction_result, prediction_step=1): 76 | prediction = prediction_result.inferences\ 77 | ['multiStepBestPredictions'][prediction_step] 78 | output_row = [index, value, prediction] 79 | if self.show_anomaly_score: 80 | output_row.append(prediction_result.inferences['anomalyScore']) 81 | self.writer.writerow(output_row) 82 | self.linecount = self.linecount + 1 83 | 84 | 85 | def close(self): 86 | self.file.close() 87 | print "Done. Wrote %i data lines to %s." % (self.linecount, self.file.name) 88 | 89 | 90 | 91 | class NuPICPlotOutput(NuPICOutput): 92 | 93 | 94 | def __init__(self, *args, **kwargs): 95 | super(NuPICPlotOutput, self).__init__(*args, **kwargs) 96 | # turn matplotlib interactive mode on (ion) 97 | plt.ion() 98 | plt.figure(figsize=(14, 10)) 99 | gs = gridspec.GridSpec(2, 1, height_ratios=[3,1]) 100 | # plot title, legend, etc 101 | plt.title('Sine prediction example') 102 | plt.ylabel('Sine (rad)') 103 | # The shifter will align prediction and actual values. 104 | self.shifter = InferenceShifter() 105 | # Keep the last WINDOW predicted and actual values for plotting. 106 | self.actual_history = deque([0.0] * WINDOW, maxlen=360) 107 | self.predicted_history = deque([0.0] * WINDOW, maxlen=360) 108 | if self.show_anomaly_score: 109 | self.anomaly_score = deque([0.0] * WINDOW, maxlen=360) 110 | # Initialize the plot lines that we will update with each new record. 111 | if self.show_anomaly_score: 112 | plt.subplot(gs[0]) 113 | self.actual_line, = plt.plot(range(WINDOW), self.actual_history) 114 | self.predicted_line, = plt.plot(range(WINDOW), self.predicted_history) 115 | plt.legend(tuple(['actual','predicted']), loc=3) 116 | if self.show_anomaly_score: 117 | plt.subplot(gs[1]) 118 | self.anomaly_score_line, = plt.plot(range(WINDOW), self.anomaly_score, 'r-') 119 | plt.legend(tuple(['anomaly score']), loc=3) 120 | 121 | # Set the y-axis range. 122 | self.actual_line.axes.set_ylim(-1, 1) 123 | self.predicted_line.axes.set_ylim(-1, 1) 124 | if self.show_anomaly_score: 125 | self.anomaly_score_line.axes.set_ylim(-1, 1) 126 | 127 | 128 | 129 | def write(self, index, value, prediction_result, prediction_step=1): 130 | shifted_result = self.shifter.shift(prediction_result) 131 | # shifted_result = prediction_result 132 | # Update the trailing predicted and actual value deques. 133 | inference = shifted_result.inferences\ 134 | ['multiStepBestPredictions'][prediction_step] 135 | if inference is not None: 136 | self.actual_history.append(shifted_result.rawInput['sine']) 137 | self.predicted_history.append(inference) 138 | if self.show_anomaly_score: 139 | anomaly_score = prediction_result.inferences['anomalyScore'] 140 | self.anomaly_score.append(anomaly_score) 141 | 142 | # Redraw the chart with the new data. 143 | self.actual_line.set_ydata(self.actual_history) # update the data 144 | self.predicted_line.set_ydata(self.predicted_history) # update the data 145 | if self.show_anomaly_score: 146 | self.anomaly_score_line.set_ydata(self.anomaly_score) # update the data 147 | plt.draw() 148 | plt.tight_layout() 149 | 150 | 151 | 152 | def close(self): 153 | plt.ioff() 154 | plt.show() 155 | 156 | 157 | 158 | NuPICOutput.register(NuPICFileOutput) 159 | NuPICOutput.register(NuPICPlotOutput) 160 | ======= 161 | # ---------------------------------------------------------------------- 162 | # Numenta Platform for Intelligent Computing (NuPIC) 163 | # Copyright (C) 2013, Numenta, Inc. Unless you have an agreement 164 | # with Numenta, Inc., for a separate license for this software code, the 165 | # following terms and conditions apply: 166 | # 167 | # This program is free software: you can redistribute it and/or modify 168 | # it under the terms of the GNU General Public License version 3 as 169 | # published by the Free Software Foundation. 170 | # 171 | # This program is distributed in the hope that it will be useful, 172 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 173 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 174 | # See the GNU General Public License for more details. 175 | # 176 | # You should have received a copy of the GNU General Public License 177 | # along with this program. If not, see http://www.gnu.org/licenses. 178 | # 179 | # http://numenta.org/licenses/ 180 | # ---------------------------------------------------------------------- 181 | import csv 182 | from collections import deque 183 | from abc import ABCMeta, abstractmethod 184 | from nupic.data.inference_shifter import InferenceShifter 185 | # Some users might not have matplotlib, and will only be using NuPICFileOutput. 186 | # So we can attempt to import and swallow any import errors that occur. 187 | try: 188 | import matplotlib.pyplot as plt 189 | import matplotlib.gridspec as gridspec 190 | except ImportError: 191 | pass 192 | 193 | 194 | WINDOW = 360 195 | 196 | 197 | class NuPICOutput(object): 198 | 199 | __metaclass__ = ABCMeta 200 | 201 | 202 | def __init__(self, name, show_anomaly_score=False): 203 | self.name = name 204 | self.show_anomaly_score = show_anomaly_score 205 | 206 | 207 | @abstractmethod 208 | def write(self, index, value, prediction_result, prediction_step=1): 209 | pass 210 | 211 | 212 | @abstractmethod 213 | def close(self): 214 | pass 215 | 216 | 217 | 218 | class NuPICFileOutput(NuPICOutput): 219 | 220 | 221 | def __init__(self, *args, **kwargs): 222 | super(NuPICFileOutput, self).__init__(*args, **kwargs) 223 | self.linecount = 0 224 | output_filename = '/home/sdbadmin/runSwarm/'+"%s.csv" % self.name 225 | print "Preparing to output to %s" % output_filename 226 | self.file = open(output_filename, 'w') 227 | self.writer = csv.writer(self.file) 228 | header_row = ['Time', 'value', 'prediction'] 229 | if self.show_anomaly_score: 230 | header_row.append('anomaly score') 231 | self.writer.writerow(header_row) 232 | 233 | 234 | def write(self, index, value, prediction_result, prediction_step=1): 235 | prediction = prediction_result.inferences\ 236 | ['multiStepBestPredictions'][prediction_step] 237 | output_row = [index, value, prediction] 238 | if self.show_anomaly_score: 239 | output_row.append(prediction_result.inferences['anomalyScore']) 240 | self.writer.writerow(output_row) 241 | self.linecount = self.linecount + 1 242 | 243 | 244 | def close(self): 245 | self.file.close() 246 | print "Done. Wrote %i data lines to %s." % (self.linecount, self.file.name) 247 | 248 | 249 | 250 | class NuPICPlotOutput(NuPICOutput): 251 | 252 | 253 | def __init__(self, *args, **kwargs): 254 | super(NuPICPlotOutput, self).__init__(*args, **kwargs) 255 | # turn matplotlib interactive mode on (ion) 256 | plt.ion() 257 | plt.figure(figsize=(14, 10)) 258 | gs = gridspec.GridSpec(2, 1, height_ratios=[3,1]) 259 | # plot title, legend, etc 260 | plt.title('Sine prediction example') 261 | plt.ylabel('Sine (rad)') 262 | # The shifter will align prediction and actual values. 263 | self.shifter = InferenceShifter() 264 | # Keep the last WINDOW predicted and actual values for plotting. 265 | self.actual_history = deque([0.0] * WINDOW, maxlen=360) 266 | self.predicted_history = deque([0.0] * WINDOW, maxlen=360) 267 | if self.show_anomaly_score: 268 | self.anomaly_score = deque([0.0] * WINDOW, maxlen=360) 269 | # Initialize the plot lines that we will update with each new record. 270 | if self.show_anomaly_score: 271 | plt.subplot(gs[0]) 272 | self.actual_line, = plt.plot(range(WINDOW), self.actual_history) 273 | self.predicted_line, = plt.plot(range(WINDOW), self.predicted_history) 274 | plt.legend(tuple(['actual','predicted']), loc=3) 275 | if self.show_anomaly_score: 276 | plt.subplot(gs[1]) 277 | self.anomaly_score_line, = plt.plot(range(WINDOW), self.anomaly_score, 'r-') 278 | plt.legend(tuple(['anomaly score']), loc=3) 279 | 280 | # Set the y-axis range. 281 | self.actual_line.axes.set_ylim(-1, 1) 282 | self.predicted_line.axes.set_ylim(-1, 1) 283 | if self.show_anomaly_score: 284 | self.anomaly_score_line.axes.set_ylim(-1, 1) 285 | 286 | 287 | 288 | def write(self, index, value, prediction_result, prediction_step=1): 289 | shifted_result = self.shifter.shift(prediction_result) 290 | # shifted_result = prediction_result 291 | # Update the trailing predicted and actual value deques. 292 | inference = shifted_result.inferences\ 293 | ['multiStepBestPredictions'][prediction_step] 294 | if inference is not None: 295 | self.actual_history.append(shifted_result.rawInput['sine']) 296 | self.predicted_history.append(inference) 297 | if self.show_anomaly_score: 298 | anomaly_score = prediction_result.inferences['anomalyScore'] 299 | self.anomaly_score.append(anomaly_score) 300 | 301 | # Redraw the chart with the new data. 302 | self.actual_line.set_ydata(self.actual_history) # update the data 303 | self.predicted_line.set_ydata(self.predicted_history) # update the data 304 | if self.show_anomaly_score: 305 | self.anomaly_score_line.set_ydata(self.anomaly_score) # update the data 306 | plt.draw() 307 | plt.tight_layout() 308 | 309 | 310 | 311 | def close(self): 312 | plt.ioff() 313 | plt.show() 314 | 315 | 316 | 317 | NuPICOutput.register(NuPICFileOutput) 318 | NuPICOutput.register(NuPICPlotOutput) 319 | >>>>>>> 86b35818ed8f44ee23353ebaffc6adce33faf04a 320 | -------------------------------------------------------------------------------- /share_experiment.py: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | #!/usr/bin/python 3 | 4 | # ---------------------------------------------------------------------- 5 | # Numenta Platform for Intelligent Computing (NuPIC) 6 | # Copyright (C) 2013, Numenta, Inc. Unless you have an agreement 7 | # with Numenta, Inc., for a separate license for this software code, the 8 | # following terms and conditions apply: 9 | # 10 | # This program is free software: you can redistribute it and/or modify 11 | # it under the terms of the GNU General Public License version 3 as 12 | # published by the Free Software Foundation. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 17 | # See the GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see http://www.gnu.org/licenses. 21 | # 22 | # http://numenta.org/licenses/ 23 | # ---------------------------------------------------------------------- 24 | import csv 25 | from nupic.frameworks.opf.model_factory import ModelFactory 26 | from nupic_output import NuPICFileOutput, NuPICPlotOutput 27 | from nupic.swarming import permutations_runner 28 | 29 | 30 | # Change this to switch from a matplotlib plot to file output. 31 | PLOT = False 32 | SWARM_CONFIG = { 33 | "includedFields": [ 34 | { 35 | "fieldName": "value", 36 | "fieldType": "float", 37 | "maxValue": 1.0, 38 | "minValue": 0.0 39 | } 40 | ], 41 | "streamDef": { 42 | "info": "value", 43 | "version": 1, 44 | "streams": [ 45 | { 46 | "info": "choice.csv", 47 | "source": "file://home/sdbadmin/runSwarm/choice.csv", 48 | "columns": [ 49 | "*" 50 | ] 51 | } 52 | ] 53 | }, 54 | "inferenceType": "TemporalAnomaly", 55 | "inferenceArgs": { 56 | "predictionSteps": [ 57 | 3 58 | ], 59 | "predictedField": "value" 60 | }, 61 | "swarmSize": "medium" 62 | } 63 | 64 | 65 | 66 | def swarm_over_data(filename): 67 | config=SWARM_CONFIG 68 | print('filename: ',filename) 69 | 70 | print config['streamDef']['streams'] 71 | config['streamDef']['streams'][0]['info']=filename 72 | config['streamDef']['streams'][0]['source']="file://runSwarm/"+filename 73 | return permutations_runner.runWithConfig(config, 74 | {'maxWorkers': 4, 'overwrite': True}) 75 | 76 | 77 | 78 | def run_sine_experiment(): 79 | input_file = "sine.csv" 80 | generate_data.run(input_file) 81 | model_params = swarm_over_data() 82 | if PLOT: 83 | output = NuPICPlotOutput("sine_output", show_anomaly_score=True) 84 | else: 85 | output = NuPICFileOutput("sine_output", show_anomaly_score=True) 86 | model = ModelFactory.create(model_params) 87 | model.enableInference({"predictedField": "sine"}) 88 | 89 | with open(input_file, "rb") as sine_input: 90 | csv_reader = csv.reader(sine_input) 91 | 92 | # skip header rows 93 | csv_reader.next() 94 | csv_reader.next() 95 | csv_reader.next() 96 | 97 | # the real data 98 | for row in csv_reader: 99 | angle = float(row[0]) 100 | sine_value = float(row[1]) 101 | result = model.run({"sine": sine_value}) 102 | output.write(angle, sine_value, result, prediction_step=1) 103 | 104 | output.close() 105 | 106 | def generate_data(a,filename): 107 | print "Generating data into %s" % filename 108 | fileHandle = open('/home/sdbadmin/runSwarm/'+filename,"w") 109 | writer = csv.writer(fileHandle) 110 | writer.writerow(["Time","value"]) 111 | writer.writerow(["int","float"]) 112 | writer.writerow(["",""]) 113 | 114 | for i in range(len(a)): 115 | time=i 116 | value=a[i] 117 | writer.writerow([time, value]) 118 | 119 | fileHandle.close() 120 | print "Generated %i rows of output data into %s" % (len(a), filename) 121 | 122 | 123 | def swarm(a,number,col): 124 | generate_data(a,str(col)+'.csv') 125 | model_params = swarm_over_data(filename=str(col)+'.csv') 126 | 127 | '''model params save''' 128 | import json 129 | fp=file('/home/sdbadmin/'+str(col)+'_swarmParams.csv','w') 130 | json.dump(model_params,fp) 131 | fp.close() 132 | 133 | if PLOT: 134 | output = NuPICPlotOutput(str(col)+"_swarm__output", show_anomaly_score=True) 135 | else: 136 | output = NuPICFileOutput(str(col)+"_swarm_output", show_anomaly_score=True) 137 | model = ModelFactory.create(model_params) 138 | model.enableInference({"predictedField": "value"}) 139 | 140 | input_file='/home/sdbadmin/runSwarm/'+str(col)+'.csv' 141 | with open(input_file, "rb") as sine_input: 142 | csv_reader = csv.reader(sine_input) 143 | # the real data 144 | 145 | # skip header rows 146 | csv_reader.next() 147 | csv_reader.next() 148 | csv_reader.next() 149 | 150 | for row in csv_reader: 151 | time=float(row[0]) 152 | value = float(row[1]) 153 | result = model.run({"value": value}) 154 | output.write(time,value, result, prediction_step=3) 155 | 156 | output.close() 157 | return model_params 158 | ======= 159 | #!/usr/bin/python 160 | 161 | # ---------------------------------------------------------------------- 162 | # Numenta Platform for Intelligent Computing (NuPIC) 163 | # Copyright (C) 2013, Numenta, Inc. Unless you have an agreement 164 | # with Numenta, Inc., for a separate license for this software code, the 165 | # following terms and conditions apply: 166 | # 167 | # This program is free software: you can redistribute it and/or modify 168 | # it under the terms of the GNU General Public License version 3 as 169 | # published by the Free Software Foundation. 170 | # 171 | # This program is distributed in the hope that it will be useful, 172 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 173 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 174 | # See the GNU General Public License for more details. 175 | # 176 | # You should have received a copy of the GNU General Public License 177 | # along with this program. If not, see http://www.gnu.org/licenses. 178 | # 179 | # http://numenta.org/licenses/ 180 | # ---------------------------------------------------------------------- 181 | import csv 182 | from nupic.frameworks.opf.model_factory import ModelFactory 183 | from nupic_output import NuPICFileOutput, NuPICPlotOutput 184 | from nupic.swarming import permutations_runner 185 | 186 | 187 | # Change this to switch from a matplotlib plot to file output. 188 | PLOT = False 189 | SWARM_CONFIG = { 190 | "includedFields": [ 191 | { 192 | "fieldName": "value", 193 | "fieldType": "float", 194 | "maxValue": 1.0, 195 | "minValue": 0.0 196 | } 197 | ], 198 | "streamDef": { 199 | "info": "value", 200 | "version": 1, 201 | "streams": [ 202 | { 203 | "info": "choice.csv", 204 | "source": "file://home/sdbadmin/runSwarm/choice.csv", 205 | "columns": [ 206 | "*" 207 | ] 208 | } 209 | ] 210 | }, 211 | "inferenceType": "TemporalAnomaly", 212 | "inferenceArgs": { 213 | "predictionSteps": [ 214 | 3 215 | ], 216 | "predictedField": "value" 217 | }, 218 | "swarmSize": "medium" 219 | } 220 | 221 | 222 | 223 | def swarm_over_data(filename): 224 | config=SWARM_CONFIG 225 | print('filename: ',filename) 226 | 227 | print config['streamDef']['streams'] 228 | config['streamDef']['streams'][0]['info']=filename 229 | config['streamDef']['streams'][0]['source']="file://runSwarm/"+filename 230 | return permutations_runner.runWithConfig(config, 231 | {'maxWorkers': 4, 'overwrite': True}) 232 | 233 | 234 | 235 | def run_sine_experiment(): 236 | input_file = "sine.csv" 237 | generate_data.run(input_file) 238 | model_params = swarm_over_data() 239 | if PLOT: 240 | output = NuPICPlotOutput("sine_output", show_anomaly_score=True) 241 | else: 242 | output = NuPICFileOutput("sine_output", show_anomaly_score=True) 243 | model = ModelFactory.create(model_params) 244 | model.enableInference({"predictedField": "sine"}) 245 | 246 | with open(input_file, "rb") as sine_input: 247 | csv_reader = csv.reader(sine_input) 248 | 249 | # skip header rows 250 | csv_reader.next() 251 | csv_reader.next() 252 | csv_reader.next() 253 | 254 | # the real data 255 | for row in csv_reader: 256 | angle = float(row[0]) 257 | sine_value = float(row[1]) 258 | result = model.run({"sine": sine_value}) 259 | output.write(angle, sine_value, result, prediction_step=1) 260 | 261 | output.close() 262 | 263 | def generate_data(a,filename): 264 | print "Generating data into %s" % filename 265 | fileHandle = open('/home/sdbadmin/runSwarm/'+filename,"w") 266 | writer = csv.writer(fileHandle) 267 | writer.writerow(["Time","value"]) 268 | writer.writerow(["int","float"]) 269 | writer.writerow(["",""]) 270 | 271 | for i in range(len(a)): 272 | time=i 273 | value=a[i] 274 | writer.writerow([time, value]) 275 | 276 | fileHandle.close() 277 | print "Generated %i rows of output data into %s" % (len(a), filename) 278 | 279 | 280 | def swarm(a,number,col): 281 | generate_data(a,str(col)+'.csv') 282 | model_params = swarm_over_data(filename=str(col)+'.csv') 283 | 284 | '''model params save''' 285 | import json 286 | fp=file('/home/sdbadmin/'+str(col)+'_swarmParams.csv','w') 287 | json.dump(model_params,fp) 288 | fp.close() 289 | 290 | if PLOT: 291 | output = NuPICPlotOutput(str(col)+"_swarm__output", show_anomaly_score=True) 292 | else: 293 | output = NuPICFileOutput(str(col)+"_swarm_output", show_anomaly_score=True) 294 | model = ModelFactory.create(model_params) 295 | model.enableInference({"predictedField": "value"}) 296 | 297 | input_file='/home/sdbadmin/runSwarm/'+str(col)+'.csv' 298 | with open(input_file, "rb") as sine_input: 299 | csv_reader = csv.reader(sine_input) 300 | # the real data 301 | 302 | # skip header rows 303 | csv_reader.next() 304 | csv_reader.next() 305 | csv_reader.next() 306 | 307 | for row in csv_reader: 308 | time=float(row[0]) 309 | value = float(row[1]) 310 | result = model.run({"value": value}) 311 | output.write(time,value, result, prediction_step=3) 312 | 313 | output.close() 314 | return model_params 315 | >>>>>>> 86b35818ed8f44ee23353ebaffc6adce33faf04a 316 | -------------------------------------------------------------------------------- /similarity.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | ''' 3 | complete Time Series Trend Similarity 4 | D(msd)= D(Euclid)*(2-ASD/SAD) 5 | ASD --- Difference in absolute value 6 | SAD --- manhaton distance 7 | ''' 8 | import numpy as np 9 | 10 | def MSD(a,b): 11 | if len(a)!=len(b): 12 | print('a,b:',len(a),len(b)) 13 | print('length not equal,quit') 14 | return 15 | if not isinstance(a,np.ndarray): 16 | a=np.array(a).reshape(-1) 17 | if not isinstance(b,np.ndarray): 18 | b=np.array(b).reshape(-1) 19 | a=a.reshape(-1) 20 | b=b.reshape(-1) 21 | if (a==b).all():return 0 22 | #欧式距离 23 | Deuclid=np.linalg.norm(a-b) 24 | #print('Deuclid: ',Deuclid) 25 | #曼哈顿距离 26 | Dmahat=sum(abs(a-b)) 27 | #print('mahaton: ',Dmahat) 28 | #ASD 29 | ASD=abs(sum(a-b)) 30 | #print('ASD: ',ASD) 31 | #MSD 32 | msd=Deuclid*(2-ASD/Dmahat) 33 | #print('msd: ',msd) 34 | return msd 35 | -------------------------------------------------------------------------------- /slave/share_code/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = share_code.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = share_code 12 | -------------------------------------------------------------------------------- /slave/share_code/share_code/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/__init__.py -------------------------------------------------------------------------------- /slave/share_code/share_code/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/__init__.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/__pycache__/middlewares.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/__pycache__/middlewares.cpython-36.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/auto_ip.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 13 21:51:29 2018 4 | 5 | @author: Administrator 6 | """ 7 | import urllib 8 | import urllib2 9 | import time 10 | import redis 11 | from scrapy.selector import Selector 12 | import requests 13 | 14 | def get_url(url): # 国内高匿代理的链接 15 | url_list = [] 16 | for i in range(1,100): 17 | url_new = url + str(i) 18 | url_list.append(url_new) 19 | return url_list 20 | def get_content(url): # 获取网页内容 21 | user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0' 22 | headers = {'User-Agent': user_agent} 23 | req = requests.get(url=url, headers=headers) 24 | 25 | ''' 26 | req = urllib.request.Request(url=url, headers=headers) 27 | res = urllib.request.urlopen(req) 28 | content = res.read() 29 | return content.decode('utf-8') 30 | ''' 31 | return req 32 | 33 | def get_info(content): # 提取网页信息 / ip 端口 34 | datas_ip = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[2]/text()').extract() 35 | datas_head = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[6]/text()').extract() 36 | datas_port =Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[3]/text()').extract() 37 | 38 | ''' 39 | datas_ip = Selector(text=content).xpath('//table[contains(@id,"ip_list")]/tr/td[2]/text()').extract() 40 | datas_head = Selector(text=content).xpath('//table[contains(@id,"ip_list")]/tr/td[6]/text()').extract() 41 | datas_port =Selector(text=content).xpath('//table[contains(@id,"ip_list")]/tr/td[3]/text()').extract() 42 | ''' 43 | 44 | #写入redis 45 | print('head: ',datas_head) 46 | try: 47 | R=redis.Redis(host='localhost',port='6379') 48 | except Exception as e: 49 | print(e) 50 | count=0 51 | for head,ip,port in zip(datas_head,datas_ip,datas_port): 52 | p=R.zadd('share:auto_ip_pool',str(head).lower()+'://'+str(ip)+':'+str(port),count) 53 | if p: 54 | count+=1 55 | #print(datas_ip,datas_port) 56 | return datas_head,datas_ip,datas_port 57 | def verify_ip(head,ip,port): # 验证ip有效 58 | user_agent ='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0' 59 | headers = {'User-Agent':user_agent} 60 | if 'https' in head: 61 | proxy = {'https':'%s://%s:%s'%(head.lower(),ip,port)} 62 | else: 63 | proxy = {'http':'%s://%s:%s'%(head.lower(),ip,port)} 64 | print(proxy) 65 | 66 | #proxy_handler = urllib.request.ProxyHandler(proxy) 67 | proxy_handler=urllib2.ProxyHandler(proxy) 68 | #opener = urllib.request.build_opener(proxy_handler) 69 | opener = urllib2.build_opener(proxy_handler) 70 | urllib2.install_opener(opener) 71 | 72 | #test_url = "https://www.baidu.com/" 73 | test_url = "http://quote.eastmoney.com/stocklist.html#sh" 74 | req = urllib2.Request(url=test_url,headers=headers) 75 | time.sleep(3) 76 | 77 | try: 78 | R=redis.Redis(host='localhost',port='6379') 79 | except Exception as e: 80 | print(e) 81 | count=0 82 | try: 83 | res = urllib2.urlopen(req,timeout=3) 84 | #time.sleep(3) 85 | content = res.read() 86 | if content: 87 | print('that is ok') 88 | R.zadd('share:auto_ip_pool_ok',str(head).lower()+'://'+str(ip)+':'+str(port),count) 89 | count+=1 90 | else: 91 | print('its not ok') 92 | except urllib2.URLError as e: 93 | print(e.reason) 94 | except Exception as e: 95 | print(e) 96 | 97 | if __name__ == '__main__': 98 | url = 'http://www.xicidaili.com/nn/' 99 | url_list = get_url(url) 100 | for i in url_list: 101 | print(i) 102 | content = get_content(i) 103 | time.sleep(3) 104 | data=get_info(content) 105 | for head,ip,port in zip(data[0],data[1],data[2]): 106 | verify_ip(head,ip,port) 107 | -------------------------------------------------------------------------------- /slave/share_code/share_code/auto_ip_multi_pro.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 11 00:01:28 2018 4 | 5 | @author: Administrator 6 | """ 7 | from scrapy.selector import Selector 8 | import time 9 | import requests 10 | import redis 11 | from multiprocessing import Process, Queue 12 | 13 | 14 | def get_url(url): # 国内高匿代理的链接 15 | url_list = [] 16 | for i in range(1,100): 17 | url_new = url + str(i) 18 | url_list.append(url_new) 19 | return url_list 20 | 21 | def get_content(url): # 获取网页内容 22 | user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' 23 | headers = {'User-Agent': user_agent} 24 | req = requests.get(url=url, headers=headers) 25 | return req 26 | 27 | def get_info(content): # 提取网页信息 / ip 端口 28 | datas_ip = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[2]/text()').extract() 29 | datas_head = Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[6]/text()').extract() 30 | datas_port =Selector(content).xpath('//table[contains(@id,"ip_list")]/tr/td[3]/text()').extract() 31 | 32 | #写入redis 33 | print('head: ',datas_head) 34 | try: 35 | R=redis.Redis(host='localhost',port='6379') 36 | except Exception as e: 37 | print(e) 38 | count=0 39 | for head,ip,port in zip(datas_head,datas_ip,datas_port): 40 | p=R.zadd('share:auto_ip_pool',str(head).lower()+'://'+str(ip)+':'+str(port),count) 41 | if p: 42 | count+=1 43 | #print(datas_ip,datas_port) 44 | return datas_head,datas_ip,datas_port 45 | 46 | def verify_ip_one(old_queue,new_queue): # 验证ip有效性 47 | while 1: 48 | data=old_queue.get() 49 | print(data) 50 | if data==0: 51 | break 52 | head=data[0].lower() 53 | ip=data[1] 54 | port=data[2] 55 | print('head,data,ip') 56 | 57 | user_agent ='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0' 58 | accept_encoding ='gzip, deflate, sdch' 59 | accept_language ='zh-CN,zh;q=0.8' 60 | headers = {'User-Agent':user_agent,'Accept-Encoding':accept_encoding,'Accept-Language':accept_language} 61 | if 'https' in head: 62 | proxy = {'https':'%s://%s:%s'%(head,ip,port)} 63 | else: 64 | proxy = {'http':'%s://%s:%s'%(head,ip,port)} 65 | print(proxy) 66 | 67 | test_url = "https://www.baidu.com/" 68 | 69 | try: 70 | req = requests.get(url=test_url,proxies=proxy,headers=headers) 71 | status_code=req.status_code 72 | if status_code==200: 73 | print('that is ok') 74 | print(str(ip) + u":" + str(port)) 75 | new_queue.put([head,ip,port]) 76 | else: 77 | print('its not ok') 78 | except Exception as e: 79 | print('fall down') 80 | 81 | def verif_ip(data): 82 | old_queue=Queue() 83 | for head,ip,port in zip(data[0],data[1],data[2]): 84 | old_queue.put([head,ip,port]) #往没验证过的queue加入数据 85 | print('verify ip.....') 86 | print('old_queue: ',old_queue.qsize()) 87 | old_queue.put(0) #终止条件 88 | new_queue=Queue() 89 | works = [] 90 | for i in range(1): 91 | print('process %s'%i) 92 | works.append(Process(target=verify_ip_one, args=(old_queue,new_queue))) 93 | for work in works: 94 | print('process start') 95 | work.start() 96 | work.join() 97 | ''' 98 | for work in works: 99 | work.join() 100 | ''' 101 | try: 102 | R=redis.Redis(host='localhost',port='6379') 103 | except Exception as e: 104 | print(e) 105 | for i in range(new_queue.qsize()): 106 | head,ip,port=new_queue.get() 107 | R.sadd('share:auto_ip_pool_ok',str(head).lower()+'://'+str(ip)+':'+str(port)) 108 | print('insert one row') 109 | 110 | 111 | if __name__ == '__main__': 112 | url = 'http://www.xicidaili.com/nn/' 113 | url_list = get_url(url) 114 | for i in url_list: 115 | print(i) 116 | content = get_content(i) 117 | time.sleep(3) 118 | data=get_info(content) 119 | verif_ip(data) 120 | ''' 121 | for head,ip,port in zip(data[0],data[1],data[2]): 122 | verif_ip(head.lower(),ip,port) 123 | ''' 124 | -------------------------------------------------------------------------------- /slave/share_code/share_code/example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 13 17:35:23 2018 4 | 5 | @author: Administrator 6 | """ 7 | 8 | from multiprocessing import Process, Queue 9 | 10 | def f(q,n): 11 | q.put([42, n, 'hello']) 12 | 13 | if __name__ == '__main__': 14 | q = Queue() 15 | p_list=[] 16 | for i in range(3): 17 | p = Process(target=f, args=(q,i)) 18 | p_list.append(p) 19 | p.start() 20 | print(q.get()) 21 | print(q.get()) 22 | print(q.get()) 23 | for i in p_list: 24 | i.join() -------------------------------------------------------------------------------- /slave/share_code/share_code/extensions.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/extensions.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/extensionsItem.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 20 19:18:54 2018 4 | 5 | @author: Administrator 6 | """ 7 | import time 8 | import logging 9 | from scrapy import signals 10 | from scrapy.exceptions import NotConfigured 11 | logger = logging.getLogger(__name__) 12 | 13 | class SpiderOpenCloseLogging(object): 14 | 15 | def __init__(self, item_count,idle_number,crawler): 16 | self.item_count = item_count 17 | 18 | self.items_scraped = 0 19 | 20 | self.idle_count = 0 21 | 22 | self.idle_list = [] 23 | 24 | self.crawler = crawler 25 | 26 | self.idle_number = idle_number 27 | 28 | @classmethod 29 | def from_crawler(cls, crawler): 30 | # first check if the extension should be enabled and raise 31 | 32 | # NotConfigured otherwise 33 | 34 | if not crawler.settings.getbool('MYEXT_ENABLED'): 35 | 36 | raise NotConfigured 37 | #idle_number 38 | idle_number = crawler.settings.getint('IDLE_NUMBER', 20) 39 | 40 | # get the number of items from settings 41 | 42 | item_count = crawler.settings.getint('ITEM_NUMBER', 10) 43 | 44 | # instantiate the extension object 45 | 46 | ext = cls(item_count,idle_number,crawler) 47 | 48 | # connect the extension object to signals 49 | 50 | crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) 51 | 52 | crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) 53 | 54 | crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped) 55 | 56 | crawler.signals.connect(ext.spider_idle, signal=signals.spider_idle) 57 | 58 | # return the extension object 59 | 60 | return ext 61 | 62 | def spider_opened(self, spider): 63 | logger.info("opened spider %s redis spider Idle, Continuous idle limit: %d", spider.name, self.idle_number) 64 | 65 | def spider_closed(self, spider,reason='finished'): #默认结束reason是finished,如果spider是被引擎的 close_spider 方法所关闭,则其为调用该方法时传入的 reason 参数(默认为 'cancelled') 66 | logger.info("closed spider %s, idle count %d , Continuous idle count %d ,closed reason %s", 67 | spider.name, self.idle_count, len(self.idle_list),reason) 68 | 69 | def item_scraped(self, item, spider): 70 | self.items_scraped += 1 71 | if item: 72 | print('items add one ') 73 | else: 74 | print('item is None') 75 | print('self.idle_count: ',self.idle_count) 76 | if self.items_scraped % self.item_count == 0: 77 | spider.log("scraped %d items" % self.items_scraped) 78 | 79 | def spider_idle(self, spider): 80 | self.idle_count += 1 # 空闲计数 81 | print('idle_count:',self.idle_count) 82 | self.idle_list.append(time.time()) # 每次触发 spider_idle时,记录下触发时间戳 83 | idle_list_len = len(self.idle_list) # 获取当前已经连续触发的次数 84 | if idle_list_len > self.idle_number: 85 | # 连续触发的次数达到配置次数后关闭爬虫 86 | logger.info('\n continued idle number exceed {} Times' 87 | '\n meet the idle shutdown conditions, will close the reptile operation' 88 | '\n idle start time: {}, close spider time: {}'.format(self.idle_count, 89 | self.idle_list[0], self.idle_list[-1])) 90 | self.crawler.engine.close_spider(spider, 'closespider_ForNullRun') 91 | -------------------------------------------------------------------------------- /slave/share_code/share_code/extensionsItem.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/extensionsItem.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/extensionsTime.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from time import time 4 | 5 | from scrapy.exceptions import NotConfigured 6 | from twisted.internet import task 7 | from scrapy import signals 8 | 9 | 10 | class Latencies(object): 11 | """ 12 | An extension that measures throughput and latencies. 13 | """ 14 | @classmethod 15 | def from_crawler(cls, crawler): 16 | return cls(crawler) 17 | 18 | def __init__(self, crawler): 19 | self.crawler = crawler 20 | self.interval = crawler.settings.getfloat('LATENCIES_INTERVAL') 21 | 22 | if not self.interval: 23 | raise NotConfigured 24 | 25 | cs = crawler.signals 26 | cs.connect(self._spider_opened, signal=signals.spider_opened) 27 | cs.connect(self._spider_closed, signal=signals.spider_closed) 28 | cs.connect(self._request_scheduled, signal=signals.request_scheduled) 29 | cs.connect(self._response_received, signal=signals.response_received) 30 | cs.connect(self._item_scraped, signal=signals.item_scraped) 31 | 32 | self.latency, self.proc_latency, self.items = 0, 0, 0 33 | 34 | def _spider_opened(self, spider): 35 | self.task = task.LoopingCall(self._log, spider) 36 | self.task.start(self.interval) 37 | 38 | def _spider_closed(self, spider, reason): 39 | if self.task.running: 40 | self.task.stop() 41 | 42 | def _request_scheduled(self, request, spider): 43 | request.meta['schedule_time'] = time() 44 | 45 | def _response_received(self, response, request, spider): 46 | request.meta['received_time'] = time() 47 | 48 | def _item_scraped(self, item, response, spider): 49 | self.latency += time() - response.meta['schedule_time'] 50 | self.proc_latency += time() - response.meta['received_time'] 51 | self.items += 1 52 | 53 | def _log(self, spider): 54 | irate = float(self.items) / self.interval #interval 时间内处理item数 55 | latency = self.latency / self.items if self.items else 0 #单个时延 56 | proc_latency = self.proc_latency / self.items if self.items else 0 #单个响应时间 57 | 58 | spider.logger.info(("Scraped %d items at %.1f items/s, avg latency: " 59 | "%.2f s and avg time in pipelines: %.2f s") % 60 | (self.items, irate, latency, proc_latency)) 61 | 62 | self.latency, self.proc_latency, self.items = 0, 0, 0 63 | spider.logger.info('now init values again: %d %d %d'%(self.latency, self.proc_latency, self.items)) 64 | 65 | -------------------------------------------------------------------------------- /slave/share_code/share_code/extensionsTime.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/extensionsTime.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from scrapy.loader import ItemLoader 10 | from scrapy.loader.processors import MapCompose, TakeFirst, Join 11 | 12 | 13 | class ShareCodeItem(scrapy.Item): 14 | # define the fields for your item here like: 15 | # name = scrapy.Field() 16 | number = scrapy.Field() 17 | 18 | data = scrapy.Field() 19 | 20 | class ShareLoader(ItemLoader): 21 | default_item_class = ShareCodeItem 22 | default_input_processor = MapCompose(lambda s: s.lstrip().replace('\r',''))#去掉左边空格 23 | default_output_processor = Join() 24 | description_out = Join() 25 | -------------------------------------------------------------------------------- /slave/share_code/share_code/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/items.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | import scrapy 9 | from scrapy import signals 10 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 11 | from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware 12 | import random 13 | import redis 14 | 15 | 16 | class ShareCodeSpiderMiddleware(object): 17 | # Not all methods need to be defined. If a method is not defined, 18 | # scrapy acts as if the spider middleware does not modify the 19 | # passed objects. 20 | 21 | @classmethod 22 | def from_crawler(cls, crawler): 23 | # This method is used by Scrapy to create your spiders. 24 | s = cls() 25 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 26 | return s 27 | 28 | def process_spider_input(self, response, spider): 29 | # Called for each response that goes through the spider 30 | # middleware and into the spider. 31 | 32 | # Should return None or raise an exception. 33 | return None 34 | 35 | def process_spider_output(self, response, result, spider): 36 | # Called with the results returned from the Spider, after 37 | # it has processed the response. 38 | 39 | # Must return an iterable of Request, dict or Item objects. 40 | for i in result: 41 | yield i 42 | 43 | def process_spider_exception(self, response, exception, spider): 44 | # Called when a spider or process_spider_input() method 45 | # (from other spider middleware) raises an exception. 46 | 47 | # Should return either None or an iterable of Response, dict 48 | # or Item objects. 49 | pass 50 | 51 | def process_start_requests(self, start_requests, spider): 52 | # Called with the start requests of the spider, and works 53 | # similarly to the process_spider_output() method, except 54 | # that it doesn’t have a response associated. 55 | 56 | # Must return only requests (not items). 57 | for r in start_requests: 58 | yield r 59 | 60 | def spider_opened(self, spider): 61 | spider.logger.info('Spider opened: %s' % spider.name) 62 | 63 | 64 | class ShareCodeDownloaderMiddleware(object): 65 | # Not all methods need to be defined. If a method is not defined, 66 | # scrapy acts as if the downloader middleware does not modify the 67 | # passed objects. 68 | 69 | @classmethod 70 | def from_crawler(cls, crawler): 71 | # This method is used by Scrapy to create your spiders. 72 | s = cls() 73 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 74 | return s 75 | 76 | def process_request(self, request, spider): 77 | # Called for each request that goes through the downloader 78 | # middleware. 79 | 80 | # Must either: 81 | # - return None: continue processing this request 82 | # - or return a Response object 83 | # - or return a Request object 84 | # - or raise IgnoreRequest: process_exception() methods of 85 | # installed downloader middleware will be called 86 | return None 87 | 88 | def process_response(self, request, response, spider): 89 | # Called with the response returned from the downloader. 90 | 91 | # Must either; 92 | # - return a Response object 93 | # - return a Request object 94 | # - or raise IgnoreRequest 95 | return response 96 | 97 | def process_exception(self, request, exception, spider): 98 | # Called when a download handler or a process_request() 99 | # (from other downloader middleware) raises an exception. 100 | 101 | # Must either: 102 | # - return None: continue processing this exception 103 | # - return a Response object: stops process_exception() chain 104 | # - return a Request object: stops process_exception() chain 105 | pass 106 | 107 | def spider_opened(self, spider): 108 | spider.logger.info('Spider opened: %s' % spider.name) 109 | 110 | class DoubleUserAgentMiddleware(UserAgentMiddleware): 111 | ''' 112 | 设置User-Agent 113 | ''' 114 | 115 | def __init__(self, user_agent): 116 | self.user_agent = user_agent 117 | 118 | @classmethod 119 | def from_crawler(cls, crawler): 120 | return cls( 121 | user_agent=crawler.settings.get('DOUBLE_USER_AGENT') 122 | ) 123 | 124 | def process_request(self, request, spider): 125 | agent = random.choice(self.user_agent) 126 | request.headers['User-Agent'] = agent 127 | 128 | class AuToIpMiddleware(HttpProxyMiddleware): 129 | '''设置自动IP''' 130 | def __init__(self,ipaddr=''): 131 | super(AuToIpMiddleware,self).__init__ 132 | self.ipaddr=ipaddr 133 | 134 | def process_request(self, request, spider): 135 | '''对request对象加上proxy''' 136 | self.ipaddr=self.get_random_proxy() 137 | print('The ip is: '+str(self.ipaddr)) 138 | request.meta["proxy"] = self.ipaddr 139 | 140 | def process_response(self, request, response, spider): 141 | '''对返回的response处理''' 142 | # 如果返回的response状态不是200,重新生成当前request对象 143 | if response.status != 200: 144 | proxy = self.get_random_proxy() 145 | print("this is response ip:"+proxy) 146 | # 对当前reque加上代理 147 | request.meta['proxy'] = self.ipaddr 148 | return request 149 | return response 150 | 151 | def get_random_proxy(self): 152 | try: 153 | R=redis.Redis(host='192.168.111.130',port='6379') 154 | except Exception as e: 155 | print(e) 156 | length=R.zcard('share:auto_ip_pool_ok') 157 | number=random.randint(0,length-1) 158 | ipaddr=R.zrange('share:auto_ip_pool_ok',number,number)[0] #取出随机一个ip 159 | ipaddr=ipaddr.decode('utf-8') 160 | return ipaddr 161 | -------------------------------------------------------------------------------- /slave/share_code/share_code/middlewares.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/middlewares.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from hdfs import Client 8 | from scrapy.exceptions import DropItem 9 | 10 | 11 | class ShareCodePipeline(object): 12 | def __init__(self,client): 13 | print("ShareCodePipeline __init__") 14 | self.client=client 15 | ''' 16 | try: 17 | client=Client('http://192.168.111.130:50070') 18 | except Exception as e: 19 | print(e) 20 | self.client=client 21 | ''' 22 | @classmethod 23 | def from_settings(cls,settings): 24 | hdfs_master=settings['HDFS_MASTER'] 25 | hdfs_address=settings['HDFS_ADDRESS'] 26 | try: 27 | client=Client('http://'+str(hdfs_master)+':'+str(hdfs_address)) 28 | except Exception as e: 29 | print(e) 30 | 31 | return cls(client) 32 | 33 | def process_item(self, item, spider): 34 | print("ShareCodePipeline process_item") 35 | 36 | if item['number']: 37 | number=item['number'] 38 | else: 39 | raise DropItem('Missing number in %s'%item) 40 | if item['data']: 41 | data=item['data'] 42 | else: 43 | raise DropItem('Missing data in %s'%item) 44 | 45 | data_str=data #内含中文,先编码成utf-8 46 | ''' 47 | try: 48 | client=Client('http://192.168.111.130:50070') 49 | except Exception as e: 50 | print(e) 51 | ''' 52 | 53 | try: 54 | print('begin write') 55 | if not self.client.content('/sdbadmin/hadoop/input/'+str(number)+'.csv',strict=False): 56 | self.client.write('/sdbadmin/hadoop/input/'+str(number)+'.csv',data=data_str,encoding='utf-8') 57 | print('hdfs client close!') 58 | print('end write') 59 | else: 60 | print('dupilicate data!') 61 | except Exception as e: 62 | print(e) 63 | 64 | return item 65 | -------------------------------------------------------------------------------- /slave/share_code/share_code/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/pipelines.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for share_code project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'share_code' 13 | 14 | SPIDER_MODULES = ['share_code.spiders'] 15 | NEWSPIDER_MODULE = 'share_code.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'share_code (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'share_code.middlewares.ShareCodeSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | DOWNLOADER_MIDDLEWARES = { 56 | 'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None, 57 | 'share_code.middlewares.DoubleUserAgentMiddleware': 240, 58 | 'share_code.middlewares.AuToIpMiddleware': None, #340 59 | 'share_code.middlewares.ShareCodeDownloaderMiddleware': 543, 60 | 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware':None, 61 | } 62 | 63 | # Enable or disable extensions 64 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 65 | #EXTENSIONS = { 66 | # 'scrapy.extensions.telnet.TelnetConsole': None, 67 | #} 68 | 69 | # Configure item pipelines 70 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 71 | ITEM_PIPELINES = { 72 | 'share_code.pipelines.ShareCodePipeline': None, 73 | } 74 | 75 | # Enable and configure the AutoThrottle extension (disabled by default) 76 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 77 | #AUTOTHROTTLE_ENABLED = True 78 | # The initial download delay 79 | #AUTOTHROTTLE_START_DELAY = 5 80 | # The maximum download delay to be set in case of high latencies 81 | #AUTOTHROTTLE_MAX_DELAY = 60 82 | # The average number of requests Scrapy should be sending in parallel to 83 | # each remote server 84 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 85 | # Enable showing throttling stats for every response received: 86 | #AUTOTHROTTLE_DEBUG = False 87 | 88 | # Enable and configure HTTP caching (disabled by default) 89 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 90 | #HTTPCACHE_ENABLED = True 91 | #HTTPCACHE_EXPIRATION_SECS = 0 92 | #HTTPCACHE_DIR = 'httpcache' 93 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 94 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 95 | 96 | MYEXT_ENABLED = True 97 | 98 | LATENCIES_INTERVAL = 5 99 | ITEM_NUMBERS=10 100 | IDLE_NUMBERS=20 101 | 102 | MEMUSAGE_NOTIFY_MAIL = ['3081881935@qq.com'] 103 | MEMUSAGE_REPORT = True 104 | MEMUSAGE_ENABLED = True 105 | MEMUSAGE_LIMIT_MB = 4112 106 | MEMDEBUG_ENABLED = True 107 | MEMDEBUG_NOTIFY = [] 108 | 109 | EXTENSIONS = { 110 | 'share_code.extensionsItem.SpiderOpenCloseLogging': 100, 111 | 'share_code.extensionsTime.Latencies': 120, 112 | 'scrapy.contrib.memusage.MemoryUsage': 50, 113 | 'scrapy.contrib.memdebug.MemoryDebugger': 60 114 | } 115 | 116 | ITEM_PIPELINES = { 117 | 'share_code.pipelines.ShareCodePipeline':100 118 | } 119 | 120 | # Enables scheduling storing requests queue in redis. 121 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 122 | 123 | # Ensure all spiders share same duplicates filter through redis. 124 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 125 | 126 | REDIS_START_URLS_AS_SET = True 127 | 128 | REDIS_HOST = '192.168.111.130' 129 | REDIS_PORT = '6379' 130 | 131 | HDFS_MASTER = '192.168.111.130' 132 | HDFS_ADDRESS = '50070' 133 | 134 | DOUBLE_USER_AGENT = [ 135 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 136 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 137 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 138 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 139 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 140 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 141 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 142 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 143 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 144 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 145 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 146 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 147 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 148 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 149 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 150 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 151 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", 152 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", 153 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", 154 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", 155 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", 156 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 157 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", 158 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 159 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", 160 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 161 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 162 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 163 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 164 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 165 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", 166 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", 167 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", 168 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", 169 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", 170 | ] 171 | -------------------------------------------------------------------------------- /slave/share_code/share_code/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/settings.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/spiders/SlaveSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 26 13:42:52 2018 4 | 5 | @author: Administrator 6 | """ 7 | import time 8 | from scrapy_redis.spiders import RedisSpider 9 | import redis 10 | from scrapy import log 11 | from share_code.items import ShareLoader 12 | #from scrapy.selector import Selector 13 | import re 14 | from kazoo.client import KazooClient 15 | import sys 16 | reload(sys) 17 | sys.setdefaultencoding('gbk') 18 | 19 | class SlaveSpider(RedisSpider): 20 | name = "spider" 21 | download_delay=2 22 | #allowed_domains = ["spider.org"] 23 | redis_key = 'share:download_url' 24 | 25 | zk = KazooClient(hosts='127.0.0.1:2181') 26 | zk.start() 27 | 28 | # Ensure a path, create if necessary 29 | zk.ensure_path("/ip_process") 30 | 31 | # Create a node with data 32 | zk.create("/ip_process/192.168.111.129", 33 | value=b"ok", ephemeral=True) 34 | 35 | pool=redis.ConnectionPool(host='192.168.111.130', port=6379, decode_responses=True) 36 | 37 | 38 | def parse(self, response): 39 | #print('response.body: ',response.body.decode('gbk')) #str 40 | print('response.encoding: ',response.encoding) 41 | #print('body: ',response.body) 42 | #print('response.request.meta: ',response.request.meta) 43 | self.log('parse begin!',level=log.INFO) 44 | try: 45 | #global pool 46 | R=redis.Redis(connection_pool=self.pool) 47 | except Exception as e: 48 | self.log(e,level=log.ERROR) 49 | print('response.url: ',response.url) 50 | if R.sismember('share:dupefilter_bak',response.url): #如果url不重复 51 | self.log('download_url repeat! stop this chance and continue',level=log.DEBUG) 52 | #return None 53 | else: 54 | #打印user-agent 55 | print('user-agent: ',response.request.headers['User-Agent']) 56 | time.sleep(10) 57 | el=ShareLoader(response=response) 58 | 59 | #text=str(response.body,encoding='gbk') 60 | text=str(response.body) if type(response.body) != 'str' else response.body 61 | el.add_value('data',text) 62 | 63 | pat=re.compile('.*?code=0(\d+).*?') #正则表达式,取出code 64 | download_url=response.url 65 | code=pat.findall(download_url) 66 | print('code: ',code) 67 | 68 | el.add_value('number',code) 69 | 70 | R.sadd('share:dupefilter_bak',download_url) #储存已经spider的网页url,实现去重 71 | 72 | self.log('parse end!',level=log.INFO) 73 | 74 | return el.load_item() 75 | -------------------------------------------------------------------------------- /slave/share_code/share_code/spiders/SlaveSpider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/spiders/SlaveSpider.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /slave/share_code/share_code/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/spiders/__init__.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/spiders/__pycache__/SlaveSpider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/spiders/__pycache__/SlaveSpider.cpython-36.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jiede1/spider-based-on-scrapy_redis-for-share-and-share-prediction-algorithm-search/9c9b2988dce14da21e2a90bec5038bb5512e9e5c/slave/share_code/share_code/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /slave/share_code/share_code/untitled2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 20 01:08:28 2018 4 | 5 | @author: Administrator 6 | """ 7 | 8 | class zoo: 9 | b=1 10 | def __init__(self): 11 | print('init',self.b) 12 | @classmethod 13 | def ok(cls,iin): 14 | print('classmethod: ',iin) 15 | print('hello') 16 | global a 17 | a=1 18 | a+=1 19 | print(a) 20 | zoo() -------------------------------------------------------------------------------- /slave/zoo_detect/zoo_watcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 20 00:57:45 2018 4 | 5 | @author: Administrator 6 | """ 7 | 8 | from kazoo.client import KazooClient 9 | 10 | import time 11 | 12 | import logging 13 | logging.basicConfig() 14 | 15 | zk = KazooClient(hosts='127.0.0.1:2181') 16 | zk.start() 17 | 18 | # Determine if a node exists 19 | while True: 20 | for ip in ['192.168.111.129']: 21 | if zk.exists("/ip_process/" + ip): 22 | print ("%s is alive!"%ip) 23 | else: 24 | print ("%s is dead!"%ip) 25 | break 26 | time.sleep(5) 27 | 28 | zk.stop() 29 | -------------------------------------------------------------------------------- /test_nupic.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import time 4 | import numpy as np 5 | import pandas as pd 6 | import sklearn 7 | from hdfs.client import Client 8 | from similarity import MSD 9 | import copy 10 | from sklearn.neural_network import MLPRegressor 11 | from sklearn import preprocessing 12 | from sklearn.model_selection import GridSearchCV 13 | from KMSD import kmeans_MSD 14 | from share_experiment import swarm 15 | from nupic.frameworks.opf.model_factory import ModelFactory 16 | from nupic_output import NuPICFileOutput, NuPICPlotOutput 17 | from nupic.swarming import permutations_runner 18 | ''' 19 | filepath='/sdbadmin/hadoop/input' 20 | try: 21 | client=Client('http://192.168.111.130:50070') 22 | except Exception as e: 23 | print(e) 24 | 25 | dirs=client.list(filepath) 26 | ''' 27 | 28 | import os 29 | filepath='/opt/share_code_data' 30 | dirs=os.listdir(filepath) 31 | 32 | #将hdfs本地化 33 | print('there are %d shares'%(len(dirs))) 34 | ''' 35 | try: 36 | for i in range(len(dirs)): 37 | client.download(filepath+'/'+dirs[i],'/opt/share_code_data/'+dirs[i]) 38 | print('download success') 39 | except Exception as e: 40 | print(e) 41 | ''' 42 | #即使是使用nupic,也考虑预处理 43 | min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)) 44 | DD=pd.DataFrame([]) 45 | for i in range(len(dirs)): 46 | df=pd.read_csv('/opt/share_code_data/'+dirs[i],index_col=0) #利用第一列做索引 47 | if len(DD)==len(df) and len(df)!=0: 48 | trun = min_max_scaler.fit_transform(copy.deepcopy(df.iloc[:,5]).values.reshape(-1, 1) ) 49 | DD[dirs[i].strip().split('.')[0]]=trun 50 | elif len(df)!=0: 51 | cols=DD.columns 52 | trun = min_max_scaler.fit_transform(copy.deepcopy(df.iloc[:,5]).values.reshape(-1,1)) 53 | DD=pd.concat([DD,pd.DataFrame(trun)],axis=1) #长度不一致的合并 54 | f=list(cols) 55 | f.append(dirs[i].strip().split('.')[0]) 56 | DD.columns=f 57 | DD.fillna(0, inplace=True) 58 | 59 | print('DD shape: ',np.shape(DD)) 60 | print(DD.head()) #[6690 rows x 169 columns] 61 | #time.sleep(6) 62 | clusters_DF=kmeans_MSD(DD.values,k=3) # 返回一个dict,里面包含了所有的聚类,k是聚类数 63 | clusterAss=clusters_DF[1] 64 | 65 | newD=[] 66 | leftD=[] 67 | for i in range(DD.shape[1]): 68 | if len(np.nonzero(DD.values [:,i]==0)[0])<=int(0.3*DD.shape[0]): 69 | if len(newD)==0: 70 | newD=copy.deepcopy(DD.iloc[:,i]) 71 | else: 72 | newD=pd.concat([newD,copy.deepcopy(DD.iloc[:,i])],axis=1) 73 | else: 74 | if len(leftD)==0: 75 | leftD=copy.deepcopy(DD.iloc[:,i]) 76 | else: 77 | leftD=pd.concat([leftD,copy.deepcopy(DD.iloc[:,i])],axis=1) 78 | 79 | ser=set(list(np.array(clusterAss[:,0]))) #聚类数 80 | print('cluster leibie: ',ser) 81 | 82 | #对newD的数据进行操作 83 | for clu in ser: 84 | if str(clu)=='0.0':continue 85 | print('newD:',newD.shape,clusterAss.shape) 86 | Data=copy.deepcopy(newD.iloc[:,np.nonzero(clusterAss[:,0]==clu)[0]]) #选出某一聚类的所有数据 87 | clusterA=copy.deepcopy(clusterAss[np.nonzero(clusterAss[:,0]==clu)[0]]) #选出某一聚类的所有类分类结果数据 88 | minV=min(clusterA[:,1]) #选出最短距离 89 | index=list(clusterA[:,1]).index(minV) #找出最短距离对应的share数据是哪一条 90 | 91 | data=copy.deepcopy(Data.iloc[:,index].values) 92 | data=data[np.nonzero(data!=0)[0]] #swarm只考虑非0数据 93 | print('Data.columns: ',Data.columns) 94 | #time.sleep(6) 95 | paras=swarm(data,number=index,col=Data.columns[index]) #运行swarm 96 | print('paras: ',paras) #best params 97 | import csv 98 | model = ModelFactory.create(paras) 99 | model.enableInference({"predictedField": "value"}) 100 | 101 | output = NuPICFileOutput("output"+str(clu), show_anomaly_score=True) 102 | 103 | for i in range(Data.shape[1]): 104 | input_file='/opt/share_code_data/'+str(Data.columns[i])+'.csv' 105 | with open(input_file, "rb") as sine_input: 106 | csv_reader = csv.reader(sine_input) 107 | # the real data 108 | 109 | # skip header rows 110 | csv_reader.next() 111 | csv_reader.next() 112 | csv_reader.next() 113 | 114 | for row in csv_reader: 115 | timeS=row[0] 116 | value = float(row[6]) 117 | result = model.run({"value": value}) 118 | output.write(timeS,value, result, prediction_step=3) 119 | 120 | 121 | 122 | #对leftD数据进行操作 123 | maxl=0;maxi=-1 124 | for i in range(leftD.shape[1]): #找出最长的非0序列 125 | if maxl=int(0.1*len(tempDF)): #对于目标列,限制为10%的好处是,可以避免这种情况:两个序列中,有一个序列非常多-1,导致计算不准确 109 | print('Not satisfied!') 110 | continue 111 | simL=[] 112 | for j in range(tempDF.shape[1]): #遍历所有股票数 113 | if j!=detect: #自己不会跟自己计算 114 | l=len(np.nonzero(tempDF.iloc[:,j]==-1)[0]) 115 | if l<=int(0.1*len(tempDF)): #如果-1占比不超过10%,计算 116 | sim=local_MSD_layer(tempDF.iloc[:,detect],tempDF.iloc[:,j]) 117 | simL.append({sim:str(detect)+'-'+str(j)}) #相似度做keys 118 | else: 119 | print('%d col is not suitable for %d line because -1 too much '%(j,detect)) 120 | D=sortedTw(simL) #得到与detect符合的20个序列序号,以及相似度 121 | Data,Label = update_train(tempDF,D,detect,Data,Label) #更新train训练集和标签 122 | print('Data print: ',Data) 123 | #time.sleep(6) 124 | return Data,Label 125 | 126 | def percent(a): 127 | a=pd.DataFrame(a) 128 | a.columns=['origin','predict'] 129 | b=[] 130 | for i in range(len(a)): 131 | d=abs(a.iloc[i,0]-a.iloc[i,1])/a.iloc[i,1] if a.iloc[i,1] else 'Error' 132 | if not isinstance(d,str): 133 | d=str(d*100)+'%' 134 | b.append(d) 135 | a['loss']=b 136 | return a 137 | def use_algorithm(Data,Label,pro=0.7): #利用sklearn的神经网络实现预测,默认0.7训练集 138 | #from sklearn.neural_network import MLPRegressor 139 | train_length=int(pro*len(Data)) 140 | trainD=Data[:train_length,:] 141 | testD=Data[train_length:,:] 142 | trainL=Label[:train_length] 143 | testL=Label[train_length:] 144 | print('trainD,trainL,testD,testL: ',trainD.shape,trainL.shape,testD.shape,testL.shape) 145 | #exit(1) 146 | # 神经网络 147 | parameters = { 148 | 'hidden_layer_sizes': [(15,),(15,7,3),(21,3),(7,3,3),(15,7,5)], 149 | 'max_iter': [20000,100000,200000], 150 | 'momentum': [0,0.5,0.7,1], 151 | 'learning_rate': ['adaptive','constant','invscaling'],\ 152 | 'solver': ['sgd','adam'],\ 153 | 'shuffle': [False],\ 154 | 'activation': ['logistic','relu','tanh'] 155 | } 156 | mlp = MLPRegressor() 157 | clf = GridSearchCV(mlp, parameters) 158 | model=clf.fit(trainD,trainL) 159 | bestp=clf.best_params_ 160 | with open('best_params.csv','w') as f: 161 | f.write(str(bestp)) 162 | f.close() 163 | with open('best_params.csv','r') as f: 164 | bestp=eval(f.read()) 165 | f.close() 166 | print('bestp: ',bestp) 167 | #clf = MLPRegressor(hidden_layer_sizes=(15,), max_iter=100000,learning_rate='adaptive',solver='sgd',shuffle=False,activation='logistic') 168 | #model = clf.fit(trainD,trainL) 169 | predictD = model.predict(testD) #predict data 170 | 171 | nnDF=np.concatenate([predictD.reshape(-1,1),testL.reshape(-1,1)],axis=1) 172 | loss=percent(nnDF) 173 | print('loss: ',loss) 174 | #coefficient of determination ---- 1.0 is the best 175 | score = model.score(testD,testL) 176 | print('score: ',score) 177 | loss.to_csv('/home/sdbadmin/loss.csv',index=False) #将预测结果本地化 178 | 179 | if __name__=='__main__': 180 | 181 | import time 182 | beginTime=time.time() 183 | ''' 184 | filepath='/sdbadmin/hadoop/input' 185 | try: 186 | client=Client('http://192.168.111.130:50070') 187 | except Exception as e: 188 | print(e) 189 | 190 | dirs=client.list(filepath) 191 | ''' 192 | import os 193 | filepath='/opt/share_code_data' 194 | dirs=os.listdir(filepath) 195 | #将hdfs本地化 196 | print('there are %d shares'%(len(dirs))) 197 | ''' 198 | try: 199 | for i in range(len(dirs)): 200 | client.download(filepath+'/'+dirs[i],'/opt/share_code_data/'+dirs[i]) 201 | except Exception as e: 202 | print(e) 203 | ''' 204 | min_max_scaler = preprocessing.MinMaxScaler() 205 | DD=pd.DataFrame([]) 206 | for i in range(len(dirs)): 207 | print('which csv: ',dirs[i]) 208 | df=pd.read_csv('/opt/share_code_data/'+dirs[i],index_col=0) 209 | if len(DD)==len(df) or len(DD)==0 and len(df)!=0: 210 | trun = min_max_scaler.fit_transform(copy.deepcopy(df.iloc[:,5]).values.reshape(-1,1)) 211 | DD[dirs[i].strip().split('.')[0]]=trun.ravel() 212 | elif len(df)!=0: 213 | cols=DD.columns 214 | try: 215 | trun = min_max_scaler.fit_transform(copy.deepcopy(df.iloc[:,5]).values.reshape(-1,1)) 216 | except Exception as e: 217 | print(e) 218 | print(df.iloc[:,5]) 219 | time.sleep(56) 220 | DD=pd.concat([DD,pd.DataFrame(trun)],axis=1) #长度不一致的合并 221 | f=list(cols) 222 | f.append(dirs[i].strip().split('.')[0]) 223 | DD.columns=f 224 | DD.fillna(-1, inplace=True) 225 | 226 | print('DD shape: ',np.shape(DD)) 227 | print(DD) #[6690 rows x 169 columns] 228 | 229 | Data,Label=create_examples(DD,detect=0) 230 | 231 | use_algorithm(Data,Label) 232 | endTime=time.time() 233 | print('\nalgorithm runTime: ',endTime-beginTime) 234 | 235 | ''' 236 | for i in range(y): #遍历所有share 237 | print('Now is %s'%DD.columns[i]) 238 | for j in range(y): 239 | if i!=j: 240 | l1=np.nonzero(DD.iloc[:,i]!=-1)[0] 241 | l2=np.nonzero(DD.iloc[:,j]!=-1)[0] 242 | l=set(l1) & set(l2) 243 | tempDF=copy.deepcopy(DD.iloc[list(l),[i,j]]) 244 | print('tempDF shape: ',tempDF.shape) 245 | train_len=int(len(tempDF)*0.7) 246 | test_len=int(len(tempDF)*0.3) 247 | ''' 248 | -------------------------------------------------------------------------------- /参考博客: -------------------------------------------------------------------------------- 1 | 解决 Scrapy-Redis 空跑问题,链接跑完后自动关闭爬虫 2 | https://my.oschina.net/2devil/blog/1631116 3 | 4 | Scrapy之Extension实例——计算吞吐量及时延 5 | https://blog.csdn.net/q_an1314/article/details/51188137 6 | 7 | 使用Python进行分布式系统协调 (ZooKeeper/Consul/etcd) 8 | https://blog.csdn.net/younger_china/article/details/53063426 9 | 10 | 在Scrapy中使用IP池或用户代理(python3) 11 | https://www.cnblogs.com/xiaomingzaixian/p/7121280.html 12 | 13 | 爬取网易财经中股票的历史交易数据 14 | https://blog.csdn.net/pythoncodez/article/details/77623287 15 | 16 | 使用scrapy框架爬取股票数据 17 | https://blog.csdn.net/u010986776/article/details/79261999 18 | 19 | https://blog.csdn.net/u012150179/article/details/38091411 20 | scrapy-redis实现爬虫分布式爬取分析与实现 21 | 22 | 使用scrapy-redis构建简单的分布式爬虫 23 | https://blog.csdn.net/howtogetout/article/details/51633814 24 | 25 | scrapy-redis介绍(一) 26 | https://blog.csdn.net/hjhmpl123/article/details/53292602 27 | -------------------------------------------------------------------------------- /各组件安装文档说明: -------------------------------------------------------------------------------- 1 | #1.zookeeper 安装 2 | 3 | [root@sdb3 opt]# tar -zxf zookeeper-3.4.10.tar.gz 4 | [root@sdb3 opt]# chown -R sdbadmin:sdbadmin_group zookeeper-3.4.10 5 | [root@sdb3 opt]# su – sdbadmin 6 | [sdbadmin@sdb3 src]$ cd ../conf/ 7 | [sdbadmin@sdb3 conf]$ vim zoo.cfg 8 | tickTime = 2000 9 | dataDir = /opt/zookeeper-3.4.10/data 10 | clientPort = 2181 11 | initLimit = 5 12 | syncLimit = 2 13 | server.0=192.168.111.130:2888:3888 14 | server.1=192.168.111.129:2888:3888 15 | server.2=192.168.111.128:2888:3888 16 | [sdbadmin@sdb3 conf]$ cd ../ 17 | [sdbadmin@sdb3 zookeeper-3.4.10]$ mkdir data 18 | [sdbadmin@sdb3 zookeeper-3.4.10]$ cd data 19 | [sdbadmin@sdb3 data]$ echo '2' > myid 20 | --------------------------------------------------- 21 | 22 | #2.hive安装 23 | 24 | 利用MySQL储存hive的元数据信息,因此先安装MySQL 25 | 26 | yum install mysql 27 | yum install mysql-server 28 | yum install mysql-devel 29 | 30 | 下载hive安装包 31 | tar -zxcf /opt/apache-hive-1.2.2-bin.tar.gz 32 | 配置hive与MySQL的集成,储存元信息 33 | 34 | CREATE USER 'sdbadmin'@'%' IDENTIFIED BY ''; 35 | GRANT ALL ON *.* TO 'sdbadmin'@'%'; 36 | flush privileges; 37 | create databases hive; 38 | 39 | #配置hive 40 | 具体参考https://www.cnblogs.com/kinginme/p/7233315.html 41 | https://blog.csdn.net/jssg_tzw/article/details/72354470 42 | https://blog.csdn.net/jssg_tzw/article/detahttps://mp.weixin.qq.com/s?__biz=MzIzODExMDE5MA==&mid=2694182433&idx=1&sn=687b754cddc7255026434c683f487ac0#rdils/72354470 43 | 44 | 45 | #hive学习 46 | https://mp.weixin.qq.com/s?__biz=MzIzODExMDE5MA==&mid=2694182433&idx=1&sn=687b754cddc7255026434c683f487ac0#rd 47 | https://blog.csdn.net/jssg_tzw/article/details/72354470 48 | https://www.cnblogs.com/kinginme/p/7233315.html 49 | 50 | #利用hive读取hdfs数据到外部表 51 | hive> create external table os 52 | > (dt string, StockCode string, Name string, ClosingPrice float, HighestPrice float, LowestPrice float, OpeningPrice float, LastClose float, Change float, QuoteChange float, Turnover float, DealAmount float, DealValue float, TotalMarketValue float, CirculationMarketValue float) 53 | > row format delimited fields terminated by ',' lines terminated by '\n' 54 | > stored as textfile 55 | > tblproperties ("skip.header.line.count"="1"); 56 | 57 | >load data inpath '/sdbadmin/hadoop/input/601799.csv' into table os; 58 | >select * from os; 59 | --------------------------------------------------------- 60 | 61 | #3.安装redis 62 | 63 | $ wget http://download.redis.io/releases/redis-2.8.17.tar.gz 64 | $ tar xzf redis-2.8.17.tar.gz 65 | $ cd redis-2.8.17 66 | $ make 67 | $ cd src 68 | $ ./redis-server 69 | --------------------------------------- 70 | 71 | #4.安装scrapy 72 | 73 | wget https://twistedmatrix.com/Releases/Twisted/13.1/Twisted-13.1.0.tar.bz2 74 | 1. wget https://twistedmatrix.com/Releases/Twisted/17.1/Twisted-17.1.0.tar.bz2 75 | 2. tar -jxvf Twisted-17.1.0.tar.bz2 76 | 3. cd Twisted-17.1.0 77 | 4. python setup.py install 78 | 5. cd .. 79 | 6. pip install scrapy 80 | 81 | # 安装scrapy-redis 82 | 7. pip install scrapy-redis 83 | 84 | 解决scrapy任何地方启动: 85 | 进入 Python 的主目录,如cd /usr/local/python3.6/bin,查找 scrapy 项 86 | (2)检查 cd /usr/bin/ | ll | grep scrapy,查看是否存在 87 | (3)不存在则执行 88 | ln -s /usr/local/python-2.7/bin/scrapy /usr/bin/scrapy 89 | (4)回到shell,执行 scrapy version,成功 90 | 91 | 92 | -------------------------------------------------------------------------------- /启动步骤: -------------------------------------------------------------------------------- 1 | #项目的框架有点大,所以启动步骤有点麻烦,下面贴出了master/slave主机的启动步骤。 2 | 3 | #Master主机(Master端的程序负责给定初始url,程序基于此下载好目标url,储存进redis,供slave端程序调用) 4 | 1. [sdbadmin@sdb1 ~]$ /opt/zookeeper-3.4.10/bin/zkServer.sh start #启动zookeeper 5 | 2. [sdbadmin@sdb1 ~]$ /opt/hadoop-2.7.2/sbin/start-all.sh #启动Hadoop 6 | 3. [sdbadmin@sdb1 ~]$ /opt/redis-2.8.17/src/redis-server #启动redis 7 | 4. [sdbadmin@sdb1 ~]$hdfs dfs -mkdir -p /sdbadmin/hadoop/input 8 | 9 | 给定初始url,准备工作完毕 10 | 5. [sdbadmin@sdb1 share_code]$ /opt/redis-2.8.17/src/redis-cli sadd share:start_urls http://quote.eastmoney.com/stocklist.html#sh 11 | 12 | 进入爬虫所在目录,开始跑爬虫程序 13 | 6. [sdbadmin@sdb1 share_code]$ scrapy crawl share 14 | 15 | #Slave端主机(Slave端程序负责从redis中读取目标url,下载好股票数据,并存进hdfs) 16 | 1. [root@sdb2 ~]# service mysqld start #启动MySQL 为hive做准备 17 | 2. [sdbadmin@sdb2 ~]# hive --service metastore #启动hive 18 | 3. [sdbadmin@sdb2 share_code]# scrapy crawl spider #进入爬虫所在目录,运行爬虫 19 | 20 | -------------------------------------------------------------------------------- /笔记摘录: -------------------------------------------------------------------------------- 1 | 在redis的服务器中,会至少存在三个队列: 2 | a.用于请求对象去重的集合,队列的名称为spider.name:dupefilter,其中spider.name就是我们自定义的spider的名字,下同。 3 | b.待抓取的request对象的有序集合,队列的名称为spider.name:requests 4 | c.保存提取到item的列表,队列的名称为spider.name:items 5 | d.可能存在存放初始url的集合或者是列表,队列的名称可能是spider.name:start_urls 6 | 7 | 内存监控: 8 | MEMUSAGE_NOTIFY_MAIL = ['3081881935@qq.com'] 9 | MEMUSAGE_REPORT = True 10 | MEMUSAGE_ENABLED = True 11 | MEMUSAGE_LIMIT_MB = 2048 12 | MEMDEBUG_ENABLED = True 13 | MEMDEBUG_NOTIFY = [] 14 | 15 | EXTENSIONS = { 16 | 'share_code.extensionsItem.SpiderOpenCloseLogging': 100, 17 | 'share_code.extensionsTime.Latencies': None, 18 | 'scrapy.contrib.memusage.MemoryUsage': 50, 19 | 'scrapy.contrib.memdebug.MemoryDebugger': 60 20 | } 21 | 22 | 基于形态相似距离的时间序列相似度计算 李中刘洋洋 23 | https://wenku.baidu.com/view/58dfefbc2b160b4e777fcf77.html 24 | 25 | 26 | 隐藏层大小:(输入大小+输出大小)*2/3 27 | 28 | hive -e "select * from /sdbadmin/hadoop/input/900915.csv" >> res1.csv新建csv文件,在此之前先将hdfs数据导入hive。 29 | 30 | 启动hive: 31 | Service mysqld start 32 | hive --service metastore 33 | hive 34 | 35 | 36 | scrapy_redis原理 37 | 38 | (https://blog.csdn.net/hjhmpl123/article/details/53292602) 39 | scrapy-redis原理: 40 | 1.spider解析下载器下载下来的response,返回item或者是links 41 | 2.item或者links经过spidermiddleware的process_spider_out()方法,交给engine。 42 | 3.engine将item交给itempipeline,将links交给调度器 43 | 4.在调度器中,先将request对象利用scrapy内置的指纹函数,生成一个指纹对象 44 | 5.如果request对象中的dont_filter参数设置为False,并且该request对象的指纹不在信息指纹的队列中,那么就把该request对象放到优先级的队列中 45 | 6.从优先级队列中获取request对象,交给engine 46 | 7.engine将request对象交给下载器下载,期间会通过downloadmiddleware的process_request()方法 47 | 8.下载器完成下载,获得response对象,将该对象交给engine,期间会通过downloadmiddleware的process_response()方法 48 | 9.engine将获得的response对象交给spider进行解析,期间会经过spidermiddleware的process_spider_input()方法 49 | 10.从第一步开始循环 50 | 51 | 52 | 53 | zookeeper 路由和负载均衡的实现。 54 | 在zookeeper中,一但服务器与zookeeper集群断开连接,znode节点已经不存在,此时通过注册相应的watcher机制,服务消费者能够第一时间获取服务提供者信息的变更。利用znode的特点和watcher机制将其作为动态注册和获取服务信息的配置中心,统一管理服务名称和其对应的服务器列表,能够实时的感知到后端服务器的状态,从而保持服务配置信息能够一致以及进行简单的扩容。 55 | Zookeeper类似于一棵节点树,当服务提供者启动时,将服务器的名称、地址以节点的信息添加到配置中心,而服务消费中通过服务器名称以及配置中心来获得需要调用的服务节点下的服务地址,再利用负载均衡算法选取其中的某一台服务器进行调用。 56 | 57 | 58 | 59 | HIVE 介绍 60 | (https://mp.weixin.qq.com/s?__biz=MzIzODExMDE5MA==&mid=2694182433&idx=1&sn=687b754cddc7255026434c683f487ac0#rd) 61 | (1)hive 是基于 Hadoop 的一个数据仓库工具,可以将结构化的数据文件映射为一张数据库表,并提供完整的 sql 查询功能,可以将 sql 语句转换为 MapReduce 任务进行运行。其优点是学习成本低,可以通过类 SQL 语句快速实现简单的 MapReduce 统计,不必开发专门的 MapReduce 应用,十分适合数据仓库的统计分析。 62 | (2)Hive 是建立在 Hadoop 上的数据仓库基础构架。它提供了一系列的工具,可以用来进行数据提取转化加载(ETL),这是一种可以存储、查询和分析存储在 Hadoop 中的大规模数据的机制。Hive 定义了简单的类 SQL 查询语言,称为 HQL,它允许熟悉 SQL 的用户查询数据。同时,这个语言也允许熟悉 MapReduce 开发者的开发自定义的 mapper 和 reducer 来处理内建的 mapper 和 reducer 无法完成的复杂的分析工作。 63 | 64 | --------------------------------------------------------------------------------